一直以來,程序和執行緒的區別,這種問題一般會被面試官拿來考考面試者,可見這事就不太簡單。簡單說一點差異是,程序擁有獨立的記憶體資源資訊,而執行緒則共用父程序的資源資訊。也就是說執行緒不擁有記憶體資源,所以對系統消耗會更小。所以,執行緒也有輕量級程序的說法。
除了從資源消耗的角度來講程序執行緒的差別,還有一個值得說明的是,記憶體的可見性問題。程序與程序間的資源是隔離的,所以要想共用變數之類的,是很難做到的,但是共用執行緒變數則容易許多,因為多執行緒共用記憶體資源,所以一個執行緒對記憶體的改變,其他執行緒是可以感知到的。這就給應用做一些共用和並行控制帶來了很大的方便(其實也有程序間共用資訊,程序間通訊,只是太重了,應用層無法承受之重)。所以,一般的程式語言都會提供多執行緒的建立和使用方法。
那麼,程序執行緒是如何建立的呢?我們知道,所有上層語言,都會依賴於作業系統底層的基礎實現,進執行緒的建立作為非常核心的概念,自然也是最終要轉給作業系統處理。所以,我們想看看作業系統是如何建立程序執行緒的。
linux中程序和執行緒,擁有同樣的資料結構,也就是說沒有特別區分程序和執行緒的差別。這樣做的好處是,程序和執行緒都是作為獨立的排程單元對待,不需要特殊處理,減少了排程演演算法的複雜性。
一般的,我們可以通過 fork() 函數直接建立程序,通過 pthread_create() 函數進行執行緒的建立,當然這些都是外層的api函數。我們可以以此二者作為入口進行進執行緒的建立研究。
進執行緒作為程序排程的單元,它自然有一個對應的描述類,即 task_struct, 它描述了一個程序到底有哪些內容和能力,是一個非常重要的抽象。
// include/linux/sched.h struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif /* -1 unrunnable, 0 runnable, >0 stopped: */ volatile long state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; atomic_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ unsigned int cpu; #endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; #endif int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; int nr_cpus_allowed; cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ struct sched_info sched_info; struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm; struct mm_struct *active_mm; /* Per-thread vma caching: */ struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; #endif int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_remote_wakeup:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG unsigned memcg_may_oom:1; #ifndef CONFIG_SLOB unsigned memcg_kmem_skip_account:1; #endif #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 real_start_time; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; #ifdef CONFIG_POSIX_TIMERS struct task_cputime cputime_expires; struct list_head cpu_timers[3]; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; /* * executable name, excluding path. * * - normally initialized setup_new_exec() * - access it with [gs]et_task_comm() * - lock it with task_lock() */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; /* Thread group tracking: */ u32 parent_exec_id; u32 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif #ifdef CONFIG_DEBUG_MUTEXES /* Mutex deadlock detection: */ struct mutex_waiter *blocked_on; #endif #ifdef CONFIG_TRACE_IRQFLAGS unsigned int irq_events; unsigned long hardirq_enable_ip; unsigned long hardirq_disable_ip; unsigned int hardirq_enable_event; unsigned int hardirq_disable_event; int hardirqs_enabled; int hardirq_context; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; unsigned int softirq_disable_event; unsigned int softirq_enable_event; int softirqs_enabled; int softirq_context; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #ifdef CONFIG_UBSAN unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; #ifdef CONFIG_BLOCK /* Stack plugging: */ struct blk_plug *plug; #endif /* VM state: */ struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; struct io_context *io_context; /* Ptrace state: */ unsigned long ptrace_message; siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Seqence number to catch updates: */ seqcount_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_INTEL_RDT u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; struct list_head numa_entry; struct numa_group *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; #endif struct tlbflush_unmap_batch tlb_ubc; struct rcu_head rcu; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* State flags for use by tracers: */ unsigned long trace; /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; #endif #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ atomic_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end /* CPU-specific state of this task: */ struct thread_struct thread; /* * WARNING: on x86, 'thread_struct' contains a variable-sized * structure. It *MUST* be at the end of 'task_struct'. * * Do not put anything below here! */ };
涉及到的資料結構很多,因為很多後續程序的操作,都要基於它進行,如:記憶體、io、cpu、排程器、檔案等,自然任務艱鉅。
這裡說的fork, 是應用層的api fork。用法簡單: pid_t pid = fork(); 就可以得到新的程序了,而且新的程序已經自動執行。
fork方法被呼叫一次,成功就會有兩次返回;在父程序中返回一次,返回的是子程序的pid(非0)在子程序中返回一次,返回值為0 。
這裡的fork和系統級的fork 是差不多的,下面會有詳細講解。
執行緒的建立方法為 pthread_create(),在glibc 的原始碼實現原始碼: https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_create.c;h=308db65cd4c148f8a119ed9025af194946aa2c80;hb=HEAD
首先,pthread_create() 函數是通過 versioned_symbol 進行匯出使用的。而其內部實現函數為 __pthread_create_2_1(), 如下:
versioned_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1);
所以我們可以通過 __pthread_create_2_1 進行研究其他建立執行緒過程。
int __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg) { STACK_VARIABLES; const struct pthread_attr *iattr = (struct pthread_attr *) attr; struct pthread_attr default_attr; bool free_cpuset = false; bool c11 = (attr == ATTR_C11_THREAD); if (iattr == NULL || c11) { lll_lock (__default_pthread_attr_lock, LLL_PRIVATE); default_attr = __default_pthread_attr; size_t cpusetsize = default_attr.cpusetsize; if (cpusetsize > 0) { cpu_set_t *cpuset; if (__glibc_likely (__libc_use_alloca (cpusetsize))) cpuset = __alloca (cpusetsize); else { cpuset = malloc (cpusetsize); if (cpuset == NULL) { lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE); return ENOMEM; } free_cpuset = true; } memcpy (cpuset, default_attr.cpuset, cpusetsize); default_attr.cpuset = cpuset; } lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE); iattr = &default_attr; } struct pthread *pd = NULL; int err = ALLOCATE_STACK (iattr, &pd); int retval = 0; if (__glibc_unlikely (err != 0)) /* Something went wrong. Maybe a parameter of the attributes is invalid or we could not allocate memory. Note we have to translate error codes. */ { retval = err == ENOMEM ? EAGAIN : err; goto out; } /* Initialize the TCB. All initializations with zero should be performed in 'get_cached_stack'. This way we avoid doing this if the stack freshly allocated with 'mmap'. */ #if TLS_TCB_AT_TP /* Reference to the TCB itself. */ pd->header.self = pd; /* Self-reference for TLS. */ pd->header.tcb = pd; #endif /* Store the address of the start routine and the parameter. Since we do not start the function directly the stillborn thread will get the information from its thread descriptor. */ pd->start_routine = start_routine; pd->arg = arg; pd->c11 = c11; /* Copy the thread attribute flags. */ struct pthread *self = THREAD_SELF; pd->flags = ((iattr->flags & ~(ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) | (self->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))); /* Initialize the field for the ID of the thread which is waiting for us. This is a self-reference in case the thread is created detached. */ pd->joinid = iattr->flags & ATTR_FLAG_DETACHSTATE ? pd : NULL; /* The debug events are inherited from the parent. */ pd->eventbuf = self->eventbuf; /* Copy the parent's scheduling parameters. The flags will say what is valid and what is not. */ pd->schedpolicy = self->schedpolicy; pd->schedparam = self->schedparam; /* Copy the stack guard canary. */ #ifdef THREAD_COPY_STACK_GUARD THREAD_COPY_STACK_GUARD (pd); #endif /* Copy the pointer guard value. */ #ifdef THREAD_COPY_POINTER_GUARD THREAD_COPY_POINTER_GUARD (pd); #endif /* Setup tcbhead. */ tls_setup_tcbhead (pd); /* Verify the sysinfo bits were copied in allocate_stack if needed. */ #ifdef NEED_DL_SYSINFO CHECK_THREAD_SYSINFO (pd); #endif /* Inform start_thread (above) about cancellation state that might translate into inherited signal state. */ pd->parent_cancelhandling = THREAD_GETMEM (THREAD_SELF, cancelhandling); /* Determine scheduling parameters for the thread. */ if (__builtin_expect ((iattr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0, 0) && (iattr->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) != 0) { /* Use the scheduling parameters the user provided. */ if (iattr->flags & ATTR_FLAG_POLICY_SET) { pd->schedpolicy = iattr->schedpolicy; pd->flags |= ATTR_FLAG_POLICY_SET; } if (iattr->flags & ATTR_FLAG_SCHED_SET) { /* The values were validated in pthread_attr_setschedparam. */ pd->schedparam = iattr->schedparam; pd->flags |= ATTR_FLAG_SCHED_SET; } if ((pd->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) != (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) collect_default_sched (pd); } if (__glibc_unlikely (__nptl_nthreads == 1)) _IO_enable_locks (); /* Pass the descriptor to the caller. */ *newthread = (pthread_t) pd; LIBC_PROBE (pthread_create, 4, newthread, attr, start_routine, arg); /* One more thread. We cannot have the thread do this itself, since it might exist but not have been scheduled yet by the time we've returned and need to check the value to behave correctly. We must do it before creating the thread, in case it does get scheduled first and then might mistakenly think it was the only thread. In the failure case, we momentarily store a false value; this doesn't matter because there is no kosher thing a signal handler interrupting us right here can do that cares whether the thread count is correct. */ atomic_increment (&__nptl_nthreads); /* Our local value of stopped_start and thread_ran can be accessed at any time. The PD->stopped_start may only be accessed if we have ownership of PD (see CONCURRENCY NOTES above). */ bool stopped_start = false; bool thread_ran = false; /* Start the thread. */ if (__glibc_unlikely (report_thread_creation (pd))) { stopped_start = true; /* We always create the thread stopped at startup so we can notify the debugger. */ retval = create_thread (pd, iattr, &stopped_start, STACK_VARIABLES_ARGS, &thread_ran); if (retval == 0) { /* We retain ownership of PD until (a) (see CONCURRENCY NOTES above). */ /* Assert stopped_start is true in both our local copy and the PD copy. */ assert (stopped_start); assert (pd->stopped_start); /* Now fill in the information about the new thread in the newly created thread's data structure. We cannot let the new thread do this since we don't know whether it was already scheduled when we send the event. */ pd->eventbuf.eventnum = TD_CREATE; pd->eventbuf.eventdata = pd; /* Enqueue the descriptor. */ do pd->nextevent = __nptl_last_event; while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event, pd, pd->nextevent) != 0); /* Now call the function which signals the event. See CONCURRENCY NOTES for the nptl_db interface comments. */ __nptl_create_event (); } } else retval = create_thread (pd, iattr, &stopped_start, STACK_VARIABLES_ARGS, &thread_ran); if (__glibc_unlikely (retval != 0)) { if (thread_ran) /* State (c) or (d) and we may not have PD ownership (see CONCURRENCY NOTES above). We can assert that STOPPED_START must have been true because thread creation didn't fail, but thread attribute setting did. */ /* See bug 19511 which explains why doing nothing here is a resource leak for a joinable thread. */ assert (stopped_start); else { /* State (e) and we have ownership of PD (see CONCURRENCY NOTES above). */ /* Oops, we lied for a second. */ atomic_decrement (&__nptl_nthreads); /* Perhaps a thread wants to change the IDs and is waiting for this stillborn thread. */ if (__glibc_unlikely (atomic_exchange_acq (&pd->setxid_futex, 0) == -2)) futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE); /* Free the resources. */ __deallocate_stack (pd); } /* We have to translate error codes. */ if (retval == ENOMEM) retval = EAGAIN; } else { /* We don't know if we have PD ownership. Once we check the local stopped_start we'll know if we're in state (a) or (b) (see CONCURRENCY NOTES above). */ if (stopped_start) /* State (a), we own PD. The thread blocked on this lock either because we're doing TD_CREATE event reporting, or for some other reason that create_thread chose. Now let it run free. */ lll_unlock (pd->lock, LLL_PRIVATE); /* We now have for sure more than one thread. The main thread might not yet have the flag set. No need to set the global variable again if this is what we use. */ THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1); } out: if (__glibc_unlikely (free_cpuset)) free (default_attr.cpuset); return retval; }
以上主要是構建建立執行緒的必要資訊,然後呼叫 create_thread() 進行執行緒的建立,而create_thread並非是核心的呼叫,我們需要再深入瞭解下。
// sysdeps/unix/sysv/linux/createThread.c static int create_thread (struct pthread *pd, const struct pthread_attr *attr, bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran) { /* Determine whether the newly created threads has to be started stopped since we have to set the scheduling parameters or set the affinity. */ if (attr != NULL && (__glibc_unlikely (attr->cpuset != NULL) || __glibc_unlikely ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0))) *stopped_start = true; pd->stopped_start = *stopped_start; if (__glibc_unlikely (*stopped_start)) /* See CONCURRENCY NOTES in nptl/pthread_creat.c. */ lll_lock (pd->lock, LLL_PRIVATE); /* We rely heavily on various flags the CLONE function understands: CLONE_VM, CLONE_FS, CLONE_FILES These flags select semantics with shared address space and file descriptors according to what POSIX requires. CLONE_SIGHAND, CLONE_THREAD This flag selects the POSIX signal semantics and various other kinds of sharing (itimers, POSIX timers, etc.). CLONE_SETTLS The sixth parameter to CLONE determines the TLS area for the new thread. CLONE_PARENT_SETTID The kernels writes the thread ID of the newly created thread into the location pointed to by the fifth parameters to CLONE. Note that it would be semantically equivalent to use CLONE_CHILD_SETTID but it is be more expensive in the kernel. CLONE_CHILD_CLEARTID The kernels clears the thread ID of a thread that has called sys_exit() in the location pointed to by the seventh parameter to CLONE. The termination signal is chosen to be zero which means no signal is sent. */ const int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SYSVSEM | CLONE_SIGHAND | CLONE_THREAD | CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | 0); TLS_DEFINE_INIT_TP (tp, pd); // __clone 系統呼叫 if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS, clone_flags, pd, &pd->tid, tp, &pd->tid) == -1)) return errno; /* It's started now, so if we fail below, we'll have to cancel it and let it clean itself up. */ *thread_ran = true; /* Now we have the possibility to set scheduling parameters etc. */ if (attr != NULL) { INTERNAL_SYSCALL_DECL (err); int res; /* Set the affinity mask if necessary. */ if (attr->cpuset != NULL) { assert (*stopped_start); res = INTERNAL_SYSCALL (sched_setaffinity, err, 3, pd->tid, attr->cpusetsize, attr->cpuset); if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res, err))) err_out: { /* The operation failed. We have to kill the thread. We let the normal cancellation mechanism do the work. */ pid_t pid = __getpid (); INTERNAL_SYSCALL_DECL (err2); (void) INTERNAL_SYSCALL_CALL (tgkill, err2, pid, pd->tid, SIGCANCEL); return INTERNAL_SYSCALL_ERRNO (res, err); } } /* Set the scheduling parameters. */ if ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0) { assert (*stopped_start); res = INTERNAL_SYSCALL (sched_setscheduler, err, 3, pd->tid, pd->schedpolicy, &pd->schedparam); if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res, err))) goto err_out; } } return 0; }
可見最終是呼叫到了 __clone() 方法了,__clone 是一個api級的linux呼叫,此處將會根據標識,建立一個執行緒。
上面的fork()和pthread_create(), 其實都是上層的封裝,在作業系統層面,建立程序和執行緒都是使用的 do_fork();
// kernel/fork.c /* For compatibility with architectures that call do_fork directly rather than * using the syscall entry points below. */ long do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { // 呼叫統一的建立程序方法 return _do_fork(clone_flags, stack_start, stack_size, parent_tidptr, child_tidptr, 0); } /* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ long _do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr, unsigned long tls) { struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; long nr; /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } // 複製當前程序上下文資訊 p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); if (IS_ERR(p)) return PTR_ERR(p); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ trace_sched_process_fork(current, p); // 獲取新建立的程序id pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); get_task_struct(p); } // 將新程序放入排程佇列,以便後續可以被執行 wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } // 程序id管理 put_pid(pid); return nr; }
框架簡單清晰,先複製當前程序,然後將其喚醒等待排程,然後返回程序id。所以重點是程序如何複製和喚醒,當然其中的標識位設定是很重要的,它決定如何建立程序。
/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ static __latent_entropy struct task_struct *copy_process( unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { int retval; struct task_struct *p; /* * Don't allow sharing the root directory with processes in a different * namespace */ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); /* * If the new process will be in a different pid or user namespace * do not allow it to share a thread group with the forking task. */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); } retval = -ENOMEM; // 複製當前程序的 task_struct 結構體 p = dup_task_struct(current, node); if (!p) goto fork_out; /* * This _must_ happen before we call free_task(), i.e. before we jump * to any of the bad_fork_* labels. This is to avoid freeing * p->set_child_tid which is (ab)used as a kthread's data pointer for * kernel threads (PF_KTHREAD). */ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (nr_threads >= max_threads) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME p->utimescaled = p->stimescaled = 0; #endif prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_init(&p->vtime.seqcount); p->vtime.starttime = 0; p->vtime.state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cpu_timers_init(p); p->start_time = ktime_get_ns(); p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_threadgroup_lock; } #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; p->hardirqs_enabled = 0; p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; p->hardirq_disable_event = 0; p->softirqs_enabled = 1; p->softirq_enable_ip = _THIS_IP_; p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; p->hardirq_context = 0; p->softirq_context = 0; #endif p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; lockdep_init_task(p); #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; #endif // 複製其他獨立資訊到新程序中 /* Perform scheduler related setup. Assign this task to a CPU. */ // 設定排程器,及cpu分配 retval = sched_fork(clone_flags, p); if (retval) goto bad_fork_cleanup_policy; retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); retval = security_task_alloc(p, clone_flags); if (retval) goto bad_fork_cleanup_audit; retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; // 檔案複製,實際上是 fd 的複製 retval = copy_files(clone_flags, p); if (retval) goto bad_fork_cleanup_semundo; // 複製fs結構 retval = copy_fs(clone_flags, p); if (retval) goto bad_fork_cleanup_files; retval = copy_sighand(clone_flags, p); if (retval) goto bad_fork_cleanup_fs; // 複製訊號監聽,如果是執行緒則不需要 retval = copy_signal(clone_flags, p); if (retval) goto bad_fork_cleanup_sighand; // 複製記憶體,重量級操作,建立執行緒時則不會真的複製 retval = copy_mm(clone_flags, p); if (retval) goto bad_fork_cleanup_signal; // 複製各大名稱空間, pid,uts,ipc,net,cgroup retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm; retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; // 複製執行緒本地儲存(thread local storage) retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); if (retval) goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; } } #ifdef CONFIG_BLOCK p->plug = NULL; #endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif clear_all_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { if (clone_flags & CLONE_PARENT) p->exit_signal = current->group_leader->exit_signal; else p->exit_signal = (clone_flags & CSIGNAL); p->group_leader = p; p->tgid = p->pid; } p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; cgroup_threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ retval = cgroup_can_fork(p); if (retval) goto bad_fork_free_pid; /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; } klp_copy_process(p); spin_lock(¤t->sighand->siglock); /* * Copy seccomp details explicitly here, in case they were changed * before holding sighand lock. */ copy_seccomp(p); rseq_fork(p, clone_flags); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same * tasklist_lock with adding child to the process tree * for propagate_has_child_subreaper optimization. */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; } total_forks++; spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); cgroup_threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); return p; bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); bad_fork_free_pid: cgroup_threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); free_task(p); fork_out: return ERR_PTR(retval); }
可以看出建立一個新程序,還是一件非常重的事,copy了很多的資訊,可以說肯定會非常耗效能和資源,所以不要隨便建立不必要的程序,它是寶貴的。下面我們就幾個簡單的複製過程瞭解下,都幹了什麼。
// 複製執行緒結構外殼 static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; unsigned long *stack; struct vm_struct *stack_vm_area; int err; if (node == NUMA_NO_NODE) node = tsk_fork_get_node(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; stack = alloc_thread_stack_node(tsk, node); if (!stack) goto free_tsk; stack_vm_area = task_stack_vm_area(tsk); err = arch_dup_task_struct(tsk, orig); /* * arch_dup_task_struct() clobbers the stack-related fields. Make * sure they're properly initialized before using any stack-related * functions again. */ tsk->stack = stack; #ifdef CONFIG_VMAP_STACK tsk->stack_vm_area = stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK atomic_set(&tsk->stack_refcount, 1); #endif if (err) goto free_stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under * the sighand lock in case orig has changed between now and * then. Until then, filter must be NULL to avoid messing up * the usage counts on the error path calling free_task. */ tsk->seccomp.filter = NULL; #endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif /* * One for us, one for whoever does the "release_task()" (usually * parent) */ atomic_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; account_kernel_stack(tsk, 1); kcov_task_init(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; #endif return tsk; free_stack: free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; } // 複製記憶體空間 static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; #endif tsk->mm = NULL; tsk->active_mm = NULL; /* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */ oldmm = current->mm; if (!oldmm) return 0; /* initialize the new vmacache entries */ vmacache_flush(tsk); // 一般情況下可以直接複用原有的記憶體資訊 if (clone_flags & CLONE_VM) { // 增加記憶體的使用計數,然後複用原有的指標即可 mmget(oldmm); mm = oldmm; goto good_mm; } retval = -ENOMEM; // 重量級操作,記憶體複製 mm = dup_mm(tsk); if (!mm) goto fail_nomem; good_mm: tsk->mm = mm; tsk->active_mm = mm; return 0; fail_nomem: return retval; } /* * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ static struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; mm = allocate_mm(); if (!mm) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; err = dup_mmap(mm, oldmm); if (err) goto free_pt; mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; if (mm->binfmt && !try_module_get(mm->binfmt->module)) goto free_pt; return mm; free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; mmput(mm); fail_nomem: return NULL; }
針對複製過程,每個方法有其獨特的用處,這也體現了設計模式中的單一職責原則。複製程序結構就是複製最外層的結構體,而複製記憶體,則是針對各區域的記憶體進行依次的複製。
程序建立好之後,自然是要交給排程系統處理的。理論上,只需要將將新建的程序加入到排程佇列,後續的事則可以不用fork() 管了。具體是否是這樣呢?
// kernel/sched/core.c // 將新建程序加入到排程佇列,並設定喚醒標識 /* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected CPU might disappear through hotplug * * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(&p->se); activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Nothing relies on rq->lock after this, so its fine to * drop it. */ rq_unpin_lock(rq, &rf); p->sched_class->task_woken(rq, p); rq_repin_lock(rq, &rf); } #endif task_rq_unlock(rq, p, &rf); } // 新程序加入佇列 void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); } static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); }
最後,我們看下put_pid() 都做了些什麼?感覺像是將pid寫入到全域性空間中,進行統一管理,但是實際上卻不是的,它實際上是一個參照計數的處理過程。
// kernel/pid.c // 減少pid_namespace的參照計數 void put_pid(struct pid *pid) { struct pid_namespace *ns; if (!pid) return; ns = pid->numbers[pid->level].ns; if ((atomic_read(&pid->count) == 1) || atomic_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } } EXPORT_SYMBOL_GPL(put_pid); // kernel/pid_namespace.c void put_pid_ns(struct pid_namespace *ns) { struct pid_namespace *parent; // 放到init程序pid空間 while (ns != &init_pid_ns) { parent = ns->parent; if (!kref_put(&ns->kref, free_pid_ns)) break; ns = parent; } } EXPORT_SYMBOL_GPL(put_pid_ns); static void free_pid_ns(struct kref *kref) { struct pid_namespace *ns; ns = container_of(kref, struct pid_namespace, kref); destroy_pid_namespace(ns); }
建立程序執行緒,其實包含了大量的複雜操作,每個細節都值得我們深入研究,我們此處僅簡單看幾個方向,如記憶體的複製過程,也可以大致理解進執行緒的差別,相信也能在一定程度上為大家解惑。