fork.c 85 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/kernel/fork.c
  4. *
  5. * Copyright (C) 1991, 1992 Linus Torvalds
  6. */
  7. /*
  8. * 'fork.c' contains the help-routines for the 'fork' system call
  9. * (see also entry.S and others).
  10. * Fork is rather simple, once you get the hang of it, but the memory
  11. * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
  12. */
  13. #include <linux/anon_inodes.h>
  14. #include <linux/slab.h>
  15. #include <linux/sched/autogroup.h>
  16. #include <linux/sched/mm.h>
  17. #include <linux/sched/coredump.h>
  18. #include <linux/sched/user.h>
  19. #include <linux/sched/numa_balancing.h>
  20. #include <linux/sched/stat.h>
  21. #include <linux/sched/task.h>
  22. #include <linux/sched/task_stack.h>
  23. #include <linux/sched/cputime.h>
  24. #include <linux/sched/ext.h>
  25. #include <linux/seq_file.h>
  26. #include <linux/rtmutex.h>
  27. #include <linux/init.h>
  28. #include <linux/unistd.h>
  29. #include <linux/module.h>
  30. #include <linux/vmalloc.h>
  31. #include <linux/completion.h>
  32. #include <linux/personality.h>
  33. #include <linux/mempolicy.h>
  34. #include <linux/sem.h>
  35. #include <linux/file.h>
  36. #include <linux/fdtable.h>
  37. #include <linux/iocontext.h>
  38. #include <linux/key.h>
  39. #include <linux/kmsan.h>
  40. #include <linux/binfmts.h>
  41. #include <linux/mman.h>
  42. #include <linux/mmu_notifier.h>
  43. #include <linux/fs.h>
  44. #include <linux/mm.h>
  45. #include <linux/mm_inline.h>
  46. #include <linux/memblock.h>
  47. #include <linux/nsproxy.h>
  48. #include <linux/capability.h>
  49. #include <linux/cpu.h>
  50. #include <linux/cgroup.h>
  51. #include <linux/security.h>
  52. #include <linux/hugetlb.h>
  53. #include <linux/seccomp.h>
  54. #include <linux/swap.h>
  55. #include <linux/syscalls.h>
  56. #include <linux/syscall_user_dispatch.h>
  57. #include <linux/jiffies.h>
  58. #include <linux/futex.h>
  59. #include <linux/compat.h>
  60. #include <linux/kthread.h>
  61. #include <linux/task_io_accounting_ops.h>
  62. #include <linux/rcupdate.h>
  63. #include <linux/ptrace.h>
  64. #include <linux/mount.h>
  65. #include <linux/audit.h>
  66. #include <linux/memcontrol.h>
  67. #include <linux/ftrace.h>
  68. #include <linux/proc_fs.h>
  69. #include <linux/profile.h>
  70. #include <linux/rmap.h>
  71. #include <linux/ksm.h>
  72. #include <linux/acct.h>
  73. #include <linux/userfaultfd_k.h>
  74. #include <linux/tsacct_kern.h>
  75. #include <linux/cn_proc.h>
  76. #include <linux/freezer.h>
  77. #include <linux/delayacct.h>
  78. #include <linux/taskstats_kern.h>
  79. #include <linux/tty.h>
  80. #include <linux/fs_struct.h>
  81. #include <linux/magic.h>
  82. #include <linux/perf_event.h>
  83. #include <linux/posix-timers.h>
  84. #include <linux/user-return-notifier.h>
  85. #include <linux/oom.h>
  86. #include <linux/khugepaged.h>
  87. #include <linux/signalfd.h>
  88. #include <linux/uprobes.h>
  89. #include <linux/aio.h>
  90. #include <linux/compiler.h>
  91. #include <linux/sysctl.h>
  92. #include <linux/kcov.h>
  93. #include <linux/livepatch.h>
  94. #include <linux/thread_info.h>
  95. #include <linux/stackleak.h>
  96. #include <linux/kasan.h>
  97. #include <linux/scs.h>
  98. #include <linux/io_uring.h>
  99. #include <linux/bpf.h>
  100. #include <linux/stackprotector.h>
  101. #include <linux/user_events.h>
  102. #include <linux/iommu.h>
  103. #include <linux/rseq.h>
  104. #include <uapi/linux/pidfd.h>
  105. #include <linux/pidfs.h>
  106. #include <linux/tick.h>
  107. #include <asm/pgalloc.h>
  108. #include <linux/uaccess.h>
  109. #include <asm/mmu_context.h>
  110. #include <asm/cacheflush.h>
  111. #include <asm/tlbflush.h>
  112. #include <trace/events/sched.h>
  113. #define CREATE_TRACE_POINTS
  114. #include <trace/events/task.h>
  115. #include <kunit/visibility.h>
  116. /*
  117. * Minimum number of threads to boot the kernel
  118. */
  119. #define MIN_THREADS 20
  120. /*
  121. * Maximum number of threads
  122. */
  123. #define MAX_THREADS FUTEX_TID_MASK
  124. /*
  125. * Protected counters by write_lock_irq(&tasklist_lock)
  126. */
  127. unsigned long total_forks; /* Handle normal Linux uptimes. */
  128. int nr_threads; /* The idle threads do not count.. */
  129. static int max_threads; /* tunable limit on nr_threads */
  130. #define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
  131. static const char * const resident_page_types[] = {
  132. NAMED_ARRAY_INDEX(MM_FILEPAGES),
  133. NAMED_ARRAY_INDEX(MM_ANONPAGES),
  134. NAMED_ARRAY_INDEX(MM_SWAPENTS),
  135. NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
  136. };
  137. DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  138. __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
  139. #ifdef CONFIG_PROVE_RCU
  140. int lockdep_tasklist_lock_is_held(void)
  141. {
  142. return lockdep_is_held(&tasklist_lock);
  143. }
  144. EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
  145. #endif /* #ifdef CONFIG_PROVE_RCU */
  146. int nr_processes(void)
  147. {
  148. int cpu;
  149. int total = 0;
  150. for_each_possible_cpu(cpu)
  151. total += per_cpu(process_counts, cpu);
  152. return total;
  153. }
  154. void __weak arch_release_task_struct(struct task_struct *tsk)
  155. {
  156. }
  157. static struct kmem_cache *task_struct_cachep;
  158. static inline struct task_struct *alloc_task_struct_node(int node)
  159. {
  160. return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
  161. }
  162. static inline void free_task_struct(struct task_struct *tsk)
  163. {
  164. kmem_cache_free(task_struct_cachep, tsk);
  165. }
  166. /*
  167. * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  168. * kmemcache based allocator.
  169. */
  170. # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
  171. # ifdef CONFIG_VMAP_STACK
  172. /*
  173. * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
  174. * flush. Try to minimize the number of calls by caching stacks.
  175. */
  176. #define NR_CACHED_STACKS 2
  177. static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
  178. struct vm_stack {
  179. struct rcu_head rcu;
  180. struct vm_struct *stack_vm_area;
  181. };
  182. static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
  183. {
  184. unsigned int i;
  185. for (i = 0; i < NR_CACHED_STACKS; i++) {
  186. struct vm_struct *tmp = NULL;
  187. if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm))
  188. return true;
  189. }
  190. return false;
  191. }
  192. static void thread_stack_free_rcu(struct rcu_head *rh)
  193. {
  194. struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
  195. if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
  196. return;
  197. vfree(vm_stack);
  198. }
  199. static void thread_stack_delayed_free(struct task_struct *tsk)
  200. {
  201. struct vm_stack *vm_stack = tsk->stack;
  202. vm_stack->stack_vm_area = tsk->stack_vm_area;
  203. call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
  204. }
  205. static int free_vm_stack_cache(unsigned int cpu)
  206. {
  207. struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
  208. int i;
  209. for (i = 0; i < NR_CACHED_STACKS; i++) {
  210. struct vm_struct *vm_stack = cached_vm_stacks[i];
  211. if (!vm_stack)
  212. continue;
  213. vfree(vm_stack->addr);
  214. cached_vm_stacks[i] = NULL;
  215. }
  216. return 0;
  217. }
  218. static int memcg_charge_kernel_stack(struct vm_struct *vm)
  219. {
  220. int i;
  221. int ret;
  222. int nr_charged = 0;
  223. BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
  224. for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
  225. ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
  226. if (ret)
  227. goto err;
  228. nr_charged++;
  229. }
  230. return 0;
  231. err:
  232. for (i = 0; i < nr_charged; i++)
  233. memcg_kmem_uncharge_page(vm->pages[i], 0);
  234. return ret;
  235. }
  236. static int alloc_thread_stack_node(struct task_struct *tsk, int node)
  237. {
  238. struct vm_struct *vm;
  239. void *stack;
  240. int i;
  241. for (i = 0; i < NR_CACHED_STACKS; i++) {
  242. struct vm_struct *s;
  243. s = this_cpu_xchg(cached_stacks[i], NULL);
  244. if (!s)
  245. continue;
  246. /* Reset stack metadata. */
  247. kasan_unpoison_range(s->addr, THREAD_SIZE);
  248. stack = kasan_reset_tag(s->addr);
  249. /* Clear stale pointers from reused stack. */
  250. memset(stack, 0, THREAD_SIZE);
  251. if (memcg_charge_kernel_stack(s)) {
  252. vfree(s->addr);
  253. return -ENOMEM;
  254. }
  255. tsk->stack_vm_area = s;
  256. tsk->stack = stack;
  257. return 0;
  258. }
  259. /*
  260. * Allocated stacks are cached and later reused by new threads,
  261. * so memcg accounting is performed manually on assigning/releasing
  262. * stacks to tasks. Drop __GFP_ACCOUNT.
  263. */
  264. stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
  265. VMALLOC_START, VMALLOC_END,
  266. THREADINFO_GFP & ~__GFP_ACCOUNT,
  267. PAGE_KERNEL,
  268. 0, node, __builtin_return_address(0));
  269. if (!stack)
  270. return -ENOMEM;
  271. vm = find_vm_area(stack);
  272. if (memcg_charge_kernel_stack(vm)) {
  273. vfree(stack);
  274. return -ENOMEM;
  275. }
  276. /*
  277. * We can't call find_vm_area() in interrupt context, and
  278. * free_thread_stack() can be called in interrupt context,
  279. * so cache the vm_struct.
  280. */
  281. tsk->stack_vm_area = vm;
  282. stack = kasan_reset_tag(stack);
  283. tsk->stack = stack;
  284. return 0;
  285. }
  286. static void free_thread_stack(struct task_struct *tsk)
  287. {
  288. if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
  289. thread_stack_delayed_free(tsk);
  290. tsk->stack = NULL;
  291. tsk->stack_vm_area = NULL;
  292. }
  293. # else /* !CONFIG_VMAP_STACK */
  294. static void thread_stack_free_rcu(struct rcu_head *rh)
  295. {
  296. __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
  297. }
  298. static void thread_stack_delayed_free(struct task_struct *tsk)
  299. {
  300. struct rcu_head *rh = tsk->stack;
  301. call_rcu(rh, thread_stack_free_rcu);
  302. }
  303. static int alloc_thread_stack_node(struct task_struct *tsk, int node)
  304. {
  305. struct page *page = alloc_pages_node(node, THREADINFO_GFP,
  306. THREAD_SIZE_ORDER);
  307. if (likely(page)) {
  308. tsk->stack = kasan_reset_tag(page_address(page));
  309. return 0;
  310. }
  311. return -ENOMEM;
  312. }
  313. static void free_thread_stack(struct task_struct *tsk)
  314. {
  315. thread_stack_delayed_free(tsk);
  316. tsk->stack = NULL;
  317. }
  318. # endif /* CONFIG_VMAP_STACK */
  319. # else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
  320. static struct kmem_cache *thread_stack_cache;
  321. static void thread_stack_free_rcu(struct rcu_head *rh)
  322. {
  323. kmem_cache_free(thread_stack_cache, rh);
  324. }
  325. static void thread_stack_delayed_free(struct task_struct *tsk)
  326. {
  327. struct rcu_head *rh = tsk->stack;
  328. call_rcu(rh, thread_stack_free_rcu);
  329. }
  330. static int alloc_thread_stack_node(struct task_struct *tsk, int node)
  331. {
  332. unsigned long *stack;
  333. stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
  334. stack = kasan_reset_tag(stack);
  335. tsk->stack = stack;
  336. return stack ? 0 : -ENOMEM;
  337. }
  338. static void free_thread_stack(struct task_struct *tsk)
  339. {
  340. thread_stack_delayed_free(tsk);
  341. tsk->stack = NULL;
  342. }
  343. void thread_stack_cache_init(void)
  344. {
  345. thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
  346. THREAD_SIZE, THREAD_SIZE, 0, 0,
  347. THREAD_SIZE, NULL);
  348. BUG_ON(thread_stack_cache == NULL);
  349. }
  350. # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
  351. /* SLAB cache for signal_struct structures (tsk->signal) */
  352. static struct kmem_cache *signal_cachep;
  353. /* SLAB cache for sighand_struct structures (tsk->sighand) */
  354. struct kmem_cache *sighand_cachep;
  355. /* SLAB cache for files_struct structures (tsk->files) */
  356. struct kmem_cache *files_cachep;
  357. /* SLAB cache for fs_struct structures (tsk->fs) */
  358. struct kmem_cache *fs_cachep;
  359. /* SLAB cache for vm_area_struct structures */
  360. static struct kmem_cache *vm_area_cachep;
  361. /* SLAB cache for mm_struct structures (tsk->mm) */
  362. static struct kmem_cache *mm_cachep;
  363. #ifdef CONFIG_PER_VMA_LOCK
  364. /* SLAB cache for vm_area_struct.lock */
  365. static struct kmem_cache *vma_lock_cachep;
  366. static bool vma_lock_alloc(struct vm_area_struct *vma)
  367. {
  368. vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
  369. if (!vma->vm_lock)
  370. return false;
  371. init_rwsem(&vma->vm_lock->lock);
  372. vma->vm_lock_seq = -1;
  373. return true;
  374. }
  375. static inline void vma_lock_free(struct vm_area_struct *vma)
  376. {
  377. kmem_cache_free(vma_lock_cachep, vma->vm_lock);
  378. }
  379. #else /* CONFIG_PER_VMA_LOCK */
  380. static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
  381. static inline void vma_lock_free(struct vm_area_struct *vma) {}
  382. #endif /* CONFIG_PER_VMA_LOCK */
  383. struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
  384. {
  385. struct vm_area_struct *vma;
  386. vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
  387. if (!vma)
  388. return NULL;
  389. vma_init(vma, mm);
  390. if (!vma_lock_alloc(vma)) {
  391. kmem_cache_free(vm_area_cachep, vma);
  392. return NULL;
  393. }
  394. return vma;
  395. }
  396. struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
  397. {
  398. struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
  399. if (!new)
  400. return NULL;
  401. ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
  402. ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
  403. /*
  404. * orig->shared.rb may be modified concurrently, but the clone
  405. * will be reinitialized.
  406. */
  407. data_race(memcpy(new, orig, sizeof(*new)));
  408. if (!vma_lock_alloc(new)) {
  409. kmem_cache_free(vm_area_cachep, new);
  410. return NULL;
  411. }
  412. INIT_LIST_HEAD(&new->anon_vma_chain);
  413. vma_numab_state_init(new);
  414. dup_anon_vma_name(orig, new);
  415. return new;
  416. }
  417. void __vm_area_free(struct vm_area_struct *vma)
  418. {
  419. vma_numab_state_free(vma);
  420. free_anon_vma_name(vma);
  421. vma_lock_free(vma);
  422. kmem_cache_free(vm_area_cachep, vma);
  423. }
  424. #ifdef CONFIG_PER_VMA_LOCK
  425. static void vm_area_free_rcu_cb(struct rcu_head *head)
  426. {
  427. struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
  428. vm_rcu);
  429. /* The vma should not be locked while being destroyed. */
  430. VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
  431. __vm_area_free(vma);
  432. }
  433. #endif
  434. void vm_area_free(struct vm_area_struct *vma)
  435. {
  436. #ifdef CONFIG_PER_VMA_LOCK
  437. call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
  438. #else
  439. __vm_area_free(vma);
  440. #endif
  441. }
  442. static void account_kernel_stack(struct task_struct *tsk, int account)
  443. {
  444. if (IS_ENABLED(CONFIG_VMAP_STACK)) {
  445. struct vm_struct *vm = task_stack_vm_area(tsk);
  446. int i;
  447. for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
  448. mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
  449. account * (PAGE_SIZE / 1024));
  450. } else {
  451. void *stack = task_stack_page(tsk);
  452. /* All stack pages are in the same node. */
  453. mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
  454. account * (THREAD_SIZE / 1024));
  455. }
  456. }
  457. void exit_task_stack_account(struct task_struct *tsk)
  458. {
  459. account_kernel_stack(tsk, -1);
  460. if (IS_ENABLED(CONFIG_VMAP_STACK)) {
  461. struct vm_struct *vm;
  462. int i;
  463. vm = task_stack_vm_area(tsk);
  464. for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
  465. memcg_kmem_uncharge_page(vm->pages[i], 0);
  466. }
  467. }
  468. static void release_task_stack(struct task_struct *tsk)
  469. {
  470. if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
  471. return; /* Better to leak the stack than to free prematurely */
  472. free_thread_stack(tsk);
  473. }
  474. #ifdef CONFIG_THREAD_INFO_IN_TASK
  475. void put_task_stack(struct task_struct *tsk)
  476. {
  477. if (refcount_dec_and_test(&tsk->stack_refcount))
  478. release_task_stack(tsk);
  479. }
  480. #endif
  481. void free_task(struct task_struct *tsk)
  482. {
  483. #ifdef CONFIG_SECCOMP
  484. WARN_ON_ONCE(tsk->seccomp.filter);
  485. #endif
  486. release_user_cpus_ptr(tsk);
  487. scs_release(tsk);
  488. #ifndef CONFIG_THREAD_INFO_IN_TASK
  489. /*
  490. * The task is finally done with both the stack and thread_info,
  491. * so free both.
  492. */
  493. release_task_stack(tsk);
  494. #else
  495. /*
  496. * If the task had a separate stack allocation, it should be gone
  497. * by now.
  498. */
  499. WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
  500. #endif
  501. rt_mutex_debug_task_free(tsk);
  502. ftrace_graph_exit_task(tsk);
  503. arch_release_task_struct(tsk);
  504. if (tsk->flags & PF_KTHREAD)
  505. free_kthread_struct(tsk);
  506. bpf_task_storage_free(tsk);
  507. free_task_struct(tsk);
  508. }
  509. EXPORT_SYMBOL(free_task);
  510. static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
  511. {
  512. struct file *exe_file;
  513. exe_file = get_mm_exe_file(oldmm);
  514. RCU_INIT_POINTER(mm->exe_file, exe_file);
  515. /*
  516. * We depend on the oldmm having properly denied write access to the
  517. * exe_file already.
  518. */
  519. if (exe_file && deny_write_access(exe_file))
  520. pr_warn_once("deny_write_access() failed in %s\n", __func__);
  521. }
  522. #ifdef CONFIG_MMU
  523. static __latent_entropy int dup_mmap(struct mm_struct *mm,
  524. struct mm_struct *oldmm)
  525. {
  526. struct vm_area_struct *mpnt, *tmp;
  527. int retval;
  528. unsigned long charge = 0;
  529. LIST_HEAD(uf);
  530. VMA_ITERATOR(vmi, mm, 0);
  531. if (mmap_write_lock_killable(oldmm))
  532. return -EINTR;
  533. flush_cache_dup_mm(oldmm);
  534. uprobe_dup_mmap(oldmm, mm);
  535. /*
  536. * Not linked in yet - no deadlock potential:
  537. */
  538. mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
  539. /* No ordering required: file already has been exposed. */
  540. dup_mm_exe_file(mm, oldmm);
  541. mm->total_vm = oldmm->total_vm;
  542. mm->data_vm = oldmm->data_vm;
  543. mm->exec_vm = oldmm->exec_vm;
  544. mm->stack_vm = oldmm->stack_vm;
  545. /* Use __mt_dup() to efficiently build an identical maple tree. */
  546. retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
  547. if (unlikely(retval))
  548. goto out;
  549. mt_clear_in_rcu(vmi.mas.tree);
  550. for_each_vma(vmi, mpnt) {
  551. struct file *file;
  552. vma_start_write(mpnt);
  553. if (mpnt->vm_flags & VM_DONTCOPY) {
  554. retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
  555. mpnt->vm_end, GFP_KERNEL);
  556. if (retval)
  557. goto loop_out;
  558. vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
  559. continue;
  560. }
  561. charge = 0;
  562. /*
  563. * Don't duplicate many vmas if we've been oom-killed (for
  564. * example)
  565. */
  566. if (fatal_signal_pending(current)) {
  567. retval = -EINTR;
  568. goto loop_out;
  569. }
  570. if (mpnt->vm_flags & VM_ACCOUNT) {
  571. unsigned long len = vma_pages(mpnt);
  572. if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
  573. goto fail_nomem;
  574. charge = len;
  575. }
  576. tmp = vm_area_dup(mpnt);
  577. if (!tmp)
  578. goto fail_nomem;
  579. /* track_pfn_copy() will later take care of copying internal state. */
  580. if (unlikely(tmp->vm_flags & VM_PFNMAP))
  581. untrack_pfn_clear(tmp);
  582. retval = vma_dup_policy(mpnt, tmp);
  583. if (retval)
  584. goto fail_nomem_policy;
  585. tmp->vm_mm = mm;
  586. retval = dup_userfaultfd(tmp, &uf);
  587. if (retval)
  588. goto fail_nomem_anon_vma_fork;
  589. if (tmp->vm_flags & VM_WIPEONFORK) {
  590. /*
  591. * VM_WIPEONFORK gets a clean slate in the child.
  592. * Don't prepare anon_vma until fault since we don't
  593. * copy page for current vma.
  594. */
  595. tmp->anon_vma = NULL;
  596. } else if (anon_vma_fork(tmp, mpnt))
  597. goto fail_nomem_anon_vma_fork;
  598. vm_flags_clear(tmp, VM_LOCKED_MASK);
  599. /*
  600. * Copy/update hugetlb private vma information.
  601. */
  602. if (is_vm_hugetlb_page(tmp))
  603. hugetlb_dup_vma_private(tmp);
  604. /*
  605. * Link the vma into the MT. After using __mt_dup(), memory
  606. * allocation is not necessary here, so it cannot fail.
  607. */
  608. vma_iter_bulk_store(&vmi, tmp);
  609. mm->map_count++;
  610. if (tmp->vm_ops && tmp->vm_ops->open)
  611. tmp->vm_ops->open(tmp);
  612. file = tmp->vm_file;
  613. if (file) {
  614. struct address_space *mapping = file->f_mapping;
  615. get_file(file);
  616. i_mmap_lock_write(mapping);
  617. if (vma_is_shared_maywrite(tmp))
  618. mapping_allow_writable(mapping);
  619. flush_dcache_mmap_lock(mapping);
  620. /* insert tmp into the share list, just after mpnt */
  621. vma_interval_tree_insert_after(tmp, mpnt,
  622. &mapping->i_mmap);
  623. flush_dcache_mmap_unlock(mapping);
  624. i_mmap_unlock_write(mapping);
  625. }
  626. if (!(tmp->vm_flags & VM_WIPEONFORK))
  627. retval = copy_page_range(tmp, mpnt);
  628. if (retval) {
  629. mpnt = vma_next(&vmi);
  630. goto loop_out;
  631. }
  632. }
  633. /* a new mm has just been created */
  634. retval = arch_dup_mmap(oldmm, mm);
  635. loop_out:
  636. vma_iter_free(&vmi);
  637. if (!retval) {
  638. mt_set_in_rcu(vmi.mas.tree);
  639. ksm_fork(mm, oldmm);
  640. khugepaged_fork(mm, oldmm);
  641. } else if (mpnt) {
  642. /*
  643. * The entire maple tree has already been duplicated. If the
  644. * mmap duplication fails, mark the failure point with
  645. * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
  646. * stop releasing VMAs that have not been duplicated after this
  647. * point.
  648. */
  649. mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
  650. mas_store(&vmi.mas, XA_ZERO_ENTRY);
  651. }
  652. out:
  653. mmap_write_unlock(mm);
  654. flush_tlb_mm(oldmm);
  655. mmap_write_unlock(oldmm);
  656. if (!retval)
  657. dup_userfaultfd_complete(&uf);
  658. else
  659. dup_userfaultfd_fail(&uf);
  660. return retval;
  661. fail_nomem_anon_vma_fork:
  662. mpol_put(vma_policy(tmp));
  663. fail_nomem_policy:
  664. vm_area_free(tmp);
  665. fail_nomem:
  666. retval = -ENOMEM;
  667. vm_unacct_memory(charge);
  668. goto loop_out;
  669. }
  670. static inline int mm_alloc_pgd(struct mm_struct *mm)
  671. {
  672. mm->pgd = pgd_alloc(mm);
  673. if (unlikely(!mm->pgd))
  674. return -ENOMEM;
  675. return 0;
  676. }
  677. static inline void mm_free_pgd(struct mm_struct *mm)
  678. {
  679. pgd_free(mm, mm->pgd);
  680. }
  681. #else
  682. static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  683. {
  684. mmap_write_lock(oldmm);
  685. dup_mm_exe_file(mm, oldmm);
  686. mmap_write_unlock(oldmm);
  687. return 0;
  688. }
  689. #define mm_alloc_pgd(mm) (0)
  690. #define mm_free_pgd(mm)
  691. #endif /* CONFIG_MMU */
  692. static void check_mm(struct mm_struct *mm)
  693. {
  694. int i;
  695. BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
  696. "Please make sure 'struct resident_page_types[]' is updated as well");
  697. for (i = 0; i < NR_MM_COUNTERS; i++) {
  698. long x = percpu_counter_sum(&mm->rss_stat[i]);
  699. if (unlikely(x))
  700. pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
  701. mm, resident_page_types[i], x);
  702. }
  703. if (mm_pgtables_bytes(mm))
  704. pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
  705. mm_pgtables_bytes(mm));
  706. #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
  707. VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
  708. #endif
  709. }
  710. #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
  711. #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
  712. static void do_check_lazy_tlb(void *arg)
  713. {
  714. struct mm_struct *mm = arg;
  715. WARN_ON_ONCE(current->active_mm == mm);
  716. }
  717. static void do_shoot_lazy_tlb(void *arg)
  718. {
  719. struct mm_struct *mm = arg;
  720. if (current->active_mm == mm) {
  721. WARN_ON_ONCE(current->mm);
  722. current->active_mm = &init_mm;
  723. switch_mm(mm, &init_mm, current);
  724. }
  725. }
  726. static void cleanup_lazy_tlbs(struct mm_struct *mm)
  727. {
  728. if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
  729. /*
  730. * In this case, lazy tlb mms are refounted and would not reach
  731. * __mmdrop until all CPUs have switched away and mmdrop()ed.
  732. */
  733. return;
  734. }
  735. /*
  736. * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
  737. * requires lazy mm users to switch to another mm when the refcount
  738. * drops to zero, before the mm is freed. This requires IPIs here to
  739. * switch kernel threads to init_mm.
  740. *
  741. * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
  742. * switch with the final userspace teardown TLB flush which leaves the
  743. * mm lazy on this CPU but no others, reducing the need for additional
  744. * IPIs here. There are cases where a final IPI is still required here,
  745. * such as the final mmdrop being performed on a different CPU than the
  746. * one exiting, or kernel threads using the mm when userspace exits.
  747. *
  748. * IPI overheads have not found to be expensive, but they could be
  749. * reduced in a number of possible ways, for example (roughly
  750. * increasing order of complexity):
  751. * - The last lazy reference created by exit_mm() could instead switch
  752. * to init_mm, however it's probable this will run on the same CPU
  753. * immediately afterwards, so this may not reduce IPIs much.
  754. * - A batch of mms requiring IPIs could be gathered and freed at once.
  755. * - CPUs store active_mm where it can be remotely checked without a
  756. * lock, to filter out false-positives in the cpumask.
  757. * - After mm_users or mm_count reaches zero, switching away from the
  758. * mm could clear mm_cpumask to reduce some IPIs, perhaps together
  759. * with some batching or delaying of the final IPIs.
  760. * - A delayed freeing and RCU-like quiescing sequence based on mm
  761. * switching to avoid IPIs completely.
  762. */
  763. on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
  764. if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
  765. on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
  766. }
  767. /*
  768. * Called when the last reference to the mm
  769. * is dropped: either by a lazy thread or by
  770. * mmput. Free the page directory and the mm.
  771. */
  772. void __mmdrop(struct mm_struct *mm)
  773. {
  774. BUG_ON(mm == &init_mm);
  775. WARN_ON_ONCE(mm == current->mm);
  776. /* Ensure no CPUs are using this as their lazy tlb mm */
  777. cleanup_lazy_tlbs(mm);
  778. WARN_ON_ONCE(mm == current->active_mm);
  779. mm_free_pgd(mm);
  780. destroy_context(mm);
  781. mmu_notifier_subscriptions_destroy(mm);
  782. check_mm(mm);
  783. put_user_ns(mm->user_ns);
  784. mm_pasid_drop(mm);
  785. mm_destroy_cid(mm);
  786. percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
  787. free_mm(mm);
  788. }
  789. EXPORT_SYMBOL_GPL(__mmdrop);
  790. static void mmdrop_async_fn(struct work_struct *work)
  791. {
  792. struct mm_struct *mm;
  793. mm = container_of(work, struct mm_struct, async_put_work);
  794. __mmdrop(mm);
  795. }
  796. static void mmdrop_async(struct mm_struct *mm)
  797. {
  798. if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
  799. INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
  800. schedule_work(&mm->async_put_work);
  801. }
  802. }
  803. static inline void free_signal_struct(struct signal_struct *sig)
  804. {
  805. taskstats_tgid_free(sig);
  806. sched_autogroup_exit(sig);
  807. /*
  808. * __mmdrop is not safe to call from softirq context on x86 due to
  809. * pgd_dtor so postpone it to the async context
  810. */
  811. if (sig->oom_mm)
  812. mmdrop_async(sig->oom_mm);
  813. kmem_cache_free(signal_cachep, sig);
  814. }
  815. static inline void put_signal_struct(struct signal_struct *sig)
  816. {
  817. if (refcount_dec_and_test(&sig->sigcnt))
  818. free_signal_struct(sig);
  819. }
  820. void __put_task_struct(struct task_struct *tsk)
  821. {
  822. WARN_ON(!tsk->exit_state);
  823. WARN_ON(refcount_read(&tsk->usage));
  824. WARN_ON(tsk == current);
  825. sched_ext_free(tsk);
  826. io_uring_free(tsk);
  827. cgroup_free(tsk);
  828. task_numa_free(tsk, true);
  829. security_task_free(tsk);
  830. exit_creds(tsk);
  831. delayacct_tsk_free(tsk);
  832. put_signal_struct(tsk->signal);
  833. sched_core_free(tsk);
  834. free_task(tsk);
  835. }
  836. EXPORT_SYMBOL_GPL(__put_task_struct);
  837. void __put_task_struct_rcu_cb(struct rcu_head *rhp)
  838. {
  839. struct task_struct *task = container_of(rhp, struct task_struct, rcu);
  840. __put_task_struct(task);
  841. }
  842. EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
  843. void __init __weak arch_task_cache_init(void) { }
  844. /*
  845. * set_max_threads
  846. */
  847. static void __init set_max_threads(unsigned int max_threads_suggested)
  848. {
  849. u64 threads;
  850. unsigned long nr_pages = memblock_estimated_nr_free_pages();
  851. /*
  852. * The number of threads shall be limited such that the thread
  853. * structures may only consume a small part of the available memory.
  854. */
  855. if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
  856. threads = MAX_THREADS;
  857. else
  858. threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
  859. (u64) THREAD_SIZE * 8UL);
  860. if (threads > max_threads_suggested)
  861. threads = max_threads_suggested;
  862. max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
  863. }
  864. #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
  865. /* Initialized by the architecture: */
  866. int arch_task_struct_size __read_mostly;
  867. #endif
  868. static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
  869. {
  870. /* Fetch thread_struct whitelist for the architecture. */
  871. arch_thread_struct_whitelist(offset, size);
  872. /*
  873. * Handle zero-sized whitelist or empty thread_struct, otherwise
  874. * adjust offset to position of thread_struct in task_struct.
  875. */
  876. if (unlikely(*size == 0))
  877. *offset = 0;
  878. else
  879. *offset += offsetof(struct task_struct, thread);
  880. }
  881. void __init fork_init(void)
  882. {
  883. int i;
  884. #ifndef ARCH_MIN_TASKALIGN
  885. #define ARCH_MIN_TASKALIGN 0
  886. #endif
  887. int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
  888. unsigned long useroffset, usersize;
  889. /* create a slab on which task_structs can be allocated */
  890. task_struct_whitelist(&useroffset, &usersize);
  891. task_struct_cachep = kmem_cache_create_usercopy("task_struct",
  892. arch_task_struct_size, align,
  893. SLAB_PANIC|SLAB_ACCOUNT,
  894. useroffset, usersize, NULL);
  895. /* do the arch specific task caches init */
  896. arch_task_cache_init();
  897. set_max_threads(MAX_THREADS);
  898. init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
  899. init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
  900. init_task.signal->rlim[RLIMIT_SIGPENDING] =
  901. init_task.signal->rlim[RLIMIT_NPROC];
  902. for (i = 0; i < UCOUNT_COUNTS; i++)
  903. init_user_ns.ucount_max[i] = max_threads/2;
  904. set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
  905. set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
  906. set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
  907. set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
  908. #ifdef CONFIG_VMAP_STACK
  909. cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
  910. NULL, free_vm_stack_cache);
  911. #endif
  912. scs_init();
  913. lockdep_init_task(&init_task);
  914. uprobes_init();
  915. }
  916. int __weak arch_dup_task_struct(struct task_struct *dst,
  917. struct task_struct *src)
  918. {
  919. *dst = *src;
  920. return 0;
  921. }
  922. void set_task_stack_end_magic(struct task_struct *tsk)
  923. {
  924. unsigned long *stackend;
  925. stackend = end_of_stack(tsk);
  926. *stackend = STACK_END_MAGIC; /* for overflow detection */
  927. }
  928. static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  929. {
  930. struct task_struct *tsk;
  931. int err;
  932. if (node == NUMA_NO_NODE)
  933. node = tsk_fork_get_node(orig);
  934. tsk = alloc_task_struct_node(node);
  935. if (!tsk)
  936. return NULL;
  937. err = arch_dup_task_struct(tsk, orig);
  938. if (err)
  939. goto free_tsk;
  940. err = alloc_thread_stack_node(tsk, node);
  941. if (err)
  942. goto free_tsk;
  943. #ifdef CONFIG_THREAD_INFO_IN_TASK
  944. refcount_set(&tsk->stack_refcount, 1);
  945. #endif
  946. account_kernel_stack(tsk, 1);
  947. err = scs_prepare(tsk, node);
  948. if (err)
  949. goto free_stack;
  950. #ifdef CONFIG_SECCOMP
  951. /*
  952. * We must handle setting up seccomp filters once we're under
  953. * the sighand lock in case orig has changed between now and
  954. * then. Until then, filter must be NULL to avoid messing up
  955. * the usage counts on the error path calling free_task.
  956. */
  957. tsk->seccomp.filter = NULL;
  958. #endif
  959. setup_thread_stack(tsk, orig);
  960. clear_user_return_notifier(tsk);
  961. clear_tsk_need_resched(tsk);
  962. set_task_stack_end_magic(tsk);
  963. clear_syscall_work_syscall_user_dispatch(tsk);
  964. #ifdef CONFIG_STACKPROTECTOR
  965. tsk->stack_canary = get_random_canary();
  966. #endif
  967. if (orig->cpus_ptr == &orig->cpus_mask)
  968. tsk->cpus_ptr = &tsk->cpus_mask;
  969. dup_user_cpus_ptr(tsk, orig, node);
  970. /*
  971. * One for the user space visible state that goes away when reaped.
  972. * One for the scheduler.
  973. */
  974. refcount_set(&tsk->rcu_users, 2);
  975. /* One for the rcu users */
  976. refcount_set(&tsk->usage, 1);
  977. #ifdef CONFIG_BLK_DEV_IO_TRACE
  978. tsk->btrace_seq = 0;
  979. #endif
  980. tsk->splice_pipe = NULL;
  981. tsk->task_frag.page = NULL;
  982. tsk->wake_q.next = NULL;
  983. tsk->worker_private = NULL;
  984. kcov_task_init(tsk);
  985. kmsan_task_create(tsk);
  986. kmap_local_fork(tsk);
  987. #ifdef CONFIG_FAULT_INJECTION
  988. tsk->fail_nth = 0;
  989. #endif
  990. #ifdef CONFIG_BLK_CGROUP
  991. tsk->throttle_disk = NULL;
  992. tsk->use_memdelay = 0;
  993. #endif
  994. #ifdef CONFIG_ARCH_HAS_CPU_PASID
  995. tsk->pasid_activated = 0;
  996. #endif
  997. #ifdef CONFIG_MEMCG
  998. tsk->active_memcg = NULL;
  999. #endif
  1000. #ifdef CONFIG_CPU_SUP_INTEL
  1001. tsk->reported_split_lock = 0;
  1002. #endif
  1003. #ifdef CONFIG_SCHED_MM_CID
  1004. tsk->mm_cid = -1;
  1005. tsk->last_mm_cid = -1;
  1006. tsk->mm_cid_active = 0;
  1007. tsk->migrate_from_cpu = -1;
  1008. #endif
  1009. return tsk;
  1010. free_stack:
  1011. exit_task_stack_account(tsk);
  1012. free_thread_stack(tsk);
  1013. free_tsk:
  1014. free_task_struct(tsk);
  1015. return NULL;
  1016. }
  1017. __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
  1018. static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
  1019. static int __init coredump_filter_setup(char *s)
  1020. {
  1021. default_dump_filter =
  1022. (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
  1023. MMF_DUMP_FILTER_MASK;
  1024. return 1;
  1025. }
  1026. __setup("coredump_filter=", coredump_filter_setup);
  1027. #include <linux/init_task.h>
  1028. static void mm_init_aio(struct mm_struct *mm)
  1029. {
  1030. #ifdef CONFIG_AIO
  1031. spin_lock_init(&mm->ioctx_lock);
  1032. mm->ioctx_table = NULL;
  1033. #endif
  1034. }
  1035. static __always_inline void mm_clear_owner(struct mm_struct *mm,
  1036. struct task_struct *p)
  1037. {
  1038. #ifdef CONFIG_MEMCG
  1039. if (mm->owner == p)
  1040. WRITE_ONCE(mm->owner, NULL);
  1041. #endif
  1042. }
  1043. static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
  1044. {
  1045. #ifdef CONFIG_MEMCG
  1046. mm->owner = p;
  1047. #endif
  1048. }
  1049. static void mm_init_uprobes_state(struct mm_struct *mm)
  1050. {
  1051. #ifdef CONFIG_UPROBES
  1052. mm->uprobes_state.xol_area = NULL;
  1053. #endif
  1054. }
  1055. static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
  1056. struct user_namespace *user_ns)
  1057. {
  1058. mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
  1059. mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
  1060. atomic_set(&mm->mm_users, 1);
  1061. atomic_set(&mm->mm_count, 1);
  1062. seqcount_init(&mm->write_protect_seq);
  1063. mmap_init_lock(mm);
  1064. INIT_LIST_HEAD(&mm->mmlist);
  1065. #ifdef CONFIG_PER_VMA_LOCK
  1066. mm->mm_lock_seq = 0;
  1067. #endif
  1068. mm_pgtables_bytes_init(mm);
  1069. mm->map_count = 0;
  1070. mm->locked_vm = 0;
  1071. atomic64_set(&mm->pinned_vm, 0);
  1072. memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
  1073. spin_lock_init(&mm->page_table_lock);
  1074. spin_lock_init(&mm->arg_lock);
  1075. mm_init_cpumask(mm);
  1076. mm_init_aio(mm);
  1077. mm_init_owner(mm, p);
  1078. mm_pasid_init(mm);
  1079. RCU_INIT_POINTER(mm->exe_file, NULL);
  1080. mmu_notifier_subscriptions_init(mm);
  1081. init_tlb_flush_pending(mm);
  1082. #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
  1083. mm->pmd_huge_pte = NULL;
  1084. #endif
  1085. mm_init_uprobes_state(mm);
  1086. hugetlb_count_init(mm);
  1087. if (current->mm) {
  1088. mm->flags = mmf_init_flags(current->mm->flags);
  1089. mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
  1090. } else {
  1091. mm->flags = default_dump_filter;
  1092. mm->def_flags = 0;
  1093. }
  1094. if (mm_alloc_pgd(mm))
  1095. goto fail_nopgd;
  1096. if (init_new_context(p, mm))
  1097. goto fail_nocontext;
  1098. if (mm_alloc_cid(mm))
  1099. goto fail_cid;
  1100. if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
  1101. NR_MM_COUNTERS))
  1102. goto fail_pcpu;
  1103. mm->user_ns = get_user_ns(user_ns);
  1104. lru_gen_init_mm(mm);
  1105. return mm;
  1106. fail_pcpu:
  1107. mm_destroy_cid(mm);
  1108. fail_cid:
  1109. destroy_context(mm);
  1110. fail_nocontext:
  1111. mm_free_pgd(mm);
  1112. fail_nopgd:
  1113. free_mm(mm);
  1114. return NULL;
  1115. }
  1116. /*
  1117. * Allocate and initialize an mm_struct.
  1118. */
  1119. struct mm_struct *mm_alloc(void)
  1120. {
  1121. struct mm_struct *mm;
  1122. mm = allocate_mm();
  1123. if (!mm)
  1124. return NULL;
  1125. memset(mm, 0, sizeof(*mm));
  1126. return mm_init(mm, current, current_user_ns());
  1127. }
  1128. EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
  1129. static inline void __mmput(struct mm_struct *mm)
  1130. {
  1131. VM_BUG_ON(atomic_read(&mm->mm_users));
  1132. uprobe_clear_state(mm);
  1133. exit_aio(mm);
  1134. ksm_exit(mm);
  1135. khugepaged_exit(mm); /* must run before exit_mmap */
  1136. exit_mmap(mm);
  1137. mm_put_huge_zero_folio(mm);
  1138. set_mm_exe_file(mm, NULL);
  1139. if (!list_empty(&mm->mmlist)) {
  1140. spin_lock(&mmlist_lock);
  1141. list_del(&mm->mmlist);
  1142. spin_unlock(&mmlist_lock);
  1143. }
  1144. if (mm->binfmt)
  1145. module_put(mm->binfmt->module);
  1146. lru_gen_del_mm(mm);
  1147. mmdrop(mm);
  1148. }
  1149. /*
  1150. * Decrement the use count and release all resources for an mm.
  1151. */
  1152. void mmput(struct mm_struct *mm)
  1153. {
  1154. might_sleep();
  1155. if (atomic_dec_and_test(&mm->mm_users))
  1156. __mmput(mm);
  1157. }
  1158. EXPORT_SYMBOL_GPL(mmput);
  1159. #ifdef CONFIG_MMU
  1160. static void mmput_async_fn(struct work_struct *work)
  1161. {
  1162. struct mm_struct *mm = container_of(work, struct mm_struct,
  1163. async_put_work);
  1164. __mmput(mm);
  1165. }
  1166. void mmput_async(struct mm_struct *mm)
  1167. {
  1168. if (atomic_dec_and_test(&mm->mm_users)) {
  1169. INIT_WORK(&mm->async_put_work, mmput_async_fn);
  1170. schedule_work(&mm->async_put_work);
  1171. }
  1172. }
  1173. EXPORT_SYMBOL_GPL(mmput_async);
  1174. #endif
  1175. /**
  1176. * set_mm_exe_file - change a reference to the mm's executable file
  1177. * @mm: The mm to change.
  1178. * @new_exe_file: The new file to use.
  1179. *
  1180. * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
  1181. *
  1182. * Main users are mmput() and sys_execve(). Callers prevent concurrent
  1183. * invocations: in mmput() nobody alive left, in execve it happens before
  1184. * the new mm is made visible to anyone.
  1185. *
  1186. * Can only fail if new_exe_file != NULL.
  1187. */
  1188. int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
  1189. {
  1190. struct file *old_exe_file;
  1191. /*
  1192. * It is safe to dereference the exe_file without RCU as
  1193. * this function is only called if nobody else can access
  1194. * this mm -- see comment above for justification.
  1195. */
  1196. old_exe_file = rcu_dereference_raw(mm->exe_file);
  1197. if (new_exe_file) {
  1198. /*
  1199. * We expect the caller (i.e., sys_execve) to already denied
  1200. * write access, so this is unlikely to fail.
  1201. */
  1202. if (unlikely(deny_write_access(new_exe_file)))
  1203. return -EACCES;
  1204. get_file(new_exe_file);
  1205. }
  1206. rcu_assign_pointer(mm->exe_file, new_exe_file);
  1207. if (old_exe_file) {
  1208. allow_write_access(old_exe_file);
  1209. fput(old_exe_file);
  1210. }
  1211. return 0;
  1212. }
  1213. /**
  1214. * replace_mm_exe_file - replace a reference to the mm's executable file
  1215. * @mm: The mm to change.
  1216. * @new_exe_file: The new file to use.
  1217. *
  1218. * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
  1219. *
  1220. * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
  1221. */
  1222. int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
  1223. {
  1224. struct vm_area_struct *vma;
  1225. struct file *old_exe_file;
  1226. int ret = 0;
  1227. /* Forbid mm->exe_file change if old file still mapped. */
  1228. old_exe_file = get_mm_exe_file(mm);
  1229. if (old_exe_file) {
  1230. VMA_ITERATOR(vmi, mm, 0);
  1231. mmap_read_lock(mm);
  1232. for_each_vma(vmi, vma) {
  1233. if (!vma->vm_file)
  1234. continue;
  1235. if (path_equal(&vma->vm_file->f_path,
  1236. &old_exe_file->f_path)) {
  1237. ret = -EBUSY;
  1238. break;
  1239. }
  1240. }
  1241. mmap_read_unlock(mm);
  1242. fput(old_exe_file);
  1243. if (ret)
  1244. return ret;
  1245. }
  1246. ret = deny_write_access(new_exe_file);
  1247. if (ret)
  1248. return -EACCES;
  1249. get_file(new_exe_file);
  1250. /* set the new file */
  1251. mmap_write_lock(mm);
  1252. old_exe_file = rcu_dereference_raw(mm->exe_file);
  1253. rcu_assign_pointer(mm->exe_file, new_exe_file);
  1254. mmap_write_unlock(mm);
  1255. if (old_exe_file) {
  1256. allow_write_access(old_exe_file);
  1257. fput(old_exe_file);
  1258. }
  1259. return 0;
  1260. }
  1261. /**
  1262. * get_mm_exe_file - acquire a reference to the mm's executable file
  1263. * @mm: The mm of interest.
  1264. *
  1265. * Returns %NULL if mm has no associated executable file.
  1266. * User must release file via fput().
  1267. */
  1268. struct file *get_mm_exe_file(struct mm_struct *mm)
  1269. {
  1270. struct file *exe_file;
  1271. rcu_read_lock();
  1272. exe_file = get_file_rcu(&mm->exe_file);
  1273. rcu_read_unlock();
  1274. return exe_file;
  1275. }
  1276. /**
  1277. * get_task_exe_file - acquire a reference to the task's executable file
  1278. * @task: The task.
  1279. *
  1280. * Returns %NULL if task's mm (if any) has no associated executable file or
  1281. * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
  1282. * User must release file via fput().
  1283. */
  1284. struct file *get_task_exe_file(struct task_struct *task)
  1285. {
  1286. struct file *exe_file = NULL;
  1287. struct mm_struct *mm;
  1288. task_lock(task);
  1289. mm = task->mm;
  1290. if (mm) {
  1291. if (!(task->flags & PF_KTHREAD))
  1292. exe_file = get_mm_exe_file(mm);
  1293. }
  1294. task_unlock(task);
  1295. return exe_file;
  1296. }
  1297. /**
  1298. * get_task_mm - acquire a reference to the task's mm
  1299. * @task: The task.
  1300. *
  1301. * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
  1302. * this kernel workthread has transiently adopted a user mm with use_mm,
  1303. * to do its AIO) is not set and if so returns a reference to it, after
  1304. * bumping up the use count. User must release the mm via mmput()
  1305. * after use. Typically used by /proc and ptrace.
  1306. */
  1307. struct mm_struct *get_task_mm(struct task_struct *task)
  1308. {
  1309. struct mm_struct *mm;
  1310. if (task->flags & PF_KTHREAD)
  1311. return NULL;
  1312. task_lock(task);
  1313. mm = task->mm;
  1314. if (mm)
  1315. mmget(mm);
  1316. task_unlock(task);
  1317. return mm;
  1318. }
  1319. EXPORT_SYMBOL_GPL(get_task_mm);
  1320. struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
  1321. {
  1322. struct mm_struct *mm;
  1323. int err;
  1324. err = down_read_killable(&task->signal->exec_update_lock);
  1325. if (err)
  1326. return ERR_PTR(err);
  1327. mm = get_task_mm(task);
  1328. if (mm && mm != current->mm &&
  1329. !ptrace_may_access(task, mode)) {
  1330. mmput(mm);
  1331. mm = ERR_PTR(-EACCES);
  1332. }
  1333. up_read(&task->signal->exec_update_lock);
  1334. return mm;
  1335. }
  1336. static void complete_vfork_done(struct task_struct *tsk)
  1337. {
  1338. struct completion *vfork;
  1339. task_lock(tsk);
  1340. vfork = tsk->vfork_done;
  1341. if (likely(vfork)) {
  1342. tsk->vfork_done = NULL;
  1343. complete(vfork);
  1344. }
  1345. task_unlock(tsk);
  1346. }
  1347. static int wait_for_vfork_done(struct task_struct *child,
  1348. struct completion *vfork)
  1349. {
  1350. unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
  1351. int killed;
  1352. cgroup_enter_frozen();
  1353. killed = wait_for_completion_state(vfork, state);
  1354. cgroup_leave_frozen(false);
  1355. if (killed) {
  1356. task_lock(child);
  1357. child->vfork_done = NULL;
  1358. task_unlock(child);
  1359. }
  1360. put_task_struct(child);
  1361. return killed;
  1362. }
  1363. /* Please note the differences between mmput and mm_release.
  1364. * mmput is called whenever we stop holding onto a mm_struct,
  1365. * error success whatever.
  1366. *
  1367. * mm_release is called after a mm_struct has been removed
  1368. * from the current process.
  1369. *
  1370. * This difference is important for error handling, when we
  1371. * only half set up a mm_struct for a new process and need to restore
  1372. * the old one. Because we mmput the new mm_struct before
  1373. * restoring the old one. . .
  1374. * Eric Biederman 10 January 1998
  1375. */
  1376. static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
  1377. {
  1378. uprobe_free_utask(tsk);
  1379. /* Get rid of any cached register state */
  1380. deactivate_mm(tsk, mm);
  1381. /*
  1382. * Signal userspace if we're not exiting with a core dump
  1383. * because we want to leave the value intact for debugging
  1384. * purposes.
  1385. */
  1386. if (tsk->clear_child_tid) {
  1387. if (atomic_read(&mm->mm_users) > 1) {
  1388. /*
  1389. * We don't check the error code - if userspace has
  1390. * not set up a proper pointer then tough luck.
  1391. */
  1392. put_user(0, tsk->clear_child_tid);
  1393. do_futex(tsk->clear_child_tid, FUTEX_WAKE,
  1394. 1, NULL, NULL, 0, 0);
  1395. }
  1396. tsk->clear_child_tid = NULL;
  1397. }
  1398. /*
  1399. * All done, finally we can wake up parent and return this mm to him.
  1400. * Also kthread_stop() uses this completion for synchronization.
  1401. */
  1402. if (tsk->vfork_done)
  1403. complete_vfork_done(tsk);
  1404. }
  1405. void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
  1406. {
  1407. futex_exit_release(tsk);
  1408. mm_release(tsk, mm);
  1409. }
  1410. void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
  1411. {
  1412. futex_exec_release(tsk);
  1413. mm_release(tsk, mm);
  1414. }
  1415. /**
  1416. * dup_mm() - duplicates an existing mm structure
  1417. * @tsk: the task_struct with which the new mm will be associated.
  1418. * @oldmm: the mm to duplicate.
  1419. *
  1420. * Allocates a new mm structure and duplicates the provided @oldmm structure
  1421. * content into it.
  1422. *
  1423. * Return: the duplicated mm or NULL on failure.
  1424. */
  1425. static struct mm_struct *dup_mm(struct task_struct *tsk,
  1426. struct mm_struct *oldmm)
  1427. {
  1428. struct mm_struct *mm;
  1429. int err;
  1430. mm = allocate_mm();
  1431. if (!mm)
  1432. goto fail_nomem;
  1433. memcpy(mm, oldmm, sizeof(*mm));
  1434. if (!mm_init(mm, tsk, mm->user_ns))
  1435. goto fail_nomem;
  1436. uprobe_start_dup_mmap();
  1437. err = dup_mmap(mm, oldmm);
  1438. if (err)
  1439. goto free_pt;
  1440. uprobe_end_dup_mmap();
  1441. mm->hiwater_rss = get_mm_rss(mm);
  1442. mm->hiwater_vm = mm->total_vm;
  1443. if (mm->binfmt && !try_module_get(mm->binfmt->module))
  1444. goto free_pt;
  1445. return mm;
  1446. free_pt:
  1447. /* don't put binfmt in mmput, we haven't got module yet */
  1448. mm->binfmt = NULL;
  1449. mm_init_owner(mm, NULL);
  1450. mmput(mm);
  1451. if (err)
  1452. uprobe_end_dup_mmap();
  1453. fail_nomem:
  1454. return NULL;
  1455. }
  1456. static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
  1457. {
  1458. struct mm_struct *mm, *oldmm;
  1459. tsk->min_flt = tsk->maj_flt = 0;
  1460. tsk->nvcsw = tsk->nivcsw = 0;
  1461. #ifdef CONFIG_DETECT_HUNG_TASK
  1462. tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
  1463. tsk->last_switch_time = 0;
  1464. #endif
  1465. tsk->mm = NULL;
  1466. tsk->active_mm = NULL;
  1467. /*
  1468. * Are we cloning a kernel thread?
  1469. *
  1470. * We need to steal a active VM for that..
  1471. */
  1472. oldmm = current->mm;
  1473. if (!oldmm)
  1474. return 0;
  1475. if (clone_flags & CLONE_VM) {
  1476. mmget(oldmm);
  1477. mm = oldmm;
  1478. } else {
  1479. mm = dup_mm(tsk, current->mm);
  1480. if (!mm)
  1481. return -ENOMEM;
  1482. }
  1483. tsk->mm = mm;
  1484. tsk->active_mm = mm;
  1485. sched_mm_cid_fork(tsk);
  1486. return 0;
  1487. }
  1488. static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
  1489. {
  1490. struct fs_struct *fs = current->fs;
  1491. if (clone_flags & CLONE_FS) {
  1492. /* tsk->fs is already what we want */
  1493. spin_lock(&fs->lock);
  1494. /* "users" and "in_exec" locked for check_unsafe_exec() */
  1495. if (fs->in_exec) {
  1496. spin_unlock(&fs->lock);
  1497. return -EAGAIN;
  1498. }
  1499. fs->users++;
  1500. spin_unlock(&fs->lock);
  1501. return 0;
  1502. }
  1503. tsk->fs = copy_fs_struct(fs);
  1504. if (!tsk->fs)
  1505. return -ENOMEM;
  1506. return 0;
  1507. }
  1508. static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
  1509. int no_files)
  1510. {
  1511. struct files_struct *oldf, *newf;
  1512. /*
  1513. * A background process may not have any files ...
  1514. */
  1515. oldf = current->files;
  1516. if (!oldf)
  1517. return 0;
  1518. if (no_files) {
  1519. tsk->files = NULL;
  1520. return 0;
  1521. }
  1522. if (clone_flags & CLONE_FILES) {
  1523. atomic_inc(&oldf->count);
  1524. return 0;
  1525. }
  1526. newf = dup_fd(oldf, NULL);
  1527. if (IS_ERR(newf))
  1528. return PTR_ERR(newf);
  1529. tsk->files = newf;
  1530. return 0;
  1531. }
  1532. static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
  1533. {
  1534. struct sighand_struct *sig;
  1535. if (clone_flags & CLONE_SIGHAND) {
  1536. refcount_inc(&current->sighand->count);
  1537. return 0;
  1538. }
  1539. sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
  1540. RCU_INIT_POINTER(tsk->sighand, sig);
  1541. if (!sig)
  1542. return -ENOMEM;
  1543. refcount_set(&sig->count, 1);
  1544. spin_lock_irq(&current->sighand->siglock);
  1545. memcpy(sig->action, current->sighand->action, sizeof(sig->action));
  1546. spin_unlock_irq(&current->sighand->siglock);
  1547. /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
  1548. if (clone_flags & CLONE_CLEAR_SIGHAND)
  1549. flush_signal_handlers(tsk, 0);
  1550. return 0;
  1551. }
  1552. void __cleanup_sighand(struct sighand_struct *sighand)
  1553. {
  1554. if (refcount_dec_and_test(&sighand->count)) {
  1555. signalfd_cleanup(sighand);
  1556. /*
  1557. * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
  1558. * without an RCU grace period, see __lock_task_sighand().
  1559. */
  1560. kmem_cache_free(sighand_cachep, sighand);
  1561. }
  1562. }
  1563. /*
  1564. * Initialize POSIX timer handling for a thread group.
  1565. */
  1566. static void posix_cpu_timers_init_group(struct signal_struct *sig)
  1567. {
  1568. struct posix_cputimers *pct = &sig->posix_cputimers;
  1569. unsigned long cpu_limit;
  1570. cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
  1571. posix_cputimers_group_init(pct, cpu_limit);
  1572. }
  1573. static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
  1574. {
  1575. struct signal_struct *sig;
  1576. if (clone_flags & CLONE_THREAD)
  1577. return 0;
  1578. sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
  1579. tsk->signal = sig;
  1580. if (!sig)
  1581. return -ENOMEM;
  1582. sig->nr_threads = 1;
  1583. sig->quick_threads = 1;
  1584. atomic_set(&sig->live, 1);
  1585. refcount_set(&sig->sigcnt, 1);
  1586. /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
  1587. sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
  1588. tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
  1589. init_waitqueue_head(&sig->wait_chldexit);
  1590. sig->curr_target = tsk;
  1591. init_sigpending(&sig->shared_pending);
  1592. INIT_HLIST_HEAD(&sig->multiprocess);
  1593. seqlock_init(&sig->stats_lock);
  1594. prev_cputime_init(&sig->prev_cputime);
  1595. #ifdef CONFIG_POSIX_TIMERS
  1596. INIT_HLIST_HEAD(&sig->posix_timers);
  1597. hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  1598. sig->real_timer.function = it_real_fn;
  1599. #endif
  1600. task_lock(current->group_leader);
  1601. memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
  1602. task_unlock(current->group_leader);
  1603. posix_cpu_timers_init_group(sig);
  1604. tty_audit_fork(sig);
  1605. sched_autogroup_fork(sig);
  1606. sig->oom_score_adj = current->signal->oom_score_adj;
  1607. sig->oom_score_adj_min = current->signal->oom_score_adj_min;
  1608. mutex_init(&sig->cred_guard_mutex);
  1609. init_rwsem(&sig->exec_update_lock);
  1610. return 0;
  1611. }
  1612. static void copy_seccomp(struct task_struct *p)
  1613. {
  1614. #ifdef CONFIG_SECCOMP
  1615. /*
  1616. * Must be called with sighand->lock held, which is common to
  1617. * all threads in the group. Holding cred_guard_mutex is not
  1618. * needed because this new task is not yet running and cannot
  1619. * be racing exec.
  1620. */
  1621. assert_spin_locked(&current->sighand->siglock);
  1622. /* Ref-count the new filter user, and assign it. */
  1623. get_seccomp_filter(current);
  1624. p->seccomp = current->seccomp;
  1625. /*
  1626. * Explicitly enable no_new_privs here in case it got set
  1627. * between the task_struct being duplicated and holding the
  1628. * sighand lock. The seccomp state and nnp must be in sync.
  1629. */
  1630. if (task_no_new_privs(current))
  1631. task_set_no_new_privs(p);
  1632. /*
  1633. * If the parent gained a seccomp mode after copying thread
  1634. * flags and between before we held the sighand lock, we have
  1635. * to manually enable the seccomp thread flag here.
  1636. */
  1637. if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
  1638. set_task_syscall_work(p, SECCOMP);
  1639. #endif
  1640. }
  1641. SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
  1642. {
  1643. current->clear_child_tid = tidptr;
  1644. return task_pid_vnr(current);
  1645. }
  1646. static void rt_mutex_init_task(struct task_struct *p)
  1647. {
  1648. raw_spin_lock_init(&p->pi_lock);
  1649. #ifdef CONFIG_RT_MUTEXES
  1650. p->pi_waiters = RB_ROOT_CACHED;
  1651. p->pi_top_task = NULL;
  1652. p->pi_blocked_on = NULL;
  1653. #endif
  1654. }
  1655. static inline void init_task_pid_links(struct task_struct *task)
  1656. {
  1657. enum pid_type type;
  1658. for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
  1659. INIT_HLIST_NODE(&task->pid_links[type]);
  1660. }
  1661. static inline void
  1662. init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
  1663. {
  1664. if (type == PIDTYPE_PID)
  1665. task->thread_pid = pid;
  1666. else
  1667. task->signal->pids[type] = pid;
  1668. }
  1669. static inline void rcu_copy_process(struct task_struct *p)
  1670. {
  1671. #ifdef CONFIG_PREEMPT_RCU
  1672. p->rcu_read_lock_nesting = 0;
  1673. p->rcu_read_unlock_special.s = 0;
  1674. p->rcu_blocked_node = NULL;
  1675. INIT_LIST_HEAD(&p->rcu_node_entry);
  1676. #endif /* #ifdef CONFIG_PREEMPT_RCU */
  1677. #ifdef CONFIG_TASKS_RCU
  1678. p->rcu_tasks_holdout = false;
  1679. INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
  1680. p->rcu_tasks_idle_cpu = -1;
  1681. INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
  1682. #endif /* #ifdef CONFIG_TASKS_RCU */
  1683. #ifdef CONFIG_TASKS_TRACE_RCU
  1684. p->trc_reader_nesting = 0;
  1685. p->trc_reader_special.s = 0;
  1686. INIT_LIST_HEAD(&p->trc_holdout_list);
  1687. INIT_LIST_HEAD(&p->trc_blkd_node);
  1688. #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
  1689. }
  1690. /**
  1691. * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
  1692. * @pid: the struct pid for which to create a pidfd
  1693. * @flags: flags of the new @pidfd
  1694. * @ret: Where to return the file for the pidfd.
  1695. *
  1696. * Allocate a new file that stashes @pid and reserve a new pidfd number in the
  1697. * caller's file descriptor table. The pidfd is reserved but not installed yet.
  1698. *
  1699. * The helper doesn't perform checks on @pid which makes it useful for pidfds
  1700. * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
  1701. * pidfd file are prepared.
  1702. *
  1703. * If this function returns successfully the caller is responsible to either
  1704. * call fd_install() passing the returned pidfd and pidfd file as arguments in
  1705. * order to install the pidfd into its file descriptor table or they must use
  1706. * put_unused_fd() and fput() on the returned pidfd and pidfd file
  1707. * respectively.
  1708. *
  1709. * This function is useful when a pidfd must already be reserved but there
  1710. * might still be points of failure afterwards and the caller wants to ensure
  1711. * that no pidfd is leaked into its file descriptor table.
  1712. *
  1713. * Return: On success, a reserved pidfd is returned from the function and a new
  1714. * pidfd file is returned in the last argument to the function. On
  1715. * error, a negative error code is returned from the function and the
  1716. * last argument remains unchanged.
  1717. */
  1718. static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
  1719. {
  1720. int pidfd;
  1721. struct file *pidfd_file;
  1722. pidfd = get_unused_fd_flags(O_CLOEXEC);
  1723. if (pidfd < 0)
  1724. return pidfd;
  1725. pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
  1726. if (IS_ERR(pidfd_file)) {
  1727. put_unused_fd(pidfd);
  1728. return PTR_ERR(pidfd_file);
  1729. }
  1730. /*
  1731. * anon_inode_getfile() ignores everything outside of the
  1732. * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
  1733. */
  1734. pidfd_file->f_flags |= (flags & PIDFD_THREAD);
  1735. *ret = pidfd_file;
  1736. return pidfd;
  1737. }
  1738. /**
  1739. * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
  1740. * @pid: the struct pid for which to create a pidfd
  1741. * @flags: flags of the new @pidfd
  1742. * @ret: Where to return the pidfd.
  1743. *
  1744. * Allocate a new file that stashes @pid and reserve a new pidfd number in the
  1745. * caller's file descriptor table. The pidfd is reserved but not installed yet.
  1746. *
  1747. * The helper verifies that @pid is still in use, without PIDFD_THREAD the
  1748. * task identified by @pid must be a thread-group leader.
  1749. *
  1750. * If this function returns successfully the caller is responsible to either
  1751. * call fd_install() passing the returned pidfd and pidfd file as arguments in
  1752. * order to install the pidfd into its file descriptor table or they must use
  1753. * put_unused_fd() and fput() on the returned pidfd and pidfd file
  1754. * respectively.
  1755. *
  1756. * This function is useful when a pidfd must already be reserved but there
  1757. * might still be points of failure afterwards and the caller wants to ensure
  1758. * that no pidfd is leaked into its file descriptor table.
  1759. *
  1760. * Return: On success, a reserved pidfd is returned from the function and a new
  1761. * pidfd file is returned in the last argument to the function. On
  1762. * error, a negative error code is returned from the function and the
  1763. * last argument remains unchanged.
  1764. */
  1765. int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
  1766. {
  1767. bool thread = flags & PIDFD_THREAD;
  1768. if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
  1769. return -EINVAL;
  1770. return __pidfd_prepare(pid, flags, ret);
  1771. }
  1772. static void __delayed_free_task(struct rcu_head *rhp)
  1773. {
  1774. struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
  1775. free_task(tsk);
  1776. }
  1777. static __always_inline void delayed_free_task(struct task_struct *tsk)
  1778. {
  1779. if (IS_ENABLED(CONFIG_MEMCG))
  1780. call_rcu(&tsk->rcu, __delayed_free_task);
  1781. else
  1782. free_task(tsk);
  1783. }
  1784. static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
  1785. {
  1786. /* Skip if kernel thread */
  1787. if (!tsk->mm)
  1788. return;
  1789. /* Skip if spawning a thread or using vfork */
  1790. if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
  1791. return;
  1792. /* We need to synchronize with __set_oom_adj */
  1793. mutex_lock(&oom_adj_mutex);
  1794. set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
  1795. /* Update the values in case they were changed after copy_signal */
  1796. tsk->signal->oom_score_adj = current->signal->oom_score_adj;
  1797. tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
  1798. mutex_unlock(&oom_adj_mutex);
  1799. }
  1800. #ifdef CONFIG_RV
  1801. static void rv_task_fork(struct task_struct *p)
  1802. {
  1803. int i;
  1804. for (i = 0; i < RV_PER_TASK_MONITORS; i++)
  1805. p->rv[i].da_mon.monitoring = false;
  1806. }
  1807. #else
  1808. #define rv_task_fork(p) do {} while (0)
  1809. #endif
  1810. /*
  1811. * This creates a new process as a copy of the old one,
  1812. * but does not actually start it yet.
  1813. *
  1814. * It copies the registers, and all the appropriate
  1815. * parts of the process environment (as per the clone
  1816. * flags). The actual kick-off is left to the caller.
  1817. */
  1818. __latent_entropy struct task_struct *copy_process(
  1819. struct pid *pid,
  1820. int trace,
  1821. int node,
  1822. struct kernel_clone_args *args)
  1823. {
  1824. int pidfd = -1, retval;
  1825. struct task_struct *p;
  1826. struct multiprocess_signals delayed;
  1827. struct file *pidfile = NULL;
  1828. const u64 clone_flags = args->flags;
  1829. struct nsproxy *nsp = current->nsproxy;
  1830. /*
  1831. * Don't allow sharing the root directory with processes in a different
  1832. * namespace
  1833. */
  1834. if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
  1835. return ERR_PTR(-EINVAL);
  1836. if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
  1837. return ERR_PTR(-EINVAL);
  1838. /*
  1839. * Thread groups must share signals as well, and detached threads
  1840. * can only be started up within the thread group.
  1841. */
  1842. if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
  1843. return ERR_PTR(-EINVAL);
  1844. /*
  1845. * Shared signal handlers imply shared VM. By way of the above,
  1846. * thread groups also imply shared VM. Blocking this case allows
  1847. * for various simplifications in other code.
  1848. */
  1849. if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
  1850. return ERR_PTR(-EINVAL);
  1851. /*
  1852. * Siblings of global init remain as zombies on exit since they are
  1853. * not reaped by their parent (swapper). To solve this and to avoid
  1854. * multi-rooted process trees, prevent global and container-inits
  1855. * from creating siblings.
  1856. */
  1857. if ((clone_flags & CLONE_PARENT) &&
  1858. current->signal->flags & SIGNAL_UNKILLABLE)
  1859. return ERR_PTR(-EINVAL);
  1860. /*
  1861. * If the new process will be in a different pid or user namespace
  1862. * do not allow it to share a thread group with the forking task.
  1863. */
  1864. if (clone_flags & CLONE_THREAD) {
  1865. if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
  1866. (task_active_pid_ns(current) != nsp->pid_ns_for_children))
  1867. return ERR_PTR(-EINVAL);
  1868. }
  1869. if (clone_flags & CLONE_PIDFD) {
  1870. /*
  1871. * - CLONE_DETACHED is blocked so that we can potentially
  1872. * reuse it later for CLONE_PIDFD.
  1873. */
  1874. if (clone_flags & CLONE_DETACHED)
  1875. return ERR_PTR(-EINVAL);
  1876. }
  1877. /*
  1878. * Force any signals received before this point to be delivered
  1879. * before the fork happens. Collect up signals sent to multiple
  1880. * processes that happen during the fork and delay them so that
  1881. * they appear to happen after the fork.
  1882. */
  1883. sigemptyset(&delayed.signal);
  1884. INIT_HLIST_NODE(&delayed.node);
  1885. spin_lock_irq(&current->sighand->siglock);
  1886. if (!(clone_flags & CLONE_THREAD))
  1887. hlist_add_head(&delayed.node, &current->signal->multiprocess);
  1888. recalc_sigpending();
  1889. spin_unlock_irq(&current->sighand->siglock);
  1890. retval = -ERESTARTNOINTR;
  1891. if (task_sigpending(current))
  1892. goto fork_out;
  1893. retval = -ENOMEM;
  1894. p = dup_task_struct(current, node);
  1895. if (!p)
  1896. goto fork_out;
  1897. p->flags &= ~PF_KTHREAD;
  1898. if (args->kthread)
  1899. p->flags |= PF_KTHREAD;
  1900. if (args->user_worker) {
  1901. /*
  1902. * Mark us a user worker, and block any signal that isn't
  1903. * fatal or STOP
  1904. */
  1905. p->flags |= PF_USER_WORKER;
  1906. siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
  1907. }
  1908. if (args->io_thread)
  1909. p->flags |= PF_IO_WORKER;
  1910. if (args->name)
  1911. strscpy_pad(p->comm, args->name, sizeof(p->comm));
  1912. p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
  1913. /*
  1914. * Clear TID on mm_release()?
  1915. */
  1916. p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
  1917. ftrace_graph_init_task(p);
  1918. rt_mutex_init_task(p);
  1919. lockdep_assert_irqs_enabled();
  1920. #ifdef CONFIG_PROVE_LOCKING
  1921. DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
  1922. #endif
  1923. retval = copy_creds(p, clone_flags);
  1924. if (retval < 0)
  1925. goto bad_fork_free;
  1926. retval = -EAGAIN;
  1927. if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
  1928. if (p->real_cred->user != INIT_USER &&
  1929. !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
  1930. goto bad_fork_cleanup_count;
  1931. }
  1932. current->flags &= ~PF_NPROC_EXCEEDED;
  1933. /*
  1934. * If multiple threads are within copy_process(), then this check
  1935. * triggers too late. This doesn't hurt, the check is only there
  1936. * to stop root fork bombs.
  1937. */
  1938. retval = -EAGAIN;
  1939. if (data_race(nr_threads >= max_threads))
  1940. goto bad_fork_cleanup_count;
  1941. delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
  1942. p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
  1943. p->flags |= PF_FORKNOEXEC;
  1944. INIT_LIST_HEAD(&p->children);
  1945. INIT_LIST_HEAD(&p->sibling);
  1946. rcu_copy_process(p);
  1947. p->vfork_done = NULL;
  1948. spin_lock_init(&p->alloc_lock);
  1949. init_sigpending(&p->pending);
  1950. p->utime = p->stime = p->gtime = 0;
  1951. #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
  1952. p->utimescaled = p->stimescaled = 0;
  1953. #endif
  1954. prev_cputime_init(&p->prev_cputime);
  1955. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  1956. seqcount_init(&p->vtime.seqcount);
  1957. p->vtime.starttime = 0;
  1958. p->vtime.state = VTIME_INACTIVE;
  1959. #endif
  1960. #ifdef CONFIG_IO_URING
  1961. p->io_uring = NULL;
  1962. #endif
  1963. p->default_timer_slack_ns = current->timer_slack_ns;
  1964. #ifdef CONFIG_PSI
  1965. p->psi_flags = 0;
  1966. #endif
  1967. task_io_accounting_init(&p->ioac);
  1968. acct_clear_integrals(p);
  1969. posix_cputimers_init(&p->posix_cputimers);
  1970. tick_dep_init_task(p);
  1971. p->io_context = NULL;
  1972. audit_set_context(p, NULL);
  1973. cgroup_fork(p);
  1974. if (args->kthread) {
  1975. if (!set_kthread_struct(p))
  1976. goto bad_fork_cleanup_delayacct;
  1977. }
  1978. #ifdef CONFIG_NUMA
  1979. p->mempolicy = mpol_dup(p->mempolicy);
  1980. if (IS_ERR(p->mempolicy)) {
  1981. retval = PTR_ERR(p->mempolicy);
  1982. p->mempolicy = NULL;
  1983. goto bad_fork_cleanup_delayacct;
  1984. }
  1985. #endif
  1986. #ifdef CONFIG_CPUSETS
  1987. p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
  1988. seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
  1989. #endif
  1990. #ifdef CONFIG_TRACE_IRQFLAGS
  1991. memset(&p->irqtrace, 0, sizeof(p->irqtrace));
  1992. p->irqtrace.hardirq_disable_ip = _THIS_IP_;
  1993. p->irqtrace.softirq_enable_ip = _THIS_IP_;
  1994. p->softirqs_enabled = 1;
  1995. p->softirq_context = 0;
  1996. #endif
  1997. p->pagefault_disabled = 0;
  1998. #ifdef CONFIG_LOCKDEP
  1999. lockdep_init_task(p);
  2000. #endif
  2001. #ifdef CONFIG_DEBUG_MUTEXES
  2002. p->blocked_on = NULL; /* not blocked yet */
  2003. #endif
  2004. #ifdef CONFIG_BCACHE
  2005. p->sequential_io = 0;
  2006. p->sequential_io_avg = 0;
  2007. #endif
  2008. #ifdef CONFIG_BPF_SYSCALL
  2009. RCU_INIT_POINTER(p->bpf_storage, NULL);
  2010. p->bpf_ctx = NULL;
  2011. #endif
  2012. /* Perform scheduler related setup. Assign this task to a CPU. */
  2013. retval = sched_fork(clone_flags, p);
  2014. if (retval)
  2015. goto bad_fork_cleanup_policy;
  2016. retval = perf_event_init_task(p, clone_flags);
  2017. if (retval)
  2018. goto bad_fork_sched_cancel_fork;
  2019. retval = audit_alloc(p);
  2020. if (retval)
  2021. goto bad_fork_cleanup_perf;
  2022. /* copy all the process information */
  2023. shm_init_task(p);
  2024. retval = security_task_alloc(p, clone_flags);
  2025. if (retval)
  2026. goto bad_fork_cleanup_audit;
  2027. retval = copy_semundo(clone_flags, p);
  2028. if (retval)
  2029. goto bad_fork_cleanup_security;
  2030. retval = copy_files(clone_flags, p, args->no_files);
  2031. if (retval)
  2032. goto bad_fork_cleanup_semundo;
  2033. retval = copy_fs(clone_flags, p);
  2034. if (retval)
  2035. goto bad_fork_cleanup_files;
  2036. retval = copy_sighand(clone_flags, p);
  2037. if (retval)
  2038. goto bad_fork_cleanup_fs;
  2039. retval = copy_signal(clone_flags, p);
  2040. if (retval)
  2041. goto bad_fork_cleanup_sighand;
  2042. retval = copy_mm(clone_flags, p);
  2043. if (retval)
  2044. goto bad_fork_cleanup_signal;
  2045. retval = copy_namespaces(clone_flags, p);
  2046. if (retval)
  2047. goto bad_fork_cleanup_mm;
  2048. retval = copy_io(clone_flags, p);
  2049. if (retval)
  2050. goto bad_fork_cleanup_namespaces;
  2051. retval = copy_thread(p, args);
  2052. if (retval)
  2053. goto bad_fork_cleanup_io;
  2054. stackleak_task_init(p);
  2055. if (pid != &init_struct_pid) {
  2056. pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
  2057. args->set_tid_size);
  2058. if (IS_ERR(pid)) {
  2059. retval = PTR_ERR(pid);
  2060. goto bad_fork_cleanup_thread;
  2061. }
  2062. }
  2063. /*
  2064. * This has to happen after we've potentially unshared the file
  2065. * descriptor table (so that the pidfd doesn't leak into the child
  2066. * if the fd table isn't shared).
  2067. */
  2068. if (clone_flags & CLONE_PIDFD) {
  2069. int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
  2070. /* Note that no task has been attached to @pid yet. */
  2071. retval = __pidfd_prepare(pid, flags, &pidfile);
  2072. if (retval < 0)
  2073. goto bad_fork_free_pid;
  2074. pidfd = retval;
  2075. retval = put_user(pidfd, args->pidfd);
  2076. if (retval)
  2077. goto bad_fork_put_pidfd;
  2078. }
  2079. #ifdef CONFIG_BLOCK
  2080. p->plug = NULL;
  2081. #endif
  2082. futex_init_task(p);
  2083. /*
  2084. * sigaltstack should be cleared when sharing the same VM
  2085. */
  2086. if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
  2087. sas_ss_reset(p);
  2088. /*
  2089. * Syscall tracing and stepping should be turned off in the
  2090. * child regardless of CLONE_PTRACE.
  2091. */
  2092. user_disable_single_step(p);
  2093. clear_task_syscall_work(p, SYSCALL_TRACE);
  2094. #if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
  2095. clear_task_syscall_work(p, SYSCALL_EMU);
  2096. #endif
  2097. clear_tsk_latency_tracing(p);
  2098. /* ok, now we should be set up.. */
  2099. p->pid = pid_nr(pid);
  2100. if (clone_flags & CLONE_THREAD) {
  2101. p->group_leader = current->group_leader;
  2102. p->tgid = current->tgid;
  2103. } else {
  2104. p->group_leader = p;
  2105. p->tgid = p->pid;
  2106. }
  2107. p->nr_dirtied = 0;
  2108. p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
  2109. p->dirty_paused_when = 0;
  2110. p->pdeath_signal = 0;
  2111. p->task_works = NULL;
  2112. clear_posix_cputimers_work(p);
  2113. #ifdef CONFIG_KRETPROBES
  2114. p->kretprobe_instances.first = NULL;
  2115. #endif
  2116. #ifdef CONFIG_RETHOOK
  2117. p->rethooks.first = NULL;
  2118. #endif
  2119. /*
  2120. * Ensure that the cgroup subsystem policies allow the new process to be
  2121. * forked. It should be noted that the new process's css_set can be changed
  2122. * between here and cgroup_post_fork() if an organisation operation is in
  2123. * progress.
  2124. */
  2125. retval = cgroup_can_fork(p, args);
  2126. if (retval)
  2127. goto bad_fork_put_pidfd;
  2128. /*
  2129. * Now that the cgroups are pinned, re-clone the parent cgroup and put
  2130. * the new task on the correct runqueue. All this *before* the task
  2131. * becomes visible.
  2132. *
  2133. * This isn't part of ->can_fork() because while the re-cloning is
  2134. * cgroup specific, it unconditionally needs to place the task on a
  2135. * runqueue.
  2136. */
  2137. retval = sched_cgroup_fork(p, args);
  2138. if (retval)
  2139. goto bad_fork_cancel_cgroup;
  2140. /*
  2141. * From this point on we must avoid any synchronous user-space
  2142. * communication until we take the tasklist-lock. In particular, we do
  2143. * not want user-space to be able to predict the process start-time by
  2144. * stalling fork(2) after we recorded the start_time but before it is
  2145. * visible to the system.
  2146. */
  2147. p->start_time = ktime_get_ns();
  2148. p->start_boottime = ktime_get_boottime_ns();
  2149. /*
  2150. * Make it visible to the rest of the system, but dont wake it up yet.
  2151. * Need tasklist lock for parent etc handling!
  2152. */
  2153. write_lock_irq(&tasklist_lock);
  2154. /* CLONE_PARENT re-uses the old parent */
  2155. if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
  2156. p->real_parent = current->real_parent;
  2157. p->parent_exec_id = current->parent_exec_id;
  2158. if (clone_flags & CLONE_THREAD)
  2159. p->exit_signal = -1;
  2160. else
  2161. p->exit_signal = current->group_leader->exit_signal;
  2162. } else {
  2163. p->real_parent = current;
  2164. p->parent_exec_id = current->self_exec_id;
  2165. p->exit_signal = args->exit_signal;
  2166. }
  2167. klp_copy_process(p);
  2168. sched_core_fork(p);
  2169. spin_lock(&current->sighand->siglock);
  2170. rv_task_fork(p);
  2171. rseq_fork(p, clone_flags);
  2172. /* Don't start children in a dying pid namespace */
  2173. if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
  2174. retval = -ENOMEM;
  2175. goto bad_fork_core_free;
  2176. }
  2177. /* Let kill terminate clone/fork in the middle */
  2178. if (fatal_signal_pending(current)) {
  2179. retval = -EINTR;
  2180. goto bad_fork_core_free;
  2181. }
  2182. /* No more failure paths after this point. */
  2183. /*
  2184. * Copy seccomp details explicitly here, in case they were changed
  2185. * before holding sighand lock.
  2186. */
  2187. copy_seccomp(p);
  2188. init_task_pid_links(p);
  2189. if (likely(p->pid)) {
  2190. ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
  2191. init_task_pid(p, PIDTYPE_PID, pid);
  2192. if (thread_group_leader(p)) {
  2193. init_task_pid(p, PIDTYPE_TGID, pid);
  2194. init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
  2195. init_task_pid(p, PIDTYPE_SID, task_session(current));
  2196. if (is_child_reaper(pid)) {
  2197. ns_of_pid(pid)->child_reaper = p;
  2198. p->signal->flags |= SIGNAL_UNKILLABLE;
  2199. }
  2200. p->signal->shared_pending.signal = delayed.signal;
  2201. p->signal->tty = tty_kref_get(current->signal->tty);
  2202. /*
  2203. * Inherit has_child_subreaper flag under the same
  2204. * tasklist_lock with adding child to the process tree
  2205. * for propagate_has_child_subreaper optimization.
  2206. */
  2207. p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
  2208. p->real_parent->signal->is_child_subreaper;
  2209. list_add_tail(&p->sibling, &p->real_parent->children);
  2210. list_add_tail_rcu(&p->tasks, &init_task.tasks);
  2211. attach_pid(p, PIDTYPE_TGID);
  2212. attach_pid(p, PIDTYPE_PGID);
  2213. attach_pid(p, PIDTYPE_SID);
  2214. __this_cpu_inc(process_counts);
  2215. } else {
  2216. current->signal->nr_threads++;
  2217. current->signal->quick_threads++;
  2218. atomic_inc(&current->signal->live);
  2219. refcount_inc(&current->signal->sigcnt);
  2220. task_join_group_stop(p);
  2221. list_add_tail_rcu(&p->thread_node,
  2222. &p->signal->thread_head);
  2223. }
  2224. attach_pid(p, PIDTYPE_PID);
  2225. nr_threads++;
  2226. }
  2227. total_forks++;
  2228. hlist_del_init(&delayed.node);
  2229. spin_unlock(&current->sighand->siglock);
  2230. syscall_tracepoint_update(p);
  2231. write_unlock_irq(&tasklist_lock);
  2232. if (pidfile)
  2233. fd_install(pidfd, pidfile);
  2234. proc_fork_connector(p);
  2235. sched_post_fork(p);
  2236. cgroup_post_fork(p, args);
  2237. perf_event_fork(p);
  2238. trace_task_newtask(p, clone_flags);
  2239. uprobe_copy_process(p, clone_flags);
  2240. user_events_fork(p, clone_flags);
  2241. copy_oom_score_adj(clone_flags, p);
  2242. return p;
  2243. bad_fork_core_free:
  2244. sched_core_free(p);
  2245. spin_unlock(&current->sighand->siglock);
  2246. write_unlock_irq(&tasklist_lock);
  2247. bad_fork_cancel_cgroup:
  2248. cgroup_cancel_fork(p, args);
  2249. bad_fork_put_pidfd:
  2250. if (clone_flags & CLONE_PIDFD) {
  2251. fput(pidfile);
  2252. put_unused_fd(pidfd);
  2253. }
  2254. bad_fork_free_pid:
  2255. if (pid != &init_struct_pid)
  2256. free_pid(pid);
  2257. bad_fork_cleanup_thread:
  2258. exit_thread(p);
  2259. bad_fork_cleanup_io:
  2260. if (p->io_context)
  2261. exit_io_context(p);
  2262. bad_fork_cleanup_namespaces:
  2263. exit_task_namespaces(p);
  2264. bad_fork_cleanup_mm:
  2265. if (p->mm) {
  2266. mm_clear_owner(p->mm, p);
  2267. mmput(p->mm);
  2268. }
  2269. bad_fork_cleanup_signal:
  2270. if (!(clone_flags & CLONE_THREAD))
  2271. free_signal_struct(p->signal);
  2272. bad_fork_cleanup_sighand:
  2273. __cleanup_sighand(p->sighand);
  2274. bad_fork_cleanup_fs:
  2275. exit_fs(p); /* blocking */
  2276. bad_fork_cleanup_files:
  2277. exit_files(p); /* blocking */
  2278. bad_fork_cleanup_semundo:
  2279. exit_sem(p);
  2280. bad_fork_cleanup_security:
  2281. security_task_free(p);
  2282. bad_fork_cleanup_audit:
  2283. audit_free(p);
  2284. bad_fork_cleanup_perf:
  2285. perf_event_free_task(p);
  2286. bad_fork_sched_cancel_fork:
  2287. sched_cancel_fork(p);
  2288. bad_fork_cleanup_policy:
  2289. lockdep_free_task(p);
  2290. #ifdef CONFIG_NUMA
  2291. mpol_put(p->mempolicy);
  2292. #endif
  2293. bad_fork_cleanup_delayacct:
  2294. delayacct_tsk_free(p);
  2295. bad_fork_cleanup_count:
  2296. dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
  2297. exit_creds(p);
  2298. bad_fork_free:
  2299. WRITE_ONCE(p->__state, TASK_DEAD);
  2300. exit_task_stack_account(p);
  2301. put_task_stack(p);
  2302. delayed_free_task(p);
  2303. fork_out:
  2304. spin_lock_irq(&current->sighand->siglock);
  2305. hlist_del_init(&delayed.node);
  2306. spin_unlock_irq(&current->sighand->siglock);
  2307. return ERR_PTR(retval);
  2308. }
  2309. static inline void init_idle_pids(struct task_struct *idle)
  2310. {
  2311. enum pid_type type;
  2312. for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
  2313. INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
  2314. init_task_pid(idle, type, &init_struct_pid);
  2315. }
  2316. }
  2317. static int idle_dummy(void *dummy)
  2318. {
  2319. /* This function is never called */
  2320. return 0;
  2321. }
  2322. struct task_struct * __init fork_idle(int cpu)
  2323. {
  2324. struct task_struct *task;
  2325. struct kernel_clone_args args = {
  2326. .flags = CLONE_VM,
  2327. .fn = &idle_dummy,
  2328. .fn_arg = NULL,
  2329. .kthread = 1,
  2330. .idle = 1,
  2331. };
  2332. task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
  2333. if (!IS_ERR(task)) {
  2334. init_idle_pids(task);
  2335. init_idle(task, cpu);
  2336. }
  2337. return task;
  2338. }
  2339. /*
  2340. * This is like kernel_clone(), but shaved down and tailored to just
  2341. * creating io_uring workers. It returns a created task, or an error pointer.
  2342. * The returned task is inactive, and the caller must fire it up through
  2343. * wake_up_new_task(p). All signals are blocked in the created task.
  2344. */
  2345. struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
  2346. {
  2347. unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
  2348. CLONE_IO;
  2349. struct kernel_clone_args args = {
  2350. .flags = ((lower_32_bits(flags) | CLONE_VM |
  2351. CLONE_UNTRACED) & ~CSIGNAL),
  2352. .exit_signal = (lower_32_bits(flags) & CSIGNAL),
  2353. .fn = fn,
  2354. .fn_arg = arg,
  2355. .io_thread = 1,
  2356. .user_worker = 1,
  2357. };
  2358. return copy_process(NULL, 0, node, &args);
  2359. }
  2360. /*
  2361. * Ok, this is the main fork-routine.
  2362. *
  2363. * It copies the process, and if successful kick-starts
  2364. * it and waits for it to finish using the VM if required.
  2365. *
  2366. * args->exit_signal is expected to be checked for sanity by the caller.
  2367. */
  2368. pid_t kernel_clone(struct kernel_clone_args *args)
  2369. {
  2370. u64 clone_flags = args->flags;
  2371. struct completion vfork;
  2372. struct pid *pid;
  2373. struct task_struct *p;
  2374. int trace = 0;
  2375. pid_t nr;
  2376. /*
  2377. * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
  2378. * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
  2379. * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
  2380. * field in struct clone_args and it still doesn't make sense to have
  2381. * them both point at the same memory location. Performing this check
  2382. * here has the advantage that we don't need to have a separate helper
  2383. * to check for legacy clone().
  2384. */
  2385. if ((clone_flags & CLONE_PIDFD) &&
  2386. (clone_flags & CLONE_PARENT_SETTID) &&
  2387. (args->pidfd == args->parent_tid))
  2388. return -EINVAL;
  2389. /*
  2390. * Determine whether and which event to report to ptracer. When
  2391. * called from kernel_thread or CLONE_UNTRACED is explicitly
  2392. * requested, no event is reported; otherwise, report if the event
  2393. * for the type of forking is enabled.
  2394. */
  2395. if (!(clone_flags & CLONE_UNTRACED)) {
  2396. if (clone_flags & CLONE_VFORK)
  2397. trace = PTRACE_EVENT_VFORK;
  2398. else if (args->exit_signal != SIGCHLD)
  2399. trace = PTRACE_EVENT_CLONE;
  2400. else
  2401. trace = PTRACE_EVENT_FORK;
  2402. if (likely(!ptrace_event_enabled(current, trace)))
  2403. trace = 0;
  2404. }
  2405. p = copy_process(NULL, trace, NUMA_NO_NODE, args);
  2406. add_latent_entropy();
  2407. if (IS_ERR(p))
  2408. return PTR_ERR(p);
  2409. /*
  2410. * Do this prior waking up the new thread - the thread pointer
  2411. * might get invalid after that point, if the thread exits quickly.
  2412. */
  2413. trace_sched_process_fork(current, p);
  2414. pid = get_task_pid(p, PIDTYPE_PID);
  2415. nr = pid_vnr(pid);
  2416. if (clone_flags & CLONE_PARENT_SETTID)
  2417. put_user(nr, args->parent_tid);
  2418. if (clone_flags & CLONE_VFORK) {
  2419. p->vfork_done = &vfork;
  2420. init_completion(&vfork);
  2421. get_task_struct(p);
  2422. }
  2423. if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
  2424. /* lock the task to synchronize with memcg migration */
  2425. task_lock(p);
  2426. lru_gen_add_mm(p->mm);
  2427. task_unlock(p);
  2428. }
  2429. wake_up_new_task(p);
  2430. /* forking complete and child started to run, tell ptracer */
  2431. if (unlikely(trace))
  2432. ptrace_event_pid(trace, pid);
  2433. if (clone_flags & CLONE_VFORK) {
  2434. if (!wait_for_vfork_done(p, &vfork))
  2435. ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
  2436. }
  2437. put_pid(pid);
  2438. return nr;
  2439. }
  2440. /*
  2441. * Create a kernel thread.
  2442. */
  2443. pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
  2444. unsigned long flags)
  2445. {
  2446. struct kernel_clone_args args = {
  2447. .flags = ((lower_32_bits(flags) | CLONE_VM |
  2448. CLONE_UNTRACED) & ~CSIGNAL),
  2449. .exit_signal = (lower_32_bits(flags) & CSIGNAL),
  2450. .fn = fn,
  2451. .fn_arg = arg,
  2452. .name = name,
  2453. .kthread = 1,
  2454. };
  2455. return kernel_clone(&args);
  2456. }
  2457. /*
  2458. * Create a user mode thread.
  2459. */
  2460. pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
  2461. {
  2462. struct kernel_clone_args args = {
  2463. .flags = ((lower_32_bits(flags) | CLONE_VM |
  2464. CLONE_UNTRACED) & ~CSIGNAL),
  2465. .exit_signal = (lower_32_bits(flags) & CSIGNAL),
  2466. .fn = fn,
  2467. .fn_arg = arg,
  2468. };
  2469. return kernel_clone(&args);
  2470. }
  2471. #ifdef __ARCH_WANT_SYS_FORK
  2472. SYSCALL_DEFINE0(fork)
  2473. {
  2474. #ifdef CONFIG_MMU
  2475. struct kernel_clone_args args = {
  2476. .exit_signal = SIGCHLD,
  2477. };
  2478. return kernel_clone(&args);
  2479. #else
  2480. /* can not support in nommu mode */
  2481. return -EINVAL;
  2482. #endif
  2483. }
  2484. #endif
  2485. #ifdef __ARCH_WANT_SYS_VFORK
  2486. SYSCALL_DEFINE0(vfork)
  2487. {
  2488. struct kernel_clone_args args = {
  2489. .flags = CLONE_VFORK | CLONE_VM,
  2490. .exit_signal = SIGCHLD,
  2491. };
  2492. return kernel_clone(&args);
  2493. }
  2494. #endif
  2495. #ifdef __ARCH_WANT_SYS_CLONE
  2496. #ifdef CONFIG_CLONE_BACKWARDS
  2497. SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
  2498. int __user *, parent_tidptr,
  2499. unsigned long, tls,
  2500. int __user *, child_tidptr)
  2501. #elif defined(CONFIG_CLONE_BACKWARDS2)
  2502. SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
  2503. int __user *, parent_tidptr,
  2504. int __user *, child_tidptr,
  2505. unsigned long, tls)
  2506. #elif defined(CONFIG_CLONE_BACKWARDS3)
  2507. SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
  2508. int, stack_size,
  2509. int __user *, parent_tidptr,
  2510. int __user *, child_tidptr,
  2511. unsigned long, tls)
  2512. #else
  2513. SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
  2514. int __user *, parent_tidptr,
  2515. int __user *, child_tidptr,
  2516. unsigned long, tls)
  2517. #endif
  2518. {
  2519. struct kernel_clone_args args = {
  2520. .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
  2521. .pidfd = parent_tidptr,
  2522. .child_tid = child_tidptr,
  2523. .parent_tid = parent_tidptr,
  2524. .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
  2525. .stack = newsp,
  2526. .tls = tls,
  2527. };
  2528. return kernel_clone(&args);
  2529. }
  2530. #endif
  2531. noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
  2532. struct clone_args __user *uargs,
  2533. size_t usize)
  2534. {
  2535. int err;
  2536. struct clone_args args;
  2537. pid_t *kset_tid = kargs->set_tid;
  2538. BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
  2539. CLONE_ARGS_SIZE_VER0);
  2540. BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
  2541. CLONE_ARGS_SIZE_VER1);
  2542. BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
  2543. CLONE_ARGS_SIZE_VER2);
  2544. BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
  2545. if (unlikely(usize > PAGE_SIZE))
  2546. return -E2BIG;
  2547. if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
  2548. return -EINVAL;
  2549. err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
  2550. if (err)
  2551. return err;
  2552. if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
  2553. return -EINVAL;
  2554. if (unlikely(!args.set_tid && args.set_tid_size > 0))
  2555. return -EINVAL;
  2556. if (unlikely(args.set_tid && args.set_tid_size == 0))
  2557. return -EINVAL;
  2558. /*
  2559. * Verify that higher 32bits of exit_signal are unset and that
  2560. * it is a valid signal
  2561. */
  2562. if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
  2563. !valid_signal(args.exit_signal)))
  2564. return -EINVAL;
  2565. if ((args.flags & CLONE_INTO_CGROUP) &&
  2566. (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
  2567. return -EINVAL;
  2568. *kargs = (struct kernel_clone_args){
  2569. .flags = args.flags,
  2570. .pidfd = u64_to_user_ptr(args.pidfd),
  2571. .child_tid = u64_to_user_ptr(args.child_tid),
  2572. .parent_tid = u64_to_user_ptr(args.parent_tid),
  2573. .exit_signal = args.exit_signal,
  2574. .stack = args.stack,
  2575. .stack_size = args.stack_size,
  2576. .tls = args.tls,
  2577. .set_tid_size = args.set_tid_size,
  2578. .cgroup = args.cgroup,
  2579. };
  2580. if (args.set_tid &&
  2581. copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
  2582. (kargs->set_tid_size * sizeof(pid_t))))
  2583. return -EFAULT;
  2584. kargs->set_tid = kset_tid;
  2585. return 0;
  2586. }
  2587. /**
  2588. * clone3_stack_valid - check and prepare stack
  2589. * @kargs: kernel clone args
  2590. *
  2591. * Verify that the stack arguments userspace gave us are sane.
  2592. * In addition, set the stack direction for userspace since it's easy for us to
  2593. * determine.
  2594. */
  2595. static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
  2596. {
  2597. if (kargs->stack == 0) {
  2598. if (kargs->stack_size > 0)
  2599. return false;
  2600. } else {
  2601. if (kargs->stack_size == 0)
  2602. return false;
  2603. if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
  2604. return false;
  2605. #if !defined(CONFIG_STACK_GROWSUP)
  2606. kargs->stack += kargs->stack_size;
  2607. #endif
  2608. }
  2609. return true;
  2610. }
  2611. static bool clone3_args_valid(struct kernel_clone_args *kargs)
  2612. {
  2613. /* Verify that no unknown flags are passed along. */
  2614. if (kargs->flags &
  2615. ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
  2616. return false;
  2617. /*
  2618. * - make the CLONE_DETACHED bit reusable for clone3
  2619. * - make the CSIGNAL bits reusable for clone3
  2620. */
  2621. if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
  2622. return false;
  2623. if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
  2624. (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
  2625. return false;
  2626. if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
  2627. kargs->exit_signal)
  2628. return false;
  2629. if (!clone3_stack_valid(kargs))
  2630. return false;
  2631. return true;
  2632. }
  2633. /**
  2634. * sys_clone3 - create a new process with specific properties
  2635. * @uargs: argument structure
  2636. * @size: size of @uargs
  2637. *
  2638. * clone3() is the extensible successor to clone()/clone2().
  2639. * It takes a struct as argument that is versioned by its size.
  2640. *
  2641. * Return: On success, a positive PID for the child process.
  2642. * On error, a negative errno number.
  2643. */
  2644. SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
  2645. {
  2646. int err;
  2647. struct kernel_clone_args kargs;
  2648. pid_t set_tid[MAX_PID_NS_LEVEL];
  2649. #ifdef __ARCH_BROKEN_SYS_CLONE3
  2650. #warning clone3() entry point is missing, please fix
  2651. return -ENOSYS;
  2652. #endif
  2653. kargs.set_tid = set_tid;
  2654. err = copy_clone_args_from_user(&kargs, uargs, size);
  2655. if (err)
  2656. return err;
  2657. if (!clone3_args_valid(&kargs))
  2658. return -EINVAL;
  2659. return kernel_clone(&kargs);
  2660. }
  2661. void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
  2662. {
  2663. struct task_struct *leader, *parent, *child;
  2664. int res;
  2665. read_lock(&tasklist_lock);
  2666. leader = top = top->group_leader;
  2667. down:
  2668. for_each_thread(leader, parent) {
  2669. list_for_each_entry(child, &parent->children, sibling) {
  2670. res = visitor(child, data);
  2671. if (res) {
  2672. if (res < 0)
  2673. goto out;
  2674. leader = child;
  2675. goto down;
  2676. }
  2677. up:
  2678. ;
  2679. }
  2680. }
  2681. if (leader != top) {
  2682. child = leader;
  2683. parent = child->real_parent;
  2684. leader = parent->group_leader;
  2685. goto up;
  2686. }
  2687. out:
  2688. read_unlock(&tasklist_lock);
  2689. }
  2690. #ifndef ARCH_MIN_MMSTRUCT_ALIGN
  2691. #define ARCH_MIN_MMSTRUCT_ALIGN 0
  2692. #endif
  2693. static void sighand_ctor(void *data)
  2694. {
  2695. struct sighand_struct *sighand = data;
  2696. spin_lock_init(&sighand->siglock);
  2697. init_waitqueue_head(&sighand->signalfd_wqh);
  2698. }
  2699. void __init mm_cache_init(void)
  2700. {
  2701. unsigned int mm_size;
  2702. /*
  2703. * The mm_cpumask is located at the end of mm_struct, and is
  2704. * dynamically sized based on the maximum CPU number this system
  2705. * can have, taking hotplug into account (nr_cpu_ids).
  2706. */
  2707. mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
  2708. mm_cachep = kmem_cache_create_usercopy("mm_struct",
  2709. mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
  2710. SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
  2711. offsetof(struct mm_struct, saved_auxv),
  2712. sizeof_field(struct mm_struct, saved_auxv),
  2713. NULL);
  2714. }
  2715. void __init proc_caches_init(void)
  2716. {
  2717. sighand_cachep = kmem_cache_create("sighand_cache",
  2718. sizeof(struct sighand_struct), 0,
  2719. SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
  2720. SLAB_ACCOUNT, sighand_ctor);
  2721. signal_cachep = kmem_cache_create("signal_cache",
  2722. sizeof(struct signal_struct), 0,
  2723. SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
  2724. NULL);
  2725. files_cachep = kmem_cache_create("files_cache",
  2726. sizeof(struct files_struct), 0,
  2727. SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
  2728. NULL);
  2729. fs_cachep = kmem_cache_create("fs_cache",
  2730. sizeof(struct fs_struct), 0,
  2731. SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
  2732. NULL);
  2733. vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
  2734. #ifdef CONFIG_PER_VMA_LOCK
  2735. vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
  2736. #endif
  2737. mmap_init();
  2738. nsproxy_cache_init();
  2739. }
  2740. /*
  2741. * Check constraints on flags passed to the unshare system call.
  2742. */
  2743. static int check_unshare_flags(unsigned long unshare_flags)
  2744. {
  2745. if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
  2746. CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
  2747. CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
  2748. CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
  2749. CLONE_NEWTIME))
  2750. return -EINVAL;
  2751. /*
  2752. * Not implemented, but pretend it works if there is nothing
  2753. * to unshare. Note that unsharing the address space or the
  2754. * signal handlers also need to unshare the signal queues (aka
  2755. * CLONE_THREAD).
  2756. */
  2757. if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
  2758. if (!thread_group_empty(current))
  2759. return -EINVAL;
  2760. }
  2761. if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
  2762. if (refcount_read(&current->sighand->count) > 1)
  2763. return -EINVAL;
  2764. }
  2765. if (unshare_flags & CLONE_VM) {
  2766. if (!current_is_single_threaded())
  2767. return -EINVAL;
  2768. }
  2769. return 0;
  2770. }
  2771. /*
  2772. * Unshare the filesystem structure if it is being shared
  2773. */
  2774. static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
  2775. {
  2776. struct fs_struct *fs = current->fs;
  2777. if (!(unshare_flags & CLONE_FS) || !fs)
  2778. return 0;
  2779. /* don't need lock here; in the worst case we'll do useless copy */
  2780. if (fs->users == 1)
  2781. return 0;
  2782. *new_fsp = copy_fs_struct(fs);
  2783. if (!*new_fsp)
  2784. return -ENOMEM;
  2785. return 0;
  2786. }
  2787. /*
  2788. * Unshare file descriptor table if it is being shared
  2789. */
  2790. static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
  2791. {
  2792. struct files_struct *fd = current->files;
  2793. if ((unshare_flags & CLONE_FILES) &&
  2794. (fd && atomic_read(&fd->count) > 1)) {
  2795. fd = dup_fd(fd, NULL);
  2796. if (IS_ERR(fd))
  2797. return PTR_ERR(fd);
  2798. *new_fdp = fd;
  2799. }
  2800. return 0;
  2801. }
  2802. /*
  2803. * unshare allows a process to 'unshare' part of the process
  2804. * context which was originally shared using clone. copy_*
  2805. * functions used by kernel_clone() cannot be used here directly
  2806. * because they modify an inactive task_struct that is being
  2807. * constructed. Here we are modifying the current, active,
  2808. * task_struct.
  2809. */
  2810. int ksys_unshare(unsigned long unshare_flags)
  2811. {
  2812. struct fs_struct *fs, *new_fs = NULL;
  2813. struct files_struct *new_fd = NULL;
  2814. struct cred *new_cred = NULL;
  2815. struct nsproxy *new_nsproxy = NULL;
  2816. int do_sysvsem = 0;
  2817. int err;
  2818. /*
  2819. * If unsharing a user namespace must also unshare the thread group
  2820. * and unshare the filesystem root and working directories.
  2821. */
  2822. if (unshare_flags & CLONE_NEWUSER)
  2823. unshare_flags |= CLONE_THREAD | CLONE_FS;
  2824. /*
  2825. * If unsharing vm, must also unshare signal handlers.
  2826. */
  2827. if (unshare_flags & CLONE_VM)
  2828. unshare_flags |= CLONE_SIGHAND;
  2829. /*
  2830. * If unsharing a signal handlers, must also unshare the signal queues.
  2831. */
  2832. if (unshare_flags & CLONE_SIGHAND)
  2833. unshare_flags |= CLONE_THREAD;
  2834. /*
  2835. * If unsharing namespace, must also unshare filesystem information.
  2836. */
  2837. if (unshare_flags & CLONE_NEWNS)
  2838. unshare_flags |= CLONE_FS;
  2839. err = check_unshare_flags(unshare_flags);
  2840. if (err)
  2841. goto bad_unshare_out;
  2842. /*
  2843. * CLONE_NEWIPC must also detach from the undolist: after switching
  2844. * to a new ipc namespace, the semaphore arrays from the old
  2845. * namespace are unreachable.
  2846. */
  2847. if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
  2848. do_sysvsem = 1;
  2849. err = unshare_fs(unshare_flags, &new_fs);
  2850. if (err)
  2851. goto bad_unshare_out;
  2852. err = unshare_fd(unshare_flags, &new_fd);
  2853. if (err)
  2854. goto bad_unshare_cleanup_fs;
  2855. err = unshare_userns(unshare_flags, &new_cred);
  2856. if (err)
  2857. goto bad_unshare_cleanup_fd;
  2858. err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
  2859. new_cred, new_fs);
  2860. if (err)
  2861. goto bad_unshare_cleanup_cred;
  2862. if (new_cred) {
  2863. err = set_cred_ucounts(new_cred);
  2864. if (err)
  2865. goto bad_unshare_cleanup_cred;
  2866. }
  2867. if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
  2868. if (do_sysvsem) {
  2869. /*
  2870. * CLONE_SYSVSEM is equivalent to sys_exit().
  2871. */
  2872. exit_sem(current);
  2873. }
  2874. if (unshare_flags & CLONE_NEWIPC) {
  2875. /* Orphan segments in old ns (see sem above). */
  2876. exit_shm(current);
  2877. shm_init_task(current);
  2878. }
  2879. if (new_nsproxy)
  2880. switch_task_namespaces(current, new_nsproxy);
  2881. task_lock(current);
  2882. if (new_fs) {
  2883. fs = current->fs;
  2884. spin_lock(&fs->lock);
  2885. current->fs = new_fs;
  2886. if (--fs->users)
  2887. new_fs = NULL;
  2888. else
  2889. new_fs = fs;
  2890. spin_unlock(&fs->lock);
  2891. }
  2892. if (new_fd)
  2893. swap(current->files, new_fd);
  2894. task_unlock(current);
  2895. if (new_cred) {
  2896. /* Install the new user namespace */
  2897. commit_creds(new_cred);
  2898. new_cred = NULL;
  2899. }
  2900. }
  2901. perf_event_namespaces(current);
  2902. bad_unshare_cleanup_cred:
  2903. if (new_cred)
  2904. put_cred(new_cred);
  2905. bad_unshare_cleanup_fd:
  2906. if (new_fd)
  2907. put_files_struct(new_fd);
  2908. bad_unshare_cleanup_fs:
  2909. if (new_fs)
  2910. free_fs_struct(new_fs);
  2911. bad_unshare_out:
  2912. return err;
  2913. }
  2914. SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
  2915. {
  2916. return ksys_unshare(unshare_flags);
  2917. }
  2918. /*
  2919. * Helper to unshare the files of the current task.
  2920. * We don't want to expose copy_files internals to
  2921. * the exec layer of the kernel.
  2922. */
  2923. int unshare_files(void)
  2924. {
  2925. struct task_struct *task = current;
  2926. struct files_struct *old, *copy = NULL;
  2927. int error;
  2928. error = unshare_fd(CLONE_FILES, &copy);
  2929. if (error || !copy)
  2930. return error;
  2931. old = task->files;
  2932. task_lock(task);
  2933. task->files = copy;
  2934. task_unlock(task);
  2935. put_files_struct(old);
  2936. return 0;
  2937. }
  2938. int sysctl_max_threads(const struct ctl_table *table, int write,
  2939. void *buffer, size_t *lenp, loff_t *ppos)
  2940. {
  2941. struct ctl_table t;
  2942. int ret;
  2943. int threads = max_threads;
  2944. int min = 1;
  2945. int max = MAX_THREADS;
  2946. t = *table;
  2947. t.data = &threads;
  2948. t.extra1 = &min;
  2949. t.extra2 = &max;
  2950. ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
  2951. if (ret || !write)
  2952. return ret;
  2953. max_threads = threads;
  2954. return 0;
  2955. }