percpu.c 101 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * mm/percpu.c - percpu memory allocator
  4. *
  5. * Copyright (C) 2009 SUSE Linux Products GmbH
  6. * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
  7. *
  8. * Copyright (C) 2017 Facebook Inc.
  9. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
  10. *
  11. * The percpu allocator handles both static and dynamic areas. Percpu
  12. * areas are allocated in chunks which are divided into units. There is
  13. * a 1-to-1 mapping for units to possible cpus. These units are grouped
  14. * based on NUMA properties of the machine.
  15. *
  16. * c0 c1 c2
  17. * ------------------- ------------------- ------------
  18. * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
  19. * ------------------- ...... ------------------- .... ------------
  20. *
  21. * Allocation is done by offsets into a unit's address space. Ie., an
  22. * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
  23. * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
  24. * and even sparse. Access is handled by configuring percpu base
  25. * registers according to the cpu to unit mappings and offsetting the
  26. * base address using pcpu_unit_size.
  27. *
  28. * There is special consideration for the first chunk which must handle
  29. * the static percpu variables in the kernel image as allocation services
  30. * are not online yet. In short, the first chunk is structured like so:
  31. *
  32. * <Static | [Reserved] | Dynamic>
  33. *
  34. * The static data is copied from the original section managed by the
  35. * linker. The reserved section, if non-zero, primarily manages static
  36. * percpu variables from kernel modules. Finally, the dynamic section
  37. * takes care of normal allocations.
  38. *
  39. * The allocator organizes chunks into lists according to free size and
  40. * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT
  41. * flag should be passed. All memcg-aware allocations are sharing one set
  42. * of chunks and all unaccounted allocations and allocations performed
  43. * by processes belonging to the root memory cgroup are using the second set.
  44. *
  45. * The allocator tries to allocate from the fullest chunk first. Each chunk
  46. * is managed by a bitmap with metadata blocks. The allocation map is updated
  47. * on every allocation and free to reflect the current state while the boundary
  48. * map is only updated on allocation. Each metadata block contains
  49. * information to help mitigate the need to iterate over large portions
  50. * of the bitmap. The reverse mapping from page to chunk is stored in
  51. * the page's index. Lastly, units are lazily backed and grow in unison.
  52. *
  53. * There is a unique conversion that goes on here between bytes and bits.
  54. * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk
  55. * tracks the number of pages it is responsible for in nr_pages. Helper
  56. * functions are used to convert from between the bytes, bits, and blocks.
  57. * All hints are managed in bits unless explicitly stated.
  58. *
  59. * To use this allocator, arch code should do the following:
  60. *
  61. * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  62. * regular address to percpu pointer and back if they need to be
  63. * different from the default
  64. *
  65. * - use pcpu_setup_first_chunk() during percpu area initialization to
  66. * setup the first chunk containing the kernel static percpu area
  67. */
  68. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69. #include <linux/bitmap.h>
  70. #include <linux/cpumask.h>
  71. #include <linux/memblock.h>
  72. #include <linux/err.h>
  73. #include <linux/list.h>
  74. #include <linux/log2.h>
  75. #include <linux/mm.h>
  76. #include <linux/module.h>
  77. #include <linux/mutex.h>
  78. #include <linux/percpu.h>
  79. #include <linux/pfn.h>
  80. #include <linux/slab.h>
  81. #include <linux/spinlock.h>
  82. #include <linux/vmalloc.h>
  83. #include <linux/workqueue.h>
  84. #include <linux/kmemleak.h>
  85. #include <linux/sched.h>
  86. #include <linux/sched/mm.h>
  87. #include <linux/memcontrol.h>
  88. #include <asm/cacheflush.h>
  89. #include <asm/sections.h>
  90. #include <asm/tlbflush.h>
  91. #include <asm/io.h>
  92. #define CREATE_TRACE_POINTS
  93. #include <trace/events/percpu.h>
  94. #include "percpu-internal.h"
  95. /*
  96. * The slots are sorted by the size of the biggest continuous free area.
  97. * 1-31 bytes share the same slot.
  98. */
  99. #define PCPU_SLOT_BASE_SHIFT 5
  100. /* chunks in slots below this are subject to being sidelined on failed alloc */
  101. #define PCPU_SLOT_FAIL_THRESHOLD 3
  102. #define PCPU_EMPTY_POP_PAGES_LOW 2
  103. #define PCPU_EMPTY_POP_PAGES_HIGH 4
  104. #ifdef CONFIG_SMP
  105. /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
  106. #ifndef __addr_to_pcpu_ptr
  107. #define __addr_to_pcpu_ptr(addr) \
  108. (void __percpu *)((unsigned long)(addr) - \
  109. (unsigned long)pcpu_base_addr + \
  110. (unsigned long)__per_cpu_start)
  111. #endif
  112. #ifndef __pcpu_ptr_to_addr
  113. #define __pcpu_ptr_to_addr(ptr) \
  114. (void __force *)((unsigned long)(ptr) + \
  115. (unsigned long)pcpu_base_addr - \
  116. (unsigned long)__per_cpu_start)
  117. #endif
  118. #else /* CONFIG_SMP */
  119. /* on UP, it's always identity mapped */
  120. #define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
  121. #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
  122. #endif /* CONFIG_SMP */
  123. static int pcpu_unit_pages __ro_after_init;
  124. static int pcpu_unit_size __ro_after_init;
  125. static int pcpu_nr_units __ro_after_init;
  126. static int pcpu_atom_size __ro_after_init;
  127. int pcpu_nr_slots __ro_after_init;
  128. static int pcpu_free_slot __ro_after_init;
  129. int pcpu_sidelined_slot __ro_after_init;
  130. int pcpu_to_depopulate_slot __ro_after_init;
  131. static size_t pcpu_chunk_struct_size __ro_after_init;
  132. /* cpus with the lowest and highest unit addresses */
  133. static unsigned int pcpu_low_unit_cpu __ro_after_init;
  134. static unsigned int pcpu_high_unit_cpu __ro_after_init;
  135. /* the address of the first chunk which starts with the kernel static area */
  136. void *pcpu_base_addr __ro_after_init;
  137. static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */
  138. const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
  139. /* group information, used for vm allocation */
  140. static int pcpu_nr_groups __ro_after_init;
  141. static const unsigned long *pcpu_group_offsets __ro_after_init;
  142. static const size_t *pcpu_group_sizes __ro_after_init;
  143. /*
  144. * The first chunk which always exists. Note that unlike other
  145. * chunks, this one can be allocated and mapped in several different
  146. * ways and thus often doesn't live in the vmalloc area.
  147. */
  148. struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
  149. /*
  150. * Optional reserved chunk. This chunk reserves part of the first
  151. * chunk and serves it for reserved allocations. When the reserved
  152. * region doesn't exist, the following variable is NULL.
  153. */
  154. struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
  155. DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
  156. static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
  157. struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
  158. /*
  159. * The number of empty populated pages, protected by pcpu_lock.
  160. * The reserved chunk doesn't contribute to the count.
  161. */
  162. int pcpu_nr_empty_pop_pages;
  163. /*
  164. * The number of populated pages in use by the allocator, protected by
  165. * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets
  166. * allocated/deallocated, it is allocated/deallocated in all units of a chunk
  167. * and increments/decrements this count by 1).
  168. */
  169. static unsigned long pcpu_nr_populated;
  170. /*
  171. * Balance work is used to populate or destroy chunks asynchronously. We
  172. * try to keep the number of populated free pages between
  173. * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
  174. * empty chunk.
  175. */
  176. static void pcpu_balance_workfn(struct work_struct *work);
  177. static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
  178. static bool pcpu_async_enabled __read_mostly;
  179. static bool pcpu_atomic_alloc_failed;
  180. static void pcpu_schedule_balance_work(void)
  181. {
  182. if (pcpu_async_enabled)
  183. schedule_work(&pcpu_balance_work);
  184. }
  185. /**
  186. * pcpu_addr_in_chunk - check if the address is served from this chunk
  187. * @chunk: chunk of interest
  188. * @addr: percpu address
  189. *
  190. * RETURNS:
  191. * True if the address is served from this chunk.
  192. */
  193. static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
  194. {
  195. void *start_addr, *end_addr;
  196. if (!chunk)
  197. return false;
  198. start_addr = chunk->base_addr + chunk->start_offset;
  199. end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
  200. chunk->end_offset;
  201. return addr >= start_addr && addr < end_addr;
  202. }
  203. static int __pcpu_size_to_slot(int size)
  204. {
  205. int highbit = fls(size); /* size is in bytes */
  206. return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
  207. }
  208. static int pcpu_size_to_slot(int size)
  209. {
  210. if (size == pcpu_unit_size)
  211. return pcpu_free_slot;
  212. return __pcpu_size_to_slot(size);
  213. }
  214. static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
  215. {
  216. const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  217. if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
  218. chunk_md->contig_hint == 0)
  219. return 0;
  220. return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
  221. }
  222. /* set the pointer to a chunk in a page struct */
  223. static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
  224. {
  225. page->index = (unsigned long)pcpu;
  226. }
  227. /* obtain pointer to a chunk from a page struct */
  228. static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
  229. {
  230. return (struct pcpu_chunk *)page->index;
  231. }
  232. static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
  233. {
  234. return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
  235. }
  236. static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
  237. {
  238. return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
  239. }
  240. static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
  241. unsigned int cpu, int page_idx)
  242. {
  243. return (unsigned long)chunk->base_addr +
  244. pcpu_unit_page_offset(cpu, page_idx);
  245. }
  246. /*
  247. * The following are helper functions to help access bitmaps and convert
  248. * between bitmap offsets to address offsets.
  249. */
  250. static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
  251. {
  252. return chunk->alloc_map +
  253. (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
  254. }
  255. static unsigned long pcpu_off_to_block_index(int off)
  256. {
  257. return off / PCPU_BITMAP_BLOCK_BITS;
  258. }
  259. static unsigned long pcpu_off_to_block_off(int off)
  260. {
  261. return off & (PCPU_BITMAP_BLOCK_BITS - 1);
  262. }
  263. static unsigned long pcpu_block_off_to_off(int index, int off)
  264. {
  265. return index * PCPU_BITMAP_BLOCK_BITS + off;
  266. }
  267. /**
  268. * pcpu_check_block_hint - check against the contig hint
  269. * @block: block of interest
  270. * @bits: size of allocation
  271. * @align: alignment of area (max PAGE_SIZE)
  272. *
  273. * Check to see if the allocation can fit in the block's contig hint.
  274. * Note, a chunk uses the same hints as a block so this can also check against
  275. * the chunk's contig hint.
  276. */
  277. static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
  278. size_t align)
  279. {
  280. int bit_off = ALIGN(block->contig_hint_start, align) -
  281. block->contig_hint_start;
  282. return bit_off + bits <= block->contig_hint;
  283. }
  284. /*
  285. * pcpu_next_hint - determine which hint to use
  286. * @block: block of interest
  287. * @alloc_bits: size of allocation
  288. *
  289. * This determines if we should scan based on the scan_hint or first_free.
  290. * In general, we want to scan from first_free to fulfill allocations by
  291. * first fit. However, if we know a scan_hint at position scan_hint_start
  292. * cannot fulfill an allocation, we can begin scanning from there knowing
  293. * the contig_hint will be our fallback.
  294. */
  295. static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
  296. {
  297. /*
  298. * The three conditions below determine if we can skip past the
  299. * scan_hint. First, does the scan hint exist. Second, is the
  300. * contig_hint after the scan_hint (possibly not true iff
  301. * contig_hint == scan_hint). Third, is the allocation request
  302. * larger than the scan_hint.
  303. */
  304. if (block->scan_hint &&
  305. block->contig_hint_start > block->scan_hint_start &&
  306. alloc_bits > block->scan_hint)
  307. return block->scan_hint_start + block->scan_hint;
  308. return block->first_free;
  309. }
  310. /**
  311. * pcpu_next_md_free_region - finds the next hint free area
  312. * @chunk: chunk of interest
  313. * @bit_off: chunk offset
  314. * @bits: size of free area
  315. *
  316. * Helper function for pcpu_for_each_md_free_region. It checks
  317. * block->contig_hint and performs aggregation across blocks to find the
  318. * next hint. It modifies bit_off and bits in-place to be consumed in the
  319. * loop.
  320. */
  321. static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
  322. int *bits)
  323. {
  324. int i = pcpu_off_to_block_index(*bit_off);
  325. int block_off = pcpu_off_to_block_off(*bit_off);
  326. struct pcpu_block_md *block;
  327. *bits = 0;
  328. for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  329. block++, i++) {
  330. /* handles contig area across blocks */
  331. if (*bits) {
  332. *bits += block->left_free;
  333. if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  334. continue;
  335. return;
  336. }
  337. /*
  338. * This checks three things. First is there a contig_hint to
  339. * check. Second, have we checked this hint before by
  340. * comparing the block_off. Third, is this the same as the
  341. * right contig hint. In the last case, it spills over into
  342. * the next block and should be handled by the contig area
  343. * across blocks code.
  344. */
  345. *bits = block->contig_hint;
  346. if (*bits && block->contig_hint_start >= block_off &&
  347. *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
  348. *bit_off = pcpu_block_off_to_off(i,
  349. block->contig_hint_start);
  350. return;
  351. }
  352. /* reset to satisfy the second predicate above */
  353. block_off = 0;
  354. *bits = block->right_free;
  355. *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
  356. }
  357. }
  358. /**
  359. * pcpu_next_fit_region - finds fit areas for a given allocation request
  360. * @chunk: chunk of interest
  361. * @alloc_bits: size of allocation
  362. * @align: alignment of area (max PAGE_SIZE)
  363. * @bit_off: chunk offset
  364. * @bits: size of free area
  365. *
  366. * Finds the next free region that is viable for use with a given size and
  367. * alignment. This only returns if there is a valid area to be used for this
  368. * allocation. block->first_free is returned if the allocation request fits
  369. * within the block to see if the request can be fulfilled prior to the contig
  370. * hint.
  371. */
  372. static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
  373. int align, int *bit_off, int *bits)
  374. {
  375. int i = pcpu_off_to_block_index(*bit_off);
  376. int block_off = pcpu_off_to_block_off(*bit_off);
  377. struct pcpu_block_md *block;
  378. *bits = 0;
  379. for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  380. block++, i++) {
  381. /* handles contig area across blocks */
  382. if (*bits) {
  383. *bits += block->left_free;
  384. if (*bits >= alloc_bits)
  385. return;
  386. if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  387. continue;
  388. }
  389. /* check block->contig_hint */
  390. *bits = ALIGN(block->contig_hint_start, align) -
  391. block->contig_hint_start;
  392. /*
  393. * This uses the block offset to determine if this has been
  394. * checked in the prior iteration.
  395. */
  396. if (block->contig_hint &&
  397. block->contig_hint_start >= block_off &&
  398. block->contig_hint >= *bits + alloc_bits) {
  399. int start = pcpu_next_hint(block, alloc_bits);
  400. *bits += alloc_bits + block->contig_hint_start -
  401. start;
  402. *bit_off = pcpu_block_off_to_off(i, start);
  403. return;
  404. }
  405. /* reset to satisfy the second predicate above */
  406. block_off = 0;
  407. *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
  408. align);
  409. *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
  410. *bit_off = pcpu_block_off_to_off(i, *bit_off);
  411. if (*bits >= alloc_bits)
  412. return;
  413. }
  414. /* no valid offsets were found - fail condition */
  415. *bit_off = pcpu_chunk_map_bits(chunk);
  416. }
  417. /*
  418. * Metadata free area iterators. These perform aggregation of free areas
  419. * based on the metadata blocks and return the offset @bit_off and size in
  420. * bits of the free area @bits. pcpu_for_each_fit_region only returns when
  421. * a fit is found for the allocation request.
  422. */
  423. #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
  424. for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
  425. (bit_off) < pcpu_chunk_map_bits((chunk)); \
  426. (bit_off) += (bits) + 1, \
  427. pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
  428. #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
  429. for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  430. &(bits)); \
  431. (bit_off) < pcpu_chunk_map_bits((chunk)); \
  432. (bit_off) += (bits), \
  433. pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  434. &(bits)))
  435. /**
  436. * pcpu_mem_zalloc - allocate memory
  437. * @size: bytes to allocate
  438. * @gfp: allocation flags
  439. *
  440. * Allocate @size bytes. If @size is smaller than PAGE_SIZE,
  441. * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
  442. * This is to facilitate passing through whitelisted flags. The
  443. * returned memory is always zeroed.
  444. *
  445. * RETURNS:
  446. * Pointer to the allocated area on success, NULL on failure.
  447. */
  448. static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
  449. {
  450. if (WARN_ON_ONCE(!slab_is_available()))
  451. return NULL;
  452. if (size <= PAGE_SIZE)
  453. return kzalloc(size, gfp);
  454. else
  455. return __vmalloc(size, gfp | __GFP_ZERO);
  456. }
  457. /**
  458. * pcpu_mem_free - free memory
  459. * @ptr: memory to free
  460. *
  461. * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
  462. */
  463. static void pcpu_mem_free(void *ptr)
  464. {
  465. kvfree(ptr);
  466. }
  467. static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
  468. bool move_front)
  469. {
  470. if (chunk != pcpu_reserved_chunk) {
  471. if (move_front)
  472. list_move(&chunk->list, &pcpu_chunk_lists[slot]);
  473. else
  474. list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
  475. }
  476. }
  477. static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
  478. {
  479. __pcpu_chunk_move(chunk, slot, true);
  480. }
  481. /**
  482. * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
  483. * @chunk: chunk of interest
  484. * @oslot: the previous slot it was on
  485. *
  486. * This function is called after an allocation or free changed @chunk.
  487. * New slot according to the changed state is determined and @chunk is
  488. * moved to the slot. Note that the reserved chunk is never put on
  489. * chunk slots.
  490. *
  491. * CONTEXT:
  492. * pcpu_lock.
  493. */
  494. static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
  495. {
  496. int nslot = pcpu_chunk_slot(chunk);
  497. /* leave isolated chunks in-place */
  498. if (chunk->isolated)
  499. return;
  500. if (oslot != nslot)
  501. __pcpu_chunk_move(chunk, nslot, oslot < nslot);
  502. }
  503. static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
  504. {
  505. lockdep_assert_held(&pcpu_lock);
  506. if (!chunk->isolated) {
  507. chunk->isolated = true;
  508. pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
  509. }
  510. list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
  511. }
  512. static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
  513. {
  514. lockdep_assert_held(&pcpu_lock);
  515. if (chunk->isolated) {
  516. chunk->isolated = false;
  517. pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
  518. pcpu_chunk_relocate(chunk, -1);
  519. }
  520. }
  521. /*
  522. * pcpu_update_empty_pages - update empty page counters
  523. * @chunk: chunk of interest
  524. * @nr: nr of empty pages
  525. *
  526. * This is used to keep track of the empty pages now based on the premise
  527. * a md_block covers a page. The hint update functions recognize if a block
  528. * is made full or broken to calculate deltas for keeping track of free pages.
  529. */
  530. static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
  531. {
  532. chunk->nr_empty_pop_pages += nr;
  533. if (chunk != pcpu_reserved_chunk && !chunk->isolated)
  534. pcpu_nr_empty_pop_pages += nr;
  535. }
  536. /*
  537. * pcpu_region_overlap - determines if two regions overlap
  538. * @a: start of first region, inclusive
  539. * @b: end of first region, exclusive
  540. * @x: start of second region, inclusive
  541. * @y: end of second region, exclusive
  542. *
  543. * This is used to determine if the hint region [a, b) overlaps with the
  544. * allocated region [x, y).
  545. */
  546. static inline bool pcpu_region_overlap(int a, int b, int x, int y)
  547. {
  548. return (a < y) && (x < b);
  549. }
  550. /**
  551. * pcpu_block_update - updates a block given a free area
  552. * @block: block of interest
  553. * @start: start offset in block
  554. * @end: end offset in block
  555. *
  556. * Updates a block given a known free area. The region [start, end) is
  557. * expected to be the entirety of the free area within a block. Chooses
  558. * the best starting offset if the contig hints are equal.
  559. */
  560. static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
  561. {
  562. int contig = end - start;
  563. block->first_free = min(block->first_free, start);
  564. if (start == 0)
  565. block->left_free = contig;
  566. if (end == block->nr_bits)
  567. block->right_free = contig;
  568. if (contig > block->contig_hint) {
  569. /* promote the old contig_hint to be the new scan_hint */
  570. if (start > block->contig_hint_start) {
  571. if (block->contig_hint > block->scan_hint) {
  572. block->scan_hint_start =
  573. block->contig_hint_start;
  574. block->scan_hint = block->contig_hint;
  575. } else if (start < block->scan_hint_start) {
  576. /*
  577. * The old contig_hint == scan_hint. But, the
  578. * new contig is larger so hold the invariant
  579. * scan_hint_start < contig_hint_start.
  580. */
  581. block->scan_hint = 0;
  582. }
  583. } else {
  584. block->scan_hint = 0;
  585. }
  586. block->contig_hint_start = start;
  587. block->contig_hint = contig;
  588. } else if (contig == block->contig_hint) {
  589. if (block->contig_hint_start &&
  590. (!start ||
  591. __ffs(start) > __ffs(block->contig_hint_start))) {
  592. /* start has a better alignment so use it */
  593. block->contig_hint_start = start;
  594. if (start < block->scan_hint_start &&
  595. block->contig_hint > block->scan_hint)
  596. block->scan_hint = 0;
  597. } else if (start > block->scan_hint_start ||
  598. block->contig_hint > block->scan_hint) {
  599. /*
  600. * Knowing contig == contig_hint, update the scan_hint
  601. * if it is farther than or larger than the current
  602. * scan_hint.
  603. */
  604. block->scan_hint_start = start;
  605. block->scan_hint = contig;
  606. }
  607. } else {
  608. /*
  609. * The region is smaller than the contig_hint. So only update
  610. * the scan_hint if it is larger than or equal and farther than
  611. * the current scan_hint.
  612. */
  613. if ((start < block->contig_hint_start &&
  614. (contig > block->scan_hint ||
  615. (contig == block->scan_hint &&
  616. start > block->scan_hint_start)))) {
  617. block->scan_hint_start = start;
  618. block->scan_hint = contig;
  619. }
  620. }
  621. }
  622. /*
  623. * pcpu_block_update_scan - update a block given a free area from a scan
  624. * @chunk: chunk of interest
  625. * @bit_off: chunk offset
  626. * @bits: size of free area
  627. *
  628. * Finding the final allocation spot first goes through pcpu_find_block_fit()
  629. * to find a block that can hold the allocation and then pcpu_alloc_area()
  630. * where a scan is used. When allocations require specific alignments,
  631. * we can inadvertently create holes which will not be seen in the alloc
  632. * or free paths.
  633. *
  634. * This takes a given free area hole and updates a block as it may change the
  635. * scan_hint. We need to scan backwards to ensure we don't miss free bits
  636. * from alignment.
  637. */
  638. static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
  639. int bits)
  640. {
  641. int s_off = pcpu_off_to_block_off(bit_off);
  642. int e_off = s_off + bits;
  643. int s_index, l_bit;
  644. struct pcpu_block_md *block;
  645. if (e_off > PCPU_BITMAP_BLOCK_BITS)
  646. return;
  647. s_index = pcpu_off_to_block_index(bit_off);
  648. block = chunk->md_blocks + s_index;
  649. /* scan backwards in case of alignment skipping free bits */
  650. l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
  651. s_off = (s_off == l_bit) ? 0 : l_bit + 1;
  652. pcpu_block_update(block, s_off, e_off);
  653. }
  654. /**
  655. * pcpu_chunk_refresh_hint - updates metadata about a chunk
  656. * @chunk: chunk of interest
  657. * @full_scan: if we should scan from the beginning
  658. *
  659. * Iterates over the metadata blocks to find the largest contig area.
  660. * A full scan can be avoided on the allocation path as this is triggered
  661. * if we broke the contig_hint. In doing so, the scan_hint will be before
  662. * the contig_hint or after if the scan_hint == contig_hint. This cannot
  663. * be prevented on freeing as we want to find the largest area possibly
  664. * spanning blocks.
  665. */
  666. static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
  667. {
  668. struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  669. int bit_off, bits;
  670. /* promote scan_hint to contig_hint */
  671. if (!full_scan && chunk_md->scan_hint) {
  672. bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
  673. chunk_md->contig_hint_start = chunk_md->scan_hint_start;
  674. chunk_md->contig_hint = chunk_md->scan_hint;
  675. chunk_md->scan_hint = 0;
  676. } else {
  677. bit_off = chunk_md->first_free;
  678. chunk_md->contig_hint = 0;
  679. }
  680. bits = 0;
  681. pcpu_for_each_md_free_region(chunk, bit_off, bits)
  682. pcpu_block_update(chunk_md, bit_off, bit_off + bits);
  683. }
  684. /**
  685. * pcpu_block_refresh_hint
  686. * @chunk: chunk of interest
  687. * @index: index of the metadata block
  688. *
  689. * Scans over the block beginning at first_free and updates the block
  690. * metadata accordingly.
  691. */
  692. static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
  693. {
  694. struct pcpu_block_md *block = chunk->md_blocks + index;
  695. unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
  696. unsigned int start, end; /* region start, region end */
  697. /* promote scan_hint to contig_hint */
  698. if (block->scan_hint) {
  699. start = block->scan_hint_start + block->scan_hint;
  700. block->contig_hint_start = block->scan_hint_start;
  701. block->contig_hint = block->scan_hint;
  702. block->scan_hint = 0;
  703. } else {
  704. start = block->first_free;
  705. block->contig_hint = 0;
  706. }
  707. block->right_free = 0;
  708. /* iterate over free areas and update the contig hints */
  709. for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
  710. pcpu_block_update(block, start, end);
  711. }
  712. /**
  713. * pcpu_block_update_hint_alloc - update hint on allocation path
  714. * @chunk: chunk of interest
  715. * @bit_off: chunk offset
  716. * @bits: size of request
  717. *
  718. * Updates metadata for the allocation path. The metadata only has to be
  719. * refreshed by a full scan iff the chunk's contig hint is broken. Block level
  720. * scans are required if the block's contig hint is broken.
  721. */
  722. static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
  723. int bits)
  724. {
  725. struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  726. int nr_empty_pages = 0;
  727. struct pcpu_block_md *s_block, *e_block, *block;
  728. int s_index, e_index; /* block indexes of the freed allocation */
  729. int s_off, e_off; /* block offsets of the freed allocation */
  730. /*
  731. * Calculate per block offsets.
  732. * The calculation uses an inclusive range, but the resulting offsets
  733. * are [start, end). e_index always points to the last block in the
  734. * range.
  735. */
  736. s_index = pcpu_off_to_block_index(bit_off);
  737. e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  738. s_off = pcpu_off_to_block_off(bit_off);
  739. e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  740. s_block = chunk->md_blocks + s_index;
  741. e_block = chunk->md_blocks + e_index;
  742. /*
  743. * Update s_block.
  744. */
  745. if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
  746. nr_empty_pages++;
  747. /*
  748. * block->first_free must be updated if the allocation takes its place.
  749. * If the allocation breaks the contig_hint, a scan is required to
  750. * restore this hint.
  751. */
  752. if (s_off == s_block->first_free)
  753. s_block->first_free = find_next_zero_bit(
  754. pcpu_index_alloc_map(chunk, s_index),
  755. PCPU_BITMAP_BLOCK_BITS,
  756. s_off + bits);
  757. if (pcpu_region_overlap(s_block->scan_hint_start,
  758. s_block->scan_hint_start + s_block->scan_hint,
  759. s_off,
  760. s_off + bits))
  761. s_block->scan_hint = 0;
  762. if (pcpu_region_overlap(s_block->contig_hint_start,
  763. s_block->contig_hint_start +
  764. s_block->contig_hint,
  765. s_off,
  766. s_off + bits)) {
  767. /* block contig hint is broken - scan to fix it */
  768. if (!s_off)
  769. s_block->left_free = 0;
  770. pcpu_block_refresh_hint(chunk, s_index);
  771. } else {
  772. /* update left and right contig manually */
  773. s_block->left_free = min(s_block->left_free, s_off);
  774. if (s_index == e_index)
  775. s_block->right_free = min_t(int, s_block->right_free,
  776. PCPU_BITMAP_BLOCK_BITS - e_off);
  777. else
  778. s_block->right_free = 0;
  779. }
  780. /*
  781. * Update e_block.
  782. */
  783. if (s_index != e_index) {
  784. if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
  785. nr_empty_pages++;
  786. /*
  787. * When the allocation is across blocks, the end is along
  788. * the left part of the e_block.
  789. */
  790. e_block->first_free = find_next_zero_bit(
  791. pcpu_index_alloc_map(chunk, e_index),
  792. PCPU_BITMAP_BLOCK_BITS, e_off);
  793. if (e_off == PCPU_BITMAP_BLOCK_BITS) {
  794. /* reset the block */
  795. e_block++;
  796. } else {
  797. if (e_off > e_block->scan_hint_start)
  798. e_block->scan_hint = 0;
  799. e_block->left_free = 0;
  800. if (e_off > e_block->contig_hint_start) {
  801. /* contig hint is broken - scan to fix it */
  802. pcpu_block_refresh_hint(chunk, e_index);
  803. } else {
  804. e_block->right_free =
  805. min_t(int, e_block->right_free,
  806. PCPU_BITMAP_BLOCK_BITS - e_off);
  807. }
  808. }
  809. /* update in-between md_blocks */
  810. nr_empty_pages += (e_index - s_index - 1);
  811. for (block = s_block + 1; block < e_block; block++) {
  812. block->scan_hint = 0;
  813. block->contig_hint = 0;
  814. block->left_free = 0;
  815. block->right_free = 0;
  816. }
  817. }
  818. /*
  819. * If the allocation is not atomic, some blocks may not be
  820. * populated with pages, while we account it here. The number
  821. * of pages will be added back with pcpu_chunk_populated()
  822. * when populating pages.
  823. */
  824. if (nr_empty_pages)
  825. pcpu_update_empty_pages(chunk, -nr_empty_pages);
  826. if (pcpu_region_overlap(chunk_md->scan_hint_start,
  827. chunk_md->scan_hint_start +
  828. chunk_md->scan_hint,
  829. bit_off,
  830. bit_off + bits))
  831. chunk_md->scan_hint = 0;
  832. /*
  833. * The only time a full chunk scan is required is if the chunk
  834. * contig hint is broken. Otherwise, it means a smaller space
  835. * was used and therefore the chunk contig hint is still correct.
  836. */
  837. if (pcpu_region_overlap(chunk_md->contig_hint_start,
  838. chunk_md->contig_hint_start +
  839. chunk_md->contig_hint,
  840. bit_off,
  841. bit_off + bits))
  842. pcpu_chunk_refresh_hint(chunk, false);
  843. }
  844. /**
  845. * pcpu_block_update_hint_free - updates the block hints on the free path
  846. * @chunk: chunk of interest
  847. * @bit_off: chunk offset
  848. * @bits: size of request
  849. *
  850. * Updates metadata for the allocation path. This avoids a blind block
  851. * refresh by making use of the block contig hints. If this fails, it scans
  852. * forward and backward to determine the extent of the free area. This is
  853. * capped at the boundary of blocks.
  854. *
  855. * A chunk update is triggered if a page becomes free, a block becomes free,
  856. * or the free spans across blocks. This tradeoff is to minimize iterating
  857. * over the block metadata to update chunk_md->contig_hint.
  858. * chunk_md->contig_hint may be off by up to a page, but it will never be more
  859. * than the available space. If the contig hint is contained in one block, it
  860. * will be accurate.
  861. */
  862. static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
  863. int bits)
  864. {
  865. int nr_empty_pages = 0;
  866. struct pcpu_block_md *s_block, *e_block, *block;
  867. int s_index, e_index; /* block indexes of the freed allocation */
  868. int s_off, e_off; /* block offsets of the freed allocation */
  869. int start, end; /* start and end of the whole free area */
  870. /*
  871. * Calculate per block offsets.
  872. * The calculation uses an inclusive range, but the resulting offsets
  873. * are [start, end). e_index always points to the last block in the
  874. * range.
  875. */
  876. s_index = pcpu_off_to_block_index(bit_off);
  877. e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  878. s_off = pcpu_off_to_block_off(bit_off);
  879. e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  880. s_block = chunk->md_blocks + s_index;
  881. e_block = chunk->md_blocks + e_index;
  882. /*
  883. * Check if the freed area aligns with the block->contig_hint.
  884. * If it does, then the scan to find the beginning/end of the
  885. * larger free area can be avoided.
  886. *
  887. * start and end refer to beginning and end of the free area
  888. * within each their respective blocks. This is not necessarily
  889. * the entire free area as it may span blocks past the beginning
  890. * or end of the block.
  891. */
  892. start = s_off;
  893. if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
  894. start = s_block->contig_hint_start;
  895. } else {
  896. /*
  897. * Scan backwards to find the extent of the free area.
  898. * find_last_bit returns the starting bit, so if the start bit
  899. * is returned, that means there was no last bit and the
  900. * remainder of the chunk is free.
  901. */
  902. int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
  903. start);
  904. start = (start == l_bit) ? 0 : l_bit + 1;
  905. }
  906. end = e_off;
  907. if (e_off == e_block->contig_hint_start)
  908. end = e_block->contig_hint_start + e_block->contig_hint;
  909. else
  910. end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
  911. PCPU_BITMAP_BLOCK_BITS, end);
  912. /* update s_block */
  913. e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
  914. if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
  915. nr_empty_pages++;
  916. pcpu_block_update(s_block, start, e_off);
  917. /* freeing in the same block */
  918. if (s_index != e_index) {
  919. /* update e_block */
  920. if (end == PCPU_BITMAP_BLOCK_BITS)
  921. nr_empty_pages++;
  922. pcpu_block_update(e_block, 0, end);
  923. /* reset md_blocks in the middle */
  924. nr_empty_pages += (e_index - s_index - 1);
  925. for (block = s_block + 1; block < e_block; block++) {
  926. block->first_free = 0;
  927. block->scan_hint = 0;
  928. block->contig_hint_start = 0;
  929. block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
  930. block->left_free = PCPU_BITMAP_BLOCK_BITS;
  931. block->right_free = PCPU_BITMAP_BLOCK_BITS;
  932. }
  933. }
  934. if (nr_empty_pages)
  935. pcpu_update_empty_pages(chunk, nr_empty_pages);
  936. /*
  937. * Refresh chunk metadata when the free makes a block free or spans
  938. * across blocks. The contig_hint may be off by up to a page, but if
  939. * the contig_hint is contained in a block, it will be accurate with
  940. * the else condition below.
  941. */
  942. if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
  943. pcpu_chunk_refresh_hint(chunk, true);
  944. else
  945. pcpu_block_update(&chunk->chunk_md,
  946. pcpu_block_off_to_off(s_index, start),
  947. end);
  948. }
  949. /**
  950. * pcpu_is_populated - determines if the region is populated
  951. * @chunk: chunk of interest
  952. * @bit_off: chunk offset
  953. * @bits: size of area
  954. * @next_off: return value for the next offset to start searching
  955. *
  956. * For atomic allocations, check if the backing pages are populated.
  957. *
  958. * RETURNS:
  959. * Bool if the backing pages are populated.
  960. * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
  961. */
  962. static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
  963. int *next_off)
  964. {
  965. unsigned int start, end;
  966. start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
  967. end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
  968. start = find_next_zero_bit(chunk->populated, end, start);
  969. if (start >= end)
  970. return true;
  971. end = find_next_bit(chunk->populated, end, start + 1);
  972. *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
  973. return false;
  974. }
  975. /**
  976. * pcpu_find_block_fit - finds the block index to start searching
  977. * @chunk: chunk of interest
  978. * @alloc_bits: size of request in allocation units
  979. * @align: alignment of area (max PAGE_SIZE bytes)
  980. * @pop_only: use populated regions only
  981. *
  982. * Given a chunk and an allocation spec, find the offset to begin searching
  983. * for a free region. This iterates over the bitmap metadata blocks to
  984. * find an offset that will be guaranteed to fit the requirements. It is
  985. * not quite first fit as if the allocation does not fit in the contig hint
  986. * of a block or chunk, it is skipped. This errs on the side of caution
  987. * to prevent excess iteration. Poor alignment can cause the allocator to
  988. * skip over blocks and chunks that have valid free areas.
  989. *
  990. * RETURNS:
  991. * The offset in the bitmap to begin searching.
  992. * -1 if no offset is found.
  993. */
  994. static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
  995. size_t align, bool pop_only)
  996. {
  997. struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  998. int bit_off, bits, next_off;
  999. /*
  1000. * This is an optimization to prevent scanning by assuming if the
  1001. * allocation cannot fit in the global hint, there is memory pressure
  1002. * and creating a new chunk would happen soon.
  1003. */
  1004. if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
  1005. return -1;
  1006. bit_off = pcpu_next_hint(chunk_md, alloc_bits);
  1007. bits = 0;
  1008. pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
  1009. if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
  1010. &next_off))
  1011. break;
  1012. bit_off = next_off;
  1013. bits = 0;
  1014. }
  1015. if (bit_off == pcpu_chunk_map_bits(chunk))
  1016. return -1;
  1017. return bit_off;
  1018. }
  1019. /*
  1020. * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
  1021. * @map: the address to base the search on
  1022. * @size: the bitmap size in bits
  1023. * @start: the bitnumber to start searching at
  1024. * @nr: the number of zeroed bits we're looking for
  1025. * @align_mask: alignment mask for zero area
  1026. * @largest_off: offset of the largest area skipped
  1027. * @largest_bits: size of the largest area skipped
  1028. *
  1029. * The @align_mask should be one less than a power of 2.
  1030. *
  1031. * This is a modified version of bitmap_find_next_zero_area_off() to remember
  1032. * the largest area that was skipped. This is imperfect, but in general is
  1033. * good enough. The largest remembered region is the largest failed region
  1034. * seen. This does not include anything we possibly skipped due to alignment.
  1035. * pcpu_block_update_scan() does scan backwards to try and recover what was
  1036. * lost to alignment. While this can cause scanning to miss earlier possible
  1037. * free areas, smaller allocations will eventually fill those holes.
  1038. */
  1039. static unsigned long pcpu_find_zero_area(unsigned long *map,
  1040. unsigned long size,
  1041. unsigned long start,
  1042. unsigned long nr,
  1043. unsigned long align_mask,
  1044. unsigned long *largest_off,
  1045. unsigned long *largest_bits)
  1046. {
  1047. unsigned long index, end, i, area_off, area_bits;
  1048. again:
  1049. index = find_next_zero_bit(map, size, start);
  1050. /* Align allocation */
  1051. index = __ALIGN_MASK(index, align_mask);
  1052. area_off = index;
  1053. end = index + nr;
  1054. if (end > size)
  1055. return end;
  1056. i = find_next_bit(map, end, index);
  1057. if (i < end) {
  1058. area_bits = i - area_off;
  1059. /* remember largest unused area with best alignment */
  1060. if (area_bits > *largest_bits ||
  1061. (area_bits == *largest_bits && *largest_off &&
  1062. (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
  1063. *largest_off = area_off;
  1064. *largest_bits = area_bits;
  1065. }
  1066. start = i + 1;
  1067. goto again;
  1068. }
  1069. return index;
  1070. }
  1071. /**
  1072. * pcpu_alloc_area - allocates an area from a pcpu_chunk
  1073. * @chunk: chunk of interest
  1074. * @alloc_bits: size of request in allocation units
  1075. * @align: alignment of area (max PAGE_SIZE)
  1076. * @start: bit_off to start searching
  1077. *
  1078. * This function takes in a @start offset to begin searching to fit an
  1079. * allocation of @alloc_bits with alignment @align. It needs to scan
  1080. * the allocation map because if it fits within the block's contig hint,
  1081. * @start will be block->first_free. This is an attempt to fill the
  1082. * allocation prior to breaking the contig hint. The allocation and
  1083. * boundary maps are updated accordingly if it confirms a valid
  1084. * free area.
  1085. *
  1086. * RETURNS:
  1087. * Allocated addr offset in @chunk on success.
  1088. * -1 if no matching area is found.
  1089. */
  1090. static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
  1091. size_t align, int start)
  1092. {
  1093. struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  1094. size_t align_mask = (align) ? (align - 1) : 0;
  1095. unsigned long area_off = 0, area_bits = 0;
  1096. int bit_off, end, oslot;
  1097. lockdep_assert_held(&pcpu_lock);
  1098. oslot = pcpu_chunk_slot(chunk);
  1099. /*
  1100. * Search to find a fit.
  1101. */
  1102. end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
  1103. pcpu_chunk_map_bits(chunk));
  1104. bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
  1105. align_mask, &area_off, &area_bits);
  1106. if (bit_off >= end)
  1107. return -1;
  1108. if (area_bits)
  1109. pcpu_block_update_scan(chunk, area_off, area_bits);
  1110. /* update alloc map */
  1111. bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
  1112. /* update boundary map */
  1113. set_bit(bit_off, chunk->bound_map);
  1114. bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
  1115. set_bit(bit_off + alloc_bits, chunk->bound_map);
  1116. chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
  1117. /* update first free bit */
  1118. if (bit_off == chunk_md->first_free)
  1119. chunk_md->first_free = find_next_zero_bit(
  1120. chunk->alloc_map,
  1121. pcpu_chunk_map_bits(chunk),
  1122. bit_off + alloc_bits);
  1123. pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
  1124. pcpu_chunk_relocate(chunk, oslot);
  1125. return bit_off * PCPU_MIN_ALLOC_SIZE;
  1126. }
  1127. /**
  1128. * pcpu_free_area - frees the corresponding offset
  1129. * @chunk: chunk of interest
  1130. * @off: addr offset into chunk
  1131. *
  1132. * This function determines the size of an allocation to free using
  1133. * the boundary bitmap and clears the allocation map.
  1134. *
  1135. * RETURNS:
  1136. * Number of freed bytes.
  1137. */
  1138. static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
  1139. {
  1140. struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  1141. int bit_off, bits, end, oslot, freed;
  1142. lockdep_assert_held(&pcpu_lock);
  1143. pcpu_stats_area_dealloc(chunk);
  1144. oslot = pcpu_chunk_slot(chunk);
  1145. bit_off = off / PCPU_MIN_ALLOC_SIZE;
  1146. /* find end index */
  1147. end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
  1148. bit_off + 1);
  1149. bits = end - bit_off;
  1150. bitmap_clear(chunk->alloc_map, bit_off, bits);
  1151. freed = bits * PCPU_MIN_ALLOC_SIZE;
  1152. /* update metadata */
  1153. chunk->free_bytes += freed;
  1154. /* update first free bit */
  1155. chunk_md->first_free = min(chunk_md->first_free, bit_off);
  1156. pcpu_block_update_hint_free(chunk, bit_off, bits);
  1157. pcpu_chunk_relocate(chunk, oslot);
  1158. return freed;
  1159. }
  1160. static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
  1161. {
  1162. block->scan_hint = 0;
  1163. block->contig_hint = nr_bits;
  1164. block->left_free = nr_bits;
  1165. block->right_free = nr_bits;
  1166. block->first_free = 0;
  1167. block->nr_bits = nr_bits;
  1168. }
  1169. static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
  1170. {
  1171. struct pcpu_block_md *md_block;
  1172. /* init the chunk's block */
  1173. pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
  1174. for (md_block = chunk->md_blocks;
  1175. md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
  1176. md_block++)
  1177. pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
  1178. }
  1179. /**
  1180. * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
  1181. * @tmp_addr: the start of the region served
  1182. * @map_size: size of the region served
  1183. *
  1184. * This is responsible for creating the chunks that serve the first chunk. The
  1185. * base_addr is page aligned down of @tmp_addr while the region end is page
  1186. * aligned up. Offsets are kept track of to determine the region served. All
  1187. * this is done to appease the bitmap allocator in avoiding partial blocks.
  1188. *
  1189. * RETURNS:
  1190. * Chunk serving the region at @tmp_addr of @map_size.
  1191. */
  1192. static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
  1193. int map_size)
  1194. {
  1195. struct pcpu_chunk *chunk;
  1196. unsigned long aligned_addr;
  1197. int start_offset, offset_bits, region_size, region_bits;
  1198. size_t alloc_size;
  1199. /* region calculations */
  1200. aligned_addr = tmp_addr & PAGE_MASK;
  1201. start_offset = tmp_addr - aligned_addr;
  1202. region_size = ALIGN(start_offset + map_size, PAGE_SIZE);
  1203. /* allocate chunk */
  1204. alloc_size = struct_size(chunk, populated,
  1205. BITS_TO_LONGS(region_size >> PAGE_SHIFT));
  1206. chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  1207. if (!chunk)
  1208. panic("%s: Failed to allocate %zu bytes\n", __func__,
  1209. alloc_size);
  1210. INIT_LIST_HEAD(&chunk->list);
  1211. chunk->base_addr = (void *)aligned_addr;
  1212. chunk->start_offset = start_offset;
  1213. chunk->end_offset = region_size - chunk->start_offset - map_size;
  1214. chunk->nr_pages = region_size >> PAGE_SHIFT;
  1215. region_bits = pcpu_chunk_map_bits(chunk);
  1216. alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
  1217. chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  1218. if (!chunk->alloc_map)
  1219. panic("%s: Failed to allocate %zu bytes\n", __func__,
  1220. alloc_size);
  1221. alloc_size =
  1222. BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
  1223. chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  1224. if (!chunk->bound_map)
  1225. panic("%s: Failed to allocate %zu bytes\n", __func__,
  1226. alloc_size);
  1227. alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
  1228. chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  1229. if (!chunk->md_blocks)
  1230. panic("%s: Failed to allocate %zu bytes\n", __func__,
  1231. alloc_size);
  1232. #ifdef NEED_PCPUOBJ_EXT
  1233. /* first chunk is free to use */
  1234. chunk->obj_exts = NULL;
  1235. #endif
  1236. pcpu_init_md_blocks(chunk);
  1237. /* manage populated page bitmap */
  1238. chunk->immutable = true;
  1239. bitmap_fill(chunk->populated, chunk->nr_pages);
  1240. chunk->nr_populated = chunk->nr_pages;
  1241. chunk->nr_empty_pop_pages = chunk->nr_pages;
  1242. chunk->free_bytes = map_size;
  1243. if (chunk->start_offset) {
  1244. /* hide the beginning of the bitmap */
  1245. offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
  1246. bitmap_set(chunk->alloc_map, 0, offset_bits);
  1247. set_bit(0, chunk->bound_map);
  1248. set_bit(offset_bits, chunk->bound_map);
  1249. chunk->chunk_md.first_free = offset_bits;
  1250. pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
  1251. }
  1252. if (chunk->end_offset) {
  1253. /* hide the end of the bitmap */
  1254. offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
  1255. bitmap_set(chunk->alloc_map,
  1256. pcpu_chunk_map_bits(chunk) - offset_bits,
  1257. offset_bits);
  1258. set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
  1259. chunk->bound_map);
  1260. set_bit(region_bits, chunk->bound_map);
  1261. pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
  1262. - offset_bits, offset_bits);
  1263. }
  1264. return chunk;
  1265. }
  1266. static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
  1267. {
  1268. struct pcpu_chunk *chunk;
  1269. int region_bits;
  1270. chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
  1271. if (!chunk)
  1272. return NULL;
  1273. INIT_LIST_HEAD(&chunk->list);
  1274. chunk->nr_pages = pcpu_unit_pages;
  1275. region_bits = pcpu_chunk_map_bits(chunk);
  1276. chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
  1277. sizeof(chunk->alloc_map[0]), gfp);
  1278. if (!chunk->alloc_map)
  1279. goto alloc_map_fail;
  1280. chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
  1281. sizeof(chunk->bound_map[0]), gfp);
  1282. if (!chunk->bound_map)
  1283. goto bound_map_fail;
  1284. chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
  1285. sizeof(chunk->md_blocks[0]), gfp);
  1286. if (!chunk->md_blocks)
  1287. goto md_blocks_fail;
  1288. #ifdef NEED_PCPUOBJ_EXT
  1289. if (need_pcpuobj_ext()) {
  1290. chunk->obj_exts =
  1291. pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
  1292. sizeof(struct pcpuobj_ext), gfp);
  1293. if (!chunk->obj_exts)
  1294. goto objcg_fail;
  1295. }
  1296. #endif
  1297. pcpu_init_md_blocks(chunk);
  1298. /* init metadata */
  1299. chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
  1300. return chunk;
  1301. #ifdef NEED_PCPUOBJ_EXT
  1302. objcg_fail:
  1303. pcpu_mem_free(chunk->md_blocks);
  1304. #endif
  1305. md_blocks_fail:
  1306. pcpu_mem_free(chunk->bound_map);
  1307. bound_map_fail:
  1308. pcpu_mem_free(chunk->alloc_map);
  1309. alloc_map_fail:
  1310. pcpu_mem_free(chunk);
  1311. return NULL;
  1312. }
  1313. static void pcpu_free_chunk(struct pcpu_chunk *chunk)
  1314. {
  1315. if (!chunk)
  1316. return;
  1317. #ifdef NEED_PCPUOBJ_EXT
  1318. pcpu_mem_free(chunk->obj_exts);
  1319. #endif
  1320. pcpu_mem_free(chunk->md_blocks);
  1321. pcpu_mem_free(chunk->bound_map);
  1322. pcpu_mem_free(chunk->alloc_map);
  1323. pcpu_mem_free(chunk);
  1324. }
  1325. /**
  1326. * pcpu_chunk_populated - post-population bookkeeping
  1327. * @chunk: pcpu_chunk which got populated
  1328. * @page_start: the start page
  1329. * @page_end: the end page
  1330. *
  1331. * Pages in [@page_start,@page_end) have been populated to @chunk. Update
  1332. * the bookkeeping information accordingly. Must be called after each
  1333. * successful population.
  1334. */
  1335. static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
  1336. int page_end)
  1337. {
  1338. int nr = page_end - page_start;
  1339. lockdep_assert_held(&pcpu_lock);
  1340. bitmap_set(chunk->populated, page_start, nr);
  1341. chunk->nr_populated += nr;
  1342. pcpu_nr_populated += nr;
  1343. pcpu_update_empty_pages(chunk, nr);
  1344. }
  1345. /**
  1346. * pcpu_chunk_depopulated - post-depopulation bookkeeping
  1347. * @chunk: pcpu_chunk which got depopulated
  1348. * @page_start: the start page
  1349. * @page_end: the end page
  1350. *
  1351. * Pages in [@page_start,@page_end) have been depopulated from @chunk.
  1352. * Update the bookkeeping information accordingly. Must be called after
  1353. * each successful depopulation.
  1354. */
  1355. static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  1356. int page_start, int page_end)
  1357. {
  1358. int nr = page_end - page_start;
  1359. lockdep_assert_held(&pcpu_lock);
  1360. bitmap_clear(chunk->populated, page_start, nr);
  1361. chunk->nr_populated -= nr;
  1362. pcpu_nr_populated -= nr;
  1363. pcpu_update_empty_pages(chunk, -nr);
  1364. }
  1365. /*
  1366. * Chunk management implementation.
  1367. *
  1368. * To allow different implementations, chunk alloc/free and
  1369. * [de]population are implemented in a separate file which is pulled
  1370. * into this file and compiled together. The following functions
  1371. * should be implemented.
  1372. *
  1373. * pcpu_populate_chunk - populate the specified range of a chunk
  1374. * pcpu_depopulate_chunk - depopulate the specified range of a chunk
  1375. * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk
  1376. * pcpu_create_chunk - create a new chunk
  1377. * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
  1378. * pcpu_addr_to_page - translate address to physical address
  1379. * pcpu_verify_alloc_info - check alloc_info is acceptable during init
  1380. */
  1381. static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
  1382. int page_start, int page_end, gfp_t gfp);
  1383. static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
  1384. int page_start, int page_end);
  1385. static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
  1386. int page_start, int page_end);
  1387. static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
  1388. static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
  1389. static struct page *pcpu_addr_to_page(void *addr);
  1390. static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
  1391. #ifdef CONFIG_NEED_PER_CPU_KM
  1392. #include "percpu-km.c"
  1393. #else
  1394. #include "percpu-vm.c"
  1395. #endif
  1396. /**
  1397. * pcpu_chunk_addr_search - determine chunk containing specified address
  1398. * @addr: address for which the chunk needs to be determined.
  1399. *
  1400. * This is an internal function that handles all but static allocations.
  1401. * Static percpu address values should never be passed into the allocator.
  1402. *
  1403. * RETURNS:
  1404. * The address of the found chunk.
  1405. */
  1406. static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  1407. {
  1408. /* is it in the dynamic region (first chunk)? */
  1409. if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
  1410. return pcpu_first_chunk;
  1411. /* is it in the reserved region? */
  1412. if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
  1413. return pcpu_reserved_chunk;
  1414. /*
  1415. * The address is relative to unit0 which might be unused and
  1416. * thus unmapped. Offset the address to the unit space of the
  1417. * current processor before looking it up in the vmalloc
  1418. * space. Note that any possible cpu id can be used here, so
  1419. * there's no need to worry about preemption or cpu hotplug.
  1420. */
  1421. addr += pcpu_unit_offsets[raw_smp_processor_id()];
  1422. return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
  1423. }
  1424. #ifdef CONFIG_MEMCG
  1425. static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
  1426. struct obj_cgroup **objcgp)
  1427. {
  1428. struct obj_cgroup *objcg;
  1429. if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
  1430. return true;
  1431. objcg = current_obj_cgroup();
  1432. if (!objcg)
  1433. return true;
  1434. if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
  1435. return false;
  1436. *objcgp = objcg;
  1437. return true;
  1438. }
  1439. static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
  1440. struct pcpu_chunk *chunk, int off,
  1441. size_t size)
  1442. {
  1443. if (!objcg)
  1444. return;
  1445. if (likely(chunk && chunk->obj_exts)) {
  1446. obj_cgroup_get(objcg);
  1447. chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;
  1448. rcu_read_lock();
  1449. mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
  1450. pcpu_obj_full_size(size));
  1451. rcu_read_unlock();
  1452. } else {
  1453. obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
  1454. }
  1455. }
  1456. static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
  1457. {
  1458. struct obj_cgroup *objcg;
  1459. if (unlikely(!chunk->obj_exts))
  1460. return;
  1461. objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
  1462. if (!objcg)
  1463. return;
  1464. chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;
  1465. obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
  1466. rcu_read_lock();
  1467. mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
  1468. -pcpu_obj_full_size(size));
  1469. rcu_read_unlock();
  1470. obj_cgroup_put(objcg);
  1471. }
  1472. #else /* CONFIG_MEMCG */
  1473. static bool
  1474. pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
  1475. {
  1476. return true;
  1477. }
  1478. static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
  1479. struct pcpu_chunk *chunk, int off,
  1480. size_t size)
  1481. {
  1482. }
  1483. static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
  1484. {
  1485. }
  1486. #endif /* CONFIG_MEMCG */
  1487. #ifdef CONFIG_MEM_ALLOC_PROFILING
  1488. static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
  1489. size_t size)
  1490. {
  1491. if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
  1492. alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
  1493. current->alloc_tag, size);
  1494. }
  1495. }
  1496. static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
  1497. {
  1498. if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
  1499. alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
  1500. }
  1501. #else
  1502. static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
  1503. size_t size)
  1504. {
  1505. }
  1506. static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
  1507. {
  1508. }
  1509. #endif
  1510. /**
  1511. * pcpu_alloc - the percpu allocator
  1512. * @size: size of area to allocate in bytes
  1513. * @align: alignment of area (max PAGE_SIZE)
  1514. * @reserved: allocate from the reserved chunk if available
  1515. * @gfp: allocation flags
  1516. *
  1517. * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
  1518. * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
  1519. * then no warning will be triggered on invalid or failed allocation
  1520. * requests.
  1521. *
  1522. * RETURNS:
  1523. * Percpu pointer to the allocated area on success, NULL on failure.
  1524. */
  1525. void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
  1526. gfp_t gfp)
  1527. {
  1528. gfp_t pcpu_gfp;
  1529. bool is_atomic;
  1530. bool do_warn;
  1531. struct obj_cgroup *objcg = NULL;
  1532. static int warn_limit = 10;
  1533. struct pcpu_chunk *chunk, *next;
  1534. const char *err;
  1535. int slot, off, cpu, ret;
  1536. unsigned long flags;
  1537. void __percpu *ptr;
  1538. size_t bits, bit_align;
  1539. gfp = current_gfp_context(gfp);
  1540. /* whitelisted flags that can be passed to the backing allocators */
  1541. pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
  1542. is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
  1543. do_warn = !(gfp & __GFP_NOWARN);
  1544. /*
  1545. * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
  1546. * therefore alignment must be a minimum of that many bytes.
  1547. * An allocation may have internal fragmentation from rounding up
  1548. * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
  1549. */
  1550. if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
  1551. align = PCPU_MIN_ALLOC_SIZE;
  1552. size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
  1553. bits = size >> PCPU_MIN_ALLOC_SHIFT;
  1554. bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
  1555. if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
  1556. !is_power_of_2(align))) {
  1557. WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
  1558. size, align);
  1559. return NULL;
  1560. }
  1561. if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
  1562. return NULL;
  1563. if (!is_atomic) {
  1564. /*
  1565. * pcpu_balance_workfn() allocates memory under this mutex,
  1566. * and it may wait for memory reclaim. Allow current task
  1567. * to become OOM victim, in case of memory pressure.
  1568. */
  1569. if (gfp & __GFP_NOFAIL) {
  1570. mutex_lock(&pcpu_alloc_mutex);
  1571. } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
  1572. pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
  1573. return NULL;
  1574. }
  1575. }
  1576. spin_lock_irqsave(&pcpu_lock, flags);
  1577. /* serve reserved allocations from the reserved chunk if available */
  1578. if (reserved && pcpu_reserved_chunk) {
  1579. chunk = pcpu_reserved_chunk;
  1580. off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
  1581. if (off < 0) {
  1582. err = "alloc from reserved chunk failed";
  1583. goto fail_unlock;
  1584. }
  1585. off = pcpu_alloc_area(chunk, bits, bit_align, off);
  1586. if (off >= 0)
  1587. goto area_found;
  1588. err = "alloc from reserved chunk failed";
  1589. goto fail_unlock;
  1590. }
  1591. restart:
  1592. /* search through normal chunks */
  1593. for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
  1594. list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
  1595. list) {
  1596. off = pcpu_find_block_fit(chunk, bits, bit_align,
  1597. is_atomic);
  1598. if (off < 0) {
  1599. if (slot < PCPU_SLOT_FAIL_THRESHOLD)
  1600. pcpu_chunk_move(chunk, 0);
  1601. continue;
  1602. }
  1603. off = pcpu_alloc_area(chunk, bits, bit_align, off);
  1604. if (off >= 0) {
  1605. pcpu_reintegrate_chunk(chunk);
  1606. goto area_found;
  1607. }
  1608. }
  1609. }
  1610. spin_unlock_irqrestore(&pcpu_lock, flags);
  1611. if (is_atomic) {
  1612. err = "atomic alloc failed, no space left";
  1613. goto fail;
  1614. }
  1615. /* No space left. Create a new chunk. */
  1616. if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
  1617. chunk = pcpu_create_chunk(pcpu_gfp);
  1618. if (!chunk) {
  1619. err = "failed to allocate new chunk";
  1620. goto fail;
  1621. }
  1622. spin_lock_irqsave(&pcpu_lock, flags);
  1623. pcpu_chunk_relocate(chunk, -1);
  1624. } else {
  1625. spin_lock_irqsave(&pcpu_lock, flags);
  1626. }
  1627. goto restart;
  1628. area_found:
  1629. pcpu_stats_area_alloc(chunk, size);
  1630. spin_unlock_irqrestore(&pcpu_lock, flags);
  1631. /* populate if not all pages are already there */
  1632. if (!is_atomic) {
  1633. unsigned int page_end, rs, re;
  1634. rs = PFN_DOWN(off);
  1635. page_end = PFN_UP(off + size);
  1636. for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
  1637. WARN_ON(chunk->immutable);
  1638. ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
  1639. spin_lock_irqsave(&pcpu_lock, flags);
  1640. if (ret) {
  1641. pcpu_free_area(chunk, off);
  1642. err = "failed to populate";
  1643. goto fail_unlock;
  1644. }
  1645. pcpu_chunk_populated(chunk, rs, re);
  1646. spin_unlock_irqrestore(&pcpu_lock, flags);
  1647. }
  1648. mutex_unlock(&pcpu_alloc_mutex);
  1649. }
  1650. if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
  1651. pcpu_schedule_balance_work();
  1652. /* clear the areas and return address relative to base address */
  1653. for_each_possible_cpu(cpu)
  1654. memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
  1655. ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
  1656. kmemleak_alloc_percpu(ptr, size, gfp);
  1657. trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
  1658. chunk->base_addr, off, ptr,
  1659. pcpu_obj_full_size(size), gfp);
  1660. pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
  1661. pcpu_alloc_tag_alloc_hook(chunk, off, size);
  1662. return ptr;
  1663. fail_unlock:
  1664. spin_unlock_irqrestore(&pcpu_lock, flags);
  1665. fail:
  1666. trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
  1667. if (do_warn && warn_limit) {
  1668. pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
  1669. size, align, is_atomic, err);
  1670. if (!is_atomic)
  1671. dump_stack();
  1672. if (!--warn_limit)
  1673. pr_info("limit reached, disable warning\n");
  1674. }
  1675. if (is_atomic) {
  1676. /* see the flag handling in pcpu_balance_workfn() */
  1677. pcpu_atomic_alloc_failed = true;
  1678. pcpu_schedule_balance_work();
  1679. } else {
  1680. mutex_unlock(&pcpu_alloc_mutex);
  1681. }
  1682. pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
  1683. return NULL;
  1684. }
  1685. EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);
  1686. /**
  1687. * pcpu_balance_free - manage the amount of free chunks
  1688. * @empty_only: free chunks only if there are no populated pages
  1689. *
  1690. * If empty_only is %false, reclaim all fully free chunks regardless of the
  1691. * number of populated pages. Otherwise, only reclaim chunks that have no
  1692. * populated pages.
  1693. *
  1694. * CONTEXT:
  1695. * pcpu_lock (can be dropped temporarily)
  1696. */
  1697. static void pcpu_balance_free(bool empty_only)
  1698. {
  1699. LIST_HEAD(to_free);
  1700. struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
  1701. struct pcpu_chunk *chunk, *next;
  1702. lockdep_assert_held(&pcpu_lock);
  1703. /*
  1704. * There's no reason to keep around multiple unused chunks and VM
  1705. * areas can be scarce. Destroy all free chunks except for one.
  1706. */
  1707. list_for_each_entry_safe(chunk, next, free_head, list) {
  1708. WARN_ON(chunk->immutable);
  1709. /* spare the first one */
  1710. if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
  1711. continue;
  1712. if (!empty_only || chunk->nr_empty_pop_pages == 0)
  1713. list_move(&chunk->list, &to_free);
  1714. }
  1715. if (list_empty(&to_free))
  1716. return;
  1717. spin_unlock_irq(&pcpu_lock);
  1718. list_for_each_entry_safe(chunk, next, &to_free, list) {
  1719. unsigned int rs, re;
  1720. for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
  1721. pcpu_depopulate_chunk(chunk, rs, re);
  1722. spin_lock_irq(&pcpu_lock);
  1723. pcpu_chunk_depopulated(chunk, rs, re);
  1724. spin_unlock_irq(&pcpu_lock);
  1725. }
  1726. pcpu_destroy_chunk(chunk);
  1727. cond_resched();
  1728. }
  1729. spin_lock_irq(&pcpu_lock);
  1730. }
  1731. /**
  1732. * pcpu_balance_populated - manage the amount of populated pages
  1733. *
  1734. * Maintain a certain amount of populated pages to satisfy atomic allocations.
  1735. * It is possible that this is called when physical memory is scarce causing
  1736. * OOM killer to be triggered. We should avoid doing so until an actual
  1737. * allocation causes the failure as it is possible that requests can be
  1738. * serviced from already backed regions.
  1739. *
  1740. * CONTEXT:
  1741. * pcpu_lock (can be dropped temporarily)
  1742. */
  1743. static void pcpu_balance_populated(void)
  1744. {
  1745. /* gfp flags passed to underlying allocators */
  1746. const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
  1747. struct pcpu_chunk *chunk;
  1748. int slot, nr_to_pop, ret;
  1749. lockdep_assert_held(&pcpu_lock);
  1750. /*
  1751. * Ensure there are certain number of free populated pages for
  1752. * atomic allocs. Fill up from the most packed so that atomic
  1753. * allocs don't increase fragmentation. If atomic allocation
  1754. * failed previously, always populate the maximum amount. This
  1755. * should prevent atomic allocs larger than PAGE_SIZE from keeping
  1756. * failing indefinitely; however, large atomic allocs are not
  1757. * something we support properly and can be highly unreliable and
  1758. * inefficient.
  1759. */
  1760. retry_pop:
  1761. if (pcpu_atomic_alloc_failed) {
  1762. nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
  1763. /* best effort anyway, don't worry about synchronization */
  1764. pcpu_atomic_alloc_failed = false;
  1765. } else {
  1766. nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
  1767. pcpu_nr_empty_pop_pages,
  1768. 0, PCPU_EMPTY_POP_PAGES_HIGH);
  1769. }
  1770. for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
  1771. unsigned int nr_unpop = 0, rs, re;
  1772. if (!nr_to_pop)
  1773. break;
  1774. list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
  1775. nr_unpop = chunk->nr_pages - chunk->nr_populated;
  1776. if (nr_unpop)
  1777. break;
  1778. }
  1779. if (!nr_unpop)
  1780. continue;
  1781. /* @chunk can't go away while pcpu_alloc_mutex is held */
  1782. for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
  1783. int nr = min_t(int, re - rs, nr_to_pop);
  1784. spin_unlock_irq(&pcpu_lock);
  1785. ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
  1786. cond_resched();
  1787. spin_lock_irq(&pcpu_lock);
  1788. if (!ret) {
  1789. nr_to_pop -= nr;
  1790. pcpu_chunk_populated(chunk, rs, rs + nr);
  1791. } else {
  1792. nr_to_pop = 0;
  1793. }
  1794. if (!nr_to_pop)
  1795. break;
  1796. }
  1797. }
  1798. if (nr_to_pop) {
  1799. /* ran out of chunks to populate, create a new one and retry */
  1800. spin_unlock_irq(&pcpu_lock);
  1801. chunk = pcpu_create_chunk(gfp);
  1802. cond_resched();
  1803. spin_lock_irq(&pcpu_lock);
  1804. if (chunk) {
  1805. pcpu_chunk_relocate(chunk, -1);
  1806. goto retry_pop;
  1807. }
  1808. }
  1809. }
  1810. /**
  1811. * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
  1812. *
  1813. * Scan over chunks in the depopulate list and try to release unused populated
  1814. * pages back to the system. Depopulated chunks are sidelined to prevent
  1815. * repopulating these pages unless required. Fully free chunks are reintegrated
  1816. * and freed accordingly (1 is kept around). If we drop below the empty
  1817. * populated pages threshold, reintegrate the chunk if it has empty free pages.
  1818. * Each chunk is scanned in the reverse order to keep populated pages close to
  1819. * the beginning of the chunk.
  1820. *
  1821. * CONTEXT:
  1822. * pcpu_lock (can be dropped temporarily)
  1823. *
  1824. */
  1825. static void pcpu_reclaim_populated(void)
  1826. {
  1827. struct pcpu_chunk *chunk;
  1828. struct pcpu_block_md *block;
  1829. int freed_page_start, freed_page_end;
  1830. int i, end;
  1831. bool reintegrate;
  1832. lockdep_assert_held(&pcpu_lock);
  1833. /*
  1834. * Once a chunk is isolated to the to_depopulate list, the chunk is no
  1835. * longer discoverable to allocations whom may populate pages. The only
  1836. * other accessor is the free path which only returns area back to the
  1837. * allocator not touching the populated bitmap.
  1838. */
  1839. while ((chunk = list_first_entry_or_null(
  1840. &pcpu_chunk_lists[pcpu_to_depopulate_slot],
  1841. struct pcpu_chunk, list))) {
  1842. WARN_ON(chunk->immutable);
  1843. /*
  1844. * Scan chunk's pages in the reverse order to keep populated
  1845. * pages close to the beginning of the chunk.
  1846. */
  1847. freed_page_start = chunk->nr_pages;
  1848. freed_page_end = 0;
  1849. reintegrate = false;
  1850. for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
  1851. /* no more work to do */
  1852. if (chunk->nr_empty_pop_pages == 0)
  1853. break;
  1854. /* reintegrate chunk to prevent atomic alloc failures */
  1855. if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
  1856. reintegrate = true;
  1857. break;
  1858. }
  1859. /*
  1860. * If the page is empty and populated, start or
  1861. * extend the (i, end) range. If i == 0, decrease
  1862. * i and perform the depopulation to cover the last
  1863. * (first) page in the chunk.
  1864. */
  1865. block = chunk->md_blocks + i;
  1866. if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
  1867. test_bit(i, chunk->populated)) {
  1868. if (end == -1)
  1869. end = i;
  1870. if (i > 0)
  1871. continue;
  1872. i--;
  1873. }
  1874. /* depopulate if there is an active range */
  1875. if (end == -1)
  1876. continue;
  1877. spin_unlock_irq(&pcpu_lock);
  1878. pcpu_depopulate_chunk(chunk, i + 1, end + 1);
  1879. cond_resched();
  1880. spin_lock_irq(&pcpu_lock);
  1881. pcpu_chunk_depopulated(chunk, i + 1, end + 1);
  1882. freed_page_start = min(freed_page_start, i + 1);
  1883. freed_page_end = max(freed_page_end, end + 1);
  1884. /* reset the range and continue */
  1885. end = -1;
  1886. }
  1887. /* batch tlb flush per chunk to amortize cost */
  1888. if (freed_page_start < freed_page_end) {
  1889. spin_unlock_irq(&pcpu_lock);
  1890. pcpu_post_unmap_tlb_flush(chunk,
  1891. freed_page_start,
  1892. freed_page_end);
  1893. cond_resched();
  1894. spin_lock_irq(&pcpu_lock);
  1895. }
  1896. if (reintegrate || chunk->free_bytes == pcpu_unit_size)
  1897. pcpu_reintegrate_chunk(chunk);
  1898. else
  1899. list_move_tail(&chunk->list,
  1900. &pcpu_chunk_lists[pcpu_sidelined_slot]);
  1901. }
  1902. }
  1903. /**
  1904. * pcpu_balance_workfn - manage the amount of free chunks and populated pages
  1905. * @work: unused
  1906. *
  1907. * For each chunk type, manage the number of fully free chunks and the number of
  1908. * populated pages. An important thing to consider is when pages are freed and
  1909. * how they contribute to the global counts.
  1910. */
  1911. static void pcpu_balance_workfn(struct work_struct *work)
  1912. {
  1913. /*
  1914. * pcpu_balance_free() is called twice because the first time we may
  1915. * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
  1916. * to grow other chunks. This then gives pcpu_reclaim_populated() time
  1917. * to move fully free chunks to the active list to be freed if
  1918. * appropriate.
  1919. */
  1920. mutex_lock(&pcpu_alloc_mutex);
  1921. spin_lock_irq(&pcpu_lock);
  1922. pcpu_balance_free(false);
  1923. pcpu_reclaim_populated();
  1924. pcpu_balance_populated();
  1925. pcpu_balance_free(true);
  1926. spin_unlock_irq(&pcpu_lock);
  1927. mutex_unlock(&pcpu_alloc_mutex);
  1928. }
  1929. /**
  1930. * free_percpu - free percpu area
  1931. * @ptr: pointer to area to free
  1932. *
  1933. * Free percpu area @ptr.
  1934. *
  1935. * CONTEXT:
  1936. * Can be called from atomic context.
  1937. */
  1938. void free_percpu(void __percpu *ptr)
  1939. {
  1940. void *addr;
  1941. struct pcpu_chunk *chunk;
  1942. unsigned long flags;
  1943. int size, off;
  1944. bool need_balance = false;
  1945. if (!ptr)
  1946. return;
  1947. kmemleak_free_percpu(ptr);
  1948. addr = __pcpu_ptr_to_addr(ptr);
  1949. chunk = pcpu_chunk_addr_search(addr);
  1950. off = addr - chunk->base_addr;
  1951. spin_lock_irqsave(&pcpu_lock, flags);
  1952. size = pcpu_free_area(chunk, off);
  1953. pcpu_alloc_tag_free_hook(chunk, off, size);
  1954. pcpu_memcg_free_hook(chunk, off, size);
  1955. /*
  1956. * If there are more than one fully free chunks, wake up grim reaper.
  1957. * If the chunk is isolated, it may be in the process of being
  1958. * reclaimed. Let reclaim manage cleaning up of that chunk.
  1959. */
  1960. if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
  1961. struct pcpu_chunk *pos;
  1962. list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
  1963. if (pos != chunk) {
  1964. need_balance = true;
  1965. break;
  1966. }
  1967. } else if (pcpu_should_reclaim_chunk(chunk)) {
  1968. pcpu_isolate_chunk(chunk);
  1969. need_balance = true;
  1970. }
  1971. trace_percpu_free_percpu(chunk->base_addr, off, ptr);
  1972. spin_unlock_irqrestore(&pcpu_lock, flags);
  1973. if (need_balance)
  1974. pcpu_schedule_balance_work();
  1975. }
  1976. EXPORT_SYMBOL_GPL(free_percpu);
  1977. bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
  1978. {
  1979. #ifdef CONFIG_SMP
  1980. const size_t static_size = __per_cpu_end - __per_cpu_start;
  1981. void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  1982. unsigned int cpu;
  1983. for_each_possible_cpu(cpu) {
  1984. void *start = per_cpu_ptr(base, cpu);
  1985. void *va = (void *)addr;
  1986. if (va >= start && va < start + static_size) {
  1987. if (can_addr) {
  1988. *can_addr = (unsigned long) (va - start);
  1989. *can_addr += (unsigned long)
  1990. per_cpu_ptr(base, get_boot_cpu_id());
  1991. }
  1992. return true;
  1993. }
  1994. }
  1995. #endif
  1996. /* on UP, can't distinguish from other static vars, always false */
  1997. return false;
  1998. }
  1999. /**
  2000. * is_kernel_percpu_address - test whether address is from static percpu area
  2001. * @addr: address to test
  2002. *
  2003. * Test whether @addr belongs to in-kernel static percpu area. Module
  2004. * static percpu areas are not considered. For those, use
  2005. * is_module_percpu_address().
  2006. *
  2007. * RETURNS:
  2008. * %true if @addr is from in-kernel static percpu area, %false otherwise.
  2009. */
  2010. bool is_kernel_percpu_address(unsigned long addr)
  2011. {
  2012. return __is_kernel_percpu_address(addr, NULL);
  2013. }
  2014. /**
  2015. * per_cpu_ptr_to_phys - convert translated percpu address to physical address
  2016. * @addr: the address to be converted to physical address
  2017. *
  2018. * Given @addr which is dereferenceable address obtained via one of
  2019. * percpu access macros, this function translates it into its physical
  2020. * address. The caller is responsible for ensuring @addr stays valid
  2021. * until this function finishes.
  2022. *
  2023. * percpu allocator has special setup for the first chunk, which currently
  2024. * supports either embedding in linear address space or vmalloc mapping,
  2025. * and, from the second one, the backing allocator (currently either vm or
  2026. * km) provides translation.
  2027. *
  2028. * The addr can be translated simply without checking if it falls into the
  2029. * first chunk. But the current code reflects better how percpu allocator
  2030. * actually works, and the verification can discover both bugs in percpu
  2031. * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
  2032. * code.
  2033. *
  2034. * RETURNS:
  2035. * The physical address for @addr.
  2036. */
  2037. phys_addr_t per_cpu_ptr_to_phys(void *addr)
  2038. {
  2039. void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  2040. bool in_first_chunk = false;
  2041. unsigned long first_low, first_high;
  2042. unsigned int cpu;
  2043. /*
  2044. * The following test on unit_low/high isn't strictly
  2045. * necessary but will speed up lookups of addresses which
  2046. * aren't in the first chunk.
  2047. *
  2048. * The address check is against full chunk sizes. pcpu_base_addr
  2049. * points to the beginning of the first chunk including the
  2050. * static region. Assumes good intent as the first chunk may
  2051. * not be full (ie. < pcpu_unit_pages in size).
  2052. */
  2053. first_low = (unsigned long)pcpu_base_addr +
  2054. pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
  2055. first_high = (unsigned long)pcpu_base_addr +
  2056. pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
  2057. if ((unsigned long)addr >= first_low &&
  2058. (unsigned long)addr < first_high) {
  2059. for_each_possible_cpu(cpu) {
  2060. void *start = per_cpu_ptr(base, cpu);
  2061. if (addr >= start && addr < start + pcpu_unit_size) {
  2062. in_first_chunk = true;
  2063. break;
  2064. }
  2065. }
  2066. }
  2067. if (in_first_chunk) {
  2068. if (!is_vmalloc_addr(addr))
  2069. return __pa(addr);
  2070. else
  2071. return page_to_phys(vmalloc_to_page(addr)) +
  2072. offset_in_page(addr);
  2073. } else
  2074. return page_to_phys(pcpu_addr_to_page(addr)) +
  2075. offset_in_page(addr);
  2076. }
  2077. /**
  2078. * pcpu_alloc_alloc_info - allocate percpu allocation info
  2079. * @nr_groups: the number of groups
  2080. * @nr_units: the number of units
  2081. *
  2082. * Allocate ai which is large enough for @nr_groups groups containing
  2083. * @nr_units units. The returned ai's groups[0].cpu_map points to the
  2084. * cpu_map array which is long enough for @nr_units and filled with
  2085. * NR_CPUS. It's the caller's responsibility to initialize cpu_map
  2086. * pointer of other groups.
  2087. *
  2088. * RETURNS:
  2089. * Pointer to the allocated pcpu_alloc_info on success, NULL on
  2090. * failure.
  2091. */
  2092. struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
  2093. int nr_units)
  2094. {
  2095. struct pcpu_alloc_info *ai;
  2096. size_t base_size, ai_size;
  2097. void *ptr;
  2098. int unit;
  2099. base_size = ALIGN(struct_size(ai, groups, nr_groups),
  2100. __alignof__(ai->groups[0].cpu_map[0]));
  2101. ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
  2102. ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
  2103. if (!ptr)
  2104. return NULL;
  2105. ai = ptr;
  2106. ptr += base_size;
  2107. ai->groups[0].cpu_map = ptr;
  2108. for (unit = 0; unit < nr_units; unit++)
  2109. ai->groups[0].cpu_map[unit] = NR_CPUS;
  2110. ai->nr_groups = nr_groups;
  2111. ai->__ai_size = PFN_ALIGN(ai_size);
  2112. return ai;
  2113. }
  2114. /**
  2115. * pcpu_free_alloc_info - free percpu allocation info
  2116. * @ai: pcpu_alloc_info to free
  2117. *
  2118. * Free @ai which was allocated by pcpu_alloc_alloc_info().
  2119. */
  2120. void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
  2121. {
  2122. memblock_free(ai, ai->__ai_size);
  2123. }
  2124. /**
  2125. * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
  2126. * @lvl: loglevel
  2127. * @ai: allocation info to dump
  2128. *
  2129. * Print out information about @ai using loglevel @lvl.
  2130. */
  2131. static void pcpu_dump_alloc_info(const char *lvl,
  2132. const struct pcpu_alloc_info *ai)
  2133. {
  2134. int group_width = 1, cpu_width = 1, width;
  2135. char empty_str[] = "--------";
  2136. int alloc = 0, alloc_end = 0;
  2137. int group, v;
  2138. int upa, apl; /* units per alloc, allocs per line */
  2139. v = ai->nr_groups;
  2140. while (v /= 10)
  2141. group_width++;
  2142. v = num_possible_cpus();
  2143. while (v /= 10)
  2144. cpu_width++;
  2145. empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
  2146. upa = ai->alloc_size / ai->unit_size;
  2147. width = upa * (cpu_width + 1) + group_width + 3;
  2148. apl = rounddown_pow_of_two(max(60 / width, 1));
  2149. printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
  2150. lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
  2151. ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
  2152. for (group = 0; group < ai->nr_groups; group++) {
  2153. const struct pcpu_group_info *gi = &ai->groups[group];
  2154. int unit = 0, unit_end = 0;
  2155. BUG_ON(gi->nr_units % upa);
  2156. for (alloc_end += gi->nr_units / upa;
  2157. alloc < alloc_end; alloc++) {
  2158. if (!(alloc % apl)) {
  2159. pr_cont("\n");
  2160. printk("%spcpu-alloc: ", lvl);
  2161. }
  2162. pr_cont("[%0*d] ", group_width, group);
  2163. for (unit_end += upa; unit < unit_end; unit++)
  2164. if (gi->cpu_map[unit] != NR_CPUS)
  2165. pr_cont("%0*d ",
  2166. cpu_width, gi->cpu_map[unit]);
  2167. else
  2168. pr_cont("%s ", empty_str);
  2169. }
  2170. }
  2171. pr_cont("\n");
  2172. }
  2173. /**
  2174. * pcpu_setup_first_chunk - initialize the first percpu chunk
  2175. * @ai: pcpu_alloc_info describing how to percpu area is shaped
  2176. * @base_addr: mapped address
  2177. *
  2178. * Initialize the first percpu chunk which contains the kernel static
  2179. * percpu area. This function is to be called from arch percpu area
  2180. * setup path.
  2181. *
  2182. * @ai contains all information necessary to initialize the first
  2183. * chunk and prime the dynamic percpu allocator.
  2184. *
  2185. * @ai->static_size is the size of static percpu area.
  2186. *
  2187. * @ai->reserved_size, if non-zero, specifies the amount of bytes to
  2188. * reserve after the static area in the first chunk. This reserves
  2189. * the first chunk such that it's available only through reserved
  2190. * percpu allocation. This is primarily used to serve module percpu
  2191. * static areas on architectures where the addressing model has
  2192. * limited offset range for symbol relocations to guarantee module
  2193. * percpu symbols fall inside the relocatable range.
  2194. *
  2195. * @ai->dyn_size determines the number of bytes available for dynamic
  2196. * allocation in the first chunk. The area between @ai->static_size +
  2197. * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
  2198. *
  2199. * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
  2200. * and equal to or larger than @ai->static_size + @ai->reserved_size +
  2201. * @ai->dyn_size.
  2202. *
  2203. * @ai->atom_size is the allocation atom size and used as alignment
  2204. * for vm areas.
  2205. *
  2206. * @ai->alloc_size is the allocation size and always multiple of
  2207. * @ai->atom_size. This is larger than @ai->atom_size if
  2208. * @ai->unit_size is larger than @ai->atom_size.
  2209. *
  2210. * @ai->nr_groups and @ai->groups describe virtual memory layout of
  2211. * percpu areas. Units which should be colocated are put into the
  2212. * same group. Dynamic VM areas will be allocated according to these
  2213. * groupings. If @ai->nr_groups is zero, a single group containing
  2214. * all units is assumed.
  2215. *
  2216. * The caller should have mapped the first chunk at @base_addr and
  2217. * copied static data to each unit.
  2218. *
  2219. * The first chunk will always contain a static and a dynamic region.
  2220. * However, the static region is not managed by any chunk. If the first
  2221. * chunk also contains a reserved region, it is served by two chunks -
  2222. * one for the reserved region and one for the dynamic region. They
  2223. * share the same vm, but use offset regions in the area allocation map.
  2224. * The chunk serving the dynamic region is circulated in the chunk slots
  2225. * and available for dynamic allocation like any other chunk.
  2226. */
  2227. void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
  2228. void *base_addr)
  2229. {
  2230. size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
  2231. size_t static_size, dyn_size;
  2232. unsigned long *group_offsets;
  2233. size_t *group_sizes;
  2234. unsigned long *unit_off;
  2235. unsigned int cpu;
  2236. int *unit_map;
  2237. int group, unit, i;
  2238. unsigned long tmp_addr;
  2239. size_t alloc_size;
  2240. #define PCPU_SETUP_BUG_ON(cond) do { \
  2241. if (unlikely(cond)) { \
  2242. pr_emerg("failed to initialize, %s\n", #cond); \
  2243. pr_emerg("cpu_possible_mask=%*pb\n", \
  2244. cpumask_pr_args(cpu_possible_mask)); \
  2245. pcpu_dump_alloc_info(KERN_EMERG, ai); \
  2246. BUG(); \
  2247. } \
  2248. } while (0)
  2249. /* sanity checks */
  2250. PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
  2251. #ifdef CONFIG_SMP
  2252. PCPU_SETUP_BUG_ON(!ai->static_size);
  2253. PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
  2254. #endif
  2255. PCPU_SETUP_BUG_ON(!base_addr);
  2256. PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
  2257. PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
  2258. PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
  2259. PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
  2260. PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
  2261. PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
  2262. PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
  2263. PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
  2264. IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
  2265. PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
  2266. /* process group information and build config tables accordingly */
  2267. alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
  2268. group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  2269. if (!group_offsets)
  2270. panic("%s: Failed to allocate %zu bytes\n", __func__,
  2271. alloc_size);
  2272. alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
  2273. group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  2274. if (!group_sizes)
  2275. panic("%s: Failed to allocate %zu bytes\n", __func__,
  2276. alloc_size);
  2277. alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
  2278. unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  2279. if (!unit_map)
  2280. panic("%s: Failed to allocate %zu bytes\n", __func__,
  2281. alloc_size);
  2282. alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
  2283. unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  2284. if (!unit_off)
  2285. panic("%s: Failed to allocate %zu bytes\n", __func__,
  2286. alloc_size);
  2287. for (cpu = 0; cpu < nr_cpu_ids; cpu++)
  2288. unit_map[cpu] = UINT_MAX;
  2289. pcpu_low_unit_cpu = NR_CPUS;
  2290. pcpu_high_unit_cpu = NR_CPUS;
  2291. for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
  2292. const struct pcpu_group_info *gi = &ai->groups[group];
  2293. group_offsets[group] = gi->base_offset;
  2294. group_sizes[group] = gi->nr_units * ai->unit_size;
  2295. for (i = 0; i < gi->nr_units; i++) {
  2296. cpu = gi->cpu_map[i];
  2297. if (cpu == NR_CPUS)
  2298. continue;
  2299. PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
  2300. PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
  2301. PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
  2302. unit_map[cpu] = unit + i;
  2303. unit_off[cpu] = gi->base_offset + i * ai->unit_size;
  2304. /* determine low/high unit_cpu */
  2305. if (pcpu_low_unit_cpu == NR_CPUS ||
  2306. unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
  2307. pcpu_low_unit_cpu = cpu;
  2308. if (pcpu_high_unit_cpu == NR_CPUS ||
  2309. unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
  2310. pcpu_high_unit_cpu = cpu;
  2311. }
  2312. }
  2313. pcpu_nr_units = unit;
  2314. for_each_possible_cpu(cpu)
  2315. PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
  2316. /* we're done parsing the input, undefine BUG macro and dump config */
  2317. #undef PCPU_SETUP_BUG_ON
  2318. pcpu_dump_alloc_info(KERN_DEBUG, ai);
  2319. pcpu_nr_groups = ai->nr_groups;
  2320. pcpu_group_offsets = group_offsets;
  2321. pcpu_group_sizes = group_sizes;
  2322. pcpu_unit_map = unit_map;
  2323. pcpu_unit_offsets = unit_off;
  2324. /* determine basic parameters */
  2325. pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
  2326. pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
  2327. pcpu_atom_size = ai->atom_size;
  2328. pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
  2329. BITS_TO_LONGS(pcpu_unit_pages));
  2330. pcpu_stats_save_ai(ai);
  2331. /*
  2332. * Allocate chunk slots. The slots after the active slots are:
  2333. * sidelined_slot - isolated, depopulated chunks
  2334. * free_slot - fully free chunks
  2335. * to_depopulate_slot - isolated, chunks to depopulate
  2336. */
  2337. pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
  2338. pcpu_free_slot = pcpu_sidelined_slot + 1;
  2339. pcpu_to_depopulate_slot = pcpu_free_slot + 1;
  2340. pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
  2341. pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
  2342. sizeof(pcpu_chunk_lists[0]),
  2343. SMP_CACHE_BYTES);
  2344. if (!pcpu_chunk_lists)
  2345. panic("%s: Failed to allocate %zu bytes\n", __func__,
  2346. pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
  2347. for (i = 0; i < pcpu_nr_slots; i++)
  2348. INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
  2349. /*
  2350. * The end of the static region needs to be aligned with the
  2351. * minimum allocation size as this offsets the reserved and
  2352. * dynamic region. The first chunk ends page aligned by
  2353. * expanding the dynamic region, therefore the dynamic region
  2354. * can be shrunk to compensate while still staying above the
  2355. * configured sizes.
  2356. */
  2357. static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
  2358. dyn_size = ai->dyn_size - (static_size - ai->static_size);
  2359. /*
  2360. * Initialize first chunk:
  2361. * This chunk is broken up into 3 parts:
  2362. * < static | [reserved] | dynamic >
  2363. * - static - there is no backing chunk because these allocations can
  2364. * never be freed.
  2365. * - reserved (pcpu_reserved_chunk) - exists primarily to serve
  2366. * allocations from module load.
  2367. * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
  2368. * chunk.
  2369. */
  2370. tmp_addr = (unsigned long)base_addr + static_size;
  2371. if (ai->reserved_size)
  2372. pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
  2373. ai->reserved_size);
  2374. tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
  2375. pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);
  2376. pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
  2377. pcpu_chunk_relocate(pcpu_first_chunk, -1);
  2378. /* include all regions of the first chunk */
  2379. pcpu_nr_populated += PFN_DOWN(size_sum);
  2380. pcpu_stats_chunk_alloc();
  2381. trace_percpu_create_chunk(base_addr);
  2382. /* we're done */
  2383. pcpu_base_addr = base_addr;
  2384. }
  2385. #ifdef CONFIG_SMP
  2386. const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
  2387. [PCPU_FC_AUTO] = "auto",
  2388. [PCPU_FC_EMBED] = "embed",
  2389. [PCPU_FC_PAGE] = "page",
  2390. };
  2391. enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
  2392. static int __init percpu_alloc_setup(char *str)
  2393. {
  2394. if (!str)
  2395. return -EINVAL;
  2396. if (0)
  2397. /* nada */;
  2398. #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
  2399. else if (!strcmp(str, "embed"))
  2400. pcpu_chosen_fc = PCPU_FC_EMBED;
  2401. #endif
  2402. #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  2403. else if (!strcmp(str, "page"))
  2404. pcpu_chosen_fc = PCPU_FC_PAGE;
  2405. #endif
  2406. else
  2407. pr_warn("unknown allocator %s specified\n", str);
  2408. return 0;
  2409. }
  2410. early_param("percpu_alloc", percpu_alloc_setup);
  2411. /*
  2412. * pcpu_embed_first_chunk() is used by the generic percpu setup.
  2413. * Build it if needed by the arch config or the generic setup is going
  2414. * to be used.
  2415. */
  2416. #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
  2417. !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
  2418. #define BUILD_EMBED_FIRST_CHUNK
  2419. #endif
  2420. /* build pcpu_page_first_chunk() iff needed by the arch config */
  2421. #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
  2422. #define BUILD_PAGE_FIRST_CHUNK
  2423. #endif
  2424. /* pcpu_build_alloc_info() is used by both embed and page first chunk */
  2425. #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
  2426. /**
  2427. * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
  2428. * @reserved_size: the size of reserved percpu area in bytes
  2429. * @dyn_size: minimum free size for dynamic allocation in bytes
  2430. * @atom_size: allocation atom size
  2431. * @cpu_distance_fn: callback to determine distance between cpus, optional
  2432. *
  2433. * This function determines grouping of units, their mappings to cpus
  2434. * and other parameters considering needed percpu size, allocation
  2435. * atom size and distances between CPUs.
  2436. *
  2437. * Groups are always multiples of atom size and CPUs which are of
  2438. * LOCAL_DISTANCE both ways are grouped together and share space for
  2439. * units in the same group. The returned configuration is guaranteed
  2440. * to have CPUs on different nodes on different groups and >=75% usage
  2441. * of allocated virtual address space.
  2442. *
  2443. * RETURNS:
  2444. * On success, pointer to the new allocation_info is returned. On
  2445. * failure, ERR_PTR value is returned.
  2446. */
  2447. static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
  2448. size_t reserved_size, size_t dyn_size,
  2449. size_t atom_size,
  2450. pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
  2451. {
  2452. static int group_map[NR_CPUS] __initdata;
  2453. static int group_cnt[NR_CPUS] __initdata;
  2454. static struct cpumask mask __initdata;
  2455. const size_t static_size = __per_cpu_end - __per_cpu_start;
  2456. int nr_groups = 1, nr_units = 0;
  2457. size_t size_sum, min_unit_size, alloc_size;
  2458. int upa, max_upa, best_upa; /* units_per_alloc */
  2459. int last_allocs, group, unit;
  2460. unsigned int cpu, tcpu;
  2461. struct pcpu_alloc_info *ai;
  2462. unsigned int *cpu_map;
  2463. /* this function may be called multiple times */
  2464. memset(group_map, 0, sizeof(group_map));
  2465. memset(group_cnt, 0, sizeof(group_cnt));
  2466. cpumask_clear(&mask);
  2467. /* calculate size_sum and ensure dyn_size is enough for early alloc */
  2468. size_sum = PFN_ALIGN(static_size + reserved_size +
  2469. max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
  2470. dyn_size = size_sum - static_size - reserved_size;
  2471. /*
  2472. * Determine min_unit_size, alloc_size and max_upa such that
  2473. * alloc_size is multiple of atom_size and is the smallest
  2474. * which can accommodate 4k aligned segments which are equal to
  2475. * or larger than min_unit_size.
  2476. */
  2477. min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
  2478. /* determine the maximum # of units that can fit in an allocation */
  2479. alloc_size = roundup(min_unit_size, atom_size);
  2480. upa = alloc_size / min_unit_size;
  2481. while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
  2482. upa--;
  2483. max_upa = upa;
  2484. cpumask_copy(&mask, cpu_possible_mask);
  2485. /* group cpus according to their proximity */
  2486. for (group = 0; !cpumask_empty(&mask); group++) {
  2487. /* pop the group's first cpu */
  2488. cpu = cpumask_first(&mask);
  2489. group_map[cpu] = group;
  2490. group_cnt[group]++;
  2491. cpumask_clear_cpu(cpu, &mask);
  2492. for_each_cpu(tcpu, &mask) {
  2493. if (!cpu_distance_fn ||
  2494. (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
  2495. cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
  2496. group_map[tcpu] = group;
  2497. group_cnt[group]++;
  2498. cpumask_clear_cpu(tcpu, &mask);
  2499. }
  2500. }
  2501. }
  2502. nr_groups = group;
  2503. /*
  2504. * Wasted space is caused by a ratio imbalance of upa to group_cnt.
  2505. * Expand the unit_size until we use >= 75% of the units allocated.
  2506. * Related to atom_size, which could be much larger than the unit_size.
  2507. */
  2508. last_allocs = INT_MAX;
  2509. best_upa = 0;
  2510. for (upa = max_upa; upa; upa--) {
  2511. int allocs = 0, wasted = 0;
  2512. if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
  2513. continue;
  2514. for (group = 0; group < nr_groups; group++) {
  2515. int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
  2516. allocs += this_allocs;
  2517. wasted += this_allocs * upa - group_cnt[group];
  2518. }
  2519. /*
  2520. * Don't accept if wastage is over 1/3. The
  2521. * greater-than comparison ensures upa==1 always
  2522. * passes the following check.
  2523. */
  2524. if (wasted > num_possible_cpus() / 3)
  2525. continue;
  2526. /* and then don't consume more memory */
  2527. if (allocs > last_allocs)
  2528. break;
  2529. last_allocs = allocs;
  2530. best_upa = upa;
  2531. }
  2532. BUG_ON(!best_upa);
  2533. upa = best_upa;
  2534. /* allocate and fill alloc_info */
  2535. for (group = 0; group < nr_groups; group++)
  2536. nr_units += roundup(group_cnt[group], upa);
  2537. ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
  2538. if (!ai)
  2539. return ERR_PTR(-ENOMEM);
  2540. cpu_map = ai->groups[0].cpu_map;
  2541. for (group = 0; group < nr_groups; group++) {
  2542. ai->groups[group].cpu_map = cpu_map;
  2543. cpu_map += roundup(group_cnt[group], upa);
  2544. }
  2545. ai->static_size = static_size;
  2546. ai->reserved_size = reserved_size;
  2547. ai->dyn_size = dyn_size;
  2548. ai->unit_size = alloc_size / upa;
  2549. ai->atom_size = atom_size;
  2550. ai->alloc_size = alloc_size;
  2551. for (group = 0, unit = 0; group < nr_groups; group++) {
  2552. struct pcpu_group_info *gi = &ai->groups[group];
  2553. /*
  2554. * Initialize base_offset as if all groups are located
  2555. * back-to-back. The caller should update this to
  2556. * reflect actual allocation.
  2557. */
  2558. gi->base_offset = unit * ai->unit_size;
  2559. for_each_possible_cpu(cpu)
  2560. if (group_map[cpu] == group)
  2561. gi->cpu_map[gi->nr_units++] = cpu;
  2562. gi->nr_units = roundup(gi->nr_units, upa);
  2563. unit += gi->nr_units;
  2564. }
  2565. BUG_ON(unit != nr_units);
  2566. return ai;
  2567. }
  2568. static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
  2569. pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
  2570. {
  2571. const unsigned long goal = __pa(MAX_DMA_ADDRESS);
  2572. #ifdef CONFIG_NUMA
  2573. int node = NUMA_NO_NODE;
  2574. void *ptr;
  2575. if (cpu_to_nd_fn)
  2576. node = cpu_to_nd_fn(cpu);
  2577. if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
  2578. ptr = memblock_alloc_from(size, align, goal);
  2579. pr_info("cpu %d has no node %d or node-local memory\n",
  2580. cpu, node);
  2581. pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
  2582. cpu, size, (u64)__pa(ptr));
  2583. } else {
  2584. ptr = memblock_alloc_try_nid(size, align, goal,
  2585. MEMBLOCK_ALLOC_ACCESSIBLE,
  2586. node);
  2587. pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
  2588. cpu, size, node, (u64)__pa(ptr));
  2589. }
  2590. return ptr;
  2591. #else
  2592. return memblock_alloc_from(size, align, goal);
  2593. #endif
  2594. }
  2595. static void __init pcpu_fc_free(void *ptr, size_t size)
  2596. {
  2597. memblock_free(ptr, size);
  2598. }
  2599. #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
  2600. #if defined(BUILD_EMBED_FIRST_CHUNK)
  2601. /**
  2602. * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  2603. * @reserved_size: the size of reserved percpu area in bytes
  2604. * @dyn_size: minimum free size for dynamic allocation in bytes
  2605. * @atom_size: allocation atom size
  2606. * @cpu_distance_fn: callback to determine distance between cpus, optional
  2607. * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
  2608. *
  2609. * This is a helper to ease setting up embedded first percpu chunk and
  2610. * can be called where pcpu_setup_first_chunk() is expected.
  2611. *
  2612. * If this function is used to setup the first chunk, it is allocated
  2613. * by calling pcpu_fc_alloc and used as-is without being mapped into
  2614. * vmalloc area. Allocations are always whole multiples of @atom_size
  2615. * aligned to @atom_size.
  2616. *
  2617. * This enables the first chunk to piggy back on the linear physical
  2618. * mapping which often uses larger page size. Please note that this
  2619. * can result in very sparse cpu->unit mapping on NUMA machines thus
  2620. * requiring large vmalloc address space. Don't use this allocator if
  2621. * vmalloc space is not orders of magnitude larger than distances
  2622. * between node memory addresses (ie. 32bit NUMA machines).
  2623. *
  2624. * @dyn_size specifies the minimum dynamic area size.
  2625. *
  2626. * If the needed size is smaller than the minimum or specified unit
  2627. * size, the leftover is returned using pcpu_fc_free.
  2628. *
  2629. * RETURNS:
  2630. * 0 on success, -errno on failure.
  2631. */
  2632. int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
  2633. size_t atom_size,
  2634. pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
  2635. pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
  2636. {
  2637. void *base = (void *)ULONG_MAX;
  2638. void **areas = NULL;
  2639. struct pcpu_alloc_info *ai;
  2640. size_t size_sum, areas_size;
  2641. unsigned long max_distance;
  2642. int group, i, highest_group, rc = 0;
  2643. ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
  2644. cpu_distance_fn);
  2645. if (IS_ERR(ai))
  2646. return PTR_ERR(ai);
  2647. size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
  2648. areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
  2649. areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
  2650. if (!areas) {
  2651. rc = -ENOMEM;
  2652. goto out_free;
  2653. }
  2654. /* allocate, copy and determine base address & max_distance */
  2655. highest_group = 0;
  2656. for (group = 0; group < ai->nr_groups; group++) {
  2657. struct pcpu_group_info *gi = &ai->groups[group];
  2658. unsigned int cpu = NR_CPUS;
  2659. void *ptr;
  2660. for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
  2661. cpu = gi->cpu_map[i];
  2662. BUG_ON(cpu == NR_CPUS);
  2663. /* allocate space for the whole group */
  2664. ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
  2665. if (!ptr) {
  2666. rc = -ENOMEM;
  2667. goto out_free_areas;
  2668. }
  2669. /* kmemleak tracks the percpu allocations separately */
  2670. kmemleak_ignore_phys(__pa(ptr));
  2671. areas[group] = ptr;
  2672. base = min(ptr, base);
  2673. if (ptr > areas[highest_group])
  2674. highest_group = group;
  2675. }
  2676. max_distance = areas[highest_group] - base;
  2677. max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
  2678. /* warn if maximum distance is further than 75% of vmalloc space */
  2679. if (max_distance > VMALLOC_TOTAL * 3 / 4) {
  2680. pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
  2681. max_distance, VMALLOC_TOTAL);
  2682. #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  2683. /* and fail if we have fallback */
  2684. rc = -EINVAL;
  2685. goto out_free_areas;
  2686. #endif
  2687. }
  2688. /*
  2689. * Copy data and free unused parts. This should happen after all
  2690. * allocations are complete; otherwise, we may end up with
  2691. * overlapping groups.
  2692. */
  2693. for (group = 0; group < ai->nr_groups; group++) {
  2694. struct pcpu_group_info *gi = &ai->groups[group];
  2695. void *ptr = areas[group];
  2696. for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
  2697. if (gi->cpu_map[i] == NR_CPUS) {
  2698. /* unused unit, free whole */
  2699. pcpu_fc_free(ptr, ai->unit_size);
  2700. continue;
  2701. }
  2702. /* copy and return the unused part */
  2703. memcpy(ptr, __per_cpu_load, ai->static_size);
  2704. pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
  2705. }
  2706. }
  2707. /* base address is now known, determine group base offsets */
  2708. for (group = 0; group < ai->nr_groups; group++) {
  2709. ai->groups[group].base_offset = areas[group] - base;
  2710. }
  2711. pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
  2712. PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
  2713. ai->dyn_size, ai->unit_size);
  2714. pcpu_setup_first_chunk(ai, base);
  2715. goto out_free;
  2716. out_free_areas:
  2717. for (group = 0; group < ai->nr_groups; group++)
  2718. if (areas[group])
  2719. pcpu_fc_free(areas[group],
  2720. ai->groups[group].nr_units * ai->unit_size);
  2721. out_free:
  2722. pcpu_free_alloc_info(ai);
  2723. if (areas)
  2724. memblock_free(areas, areas_size);
  2725. return rc;
  2726. }
  2727. #endif /* BUILD_EMBED_FIRST_CHUNK */
  2728. #ifdef BUILD_PAGE_FIRST_CHUNK
  2729. #include <asm/pgalloc.h>
  2730. #ifndef P4D_TABLE_SIZE
  2731. #define P4D_TABLE_SIZE PAGE_SIZE
  2732. #endif
  2733. #ifndef PUD_TABLE_SIZE
  2734. #define PUD_TABLE_SIZE PAGE_SIZE
  2735. #endif
  2736. #ifndef PMD_TABLE_SIZE
  2737. #define PMD_TABLE_SIZE PAGE_SIZE
  2738. #endif
  2739. #ifndef PTE_TABLE_SIZE
  2740. #define PTE_TABLE_SIZE PAGE_SIZE
  2741. #endif
  2742. void __init __weak pcpu_populate_pte(unsigned long addr)
  2743. {
  2744. pgd_t *pgd = pgd_offset_k(addr);
  2745. p4d_t *p4d;
  2746. pud_t *pud;
  2747. pmd_t *pmd;
  2748. if (pgd_none(*pgd)) {
  2749. p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
  2750. if (!p4d)
  2751. goto err_alloc;
  2752. pgd_populate(&init_mm, pgd, p4d);
  2753. }
  2754. p4d = p4d_offset(pgd, addr);
  2755. if (p4d_none(*p4d)) {
  2756. pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
  2757. if (!pud)
  2758. goto err_alloc;
  2759. p4d_populate(&init_mm, p4d, pud);
  2760. }
  2761. pud = pud_offset(p4d, addr);
  2762. if (pud_none(*pud)) {
  2763. pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
  2764. if (!pmd)
  2765. goto err_alloc;
  2766. pud_populate(&init_mm, pud, pmd);
  2767. }
  2768. pmd = pmd_offset(pud, addr);
  2769. if (!pmd_present(*pmd)) {
  2770. pte_t *new;
  2771. new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
  2772. if (!new)
  2773. goto err_alloc;
  2774. pmd_populate_kernel(&init_mm, pmd, new);
  2775. }
  2776. return;
  2777. err_alloc:
  2778. panic("%s: Failed to allocate memory\n", __func__);
  2779. }
  2780. /**
  2781. * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
  2782. * @reserved_size: the size of reserved percpu area in bytes
  2783. * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
  2784. *
  2785. * This is a helper to ease setting up page-remapped first percpu
  2786. * chunk and can be called where pcpu_setup_first_chunk() is expected.
  2787. *
  2788. * This is the basic allocator. Static percpu area is allocated
  2789. * page-by-page into vmalloc area.
  2790. *
  2791. * RETURNS:
  2792. * 0 on success, -errno on failure.
  2793. */
  2794. int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
  2795. {
  2796. static struct vm_struct vm;
  2797. struct pcpu_alloc_info *ai;
  2798. char psize_str[16];
  2799. int unit_pages;
  2800. size_t pages_size;
  2801. struct page **pages;
  2802. int unit, i, j, rc = 0;
  2803. int upa;
  2804. int nr_g0_units;
  2805. snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
  2806. ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
  2807. if (IS_ERR(ai))
  2808. return PTR_ERR(ai);
  2809. BUG_ON(ai->nr_groups != 1);
  2810. upa = ai->alloc_size/ai->unit_size;
  2811. nr_g0_units = roundup(num_possible_cpus(), upa);
  2812. if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
  2813. pcpu_free_alloc_info(ai);
  2814. return -EINVAL;
  2815. }
  2816. unit_pages = ai->unit_size >> PAGE_SHIFT;
  2817. /* unaligned allocations can't be freed, round up to page size */
  2818. pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
  2819. sizeof(pages[0]));
  2820. pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
  2821. if (!pages)
  2822. panic("%s: Failed to allocate %zu bytes\n", __func__,
  2823. pages_size);
  2824. /* allocate pages */
  2825. j = 0;
  2826. for (unit = 0; unit < num_possible_cpus(); unit++) {
  2827. unsigned int cpu = ai->groups[0].cpu_map[unit];
  2828. for (i = 0; i < unit_pages; i++) {
  2829. void *ptr;
  2830. ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
  2831. if (!ptr) {
  2832. pr_warn("failed to allocate %s page for cpu%u\n",
  2833. psize_str, cpu);
  2834. goto enomem;
  2835. }
  2836. /* kmemleak tracks the percpu allocations separately */
  2837. kmemleak_ignore_phys(__pa(ptr));
  2838. pages[j++] = virt_to_page(ptr);
  2839. }
  2840. }
  2841. /* allocate vm area, map the pages and copy static data */
  2842. vm.flags = VM_ALLOC;
  2843. vm.size = num_possible_cpus() * ai->unit_size;
  2844. vm_area_register_early(&vm, PAGE_SIZE);
  2845. for (unit = 0; unit < num_possible_cpus(); unit++) {
  2846. unsigned long unit_addr =
  2847. (unsigned long)vm.addr + unit * ai->unit_size;
  2848. for (i = 0; i < unit_pages; i++)
  2849. pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));
  2850. /* pte already populated, the following shouldn't fail */
  2851. rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
  2852. unit_pages);
  2853. if (rc < 0)
  2854. panic("failed to map percpu area, err=%d\n", rc);
  2855. flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
  2856. /* copy static data */
  2857. memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
  2858. }
  2859. /* we're ready, commit */
  2860. pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
  2861. unit_pages, psize_str, ai->static_size,
  2862. ai->reserved_size, ai->dyn_size);
  2863. pcpu_setup_first_chunk(ai, vm.addr);
  2864. goto out_free_ar;
  2865. enomem:
  2866. while (--j >= 0)
  2867. pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
  2868. rc = -ENOMEM;
  2869. out_free_ar:
  2870. memblock_free(pages, pages_size);
  2871. pcpu_free_alloc_info(ai);
  2872. return rc;
  2873. }
  2874. #endif /* BUILD_PAGE_FIRST_CHUNK */
  2875. #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
  2876. /*
  2877. * Generic SMP percpu area setup.
  2878. *
  2879. * The embedding helper is used because its behavior closely resembles
  2880. * the original non-dynamic generic percpu area setup. This is
  2881. * important because many archs have addressing restrictions and might
  2882. * fail if the percpu area is located far away from the previous
  2883. * location. As an added bonus, in non-NUMA cases, embedding is
  2884. * generally a good idea TLB-wise because percpu area can piggy back
  2885. * on the physical linear memory mapping which uses large page
  2886. * mappings on applicable archs.
  2887. */
  2888. unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
  2889. EXPORT_SYMBOL(__per_cpu_offset);
  2890. void __init setup_per_cpu_areas(void)
  2891. {
  2892. unsigned long delta;
  2893. unsigned int cpu;
  2894. int rc;
  2895. /*
  2896. * Always reserve area for module percpu variables. That's
  2897. * what the legacy allocator did.
  2898. */
  2899. rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
  2900. PAGE_SIZE, NULL, NULL);
  2901. if (rc < 0)
  2902. panic("Failed to initialize percpu areas.");
  2903. delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
  2904. for_each_possible_cpu(cpu)
  2905. __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
  2906. }
  2907. #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
  2908. #else /* CONFIG_SMP */
  2909. /*
  2910. * UP percpu area setup.
  2911. *
  2912. * UP always uses km-based percpu allocator with identity mapping.
  2913. * Static percpu variables are indistinguishable from the usual static
  2914. * variables and don't require any special preparation.
  2915. */
  2916. void __init setup_per_cpu_areas(void)
  2917. {
  2918. const size_t unit_size =
  2919. roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
  2920. PERCPU_DYNAMIC_RESERVE));
  2921. struct pcpu_alloc_info *ai;
  2922. void *fc;
  2923. ai = pcpu_alloc_alloc_info(1, 1);
  2924. fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
  2925. if (!ai || !fc)
  2926. panic("Failed to allocate memory for percpu areas.");
  2927. /* kmemleak tracks the percpu allocations separately */
  2928. kmemleak_ignore_phys(__pa(fc));
  2929. ai->dyn_size = unit_size;
  2930. ai->unit_size = unit_size;
  2931. ai->atom_size = unit_size;
  2932. ai->alloc_size = unit_size;
  2933. ai->groups[0].nr_units = 1;
  2934. ai->groups[0].cpu_map[0] = 0;
  2935. pcpu_setup_first_chunk(ai, fc);
  2936. pcpu_free_alloc_info(ai);
  2937. }
  2938. #endif /* CONFIG_SMP */
  2939. /*
  2940. * pcpu_nr_pages - calculate total number of populated backing pages
  2941. *
  2942. * This reflects the number of pages populated to back chunks. Metadata is
  2943. * excluded in the number exposed in meminfo as the number of backing pages
  2944. * scales with the number of cpus and can quickly outweigh the memory used for
  2945. * metadata. It also keeps this calculation nice and simple.
  2946. *
  2947. * RETURNS:
  2948. * Total number of populated backing pages in use by the allocator.
  2949. */
  2950. unsigned long pcpu_nr_pages(void)
  2951. {
  2952. return pcpu_nr_populated * pcpu_nr_units;
  2953. }
  2954. /*
  2955. * Percpu allocator is initialized early during boot when neither slab or
  2956. * workqueue is available. Plug async management until everything is up
  2957. * and running.
  2958. */
  2959. static int __init percpu_enable_async(void)
  2960. {
  2961. pcpu_async_enabled = true;
  2962. return 0;
  2963. }
  2964. subsys_initcall(percpu_enable_async);