timer.c 84 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Kernel internal timers
  4. *
  5. * Copyright (C) 1991, 1992 Linus Torvalds
  6. *
  7. * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
  8. *
  9. * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
  10. * "A Kernel Model for Precision Timekeeping" by Dave Mills
  11. * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
  12. * serialize accesses to xtime/lost_ticks).
  13. * Copyright (C) 1998 Andrea Arcangeli
  14. * 1999-03-10 Improved NTP compatibility by Ulrich Windl
  15. * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
  16. * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
  17. * Copyright (C) 2000, 2001, 2002 Ingo Molnar
  18. * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
  19. */
  20. #include <linux/kernel_stat.h>
  21. #include <linux/export.h>
  22. #include <linux/interrupt.h>
  23. #include <linux/percpu.h>
  24. #include <linux/init.h>
  25. #include <linux/mm.h>
  26. #include <linux/swap.h>
  27. #include <linux/pid_namespace.h>
  28. #include <linux/notifier.h>
  29. #include <linux/thread_info.h>
  30. #include <linux/time.h>
  31. #include <linux/jiffies.h>
  32. #include <linux/posix-timers.h>
  33. #include <linux/cpu.h>
  34. #include <linux/syscalls.h>
  35. #include <linux/delay.h>
  36. #include <linux/tick.h>
  37. #include <linux/kallsyms.h>
  38. #include <linux/irq_work.h>
  39. #include <linux/sched/signal.h>
  40. #include <linux/sched/sysctl.h>
  41. #include <linux/sched/nohz.h>
  42. #include <linux/sched/debug.h>
  43. #include <linux/slab.h>
  44. #include <linux/compat.h>
  45. #include <linux/random.h>
  46. #include <linux/sysctl.h>
  47. #include <linux/uaccess.h>
  48. #include <asm/unistd.h>
  49. #include <asm/div64.h>
  50. #include <asm/timex.h>
  51. #include <asm/io.h>
  52. #include "tick-internal.h"
  53. #include "timer_migration.h"
  54. #define CREATE_TRACE_POINTS
  55. #include <trace/events/timer.h>
  56. __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
  57. EXPORT_SYMBOL(jiffies_64);
  58. /*
  59. * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
  60. * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
  61. * level has a different granularity.
  62. *
  63. * The level granularity is: LVL_CLK_DIV ^ level
  64. * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
  65. *
  66. * The array level of a newly armed timer depends on the relative expiry
  67. * time. The farther the expiry time is away the higher the array level and
  68. * therefore the granularity becomes.
  69. *
  70. * Contrary to the original timer wheel implementation, which aims for 'exact'
  71. * expiry of the timers, this implementation removes the need for recascading
  72. * the timers into the lower array levels. The previous 'classic' timer wheel
  73. * implementation of the kernel already violated the 'exact' expiry by adding
  74. * slack to the expiry time to provide batched expiration. The granularity
  75. * levels provide implicit batching.
  76. *
  77. * This is an optimization of the original timer wheel implementation for the
  78. * majority of the timer wheel use cases: timeouts. The vast majority of
  79. * timeout timers (networking, disk I/O ...) are canceled before expiry. If
  80. * the timeout expires it indicates that normal operation is disturbed, so it
  81. * does not matter much whether the timeout comes with a slight delay.
  82. *
  83. * The only exception to this are networking timers with a small expiry
  84. * time. They rely on the granularity. Those fit into the first wheel level,
  85. * which has HZ granularity.
  86. *
  87. * We don't have cascading anymore. timers with a expiry time above the
  88. * capacity of the last wheel level are force expired at the maximum timeout
  89. * value of the last wheel level. From data sampling we know that the maximum
  90. * value observed is 5 days (network connection tracking), so this should not
  91. * be an issue.
  92. *
  93. * The currently chosen array constants values are a good compromise between
  94. * array size and granularity.
  95. *
  96. * This results in the following granularity and range levels:
  97. *
  98. * HZ 1000 steps
  99. * Level Offset Granularity Range
  100. * 0 0 1 ms 0 ms - 63 ms
  101. * 1 64 8 ms 64 ms - 511 ms
  102. * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
  103. * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
  104. * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
  105. * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
  106. * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
  107. * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
  108. * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
  109. *
  110. * HZ 300
  111. * Level Offset Granularity Range
  112. * 0 0 3 ms 0 ms - 210 ms
  113. * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
  114. * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
  115. * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
  116. * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
  117. * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
  118. * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
  119. * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
  120. * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
  121. *
  122. * HZ 250
  123. * Level Offset Granularity Range
  124. * 0 0 4 ms 0 ms - 255 ms
  125. * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
  126. * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
  127. * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
  128. * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
  129. * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
  130. * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
  131. * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
  132. * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
  133. *
  134. * HZ 100
  135. * Level Offset Granularity Range
  136. * 0 0 10 ms 0 ms - 630 ms
  137. * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
  138. * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
  139. * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
  140. * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
  141. * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
  142. * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
  143. * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
  144. */
  145. /* Clock divisor for the next level */
  146. #define LVL_CLK_SHIFT 3
  147. #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
  148. #define LVL_CLK_MASK (LVL_CLK_DIV - 1)
  149. #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
  150. #define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
  151. /*
  152. * The time start value for each level to select the bucket at enqueue
  153. * time. We start from the last possible delta of the previous level
  154. * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
  155. */
  156. #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
  157. /* Size of each clock level */
  158. #define LVL_BITS 6
  159. #define LVL_SIZE (1UL << LVL_BITS)
  160. #define LVL_MASK (LVL_SIZE - 1)
  161. #define LVL_OFFS(n) ((n) * LVL_SIZE)
  162. /* Level depth */
  163. #if HZ > 100
  164. # define LVL_DEPTH 9
  165. # else
  166. # define LVL_DEPTH 8
  167. #endif
  168. /* The cutoff (max. capacity of the wheel) */
  169. #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
  170. #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
  171. /*
  172. * The resulting wheel size. If NOHZ is configured we allocate two
  173. * wheels so we have a separate storage for the deferrable timers.
  174. */
  175. #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
  176. #ifdef CONFIG_NO_HZ_COMMON
  177. /*
  178. * If multiple bases need to be locked, use the base ordering for lock
  179. * nesting, i.e. lowest number first.
  180. */
  181. # define NR_BASES 3
  182. # define BASE_LOCAL 0
  183. # define BASE_GLOBAL 1
  184. # define BASE_DEF 2
  185. #else
  186. # define NR_BASES 1
  187. # define BASE_LOCAL 0
  188. # define BASE_GLOBAL 0
  189. # define BASE_DEF 0
  190. #endif
  191. /**
  192. * struct timer_base - Per CPU timer base (number of base depends on config)
  193. * @lock: Lock protecting the timer_base
  194. * @running_timer: When expiring timers, the lock is dropped. To make
  195. * sure not to race against deleting/modifying a
  196. * currently running timer, the pointer is set to the
  197. * timer, which expires at the moment. If no timer is
  198. * running, the pointer is NULL.
  199. * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around
  200. * timer expiry callback execution and when trying to
  201. * delete a running timer and it wasn't successful in
  202. * the first glance. It prevents priority inversion
  203. * when callback was preempted on a remote CPU and a
  204. * caller tries to delete the running timer. It also
  205. * prevents a life lock, when the task which tries to
  206. * delete a timer preempted the softirq thread which
  207. * is running the timer callback function.
  208. * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter
  209. * waiting for the end of the timer callback function
  210. * execution.
  211. * @clk: clock of the timer base; is updated before enqueue
  212. * of a timer; during expiry, it is 1 offset ahead of
  213. * jiffies to avoid endless requeuing to current
  214. * jiffies
  215. * @next_expiry: expiry value of the first timer; it is updated when
  216. * finding the next timer and during enqueue; the
  217. * value is not valid, when next_expiry_recalc is set
  218. * @cpu: Number of CPU the timer base belongs to
  219. * @next_expiry_recalc: States, whether a recalculation of next_expiry is
  220. * required. Value is set true, when a timer was
  221. * deleted.
  222. * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ
  223. * code. This state is only used in standard
  224. * base. Deferrable timers, which are enqueued remotely
  225. * never wake up an idle CPU. So no matter of supporting it
  226. * for this base.
  227. * @timers_pending: Is set, when a timer is pending in the base. It is only
  228. * reliable when next_expiry_recalc is not set.
  229. * @pending_map: bitmap of the timer wheel; each bit reflects a
  230. * bucket of the wheel. When a bit is set, at least a
  231. * single timer is enqueued in the related bucket.
  232. * @vectors: Array of lists; Each array member reflects a bucket
  233. * of the timer wheel. The list contains all timers
  234. * which are enqueued into a specific bucket.
  235. */
  236. struct timer_base {
  237. raw_spinlock_t lock;
  238. struct timer_list *running_timer;
  239. #ifdef CONFIG_PREEMPT_RT
  240. spinlock_t expiry_lock;
  241. atomic_t timer_waiters;
  242. #endif
  243. unsigned long clk;
  244. unsigned long next_expiry;
  245. unsigned int cpu;
  246. bool next_expiry_recalc;
  247. bool is_idle;
  248. bool timers_pending;
  249. DECLARE_BITMAP(pending_map, WHEEL_SIZE);
  250. struct hlist_head vectors[WHEEL_SIZE];
  251. } ____cacheline_aligned;
  252. static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
  253. #ifdef CONFIG_NO_HZ_COMMON
  254. static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
  255. static DEFINE_MUTEX(timer_keys_mutex);
  256. static void timer_update_keys(struct work_struct *work);
  257. static DECLARE_WORK(timer_update_work, timer_update_keys);
  258. #ifdef CONFIG_SMP
  259. static unsigned int sysctl_timer_migration = 1;
  260. DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
  261. static void timers_update_migration(void)
  262. {
  263. if (sysctl_timer_migration && tick_nohz_active)
  264. static_branch_enable(&timers_migration_enabled);
  265. else
  266. static_branch_disable(&timers_migration_enabled);
  267. }
  268. #ifdef CONFIG_SYSCTL
  269. static int timer_migration_handler(const struct ctl_table *table, int write,
  270. void *buffer, size_t *lenp, loff_t *ppos)
  271. {
  272. int ret;
  273. mutex_lock(&timer_keys_mutex);
  274. ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  275. if (!ret && write)
  276. timers_update_migration();
  277. mutex_unlock(&timer_keys_mutex);
  278. return ret;
  279. }
  280. static struct ctl_table timer_sysctl[] = {
  281. {
  282. .procname = "timer_migration",
  283. .data = &sysctl_timer_migration,
  284. .maxlen = sizeof(unsigned int),
  285. .mode = 0644,
  286. .proc_handler = timer_migration_handler,
  287. .extra1 = SYSCTL_ZERO,
  288. .extra2 = SYSCTL_ONE,
  289. },
  290. };
  291. static int __init timer_sysctl_init(void)
  292. {
  293. register_sysctl("kernel", timer_sysctl);
  294. return 0;
  295. }
  296. device_initcall(timer_sysctl_init);
  297. #endif /* CONFIG_SYSCTL */
  298. #else /* CONFIG_SMP */
  299. static inline void timers_update_migration(void) { }
  300. #endif /* !CONFIG_SMP */
  301. static void timer_update_keys(struct work_struct *work)
  302. {
  303. mutex_lock(&timer_keys_mutex);
  304. timers_update_migration();
  305. static_branch_enable(&timers_nohz_active);
  306. mutex_unlock(&timer_keys_mutex);
  307. }
  308. void timers_update_nohz(void)
  309. {
  310. schedule_work(&timer_update_work);
  311. }
  312. static inline bool is_timers_nohz_active(void)
  313. {
  314. return static_branch_unlikely(&timers_nohz_active);
  315. }
  316. #else
  317. static inline bool is_timers_nohz_active(void) { return false; }
  318. #endif /* NO_HZ_COMMON */
  319. static unsigned long round_jiffies_common(unsigned long j, int cpu,
  320. bool force_up)
  321. {
  322. int rem;
  323. unsigned long original = j;
  324. /*
  325. * We don't want all cpus firing their timers at once hitting the
  326. * same lock or cachelines, so we skew each extra cpu with an extra
  327. * 3 jiffies. This 3 jiffies came originally from the mm/ code which
  328. * already did this.
  329. * The skew is done by adding 3*cpunr, then round, then subtract this
  330. * extra offset again.
  331. */
  332. j += cpu * 3;
  333. rem = j % HZ;
  334. /*
  335. * If the target jiffy is just after a whole second (which can happen
  336. * due to delays of the timer irq, long irq off times etc etc) then
  337. * we should round down to the whole second, not up. Use 1/4th second
  338. * as cutoff for this rounding as an extreme upper bound for this.
  339. * But never round down if @force_up is set.
  340. */
  341. if (rem < HZ/4 && !force_up) /* round down */
  342. j = j - rem;
  343. else /* round up */
  344. j = j - rem + HZ;
  345. /* now that we have rounded, subtract the extra skew again */
  346. j -= cpu * 3;
  347. /*
  348. * Make sure j is still in the future. Otherwise return the
  349. * unmodified value.
  350. */
  351. return time_is_after_jiffies(j) ? j : original;
  352. }
  353. /**
  354. * __round_jiffies - function to round jiffies to a full second
  355. * @j: the time in (absolute) jiffies that should be rounded
  356. * @cpu: the processor number on which the timeout will happen
  357. *
  358. * __round_jiffies() rounds an absolute time in the future (in jiffies)
  359. * up or down to (approximately) full seconds. This is useful for timers
  360. * for which the exact time they fire does not matter too much, as long as
  361. * they fire approximately every X seconds.
  362. *
  363. * By rounding these timers to whole seconds, all such timers will fire
  364. * at the same time, rather than at various times spread out. The goal
  365. * of this is to have the CPU wake up less, which saves power.
  366. *
  367. * The exact rounding is skewed for each processor to avoid all
  368. * processors firing at the exact same time, which could lead
  369. * to lock contention or spurious cache line bouncing.
  370. *
  371. * The return value is the rounded version of the @j parameter.
  372. */
  373. unsigned long __round_jiffies(unsigned long j, int cpu)
  374. {
  375. return round_jiffies_common(j, cpu, false);
  376. }
  377. EXPORT_SYMBOL_GPL(__round_jiffies);
  378. /**
  379. * __round_jiffies_relative - function to round jiffies to a full second
  380. * @j: the time in (relative) jiffies that should be rounded
  381. * @cpu: the processor number on which the timeout will happen
  382. *
  383. * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
  384. * up or down to (approximately) full seconds. This is useful for timers
  385. * for which the exact time they fire does not matter too much, as long as
  386. * they fire approximately every X seconds.
  387. *
  388. * By rounding these timers to whole seconds, all such timers will fire
  389. * at the same time, rather than at various times spread out. The goal
  390. * of this is to have the CPU wake up less, which saves power.
  391. *
  392. * The exact rounding is skewed for each processor to avoid all
  393. * processors firing at the exact same time, which could lead
  394. * to lock contention or spurious cache line bouncing.
  395. *
  396. * The return value is the rounded version of the @j parameter.
  397. */
  398. unsigned long __round_jiffies_relative(unsigned long j, int cpu)
  399. {
  400. unsigned long j0 = jiffies;
  401. /* Use j0 because jiffies might change while we run */
  402. return round_jiffies_common(j + j0, cpu, false) - j0;
  403. }
  404. EXPORT_SYMBOL_GPL(__round_jiffies_relative);
  405. /**
  406. * round_jiffies - function to round jiffies to a full second
  407. * @j: the time in (absolute) jiffies that should be rounded
  408. *
  409. * round_jiffies() rounds an absolute time in the future (in jiffies)
  410. * up or down to (approximately) full seconds. This is useful for timers
  411. * for which the exact time they fire does not matter too much, as long as
  412. * they fire approximately every X seconds.
  413. *
  414. * By rounding these timers to whole seconds, all such timers will fire
  415. * at the same time, rather than at various times spread out. The goal
  416. * of this is to have the CPU wake up less, which saves power.
  417. *
  418. * The return value is the rounded version of the @j parameter.
  419. */
  420. unsigned long round_jiffies(unsigned long j)
  421. {
  422. return round_jiffies_common(j, raw_smp_processor_id(), false);
  423. }
  424. EXPORT_SYMBOL_GPL(round_jiffies);
  425. /**
  426. * round_jiffies_relative - function to round jiffies to a full second
  427. * @j: the time in (relative) jiffies that should be rounded
  428. *
  429. * round_jiffies_relative() rounds a time delta in the future (in jiffies)
  430. * up or down to (approximately) full seconds. This is useful for timers
  431. * for which the exact time they fire does not matter too much, as long as
  432. * they fire approximately every X seconds.
  433. *
  434. * By rounding these timers to whole seconds, all such timers will fire
  435. * at the same time, rather than at various times spread out. The goal
  436. * of this is to have the CPU wake up less, which saves power.
  437. *
  438. * The return value is the rounded version of the @j parameter.
  439. */
  440. unsigned long round_jiffies_relative(unsigned long j)
  441. {
  442. return __round_jiffies_relative(j, raw_smp_processor_id());
  443. }
  444. EXPORT_SYMBOL_GPL(round_jiffies_relative);
  445. /**
  446. * __round_jiffies_up - function to round jiffies up to a full second
  447. * @j: the time in (absolute) jiffies that should be rounded
  448. * @cpu: the processor number on which the timeout will happen
  449. *
  450. * This is the same as __round_jiffies() except that it will never
  451. * round down. This is useful for timeouts for which the exact time
  452. * of firing does not matter too much, as long as they don't fire too
  453. * early.
  454. */
  455. unsigned long __round_jiffies_up(unsigned long j, int cpu)
  456. {
  457. return round_jiffies_common(j, cpu, true);
  458. }
  459. EXPORT_SYMBOL_GPL(__round_jiffies_up);
  460. /**
  461. * __round_jiffies_up_relative - function to round jiffies up to a full second
  462. * @j: the time in (relative) jiffies that should be rounded
  463. * @cpu: the processor number on which the timeout will happen
  464. *
  465. * This is the same as __round_jiffies_relative() except that it will never
  466. * round down. This is useful for timeouts for which the exact time
  467. * of firing does not matter too much, as long as they don't fire too
  468. * early.
  469. */
  470. unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
  471. {
  472. unsigned long j0 = jiffies;
  473. /* Use j0 because jiffies might change while we run */
  474. return round_jiffies_common(j + j0, cpu, true) - j0;
  475. }
  476. EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
  477. /**
  478. * round_jiffies_up - function to round jiffies up to a full second
  479. * @j: the time in (absolute) jiffies that should be rounded
  480. *
  481. * This is the same as round_jiffies() except that it will never
  482. * round down. This is useful for timeouts for which the exact time
  483. * of firing does not matter too much, as long as they don't fire too
  484. * early.
  485. */
  486. unsigned long round_jiffies_up(unsigned long j)
  487. {
  488. return round_jiffies_common(j, raw_smp_processor_id(), true);
  489. }
  490. EXPORT_SYMBOL_GPL(round_jiffies_up);
  491. /**
  492. * round_jiffies_up_relative - function to round jiffies up to a full second
  493. * @j: the time in (relative) jiffies that should be rounded
  494. *
  495. * This is the same as round_jiffies_relative() except that it will never
  496. * round down. This is useful for timeouts for which the exact time
  497. * of firing does not matter too much, as long as they don't fire too
  498. * early.
  499. */
  500. unsigned long round_jiffies_up_relative(unsigned long j)
  501. {
  502. return __round_jiffies_up_relative(j, raw_smp_processor_id());
  503. }
  504. EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
  505. static inline unsigned int timer_get_idx(struct timer_list *timer)
  506. {
  507. return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
  508. }
  509. static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
  510. {
  511. timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
  512. idx << TIMER_ARRAYSHIFT;
  513. }
  514. /*
  515. * Helper function to calculate the array index for a given expiry
  516. * time.
  517. */
  518. static inline unsigned calc_index(unsigned long expires, unsigned lvl,
  519. unsigned long *bucket_expiry)
  520. {
  521. /*
  522. * The timer wheel has to guarantee that a timer does not fire
  523. * early. Early expiry can happen due to:
  524. * - Timer is armed at the edge of a tick
  525. * - Truncation of the expiry time in the outer wheel levels
  526. *
  527. * Round up with level granularity to prevent this.
  528. */
  529. expires = (expires >> LVL_SHIFT(lvl)) + 1;
  530. *bucket_expiry = expires << LVL_SHIFT(lvl);
  531. return LVL_OFFS(lvl) + (expires & LVL_MASK);
  532. }
  533. static int calc_wheel_index(unsigned long expires, unsigned long clk,
  534. unsigned long *bucket_expiry)
  535. {
  536. unsigned long delta = expires - clk;
  537. unsigned int idx;
  538. if (delta < LVL_START(1)) {
  539. idx = calc_index(expires, 0, bucket_expiry);
  540. } else if (delta < LVL_START(2)) {
  541. idx = calc_index(expires, 1, bucket_expiry);
  542. } else if (delta < LVL_START(3)) {
  543. idx = calc_index(expires, 2, bucket_expiry);
  544. } else if (delta < LVL_START(4)) {
  545. idx = calc_index(expires, 3, bucket_expiry);
  546. } else if (delta < LVL_START(5)) {
  547. idx = calc_index(expires, 4, bucket_expiry);
  548. } else if (delta < LVL_START(6)) {
  549. idx = calc_index(expires, 5, bucket_expiry);
  550. } else if (delta < LVL_START(7)) {
  551. idx = calc_index(expires, 6, bucket_expiry);
  552. } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
  553. idx = calc_index(expires, 7, bucket_expiry);
  554. } else if ((long) delta < 0) {
  555. idx = clk & LVL_MASK;
  556. *bucket_expiry = clk;
  557. } else {
  558. /*
  559. * Force expire obscene large timeouts to expire at the
  560. * capacity limit of the wheel.
  561. */
  562. if (delta >= WHEEL_TIMEOUT_CUTOFF)
  563. expires = clk + WHEEL_TIMEOUT_MAX;
  564. idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
  565. }
  566. return idx;
  567. }
  568. static void
  569. trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
  570. {
  571. /*
  572. * Deferrable timers do not prevent the CPU from entering dynticks and
  573. * are not taken into account on the idle/nohz_full path. An IPI when a
  574. * new deferrable timer is enqueued will wake up the remote CPU but
  575. * nothing will be done with the deferrable timer base. Therefore skip
  576. * the remote IPI for deferrable timers completely.
  577. */
  578. if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
  579. return;
  580. /*
  581. * We might have to IPI the remote CPU if the base is idle and the
  582. * timer is pinned. If it is a non pinned timer, it is only queued
  583. * on the remote CPU, when timer was running during queueing. Then
  584. * everything is handled by remote CPU anyway. If the other CPU is
  585. * on the way to idle then it can't set base->is_idle as we hold
  586. * the base lock:
  587. */
  588. if (base->is_idle) {
  589. WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
  590. tick_nohz_full_cpu(base->cpu)));
  591. wake_up_nohz_cpu(base->cpu);
  592. }
  593. }
  594. /*
  595. * Enqueue the timer into the hash bucket, mark it pending in
  596. * the bitmap, store the index in the timer flags then wake up
  597. * the target CPU if needed.
  598. */
  599. static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
  600. unsigned int idx, unsigned long bucket_expiry)
  601. {
  602. hlist_add_head(&timer->entry, base->vectors + idx);
  603. __set_bit(idx, base->pending_map);
  604. timer_set_idx(timer, idx);
  605. trace_timer_start(timer, bucket_expiry);
  606. /*
  607. * Check whether this is the new first expiring timer. The
  608. * effective expiry time of the timer is required here
  609. * (bucket_expiry) instead of timer->expires.
  610. */
  611. if (time_before(bucket_expiry, base->next_expiry)) {
  612. /*
  613. * Set the next expiry time and kick the CPU so it
  614. * can reevaluate the wheel:
  615. */
  616. WRITE_ONCE(base->next_expiry, bucket_expiry);
  617. base->timers_pending = true;
  618. base->next_expiry_recalc = false;
  619. trigger_dyntick_cpu(base, timer);
  620. }
  621. }
  622. static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
  623. {
  624. unsigned long bucket_expiry;
  625. unsigned int idx;
  626. idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
  627. enqueue_timer(base, timer, idx, bucket_expiry);
  628. }
  629. #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
  630. static const struct debug_obj_descr timer_debug_descr;
  631. struct timer_hint {
  632. void (*function)(struct timer_list *t);
  633. long offset;
  634. };
  635. #define TIMER_HINT(fn, container, timr, hintfn) \
  636. { \
  637. .function = fn, \
  638. .offset = offsetof(container, hintfn) - \
  639. offsetof(container, timr) \
  640. }
  641. static const struct timer_hint timer_hints[] = {
  642. TIMER_HINT(delayed_work_timer_fn,
  643. struct delayed_work, timer, work.func),
  644. TIMER_HINT(kthread_delayed_work_timer_fn,
  645. struct kthread_delayed_work, timer, work.func),
  646. };
  647. static void *timer_debug_hint(void *addr)
  648. {
  649. struct timer_list *timer = addr;
  650. int i;
  651. for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
  652. if (timer_hints[i].function == timer->function) {
  653. void (**fn)(void) = addr + timer_hints[i].offset;
  654. return *fn;
  655. }
  656. }
  657. return timer->function;
  658. }
  659. static bool timer_is_static_object(void *addr)
  660. {
  661. struct timer_list *timer = addr;
  662. return (timer->entry.pprev == NULL &&
  663. timer->entry.next == TIMER_ENTRY_STATIC);
  664. }
  665. /*
  666. * timer_fixup_init is called when:
  667. * - an active object is initialized
  668. */
  669. static bool timer_fixup_init(void *addr, enum debug_obj_state state)
  670. {
  671. struct timer_list *timer = addr;
  672. switch (state) {
  673. case ODEBUG_STATE_ACTIVE:
  674. del_timer_sync(timer);
  675. debug_object_init(timer, &timer_debug_descr);
  676. return true;
  677. default:
  678. return false;
  679. }
  680. }
  681. /* Stub timer callback for improperly used timers. */
  682. static void stub_timer(struct timer_list *unused)
  683. {
  684. WARN_ON(1);
  685. }
  686. /*
  687. * timer_fixup_activate is called when:
  688. * - an active object is activated
  689. * - an unknown non-static object is activated
  690. */
  691. static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
  692. {
  693. struct timer_list *timer = addr;
  694. switch (state) {
  695. case ODEBUG_STATE_NOTAVAILABLE:
  696. timer_setup(timer, stub_timer, 0);
  697. return true;
  698. case ODEBUG_STATE_ACTIVE:
  699. WARN_ON(1);
  700. fallthrough;
  701. default:
  702. return false;
  703. }
  704. }
  705. /*
  706. * timer_fixup_free is called when:
  707. * - an active object is freed
  708. */
  709. static bool timer_fixup_free(void *addr, enum debug_obj_state state)
  710. {
  711. struct timer_list *timer = addr;
  712. switch (state) {
  713. case ODEBUG_STATE_ACTIVE:
  714. del_timer_sync(timer);
  715. debug_object_free(timer, &timer_debug_descr);
  716. return true;
  717. default:
  718. return false;
  719. }
  720. }
  721. /*
  722. * timer_fixup_assert_init is called when:
  723. * - an untracked/uninit-ed object is found
  724. */
  725. static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
  726. {
  727. struct timer_list *timer = addr;
  728. switch (state) {
  729. case ODEBUG_STATE_NOTAVAILABLE:
  730. timer_setup(timer, stub_timer, 0);
  731. return true;
  732. default:
  733. return false;
  734. }
  735. }
  736. static const struct debug_obj_descr timer_debug_descr = {
  737. .name = "timer_list",
  738. .debug_hint = timer_debug_hint,
  739. .is_static_object = timer_is_static_object,
  740. .fixup_init = timer_fixup_init,
  741. .fixup_activate = timer_fixup_activate,
  742. .fixup_free = timer_fixup_free,
  743. .fixup_assert_init = timer_fixup_assert_init,
  744. };
  745. static inline void debug_timer_init(struct timer_list *timer)
  746. {
  747. debug_object_init(timer, &timer_debug_descr);
  748. }
  749. static inline void debug_timer_activate(struct timer_list *timer)
  750. {
  751. debug_object_activate(timer, &timer_debug_descr);
  752. }
  753. static inline void debug_timer_deactivate(struct timer_list *timer)
  754. {
  755. debug_object_deactivate(timer, &timer_debug_descr);
  756. }
  757. static inline void debug_timer_assert_init(struct timer_list *timer)
  758. {
  759. debug_object_assert_init(timer, &timer_debug_descr);
  760. }
  761. static void do_init_timer(struct timer_list *timer,
  762. void (*func)(struct timer_list *),
  763. unsigned int flags,
  764. const char *name, struct lock_class_key *key);
  765. void init_timer_on_stack_key(struct timer_list *timer,
  766. void (*func)(struct timer_list *),
  767. unsigned int flags,
  768. const char *name, struct lock_class_key *key)
  769. {
  770. debug_object_init_on_stack(timer, &timer_debug_descr);
  771. do_init_timer(timer, func, flags, name, key);
  772. }
  773. EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
  774. void destroy_timer_on_stack(struct timer_list *timer)
  775. {
  776. debug_object_free(timer, &timer_debug_descr);
  777. }
  778. EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
  779. #else
  780. static inline void debug_timer_init(struct timer_list *timer) { }
  781. static inline void debug_timer_activate(struct timer_list *timer) { }
  782. static inline void debug_timer_deactivate(struct timer_list *timer) { }
  783. static inline void debug_timer_assert_init(struct timer_list *timer) { }
  784. #endif
  785. static inline void debug_init(struct timer_list *timer)
  786. {
  787. debug_timer_init(timer);
  788. trace_timer_init(timer);
  789. }
  790. static inline void debug_deactivate(struct timer_list *timer)
  791. {
  792. debug_timer_deactivate(timer);
  793. trace_timer_cancel(timer);
  794. }
  795. static inline void debug_assert_init(struct timer_list *timer)
  796. {
  797. debug_timer_assert_init(timer);
  798. }
  799. static void do_init_timer(struct timer_list *timer,
  800. void (*func)(struct timer_list *),
  801. unsigned int flags,
  802. const char *name, struct lock_class_key *key)
  803. {
  804. timer->entry.pprev = NULL;
  805. timer->function = func;
  806. if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
  807. flags &= TIMER_INIT_FLAGS;
  808. timer->flags = flags | raw_smp_processor_id();
  809. lockdep_init_map(&timer->lockdep_map, name, key, 0);
  810. }
  811. /**
  812. * init_timer_key - initialize a timer
  813. * @timer: the timer to be initialized
  814. * @func: timer callback function
  815. * @flags: timer flags
  816. * @name: name of the timer
  817. * @key: lockdep class key of the fake lock used for tracking timer
  818. * sync lock dependencies
  819. *
  820. * init_timer_key() must be done to a timer prior to calling *any* of the
  821. * other timer functions.
  822. */
  823. void init_timer_key(struct timer_list *timer,
  824. void (*func)(struct timer_list *), unsigned int flags,
  825. const char *name, struct lock_class_key *key)
  826. {
  827. debug_init(timer);
  828. do_init_timer(timer, func, flags, name, key);
  829. }
  830. EXPORT_SYMBOL(init_timer_key);
  831. static inline void detach_timer(struct timer_list *timer, bool clear_pending)
  832. {
  833. struct hlist_node *entry = &timer->entry;
  834. debug_deactivate(timer);
  835. __hlist_del(entry);
  836. if (clear_pending)
  837. entry->pprev = NULL;
  838. entry->next = LIST_POISON2;
  839. }
  840. static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
  841. bool clear_pending)
  842. {
  843. unsigned idx = timer_get_idx(timer);
  844. if (!timer_pending(timer))
  845. return 0;
  846. if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
  847. __clear_bit(idx, base->pending_map);
  848. base->next_expiry_recalc = true;
  849. }
  850. detach_timer(timer, clear_pending);
  851. return 1;
  852. }
  853. static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
  854. {
  855. int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
  856. struct timer_base *base;
  857. base = per_cpu_ptr(&timer_bases[index], cpu);
  858. /*
  859. * If the timer is deferrable and NO_HZ_COMMON is set then we need
  860. * to use the deferrable base.
  861. */
  862. if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
  863. base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
  864. return base;
  865. }
  866. static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
  867. {
  868. int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
  869. struct timer_base *base;
  870. base = this_cpu_ptr(&timer_bases[index]);
  871. /*
  872. * If the timer is deferrable and NO_HZ_COMMON is set then we need
  873. * to use the deferrable base.
  874. */
  875. if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
  876. base = this_cpu_ptr(&timer_bases[BASE_DEF]);
  877. return base;
  878. }
  879. static inline struct timer_base *get_timer_base(u32 tflags)
  880. {
  881. return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
  882. }
  883. static inline void __forward_timer_base(struct timer_base *base,
  884. unsigned long basej)
  885. {
  886. /*
  887. * Check whether we can forward the base. We can only do that when
  888. * @basej is past base->clk otherwise we might rewind base->clk.
  889. */
  890. if (time_before_eq(basej, base->clk))
  891. return;
  892. /*
  893. * If the next expiry value is > jiffies, then we fast forward to
  894. * jiffies otherwise we forward to the next expiry value.
  895. */
  896. if (time_after(base->next_expiry, basej)) {
  897. base->clk = basej;
  898. } else {
  899. if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
  900. return;
  901. base->clk = base->next_expiry;
  902. }
  903. }
  904. static inline void forward_timer_base(struct timer_base *base)
  905. {
  906. __forward_timer_base(base, READ_ONCE(jiffies));
  907. }
  908. /*
  909. * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
  910. * that all timers which are tied to this base are locked, and the base itself
  911. * is locked too.
  912. *
  913. * So __run_timers/migrate_timers can safely modify all timers which could
  914. * be found in the base->vectors array.
  915. *
  916. * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
  917. * to wait until the migration is done.
  918. */
  919. static struct timer_base *lock_timer_base(struct timer_list *timer,
  920. unsigned long *flags)
  921. __acquires(timer->base->lock)
  922. {
  923. for (;;) {
  924. struct timer_base *base;
  925. u32 tf;
  926. /*
  927. * We need to use READ_ONCE() here, otherwise the compiler
  928. * might re-read @tf between the check for TIMER_MIGRATING
  929. * and spin_lock().
  930. */
  931. tf = READ_ONCE(timer->flags);
  932. if (!(tf & TIMER_MIGRATING)) {
  933. base = get_timer_base(tf);
  934. raw_spin_lock_irqsave(&base->lock, *flags);
  935. if (timer->flags == tf)
  936. return base;
  937. raw_spin_unlock_irqrestore(&base->lock, *flags);
  938. }
  939. cpu_relax();
  940. }
  941. }
  942. #define MOD_TIMER_PENDING_ONLY 0x01
  943. #define MOD_TIMER_REDUCE 0x02
  944. #define MOD_TIMER_NOTPENDING 0x04
  945. static inline int
  946. __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
  947. {
  948. unsigned long clk = 0, flags, bucket_expiry;
  949. struct timer_base *base, *new_base;
  950. unsigned int idx = UINT_MAX;
  951. int ret = 0;
  952. debug_assert_init(timer);
  953. /*
  954. * This is a common optimization triggered by the networking code - if
  955. * the timer is re-modified to have the same timeout or ends up in the
  956. * same array bucket then just return:
  957. */
  958. if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
  959. /*
  960. * The downside of this optimization is that it can result in
  961. * larger granularity than you would get from adding a new
  962. * timer with this expiry.
  963. */
  964. long diff = timer->expires - expires;
  965. if (!diff)
  966. return 1;
  967. if (options & MOD_TIMER_REDUCE && diff <= 0)
  968. return 1;
  969. /*
  970. * We lock timer base and calculate the bucket index right
  971. * here. If the timer ends up in the same bucket, then we
  972. * just update the expiry time and avoid the whole
  973. * dequeue/enqueue dance.
  974. */
  975. base = lock_timer_base(timer, &flags);
  976. /*
  977. * Has @timer been shutdown? This needs to be evaluated
  978. * while holding base lock to prevent a race against the
  979. * shutdown code.
  980. */
  981. if (!timer->function)
  982. goto out_unlock;
  983. forward_timer_base(base);
  984. if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
  985. time_before_eq(timer->expires, expires)) {
  986. ret = 1;
  987. goto out_unlock;
  988. }
  989. clk = base->clk;
  990. idx = calc_wheel_index(expires, clk, &bucket_expiry);
  991. /*
  992. * Retrieve and compare the array index of the pending
  993. * timer. If it matches set the expiry to the new value so a
  994. * subsequent call will exit in the expires check above.
  995. */
  996. if (idx == timer_get_idx(timer)) {
  997. if (!(options & MOD_TIMER_REDUCE))
  998. timer->expires = expires;
  999. else if (time_after(timer->expires, expires))
  1000. timer->expires = expires;
  1001. ret = 1;
  1002. goto out_unlock;
  1003. }
  1004. } else {
  1005. base = lock_timer_base(timer, &flags);
  1006. /*
  1007. * Has @timer been shutdown? This needs to be evaluated
  1008. * while holding base lock to prevent a race against the
  1009. * shutdown code.
  1010. */
  1011. if (!timer->function)
  1012. goto out_unlock;
  1013. forward_timer_base(base);
  1014. }
  1015. ret = detach_if_pending(timer, base, false);
  1016. if (!ret && (options & MOD_TIMER_PENDING_ONLY))
  1017. goto out_unlock;
  1018. new_base = get_timer_this_cpu_base(timer->flags);
  1019. if (base != new_base) {
  1020. /*
  1021. * We are trying to schedule the timer on the new base.
  1022. * However we can't change timer's base while it is running,
  1023. * otherwise timer_delete_sync() can't detect that the timer's
  1024. * handler yet has not finished. This also guarantees that the
  1025. * timer is serialized wrt itself.
  1026. */
  1027. if (likely(base->running_timer != timer)) {
  1028. /* See the comment in lock_timer_base() */
  1029. timer->flags |= TIMER_MIGRATING;
  1030. raw_spin_unlock(&base->lock);
  1031. base = new_base;
  1032. raw_spin_lock(&base->lock);
  1033. WRITE_ONCE(timer->flags,
  1034. (timer->flags & ~TIMER_BASEMASK) | base->cpu);
  1035. forward_timer_base(base);
  1036. }
  1037. }
  1038. debug_timer_activate(timer);
  1039. timer->expires = expires;
  1040. /*
  1041. * If 'idx' was calculated above and the base time did not advance
  1042. * between calculating 'idx' and possibly switching the base, only
  1043. * enqueue_timer() is required. Otherwise we need to (re)calculate
  1044. * the wheel index via internal_add_timer().
  1045. */
  1046. if (idx != UINT_MAX && clk == base->clk)
  1047. enqueue_timer(base, timer, idx, bucket_expiry);
  1048. else
  1049. internal_add_timer(base, timer);
  1050. out_unlock:
  1051. raw_spin_unlock_irqrestore(&base->lock, flags);
  1052. return ret;
  1053. }
  1054. /**
  1055. * mod_timer_pending - Modify a pending timer's timeout
  1056. * @timer: The pending timer to be modified
  1057. * @expires: New absolute timeout in jiffies
  1058. *
  1059. * mod_timer_pending() is the same for pending timers as mod_timer(), but
  1060. * will not activate inactive timers.
  1061. *
  1062. * If @timer->function == NULL then the start operation is silently
  1063. * discarded.
  1064. *
  1065. * Return:
  1066. * * %0 - The timer was inactive and not modified or was in
  1067. * shutdown state and the operation was discarded
  1068. * * %1 - The timer was active and requeued to expire at @expires
  1069. */
  1070. int mod_timer_pending(struct timer_list *timer, unsigned long expires)
  1071. {
  1072. return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
  1073. }
  1074. EXPORT_SYMBOL(mod_timer_pending);
  1075. /**
  1076. * mod_timer - Modify a timer's timeout
  1077. * @timer: The timer to be modified
  1078. * @expires: New absolute timeout in jiffies
  1079. *
  1080. * mod_timer(timer, expires) is equivalent to:
  1081. *
  1082. * del_timer(timer); timer->expires = expires; add_timer(timer);
  1083. *
  1084. * mod_timer() is more efficient than the above open coded sequence. In
  1085. * case that the timer is inactive, the del_timer() part is a NOP. The
  1086. * timer is in any case activated with the new expiry time @expires.
  1087. *
  1088. * Note that if there are multiple unserialized concurrent users of the
  1089. * same timer, then mod_timer() is the only safe way to modify the timeout,
  1090. * since add_timer() cannot modify an already running timer.
  1091. *
  1092. * If @timer->function == NULL then the start operation is silently
  1093. * discarded. In this case the return value is 0 and meaningless.
  1094. *
  1095. * Return:
  1096. * * %0 - The timer was inactive and started or was in shutdown
  1097. * state and the operation was discarded
  1098. * * %1 - The timer was active and requeued to expire at @expires or
  1099. * the timer was active and not modified because @expires did
  1100. * not change the effective expiry time
  1101. */
  1102. int mod_timer(struct timer_list *timer, unsigned long expires)
  1103. {
  1104. return __mod_timer(timer, expires, 0);
  1105. }
  1106. EXPORT_SYMBOL(mod_timer);
  1107. /**
  1108. * timer_reduce - Modify a timer's timeout if it would reduce the timeout
  1109. * @timer: The timer to be modified
  1110. * @expires: New absolute timeout in jiffies
  1111. *
  1112. * timer_reduce() is very similar to mod_timer(), except that it will only
  1113. * modify an enqueued timer if that would reduce the expiration time. If
  1114. * @timer is not enqueued it starts the timer.
  1115. *
  1116. * If @timer->function == NULL then the start operation is silently
  1117. * discarded.
  1118. *
  1119. * Return:
  1120. * * %0 - The timer was inactive and started or was in shutdown
  1121. * state and the operation was discarded
  1122. * * %1 - The timer was active and requeued to expire at @expires or
  1123. * the timer was active and not modified because @expires
  1124. * did not change the effective expiry time such that the
  1125. * timer would expire earlier than already scheduled
  1126. */
  1127. int timer_reduce(struct timer_list *timer, unsigned long expires)
  1128. {
  1129. return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
  1130. }
  1131. EXPORT_SYMBOL(timer_reduce);
  1132. /**
  1133. * add_timer - Start a timer
  1134. * @timer: The timer to be started
  1135. *
  1136. * Start @timer to expire at @timer->expires in the future. @timer->expires
  1137. * is the absolute expiry time measured in 'jiffies'. When the timer expires
  1138. * timer->function(timer) will be invoked from soft interrupt context.
  1139. *
  1140. * The @timer->expires and @timer->function fields must be set prior
  1141. * to calling this function.
  1142. *
  1143. * If @timer->function == NULL then the start operation is silently
  1144. * discarded.
  1145. *
  1146. * If @timer->expires is already in the past @timer will be queued to
  1147. * expire at the next timer tick.
  1148. *
  1149. * This can only operate on an inactive timer. Attempts to invoke this on
  1150. * an active timer are rejected with a warning.
  1151. */
  1152. void add_timer(struct timer_list *timer)
  1153. {
  1154. if (WARN_ON_ONCE(timer_pending(timer)))
  1155. return;
  1156. __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
  1157. }
  1158. EXPORT_SYMBOL(add_timer);
  1159. /**
  1160. * add_timer_local() - Start a timer on the local CPU
  1161. * @timer: The timer to be started
  1162. *
  1163. * Same as add_timer() except that the timer flag TIMER_PINNED is set.
  1164. *
  1165. * See add_timer() for further details.
  1166. */
  1167. void add_timer_local(struct timer_list *timer)
  1168. {
  1169. if (WARN_ON_ONCE(timer_pending(timer)))
  1170. return;
  1171. timer->flags |= TIMER_PINNED;
  1172. __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
  1173. }
  1174. EXPORT_SYMBOL(add_timer_local);
  1175. /**
  1176. * add_timer_global() - Start a timer without TIMER_PINNED flag set
  1177. * @timer: The timer to be started
  1178. *
  1179. * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
  1180. *
  1181. * See add_timer() for further details.
  1182. */
  1183. void add_timer_global(struct timer_list *timer)
  1184. {
  1185. if (WARN_ON_ONCE(timer_pending(timer)))
  1186. return;
  1187. timer->flags &= ~TIMER_PINNED;
  1188. __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
  1189. }
  1190. EXPORT_SYMBOL(add_timer_global);
  1191. /**
  1192. * add_timer_on - Start a timer on a particular CPU
  1193. * @timer: The timer to be started
  1194. * @cpu: The CPU to start it on
  1195. *
  1196. * Same as add_timer() except that it starts the timer on the given CPU and
  1197. * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
  1198. * the next round, add_timer_global() should be used instead as it unsets
  1199. * the TIMER_PINNED flag.
  1200. *
  1201. * See add_timer() for further details.
  1202. */
  1203. void add_timer_on(struct timer_list *timer, int cpu)
  1204. {
  1205. struct timer_base *new_base, *base;
  1206. unsigned long flags;
  1207. debug_assert_init(timer);
  1208. if (WARN_ON_ONCE(timer_pending(timer)))
  1209. return;
  1210. /* Make sure timer flags have TIMER_PINNED flag set */
  1211. timer->flags |= TIMER_PINNED;
  1212. new_base = get_timer_cpu_base(timer->flags, cpu);
  1213. /*
  1214. * If @timer was on a different CPU, it should be migrated with the
  1215. * old base locked to prevent other operations proceeding with the
  1216. * wrong base locked. See lock_timer_base().
  1217. */
  1218. base = lock_timer_base(timer, &flags);
  1219. /*
  1220. * Has @timer been shutdown? This needs to be evaluated while
  1221. * holding base lock to prevent a race against the shutdown code.
  1222. */
  1223. if (!timer->function)
  1224. goto out_unlock;
  1225. if (base != new_base) {
  1226. timer->flags |= TIMER_MIGRATING;
  1227. raw_spin_unlock(&base->lock);
  1228. base = new_base;
  1229. raw_spin_lock(&base->lock);
  1230. WRITE_ONCE(timer->flags,
  1231. (timer->flags & ~TIMER_BASEMASK) | cpu);
  1232. }
  1233. forward_timer_base(base);
  1234. debug_timer_activate(timer);
  1235. internal_add_timer(base, timer);
  1236. out_unlock:
  1237. raw_spin_unlock_irqrestore(&base->lock, flags);
  1238. }
  1239. EXPORT_SYMBOL_GPL(add_timer_on);
  1240. /**
  1241. * __timer_delete - Internal function: Deactivate a timer
  1242. * @timer: The timer to be deactivated
  1243. * @shutdown: If true, this indicates that the timer is about to be
  1244. * shutdown permanently.
  1245. *
  1246. * If @shutdown is true then @timer->function is set to NULL under the
  1247. * timer base lock which prevents further rearming of the time. In that
  1248. * case any attempt to rearm @timer after this function returns will be
  1249. * silently ignored.
  1250. *
  1251. * Return:
  1252. * * %0 - The timer was not pending
  1253. * * %1 - The timer was pending and deactivated
  1254. */
  1255. static int __timer_delete(struct timer_list *timer, bool shutdown)
  1256. {
  1257. struct timer_base *base;
  1258. unsigned long flags;
  1259. int ret = 0;
  1260. debug_assert_init(timer);
  1261. /*
  1262. * If @shutdown is set then the lock has to be taken whether the
  1263. * timer is pending or not to protect against a concurrent rearm
  1264. * which might hit between the lockless pending check and the lock
  1265. * acquisition. By taking the lock it is ensured that such a newly
  1266. * enqueued timer is dequeued and cannot end up with
  1267. * timer->function == NULL in the expiry code.
  1268. *
  1269. * If timer->function is currently executed, then this makes sure
  1270. * that the callback cannot requeue the timer.
  1271. */
  1272. if (timer_pending(timer) || shutdown) {
  1273. base = lock_timer_base(timer, &flags);
  1274. ret = detach_if_pending(timer, base, true);
  1275. if (shutdown)
  1276. timer->function = NULL;
  1277. raw_spin_unlock_irqrestore(&base->lock, flags);
  1278. }
  1279. return ret;
  1280. }
  1281. /**
  1282. * timer_delete - Deactivate a timer
  1283. * @timer: The timer to be deactivated
  1284. *
  1285. * The function only deactivates a pending timer, but contrary to
  1286. * timer_delete_sync() it does not take into account whether the timer's
  1287. * callback function is concurrently executed on a different CPU or not.
  1288. * It neither prevents rearming of the timer. If @timer can be rearmed
  1289. * concurrently then the return value of this function is meaningless.
  1290. *
  1291. * Return:
  1292. * * %0 - The timer was not pending
  1293. * * %1 - The timer was pending and deactivated
  1294. */
  1295. int timer_delete(struct timer_list *timer)
  1296. {
  1297. return __timer_delete(timer, false);
  1298. }
  1299. EXPORT_SYMBOL(timer_delete);
  1300. /**
  1301. * timer_shutdown - Deactivate a timer and prevent rearming
  1302. * @timer: The timer to be deactivated
  1303. *
  1304. * The function does not wait for an eventually running timer callback on a
  1305. * different CPU but it prevents rearming of the timer. Any attempt to arm
  1306. * @timer after this function returns will be silently ignored.
  1307. *
  1308. * This function is useful for teardown code and should only be used when
  1309. * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
  1310. *
  1311. * Return:
  1312. * * %0 - The timer was not pending
  1313. * * %1 - The timer was pending
  1314. */
  1315. int timer_shutdown(struct timer_list *timer)
  1316. {
  1317. return __timer_delete(timer, true);
  1318. }
  1319. EXPORT_SYMBOL_GPL(timer_shutdown);
  1320. /**
  1321. * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
  1322. * @timer: Timer to deactivate
  1323. * @shutdown: If true, this indicates that the timer is about to be
  1324. * shutdown permanently.
  1325. *
  1326. * If @shutdown is true then @timer->function is set to NULL under the
  1327. * timer base lock which prevents further rearming of the timer. Any
  1328. * attempt to rearm @timer after this function returns will be silently
  1329. * ignored.
  1330. *
  1331. * This function cannot guarantee that the timer cannot be rearmed
  1332. * right after dropping the base lock if @shutdown is false. That
  1333. * needs to be prevented by the calling code if necessary.
  1334. *
  1335. * Return:
  1336. * * %0 - The timer was not pending
  1337. * * %1 - The timer was pending and deactivated
  1338. * * %-1 - The timer callback function is running on a different CPU
  1339. */
  1340. static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
  1341. {
  1342. struct timer_base *base;
  1343. unsigned long flags;
  1344. int ret = -1;
  1345. debug_assert_init(timer);
  1346. base = lock_timer_base(timer, &flags);
  1347. if (base->running_timer != timer)
  1348. ret = detach_if_pending(timer, base, true);
  1349. if (shutdown)
  1350. timer->function = NULL;
  1351. raw_spin_unlock_irqrestore(&base->lock, flags);
  1352. return ret;
  1353. }
  1354. /**
  1355. * try_to_del_timer_sync - Try to deactivate a timer
  1356. * @timer: Timer to deactivate
  1357. *
  1358. * This function tries to deactivate a timer. On success the timer is not
  1359. * queued and the timer callback function is not running on any CPU.
  1360. *
  1361. * This function does not guarantee that the timer cannot be rearmed right
  1362. * after dropping the base lock. That needs to be prevented by the calling
  1363. * code if necessary.
  1364. *
  1365. * Return:
  1366. * * %0 - The timer was not pending
  1367. * * %1 - The timer was pending and deactivated
  1368. * * %-1 - The timer callback function is running on a different CPU
  1369. */
  1370. int try_to_del_timer_sync(struct timer_list *timer)
  1371. {
  1372. return __try_to_del_timer_sync(timer, false);
  1373. }
  1374. EXPORT_SYMBOL(try_to_del_timer_sync);
  1375. #ifdef CONFIG_PREEMPT_RT
  1376. static __init void timer_base_init_expiry_lock(struct timer_base *base)
  1377. {
  1378. spin_lock_init(&base->expiry_lock);
  1379. }
  1380. static inline void timer_base_lock_expiry(struct timer_base *base)
  1381. {
  1382. spin_lock(&base->expiry_lock);
  1383. }
  1384. static inline void timer_base_unlock_expiry(struct timer_base *base)
  1385. {
  1386. spin_unlock(&base->expiry_lock);
  1387. }
  1388. /*
  1389. * The counterpart to del_timer_wait_running().
  1390. *
  1391. * If there is a waiter for base->expiry_lock, then it was waiting for the
  1392. * timer callback to finish. Drop expiry_lock and reacquire it. That allows
  1393. * the waiter to acquire the lock and make progress.
  1394. */
  1395. static void timer_sync_wait_running(struct timer_base *base)
  1396. __releases(&base->lock) __releases(&base->expiry_lock)
  1397. __acquires(&base->expiry_lock) __acquires(&base->lock)
  1398. {
  1399. if (atomic_read(&base->timer_waiters)) {
  1400. raw_spin_unlock_irq(&base->lock);
  1401. spin_unlock(&base->expiry_lock);
  1402. spin_lock(&base->expiry_lock);
  1403. raw_spin_lock_irq(&base->lock);
  1404. }
  1405. }
  1406. /*
  1407. * This function is called on PREEMPT_RT kernels when the fast path
  1408. * deletion of a timer failed because the timer callback function was
  1409. * running.
  1410. *
  1411. * This prevents priority inversion, if the softirq thread on a remote CPU
  1412. * got preempted, and it prevents a life lock when the task which tries to
  1413. * delete a timer preempted the softirq thread running the timer callback
  1414. * function.
  1415. */
  1416. static void del_timer_wait_running(struct timer_list *timer)
  1417. {
  1418. u32 tf;
  1419. tf = READ_ONCE(timer->flags);
  1420. if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
  1421. struct timer_base *base = get_timer_base(tf);
  1422. /*
  1423. * Mark the base as contended and grab the expiry lock,
  1424. * which is held by the softirq across the timer
  1425. * callback. Drop the lock immediately so the softirq can
  1426. * expire the next timer. In theory the timer could already
  1427. * be running again, but that's more than unlikely and just
  1428. * causes another wait loop.
  1429. */
  1430. atomic_inc(&base->timer_waiters);
  1431. spin_lock_bh(&base->expiry_lock);
  1432. atomic_dec(&base->timer_waiters);
  1433. spin_unlock_bh(&base->expiry_lock);
  1434. }
  1435. }
  1436. #else
  1437. static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
  1438. static inline void timer_base_lock_expiry(struct timer_base *base) { }
  1439. static inline void timer_base_unlock_expiry(struct timer_base *base) { }
  1440. static inline void timer_sync_wait_running(struct timer_base *base) { }
  1441. static inline void del_timer_wait_running(struct timer_list *timer) { }
  1442. #endif
  1443. /**
  1444. * __timer_delete_sync - Internal function: Deactivate a timer and wait
  1445. * for the handler to finish.
  1446. * @timer: The timer to be deactivated
  1447. * @shutdown: If true, @timer->function will be set to NULL under the
  1448. * timer base lock which prevents rearming of @timer
  1449. *
  1450. * If @shutdown is not set the timer can be rearmed later. If the timer can
  1451. * be rearmed concurrently, i.e. after dropping the base lock then the
  1452. * return value is meaningless.
  1453. *
  1454. * If @shutdown is set then @timer->function is set to NULL under timer
  1455. * base lock which prevents rearming of the timer. Any attempt to rearm
  1456. * a shutdown timer is silently ignored.
  1457. *
  1458. * If the timer should be reused after shutdown it has to be initialized
  1459. * again.
  1460. *
  1461. * Return:
  1462. * * %0 - The timer was not pending
  1463. * * %1 - The timer was pending and deactivated
  1464. */
  1465. static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
  1466. {
  1467. int ret;
  1468. #ifdef CONFIG_LOCKDEP
  1469. unsigned long flags;
  1470. /*
  1471. * If lockdep gives a backtrace here, please reference
  1472. * the synchronization rules above.
  1473. */
  1474. local_irq_save(flags);
  1475. lock_map_acquire(&timer->lockdep_map);
  1476. lock_map_release(&timer->lockdep_map);
  1477. local_irq_restore(flags);
  1478. #endif
  1479. /*
  1480. * don't use it in hardirq context, because it
  1481. * could lead to deadlock.
  1482. */
  1483. WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
  1484. /*
  1485. * Must be able to sleep on PREEMPT_RT because of the slowpath in
  1486. * del_timer_wait_running().
  1487. */
  1488. if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
  1489. lockdep_assert_preemption_enabled();
  1490. do {
  1491. ret = __try_to_del_timer_sync(timer, shutdown);
  1492. if (unlikely(ret < 0)) {
  1493. del_timer_wait_running(timer);
  1494. cpu_relax();
  1495. }
  1496. } while (ret < 0);
  1497. return ret;
  1498. }
  1499. /**
  1500. * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
  1501. * @timer: The timer to be deactivated
  1502. *
  1503. * Synchronization rules: Callers must prevent restarting of the timer,
  1504. * otherwise this function is meaningless. It must not be called from
  1505. * interrupt contexts unless the timer is an irqsafe one. The caller must
  1506. * not hold locks which would prevent completion of the timer's callback
  1507. * function. The timer's handler must not call add_timer_on(). Upon exit
  1508. * the timer is not queued and the handler is not running on any CPU.
  1509. *
  1510. * For !irqsafe timers, the caller must not hold locks that are held in
  1511. * interrupt context. Even if the lock has nothing to do with the timer in
  1512. * question. Here's why::
  1513. *
  1514. * CPU0 CPU1
  1515. * ---- ----
  1516. * <SOFTIRQ>
  1517. * call_timer_fn();
  1518. * base->running_timer = mytimer;
  1519. * spin_lock_irq(somelock);
  1520. * <IRQ>
  1521. * spin_lock(somelock);
  1522. * timer_delete_sync(mytimer);
  1523. * while (base->running_timer == mytimer);
  1524. *
  1525. * Now timer_delete_sync() will never return and never release somelock.
  1526. * The interrupt on the other CPU is waiting to grab somelock but it has
  1527. * interrupted the softirq that CPU0 is waiting to finish.
  1528. *
  1529. * This function cannot guarantee that the timer is not rearmed again by
  1530. * some concurrent or preempting code, right after it dropped the base
  1531. * lock. If there is the possibility of a concurrent rearm then the return
  1532. * value of the function is meaningless.
  1533. *
  1534. * If such a guarantee is needed, e.g. for teardown situations then use
  1535. * timer_shutdown_sync() instead.
  1536. *
  1537. * Return:
  1538. * * %0 - The timer was not pending
  1539. * * %1 - The timer was pending and deactivated
  1540. */
  1541. int timer_delete_sync(struct timer_list *timer)
  1542. {
  1543. return __timer_delete_sync(timer, false);
  1544. }
  1545. EXPORT_SYMBOL(timer_delete_sync);
  1546. /**
  1547. * timer_shutdown_sync - Shutdown a timer and prevent rearming
  1548. * @timer: The timer to be shutdown
  1549. *
  1550. * When the function returns it is guaranteed that:
  1551. * - @timer is not queued
  1552. * - The callback function of @timer is not running
  1553. * - @timer cannot be enqueued again. Any attempt to rearm
  1554. * @timer is silently ignored.
  1555. *
  1556. * See timer_delete_sync() for synchronization rules.
  1557. *
  1558. * This function is useful for final teardown of an infrastructure where
  1559. * the timer is subject to a circular dependency problem.
  1560. *
  1561. * A common pattern for this is a timer and a workqueue where the timer can
  1562. * schedule work and work can arm the timer. On shutdown the workqueue must
  1563. * be destroyed and the timer must be prevented from rearming. Unless the
  1564. * code has conditionals like 'if (mything->in_shutdown)' to prevent that
  1565. * there is no way to get this correct with timer_delete_sync().
  1566. *
  1567. * timer_shutdown_sync() is solving the problem. The correct ordering of
  1568. * calls in this case is:
  1569. *
  1570. * timer_shutdown_sync(&mything->timer);
  1571. * workqueue_destroy(&mything->workqueue);
  1572. *
  1573. * After this 'mything' can be safely freed.
  1574. *
  1575. * This obviously implies that the timer is not required to be functional
  1576. * for the rest of the shutdown operation.
  1577. *
  1578. * Return:
  1579. * * %0 - The timer was not pending
  1580. * * %1 - The timer was pending
  1581. */
  1582. int timer_shutdown_sync(struct timer_list *timer)
  1583. {
  1584. return __timer_delete_sync(timer, true);
  1585. }
  1586. EXPORT_SYMBOL_GPL(timer_shutdown_sync);
  1587. static void call_timer_fn(struct timer_list *timer,
  1588. void (*fn)(struct timer_list *),
  1589. unsigned long baseclk)
  1590. {
  1591. int count = preempt_count();
  1592. #ifdef CONFIG_LOCKDEP
  1593. /*
  1594. * It is permissible to free the timer from inside the
  1595. * function that is called from it, this we need to take into
  1596. * account for lockdep too. To avoid bogus "held lock freed"
  1597. * warnings as well as problems when looking into
  1598. * timer->lockdep_map, make a copy and use that here.
  1599. */
  1600. struct lockdep_map lockdep_map;
  1601. lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
  1602. #endif
  1603. /*
  1604. * Couple the lock chain with the lock chain at
  1605. * timer_delete_sync() by acquiring the lock_map around the fn()
  1606. * call here and in timer_delete_sync().
  1607. */
  1608. lock_map_acquire(&lockdep_map);
  1609. trace_timer_expire_entry(timer, baseclk);
  1610. fn(timer);
  1611. trace_timer_expire_exit(timer);
  1612. lock_map_release(&lockdep_map);
  1613. if (count != preempt_count()) {
  1614. WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
  1615. fn, count, preempt_count());
  1616. /*
  1617. * Restore the preempt count. That gives us a decent
  1618. * chance to survive and extract information. If the
  1619. * callback kept a lock held, bad luck, but not worse
  1620. * than the BUG() we had.
  1621. */
  1622. preempt_count_set(count);
  1623. }
  1624. }
  1625. static void expire_timers(struct timer_base *base, struct hlist_head *head)
  1626. {
  1627. /*
  1628. * This value is required only for tracing. base->clk was
  1629. * incremented directly before expire_timers was called. But expiry
  1630. * is related to the old base->clk value.
  1631. */
  1632. unsigned long baseclk = base->clk - 1;
  1633. while (!hlist_empty(head)) {
  1634. struct timer_list *timer;
  1635. void (*fn)(struct timer_list *);
  1636. timer = hlist_entry(head->first, struct timer_list, entry);
  1637. base->running_timer = timer;
  1638. detach_timer(timer, true);
  1639. fn = timer->function;
  1640. if (WARN_ON_ONCE(!fn)) {
  1641. /* Should never happen. Emphasis on should! */
  1642. base->running_timer = NULL;
  1643. continue;
  1644. }
  1645. if (timer->flags & TIMER_IRQSAFE) {
  1646. raw_spin_unlock(&base->lock);
  1647. call_timer_fn(timer, fn, baseclk);
  1648. raw_spin_lock(&base->lock);
  1649. base->running_timer = NULL;
  1650. } else {
  1651. raw_spin_unlock_irq(&base->lock);
  1652. call_timer_fn(timer, fn, baseclk);
  1653. raw_spin_lock_irq(&base->lock);
  1654. base->running_timer = NULL;
  1655. timer_sync_wait_running(base);
  1656. }
  1657. }
  1658. }
  1659. static int collect_expired_timers(struct timer_base *base,
  1660. struct hlist_head *heads)
  1661. {
  1662. unsigned long clk = base->clk = base->next_expiry;
  1663. struct hlist_head *vec;
  1664. int i, levels = 0;
  1665. unsigned int idx;
  1666. for (i = 0; i < LVL_DEPTH; i++) {
  1667. idx = (clk & LVL_MASK) + i * LVL_SIZE;
  1668. if (__test_and_clear_bit(idx, base->pending_map)) {
  1669. vec = base->vectors + idx;
  1670. hlist_move_list(vec, heads++);
  1671. levels++;
  1672. }
  1673. /* Is it time to look at the next level? */
  1674. if (clk & LVL_CLK_MASK)
  1675. break;
  1676. /* Shift clock for the next level granularity */
  1677. clk >>= LVL_CLK_SHIFT;
  1678. }
  1679. return levels;
  1680. }
  1681. /*
  1682. * Find the next pending bucket of a level. Search from level start (@offset)
  1683. * + @clk upwards and if nothing there, search from start of the level
  1684. * (@offset) up to @offset + clk.
  1685. */
  1686. static int next_pending_bucket(struct timer_base *base, unsigned offset,
  1687. unsigned clk)
  1688. {
  1689. unsigned pos, start = offset + clk;
  1690. unsigned end = offset + LVL_SIZE;
  1691. pos = find_next_bit(base->pending_map, end, start);
  1692. if (pos < end)
  1693. return pos - start;
  1694. pos = find_next_bit(base->pending_map, start, offset);
  1695. return pos < start ? pos + LVL_SIZE - start : -1;
  1696. }
  1697. /*
  1698. * Search the first expiring timer in the various clock levels. Caller must
  1699. * hold base->lock.
  1700. *
  1701. * Store next expiry time in base->next_expiry.
  1702. */
  1703. static void timer_recalc_next_expiry(struct timer_base *base)
  1704. {
  1705. unsigned long clk, next, adj;
  1706. unsigned lvl, offset = 0;
  1707. next = base->clk + NEXT_TIMER_MAX_DELTA;
  1708. clk = base->clk;
  1709. for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
  1710. int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
  1711. unsigned long lvl_clk = clk & LVL_CLK_MASK;
  1712. if (pos >= 0) {
  1713. unsigned long tmp = clk + (unsigned long) pos;
  1714. tmp <<= LVL_SHIFT(lvl);
  1715. if (time_before(tmp, next))
  1716. next = tmp;
  1717. /*
  1718. * If the next expiration happens before we reach
  1719. * the next level, no need to check further.
  1720. */
  1721. if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
  1722. break;
  1723. }
  1724. /*
  1725. * Clock for the next level. If the current level clock lower
  1726. * bits are zero, we look at the next level as is. If not we
  1727. * need to advance it by one because that's going to be the
  1728. * next expiring bucket in that level. base->clk is the next
  1729. * expiring jiffy. So in case of:
  1730. *
  1731. * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
  1732. * 0 0 0 0 0 0
  1733. *
  1734. * we have to look at all levels @index 0. With
  1735. *
  1736. * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
  1737. * 0 0 0 0 0 2
  1738. *
  1739. * LVL0 has the next expiring bucket @index 2. The upper
  1740. * levels have the next expiring bucket @index 1.
  1741. *
  1742. * In case that the propagation wraps the next level the same
  1743. * rules apply:
  1744. *
  1745. * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
  1746. * 0 0 0 0 F 2
  1747. *
  1748. * So after looking at LVL0 we get:
  1749. *
  1750. * LVL5 LVL4 LVL3 LVL2 LVL1
  1751. * 0 0 0 1 0
  1752. *
  1753. * So no propagation from LVL1 to LVL2 because that happened
  1754. * with the add already, but then we need to propagate further
  1755. * from LVL2 to LVL3.
  1756. *
  1757. * So the simple check whether the lower bits of the current
  1758. * level are 0 or not is sufficient for all cases.
  1759. */
  1760. adj = lvl_clk ? 1 : 0;
  1761. clk >>= LVL_CLK_SHIFT;
  1762. clk += adj;
  1763. }
  1764. WRITE_ONCE(base->next_expiry, next);
  1765. base->next_expiry_recalc = false;
  1766. base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
  1767. }
  1768. #ifdef CONFIG_NO_HZ_COMMON
  1769. /*
  1770. * Check, if the next hrtimer event is before the next timer wheel
  1771. * event:
  1772. */
  1773. static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
  1774. {
  1775. u64 nextevt = hrtimer_get_next_event();
  1776. /*
  1777. * If high resolution timers are enabled
  1778. * hrtimer_get_next_event() returns KTIME_MAX.
  1779. */
  1780. if (expires <= nextevt)
  1781. return expires;
  1782. /*
  1783. * If the next timer is already expired, return the tick base
  1784. * time so the tick is fired immediately.
  1785. */
  1786. if (nextevt <= basem)
  1787. return basem;
  1788. /*
  1789. * Round up to the next jiffy. High resolution timers are
  1790. * off, so the hrtimers are expired in the tick and we need to
  1791. * make sure that this tick really expires the timer to avoid
  1792. * a ping pong of the nohz stop code.
  1793. *
  1794. * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
  1795. */
  1796. return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
  1797. }
  1798. static unsigned long next_timer_interrupt(struct timer_base *base,
  1799. unsigned long basej)
  1800. {
  1801. if (base->next_expiry_recalc)
  1802. timer_recalc_next_expiry(base);
  1803. /*
  1804. * Move next_expiry for the empty base into the future to prevent an
  1805. * unnecessary raise of the timer softirq when the next_expiry value
  1806. * will be reached even if there is no timer pending.
  1807. *
  1808. * This update is also required to make timer_base::next_expiry values
  1809. * easy comparable to find out which base holds the first pending timer.
  1810. */
  1811. if (!base->timers_pending)
  1812. WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);
  1813. return base->next_expiry;
  1814. }
  1815. static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
  1816. struct timer_base *base_local,
  1817. struct timer_base *base_global,
  1818. struct timer_events *tevt)
  1819. {
  1820. unsigned long nextevt, nextevt_local, nextevt_global;
  1821. bool local_first;
  1822. nextevt_local = next_timer_interrupt(base_local, basej);
  1823. nextevt_global = next_timer_interrupt(base_global, basej);
  1824. local_first = time_before_eq(nextevt_local, nextevt_global);
  1825. nextevt = local_first ? nextevt_local : nextevt_global;
  1826. /*
  1827. * If the @nextevt is at max. one tick away, use @nextevt and store
  1828. * it in the local expiry value. The next global event is irrelevant in
  1829. * this case and can be left as KTIME_MAX.
  1830. */
  1831. if (time_before_eq(nextevt, basej + 1)) {
  1832. /* If we missed a tick already, force 0 delta */
  1833. if (time_before(nextevt, basej))
  1834. nextevt = basej;
  1835. tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;
  1836. /*
  1837. * This is required for the remote check only but it doesn't
  1838. * hurt, when it is done for both call sites:
  1839. *
  1840. * * The remote callers will only take care of the global timers
  1841. * as local timers will be handled by CPU itself. When not
  1842. * updating tevt->global with the already missed first global
  1843. * timer, it is possible that it will be missed completely.
  1844. *
  1845. * * The local callers will ignore the tevt->global anyway, when
  1846. * nextevt is max. one tick away.
  1847. */
  1848. if (!local_first)
  1849. tevt->global = tevt->local;
  1850. return nextevt;
  1851. }
  1852. /*
  1853. * Update tevt.* values:
  1854. *
  1855. * If the local queue expires first, then the global event can be
  1856. * ignored. If the global queue is empty, nothing to do either.
  1857. */
  1858. if (!local_first && base_global->timers_pending)
  1859. tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;
  1860. if (base_local->timers_pending)
  1861. tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;
  1862. return nextevt;
  1863. }
  1864. # ifdef CONFIG_SMP
  1865. /**
  1866. * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
  1867. * @basej: base time jiffies
  1868. * @basem: base time clock monotonic
  1869. * @tevt: Pointer to the storage for the expiry values
  1870. * @cpu: Remote CPU
  1871. *
  1872. * Stores the next pending local and global timer expiry values in the
  1873. * struct pointed to by @tevt. If a queue is empty the corresponding
  1874. * field is set to KTIME_MAX. If local event expires before global
  1875. * event, global event is set to KTIME_MAX as well.
  1876. *
  1877. * Caller needs to make sure timer base locks are held (use
  1878. * timer_lock_remote_bases() for this purpose).
  1879. */
  1880. void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
  1881. struct timer_events *tevt,
  1882. unsigned int cpu)
  1883. {
  1884. struct timer_base *base_local, *base_global;
  1885. /* Preset local / global events */
  1886. tevt->local = tevt->global = KTIME_MAX;
  1887. base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
  1888. base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
  1889. lockdep_assert_held(&base_local->lock);
  1890. lockdep_assert_held(&base_global->lock);
  1891. fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
  1892. }
  1893. /**
  1894. * timer_unlock_remote_bases - unlock timer bases of cpu
  1895. * @cpu: Remote CPU
  1896. *
  1897. * Unlocks the remote timer bases.
  1898. */
  1899. void timer_unlock_remote_bases(unsigned int cpu)
  1900. __releases(timer_bases[BASE_LOCAL]->lock)
  1901. __releases(timer_bases[BASE_GLOBAL]->lock)
  1902. {
  1903. struct timer_base *base_local, *base_global;
  1904. base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
  1905. base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
  1906. raw_spin_unlock(&base_global->lock);
  1907. raw_spin_unlock(&base_local->lock);
  1908. }
  1909. /**
  1910. * timer_lock_remote_bases - lock timer bases of cpu
  1911. * @cpu: Remote CPU
  1912. *
  1913. * Locks the remote timer bases.
  1914. */
  1915. void timer_lock_remote_bases(unsigned int cpu)
  1916. __acquires(timer_bases[BASE_LOCAL]->lock)
  1917. __acquires(timer_bases[BASE_GLOBAL]->lock)
  1918. {
  1919. struct timer_base *base_local, *base_global;
  1920. base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
  1921. base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
  1922. lockdep_assert_irqs_disabled();
  1923. raw_spin_lock(&base_local->lock);
  1924. raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
  1925. }
  1926. /**
  1927. * timer_base_is_idle() - Return whether timer base is set idle
  1928. *
  1929. * Returns value of local timer base is_idle value.
  1930. */
  1931. bool timer_base_is_idle(void)
  1932. {
  1933. return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
  1934. }
  1935. static void __run_timer_base(struct timer_base *base);
  1936. /**
  1937. * timer_expire_remote() - expire global timers of cpu
  1938. * @cpu: Remote CPU
  1939. *
  1940. * Expire timers of global base of remote CPU.
  1941. */
  1942. void timer_expire_remote(unsigned int cpu)
  1943. {
  1944. struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
  1945. __run_timer_base(base);
  1946. }
  1947. static void timer_use_tmigr(unsigned long basej, u64 basem,
  1948. unsigned long *nextevt, bool *tick_stop_path,
  1949. bool timer_base_idle, struct timer_events *tevt)
  1950. {
  1951. u64 next_tmigr;
  1952. if (timer_base_idle)
  1953. next_tmigr = tmigr_cpu_new_timer(tevt->global);
  1954. else if (tick_stop_path)
  1955. next_tmigr = tmigr_cpu_deactivate(tevt->global);
  1956. else
  1957. next_tmigr = tmigr_quick_check(tevt->global);
  1958. /*
  1959. * If the CPU is the last going idle in timer migration hierarchy, make
  1960. * sure the CPU will wake up in time to handle remote timers.
  1961. * next_tmigr == KTIME_MAX if other CPUs are still active.
  1962. */
  1963. if (next_tmigr < tevt->local) {
  1964. u64 tmp;
  1965. /* If we missed a tick already, force 0 delta */
  1966. if (next_tmigr < basem)
  1967. next_tmigr = basem;
  1968. tmp = div_u64(next_tmigr - basem, TICK_NSEC);
  1969. *nextevt = basej + (unsigned long)tmp;
  1970. tevt->local = next_tmigr;
  1971. }
  1972. }
  1973. # else
  1974. static void timer_use_tmigr(unsigned long basej, u64 basem,
  1975. unsigned long *nextevt, bool *tick_stop_path,
  1976. bool timer_base_idle, struct timer_events *tevt)
  1977. {
  1978. /*
  1979. * Make sure first event is written into tevt->local to not miss a
  1980. * timer on !SMP systems.
  1981. */
  1982. tevt->local = min_t(u64, tevt->local, tevt->global);
  1983. }
  1984. # endif /* CONFIG_SMP */
  1985. static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
  1986. bool *idle)
  1987. {
  1988. struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
  1989. struct timer_base *base_local, *base_global;
  1990. unsigned long nextevt;
  1991. bool idle_is_possible;
  1992. /*
  1993. * When the CPU is offline, the tick is cancelled and nothing is supposed
  1994. * to try to stop it.
  1995. */
  1996. if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
  1997. if (idle)
  1998. *idle = true;
  1999. return tevt.local;
  2000. }
  2001. base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
  2002. base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
  2003. raw_spin_lock(&base_local->lock);
  2004. raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
  2005. nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
  2006. base_global, &tevt);
  2007. /*
  2008. * If the next event is only one jiffy ahead there is no need to call
  2009. * timer migration hierarchy related functions. The value for the next
  2010. * global timer in @tevt struct equals then KTIME_MAX. This is also
  2011. * true, when the timer base is idle.
  2012. *
  2013. * The proper timer migration hierarchy function depends on the callsite
  2014. * and whether timer base is idle or not. @nextevt will be updated when
  2015. * this CPU needs to handle the first timer migration hierarchy
  2016. * event. See timer_use_tmigr() for detailed information.
  2017. */
  2018. idle_is_possible = time_after(nextevt, basej + 1);
  2019. if (idle_is_possible)
  2020. timer_use_tmigr(basej, basem, &nextevt, idle,
  2021. base_local->is_idle, &tevt);
  2022. /*
  2023. * We have a fresh next event. Check whether we can forward the
  2024. * base.
  2025. */
  2026. __forward_timer_base(base_local, basej);
  2027. __forward_timer_base(base_global, basej);
  2028. /*
  2029. * Set base->is_idle only when caller is timer_base_try_to_set_idle()
  2030. */
  2031. if (idle) {
  2032. /*
  2033. * Bases are idle if the next event is more than a tick
  2034. * away. Caution: @nextevt could have changed by enqueueing a
  2035. * global timer into timer migration hierarchy. Therefore a new
  2036. * check is required here.
  2037. *
  2038. * If the base is marked idle then any timer add operation must
  2039. * forward the base clk itself to keep granularity small. This
  2040. * idle logic is only maintained for the BASE_LOCAL and
  2041. * BASE_GLOBAL base, deferrable timers may still see large
  2042. * granularity skew (by design).
  2043. */
  2044. if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
  2045. base_local->is_idle = true;
  2046. /*
  2047. * Global timers queued locally while running in a task
  2048. * in nohz_full mode need a self-IPI to kick reprogramming
  2049. * in IRQ tail.
  2050. */
  2051. if (tick_nohz_full_cpu(base_local->cpu))
  2052. base_global->is_idle = true;
  2053. trace_timer_base_idle(true, base_local->cpu);
  2054. }
  2055. *idle = base_local->is_idle;
  2056. /*
  2057. * When timer base is not set idle, undo the effect of
  2058. * tmigr_cpu_deactivate() to prevent inconsistent states - active
  2059. * timer base but inactive timer migration hierarchy.
  2060. *
  2061. * When timer base was already marked idle, nothing will be
  2062. * changed here.
  2063. */
  2064. if (!base_local->is_idle && idle_is_possible)
  2065. tmigr_cpu_activate();
  2066. }
  2067. raw_spin_unlock(&base_global->lock);
  2068. raw_spin_unlock(&base_local->lock);
  2069. return cmp_next_hrtimer_event(basem, tevt.local);
  2070. }
  2071. /**
  2072. * get_next_timer_interrupt() - return the time (clock mono) of the next timer
  2073. * @basej: base time jiffies
  2074. * @basem: base time clock monotonic
  2075. *
  2076. * Returns the tick aligned clock monotonic time of the next pending timer or
  2077. * KTIME_MAX if no timer is pending. If timer of global base was queued into
  2078. * timer migration hierarchy, first global timer is not taken into account. If
  2079. * it was the last CPU of timer migration hierarchy going idle, first global
  2080. * event is taken into account.
  2081. */
  2082. u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
  2083. {
  2084. return __get_next_timer_interrupt(basej, basem, NULL);
  2085. }
  2086. /**
  2087. * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
  2088. * @basej: base time jiffies
  2089. * @basem: base time clock monotonic
  2090. * @idle: pointer to store the value of timer_base->is_idle on return;
  2091. * *idle contains the information whether tick was already stopped
  2092. *
  2093. * Returns the tick aligned clock monotonic time of the next pending timer or
  2094. * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
  2095. * returned as well.
  2096. */
  2097. u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
  2098. {
  2099. if (*idle)
  2100. return KTIME_MAX;
  2101. return __get_next_timer_interrupt(basej, basem, idle);
  2102. }
  2103. /**
  2104. * timer_clear_idle - Clear the idle state of the timer base
  2105. *
  2106. * Called with interrupts disabled
  2107. */
  2108. void timer_clear_idle(void)
  2109. {
  2110. /*
  2111. * We do this unlocked. The worst outcome is a remote pinned timer
  2112. * enqueue sending a pointless IPI, but taking the lock would just
  2113. * make the window for sending the IPI a few instructions smaller
  2114. * for the cost of taking the lock in the exit from idle
  2115. * path. Required for BASE_LOCAL only.
  2116. */
  2117. __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
  2118. if (tick_nohz_full_cpu(smp_processor_id()))
  2119. __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
  2120. trace_timer_base_idle(false, smp_processor_id());
  2121. /* Activate without holding the timer_base->lock */
  2122. tmigr_cpu_activate();
  2123. }
  2124. #endif
  2125. /**
  2126. * __run_timers - run all expired timers (if any) on this CPU.
  2127. * @base: the timer vector to be processed.
  2128. */
  2129. static inline void __run_timers(struct timer_base *base)
  2130. {
  2131. struct hlist_head heads[LVL_DEPTH];
  2132. int levels;
  2133. lockdep_assert_held(&base->lock);
  2134. if (base->running_timer)
  2135. return;
  2136. while (time_after_eq(jiffies, base->clk) &&
  2137. time_after_eq(jiffies, base->next_expiry)) {
  2138. levels = collect_expired_timers(base, heads);
  2139. /*
  2140. * The two possible reasons for not finding any expired
  2141. * timer at this clk are that all matching timers have been
  2142. * dequeued or no timer has been queued since
  2143. * base::next_expiry was set to base::clk +
  2144. * NEXT_TIMER_MAX_DELTA.
  2145. */
  2146. WARN_ON_ONCE(!levels && !base->next_expiry_recalc
  2147. && base->timers_pending);
  2148. /*
  2149. * While executing timers, base->clk is set 1 offset ahead of
  2150. * jiffies to avoid endless requeuing to current jiffies.
  2151. */
  2152. base->clk++;
  2153. timer_recalc_next_expiry(base);
  2154. while (levels--)
  2155. expire_timers(base, heads + levels);
  2156. }
  2157. }
  2158. static void __run_timer_base(struct timer_base *base)
  2159. {
  2160. /* Can race against a remote CPU updating next_expiry under the lock */
  2161. if (time_before(jiffies, READ_ONCE(base->next_expiry)))
  2162. return;
  2163. timer_base_lock_expiry(base);
  2164. raw_spin_lock_irq(&base->lock);
  2165. __run_timers(base);
  2166. raw_spin_unlock_irq(&base->lock);
  2167. timer_base_unlock_expiry(base);
  2168. }
  2169. static void run_timer_base(int index)
  2170. {
  2171. struct timer_base *base = this_cpu_ptr(&timer_bases[index]);
  2172. __run_timer_base(base);
  2173. }
  2174. /*
  2175. * This function runs timers and the timer-tq in bottom half context.
  2176. */
  2177. static __latent_entropy void run_timer_softirq(void)
  2178. {
  2179. run_timer_base(BASE_LOCAL);
  2180. if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
  2181. run_timer_base(BASE_GLOBAL);
  2182. run_timer_base(BASE_DEF);
  2183. if (is_timers_nohz_active())
  2184. tmigr_handle_remote();
  2185. }
  2186. }
  2187. /*
  2188. * Called by the local, per-CPU timer interrupt on SMP.
  2189. */
  2190. static void run_local_timers(void)
  2191. {
  2192. struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
  2193. hrtimer_run_queues();
  2194. for (int i = 0; i < NR_BASES; i++, base++) {
  2195. /*
  2196. * Raise the softirq only if required.
  2197. *
  2198. * timer_base::next_expiry can be written by a remote CPU while
  2199. * holding the lock. If this write happens at the same time than
  2200. * the lockless local read, sanity checker could complain about
  2201. * data corruption.
  2202. *
  2203. * There are two possible situations where
  2204. * timer_base::next_expiry is written by a remote CPU:
  2205. *
  2206. * 1. Remote CPU expires global timers of this CPU and updates
  2207. * timer_base::next_expiry of BASE_GLOBAL afterwards in
  2208. * next_timer_interrupt() or timer_recalc_next_expiry(). The
  2209. * worst outcome is a superfluous raise of the timer softirq
  2210. * when the not yet updated value is read.
  2211. *
  2212. * 2. A new first pinned timer is enqueued by a remote CPU
  2213. * and therefore timer_base::next_expiry of BASE_LOCAL is
  2214. * updated. When this update is missed, this isn't a
  2215. * problem, as an IPI is executed nevertheless when the CPU
  2216. * was idle before. When the CPU wasn't idle but the update
  2217. * is missed, then the timer would expire one jiffy late -
  2218. * bad luck.
  2219. *
  2220. * Those unlikely corner cases where the worst outcome is only a
  2221. * one jiffy delay or a superfluous raise of the softirq are
  2222. * not that expensive as doing the check always while holding
  2223. * the lock.
  2224. *
  2225. * Possible remote writers are using WRITE_ONCE(). Local reader
  2226. * uses therefore READ_ONCE().
  2227. */
  2228. if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
  2229. (i == BASE_DEF && tmigr_requires_handle_remote())) {
  2230. raise_softirq(TIMER_SOFTIRQ);
  2231. return;
  2232. }
  2233. }
  2234. }
  2235. /*
  2236. * Called from the timer interrupt handler to charge one tick to the current
  2237. * process. user_tick is 1 if the tick is user time, 0 for system.
  2238. */
  2239. void update_process_times(int user_tick)
  2240. {
  2241. struct task_struct *p = current;
  2242. /* Note: this timer irq context must be accounted for as well. */
  2243. account_process_tick(p, user_tick);
  2244. run_local_timers();
  2245. rcu_sched_clock_irq(user_tick);
  2246. #ifdef CONFIG_IRQ_WORK
  2247. if (in_irq())
  2248. irq_work_tick();
  2249. #endif
  2250. sched_tick();
  2251. if (IS_ENABLED(CONFIG_POSIX_TIMERS))
  2252. run_posix_cpu_timers();
  2253. }
  2254. /*
  2255. * Since schedule_timeout()'s timer is defined on the stack, it must store
  2256. * the target task on the stack as well.
  2257. */
  2258. struct process_timer {
  2259. struct timer_list timer;
  2260. struct task_struct *task;
  2261. };
  2262. static void process_timeout(struct timer_list *t)
  2263. {
  2264. struct process_timer *timeout = from_timer(timeout, t, timer);
  2265. wake_up_process(timeout->task);
  2266. }
  2267. /**
  2268. * schedule_timeout - sleep until timeout
  2269. * @timeout: timeout value in jiffies
  2270. *
  2271. * Make the current task sleep until @timeout jiffies have elapsed.
  2272. * The function behavior depends on the current task state
  2273. * (see also set_current_state() description):
  2274. *
  2275. * %TASK_RUNNING - the scheduler is called, but the task does not sleep
  2276. * at all. That happens because sched_submit_work() does nothing for
  2277. * tasks in %TASK_RUNNING state.
  2278. *
  2279. * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
  2280. * pass before the routine returns unless the current task is explicitly
  2281. * woken up, (e.g. by wake_up_process()).
  2282. *
  2283. * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
  2284. * delivered to the current task or the current task is explicitly woken
  2285. * up.
  2286. *
  2287. * The current task state is guaranteed to be %TASK_RUNNING when this
  2288. * routine returns.
  2289. *
  2290. * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
  2291. * the CPU away without a bound on the timeout. In this case the return
  2292. * value will be %MAX_SCHEDULE_TIMEOUT.
  2293. *
  2294. * Returns 0 when the timer has expired otherwise the remaining time in
  2295. * jiffies will be returned. In all cases the return value is guaranteed
  2296. * to be non-negative.
  2297. */
  2298. signed long __sched schedule_timeout(signed long timeout)
  2299. {
  2300. struct process_timer timer;
  2301. unsigned long expire;
  2302. switch (timeout)
  2303. {
  2304. case MAX_SCHEDULE_TIMEOUT:
  2305. /*
  2306. * These two special cases are useful to be comfortable
  2307. * in the caller. Nothing more. We could take
  2308. * MAX_SCHEDULE_TIMEOUT from one of the negative value
  2309. * but I' d like to return a valid offset (>=0) to allow
  2310. * the caller to do everything it want with the retval.
  2311. */
  2312. schedule();
  2313. goto out;
  2314. default:
  2315. /*
  2316. * Another bit of PARANOID. Note that the retval will be
  2317. * 0 since no piece of kernel is supposed to do a check
  2318. * for a negative retval of schedule_timeout() (since it
  2319. * should never happens anyway). You just have the printk()
  2320. * that will tell you if something is gone wrong and where.
  2321. */
  2322. if (timeout < 0) {
  2323. printk(KERN_ERR "schedule_timeout: wrong timeout "
  2324. "value %lx\n", timeout);
  2325. dump_stack();
  2326. __set_current_state(TASK_RUNNING);
  2327. goto out;
  2328. }
  2329. }
  2330. expire = timeout + jiffies;
  2331. timer.task = current;
  2332. timer_setup_on_stack(&timer.timer, process_timeout, 0);
  2333. __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
  2334. schedule();
  2335. del_timer_sync(&timer.timer);
  2336. /* Remove the timer from the object tracker */
  2337. destroy_timer_on_stack(&timer.timer);
  2338. timeout = expire - jiffies;
  2339. out:
  2340. return timeout < 0 ? 0 : timeout;
  2341. }
  2342. EXPORT_SYMBOL(schedule_timeout);
  2343. /*
  2344. * We can use __set_current_state() here because schedule_timeout() calls
  2345. * schedule() unconditionally.
  2346. */
  2347. signed long __sched schedule_timeout_interruptible(signed long timeout)
  2348. {
  2349. __set_current_state(TASK_INTERRUPTIBLE);
  2350. return schedule_timeout(timeout);
  2351. }
  2352. EXPORT_SYMBOL(schedule_timeout_interruptible);
  2353. signed long __sched schedule_timeout_killable(signed long timeout)
  2354. {
  2355. __set_current_state(TASK_KILLABLE);
  2356. return schedule_timeout(timeout);
  2357. }
  2358. EXPORT_SYMBOL(schedule_timeout_killable);
  2359. signed long __sched schedule_timeout_uninterruptible(signed long timeout)
  2360. {
  2361. __set_current_state(TASK_UNINTERRUPTIBLE);
  2362. return schedule_timeout(timeout);
  2363. }
  2364. EXPORT_SYMBOL(schedule_timeout_uninterruptible);
  2365. /*
  2366. * Like schedule_timeout_uninterruptible(), except this task will not contribute
  2367. * to load average.
  2368. */
  2369. signed long __sched schedule_timeout_idle(signed long timeout)
  2370. {
  2371. __set_current_state(TASK_IDLE);
  2372. return schedule_timeout(timeout);
  2373. }
  2374. EXPORT_SYMBOL(schedule_timeout_idle);
  2375. #ifdef CONFIG_HOTPLUG_CPU
  2376. static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
  2377. {
  2378. struct timer_list *timer;
  2379. int cpu = new_base->cpu;
  2380. while (!hlist_empty(head)) {
  2381. timer = hlist_entry(head->first, struct timer_list, entry);
  2382. detach_timer(timer, false);
  2383. timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
  2384. internal_add_timer(new_base, timer);
  2385. }
  2386. }
  2387. int timers_prepare_cpu(unsigned int cpu)
  2388. {
  2389. struct timer_base *base;
  2390. int b;
  2391. for (b = 0; b < NR_BASES; b++) {
  2392. base = per_cpu_ptr(&timer_bases[b], cpu);
  2393. base->clk = jiffies;
  2394. base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
  2395. base->next_expiry_recalc = false;
  2396. base->timers_pending = false;
  2397. base->is_idle = false;
  2398. }
  2399. return 0;
  2400. }
  2401. int timers_dead_cpu(unsigned int cpu)
  2402. {
  2403. struct timer_base *old_base;
  2404. struct timer_base *new_base;
  2405. int b, i;
  2406. for (b = 0; b < NR_BASES; b++) {
  2407. old_base = per_cpu_ptr(&timer_bases[b], cpu);
  2408. new_base = get_cpu_ptr(&timer_bases[b]);
  2409. /*
  2410. * The caller is globally serialized and nobody else
  2411. * takes two locks at once, deadlock is not possible.
  2412. */
  2413. raw_spin_lock_irq(&new_base->lock);
  2414. raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
  2415. /*
  2416. * The current CPUs base clock might be stale. Update it
  2417. * before moving the timers over.
  2418. */
  2419. forward_timer_base(new_base);
  2420. WARN_ON_ONCE(old_base->running_timer);
  2421. old_base->running_timer = NULL;
  2422. for (i = 0; i < WHEEL_SIZE; i++)
  2423. migrate_timer_list(new_base, old_base->vectors + i);
  2424. raw_spin_unlock(&old_base->lock);
  2425. raw_spin_unlock_irq(&new_base->lock);
  2426. put_cpu_ptr(&timer_bases);
  2427. }
  2428. return 0;
  2429. }
  2430. #endif /* CONFIG_HOTPLUG_CPU */
  2431. static void __init init_timer_cpu(int cpu)
  2432. {
  2433. struct timer_base *base;
  2434. int i;
  2435. for (i = 0; i < NR_BASES; i++) {
  2436. base = per_cpu_ptr(&timer_bases[i], cpu);
  2437. base->cpu = cpu;
  2438. raw_spin_lock_init(&base->lock);
  2439. base->clk = jiffies;
  2440. base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
  2441. timer_base_init_expiry_lock(base);
  2442. }
  2443. }
  2444. static void __init init_timer_cpus(void)
  2445. {
  2446. int cpu;
  2447. for_each_possible_cpu(cpu)
  2448. init_timer_cpu(cpu);
  2449. }
  2450. void __init init_timers(void)
  2451. {
  2452. init_timer_cpus();
  2453. posix_cputimers_init_work();
  2454. open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
  2455. }
  2456. /**
  2457. * msleep - sleep safely even with waitqueue interruptions
  2458. * @msecs: Time in milliseconds to sleep for
  2459. */
  2460. void msleep(unsigned int msecs)
  2461. {
  2462. unsigned long timeout = msecs_to_jiffies(msecs);
  2463. while (timeout)
  2464. timeout = schedule_timeout_uninterruptible(timeout);
  2465. }
  2466. EXPORT_SYMBOL(msleep);
  2467. /**
  2468. * msleep_interruptible - sleep waiting for signals
  2469. * @msecs: Time in milliseconds to sleep for
  2470. */
  2471. unsigned long msleep_interruptible(unsigned int msecs)
  2472. {
  2473. unsigned long timeout = msecs_to_jiffies(msecs);
  2474. while (timeout && !signal_pending(current))
  2475. timeout = schedule_timeout_interruptible(timeout);
  2476. return jiffies_to_msecs(timeout);
  2477. }
  2478. EXPORT_SYMBOL(msleep_interruptible);
  2479. /**
  2480. * usleep_range_state - Sleep for an approximate time in a given state
  2481. * @min: Minimum time in usecs to sleep
  2482. * @max: Maximum time in usecs to sleep
  2483. * @state: State of the current task that will be while sleeping
  2484. *
  2485. * In non-atomic context where the exact wakeup time is flexible, use
  2486. * usleep_range_state() instead of udelay(). The sleep improves responsiveness
  2487. * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
  2488. * power usage by allowing hrtimers to take advantage of an already-
  2489. * scheduled interrupt instead of scheduling a new one just for this sleep.
  2490. */
  2491. void __sched usleep_range_state(unsigned long min, unsigned long max,
  2492. unsigned int state)
  2493. {
  2494. ktime_t exp = ktime_add_us(ktime_get(), min);
  2495. u64 delta = (u64)(max - min) * NSEC_PER_USEC;
  2496. for (;;) {
  2497. __set_current_state(state);
  2498. /* Do not return before the requested sleep time has elapsed */
  2499. if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
  2500. break;
  2501. }
  2502. }
  2503. EXPORT_SYMBOL(usleep_range_state);