edac_device.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. /*
  2. * edac_device.c
  3. * (C) 2007 www.douglaskthompson.com
  4. *
  5. * This file may be distributed under the terms of the
  6. * GNU General Public License.
  7. *
  8. * Written by Doug Thompson <norsk5@xmission.com>
  9. *
  10. * edac_device API implementation
  11. * 19 Jan 2007
  12. */
  13. #include <asm/page.h>
  14. #include <linux/uaccess.h>
  15. #include <linux/ctype.h>
  16. #include <linux/highmem.h>
  17. #include <linux/init.h>
  18. #include <linux/jiffies.h>
  19. #include <linux/module.h>
  20. #include <linux/slab.h>
  21. #include <linux/smp.h>
  22. #include <linux/spinlock.h>
  23. #include <linux/sysctl.h>
  24. #include <linux/timer.h>
  25. #include "edac_device.h"
  26. #include "edac_module.h"
  27. /* lock for the list: 'edac_device_list', manipulation of this list
  28. * is protected by the 'device_ctls_mutex' lock
  29. */
  30. static DEFINE_MUTEX(device_ctls_mutex);
  31. static LIST_HEAD(edac_device_list);
  32. /* Default workqueue processing interval on this instance, in msecs */
  33. #define DEFAULT_POLL_INTERVAL 1000
  34. #ifdef CONFIG_EDAC_DEBUG
  35. static void edac_device_dump_device(struct edac_device_ctl_info *edac_dev)
  36. {
  37. edac_dbg(3, "\tedac_dev = %p dev_idx=%d\n",
  38. edac_dev, edac_dev->dev_idx);
  39. edac_dbg(4, "\tedac_dev->edac_check = %p\n", edac_dev->edac_check);
  40. edac_dbg(3, "\tdev = %p\n", edac_dev->dev);
  41. edac_dbg(3, "\tmod_name:ctl_name = %s:%s\n",
  42. edac_dev->mod_name, edac_dev->ctl_name);
  43. edac_dbg(3, "\tpvt_info = %p\n\n", edac_dev->pvt_info);
  44. }
  45. #endif /* CONFIG_EDAC_DEBUG */
  46. /*
  47. * @off_val: zero, 1, or other based offset
  48. */
  49. struct edac_device_ctl_info *
  50. edac_device_alloc_ctl_info(unsigned pvt_sz, char *dev_name, unsigned nr_instances,
  51. char *blk_name, unsigned nr_blocks, unsigned off_val,
  52. int device_index)
  53. {
  54. struct edac_device_block *dev_blk, *blk_p, *blk;
  55. struct edac_device_instance *dev_inst, *inst;
  56. struct edac_device_ctl_info *dev_ctl;
  57. unsigned instance, block;
  58. void *pvt;
  59. int err;
  60. edac_dbg(4, "instances=%d blocks=%d\n", nr_instances, nr_blocks);
  61. dev_ctl = kzalloc(sizeof(struct edac_device_ctl_info), GFP_KERNEL);
  62. if (!dev_ctl)
  63. return NULL;
  64. dev_inst = kcalloc(nr_instances, sizeof(struct edac_device_instance), GFP_KERNEL);
  65. if (!dev_inst)
  66. goto free;
  67. dev_ctl->instances = dev_inst;
  68. dev_blk = kcalloc(nr_instances * nr_blocks, sizeof(struct edac_device_block), GFP_KERNEL);
  69. if (!dev_blk)
  70. goto free;
  71. dev_ctl->blocks = dev_blk;
  72. if (pvt_sz) {
  73. pvt = kzalloc(pvt_sz, GFP_KERNEL);
  74. if (!pvt)
  75. goto free;
  76. dev_ctl->pvt_info = pvt;
  77. }
  78. dev_ctl->dev_idx = device_index;
  79. dev_ctl->nr_instances = nr_instances;
  80. /* Default logging of CEs and UEs */
  81. dev_ctl->log_ce = 1;
  82. dev_ctl->log_ue = 1;
  83. /* Name of this edac device */
  84. snprintf(dev_ctl->name, sizeof(dev_ctl->name),"%s", dev_name);
  85. /* Initialize every Instance */
  86. for (instance = 0; instance < nr_instances; instance++) {
  87. inst = &dev_inst[instance];
  88. inst->ctl = dev_ctl;
  89. inst->nr_blocks = nr_blocks;
  90. blk_p = &dev_blk[instance * nr_blocks];
  91. inst->blocks = blk_p;
  92. /* name of this instance */
  93. snprintf(inst->name, sizeof(inst->name), "%s%u", dev_name, instance);
  94. /* Initialize every block in each instance */
  95. for (block = 0; block < nr_blocks; block++) {
  96. blk = &blk_p[block];
  97. blk->instance = inst;
  98. snprintf(blk->name, sizeof(blk->name),
  99. "%s%d", blk_name, block + off_val);
  100. edac_dbg(4, "instance=%d inst_p=%p block=#%d block_p=%p name='%s'\n",
  101. instance, inst, block, blk, blk->name);
  102. }
  103. }
  104. /* Mark this instance as merely ALLOCATED */
  105. dev_ctl->op_state = OP_ALLOC;
  106. /*
  107. * Initialize the 'root' kobj for the edac_device controller
  108. */
  109. err = edac_device_register_sysfs_main_kobj(dev_ctl);
  110. if (err)
  111. goto free;
  112. /* at this point, the root kobj is valid, and in order to
  113. * 'free' the object, then the function:
  114. * edac_device_unregister_sysfs_main_kobj() must be called
  115. * which will perform kobj unregistration and the actual free
  116. * will occur during the kobject callback operation
  117. */
  118. return dev_ctl;
  119. free:
  120. __edac_device_free_ctl_info(dev_ctl);
  121. return NULL;
  122. }
  123. EXPORT_SYMBOL_GPL(edac_device_alloc_ctl_info);
  124. void edac_device_free_ctl_info(struct edac_device_ctl_info *ctl_info)
  125. {
  126. edac_device_unregister_sysfs_main_kobj(ctl_info);
  127. }
  128. EXPORT_SYMBOL_GPL(edac_device_free_ctl_info);
  129. /*
  130. * find_edac_device_by_dev
  131. * scans the edac_device list for a specific 'struct device *'
  132. *
  133. * lock to be held prior to call: device_ctls_mutex
  134. *
  135. * Return:
  136. * pointer to control structure managing 'dev'
  137. * NULL if not found on list
  138. */
  139. static struct edac_device_ctl_info *find_edac_device_by_dev(struct device *dev)
  140. {
  141. struct edac_device_ctl_info *edac_dev;
  142. struct list_head *item;
  143. edac_dbg(0, "\n");
  144. list_for_each(item, &edac_device_list) {
  145. edac_dev = list_entry(item, struct edac_device_ctl_info, link);
  146. if (edac_dev->dev == dev)
  147. return edac_dev;
  148. }
  149. return NULL;
  150. }
  151. /*
  152. * add_edac_dev_to_global_list
  153. * Before calling this function, caller must
  154. * assign a unique value to edac_dev->dev_idx.
  155. *
  156. * lock to be held prior to call: device_ctls_mutex
  157. *
  158. * Return:
  159. * 0 on success
  160. * 1 on failure.
  161. */
  162. static int add_edac_dev_to_global_list(struct edac_device_ctl_info *edac_dev)
  163. {
  164. struct list_head *item, *insert_before;
  165. struct edac_device_ctl_info *rover;
  166. insert_before = &edac_device_list;
  167. /* Determine if already on the list */
  168. rover = find_edac_device_by_dev(edac_dev->dev);
  169. if (unlikely(rover != NULL))
  170. goto fail0;
  171. /* Insert in ascending order by 'dev_idx', so find position */
  172. list_for_each(item, &edac_device_list) {
  173. rover = list_entry(item, struct edac_device_ctl_info, link);
  174. if (rover->dev_idx >= edac_dev->dev_idx) {
  175. if (unlikely(rover->dev_idx == edac_dev->dev_idx))
  176. goto fail1;
  177. insert_before = item;
  178. break;
  179. }
  180. }
  181. list_add_tail_rcu(&edac_dev->link, insert_before);
  182. return 0;
  183. fail0:
  184. edac_printk(KERN_WARNING, EDAC_MC,
  185. "%s (%s) %s %s already assigned %d\n",
  186. dev_name(rover->dev), edac_dev_name(rover),
  187. rover->mod_name, rover->ctl_name, rover->dev_idx);
  188. return 1;
  189. fail1:
  190. edac_printk(KERN_WARNING, EDAC_MC,
  191. "bug in low-level driver: attempt to assign\n"
  192. " duplicate dev_idx %d in %s()\n", rover->dev_idx,
  193. __func__);
  194. return 1;
  195. }
  196. /*
  197. * del_edac_device_from_global_list
  198. */
  199. static void del_edac_device_from_global_list(struct edac_device_ctl_info
  200. *edac_device)
  201. {
  202. list_del_rcu(&edac_device->link);
  203. /* these are for safe removal of devices from global list while
  204. * NMI handlers may be traversing list
  205. */
  206. synchronize_rcu();
  207. INIT_LIST_HEAD(&edac_device->link);
  208. }
  209. /*
  210. * edac_device_workq_function
  211. * performs the operation scheduled by a workq request
  212. *
  213. * this workq is embedded within an edac_device_ctl_info
  214. * structure, that needs to be polled for possible error events.
  215. *
  216. * This operation is to acquire the list mutex lock
  217. * (thus preventing insertation or deletion)
  218. * and then call the device's poll function IFF this device is
  219. * running polled and there is a poll function defined.
  220. */
  221. static void edac_device_workq_function(struct work_struct *work_req)
  222. {
  223. struct delayed_work *d_work = to_delayed_work(work_req);
  224. struct edac_device_ctl_info *edac_dev = to_edac_device_ctl_work(d_work);
  225. mutex_lock(&device_ctls_mutex);
  226. /* If we are being removed, bail out immediately */
  227. if (edac_dev->op_state == OP_OFFLINE) {
  228. mutex_unlock(&device_ctls_mutex);
  229. return;
  230. }
  231. /* Only poll controllers that are running polled and have a check */
  232. if ((edac_dev->op_state == OP_RUNNING_POLL) &&
  233. (edac_dev->edac_check != NULL)) {
  234. edac_dev->edac_check(edac_dev);
  235. }
  236. mutex_unlock(&device_ctls_mutex);
  237. /* Reschedule the workq for the next time period to start again
  238. * if the number of msec is for 1 sec, then adjust to the next
  239. * whole one second to save timers firing all over the period
  240. * between integral seconds
  241. */
  242. if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
  243. edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
  244. else
  245. edac_queue_work(&edac_dev->work, edac_dev->delay);
  246. }
  247. /*
  248. * edac_device_workq_setup
  249. * initialize a workq item for this edac_device instance
  250. * passing in the new delay period in msec
  251. */
  252. static void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev,
  253. unsigned msec)
  254. {
  255. edac_dbg(0, "\n");
  256. /* take the arg 'msec' and set it into the control structure
  257. * to used in the time period calculation
  258. * then calc the number of jiffies that represents
  259. */
  260. edac_dev->poll_msec = msec;
  261. edac_dev->delay = msecs_to_jiffies(msec);
  262. INIT_DELAYED_WORK(&edac_dev->work, edac_device_workq_function);
  263. /* optimize here for the 1 second case, which will be normal value, to
  264. * fire ON the 1 second time event. This helps reduce all sorts of
  265. * timers firing on sub-second basis, while they are happy
  266. * to fire together on the 1 second exactly
  267. */
  268. if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
  269. edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
  270. else
  271. edac_queue_work(&edac_dev->work, edac_dev->delay);
  272. }
  273. /*
  274. * edac_device_workq_teardown
  275. * stop the workq processing on this edac_dev
  276. */
  277. static void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev)
  278. {
  279. if (!edac_dev->edac_check)
  280. return;
  281. edac_dev->op_state = OP_OFFLINE;
  282. edac_stop_work(&edac_dev->work);
  283. }
  284. /*
  285. * edac_device_reset_delay_period
  286. *
  287. * need to stop any outstanding workq queued up at this time
  288. * because we will be resetting the sleep time.
  289. * Then restart the workq on the new delay
  290. */
  291. void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev,
  292. unsigned long msec)
  293. {
  294. edac_dev->poll_msec = msec;
  295. edac_dev->delay = msecs_to_jiffies(msec);
  296. /* See comment in edac_device_workq_setup() above */
  297. if (edac_dev->poll_msec == DEFAULT_POLL_INTERVAL)
  298. edac_mod_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay));
  299. else
  300. edac_mod_work(&edac_dev->work, edac_dev->delay);
  301. }
  302. int edac_device_alloc_index(void)
  303. {
  304. static atomic_t device_indexes = ATOMIC_INIT(0);
  305. return atomic_inc_return(&device_indexes) - 1;
  306. }
  307. EXPORT_SYMBOL_GPL(edac_device_alloc_index);
  308. int edac_device_add_device(struct edac_device_ctl_info *edac_dev)
  309. {
  310. edac_dbg(0, "\n");
  311. #ifdef CONFIG_EDAC_DEBUG
  312. if (edac_debug_level >= 3)
  313. edac_device_dump_device(edac_dev);
  314. #endif
  315. mutex_lock(&device_ctls_mutex);
  316. if (add_edac_dev_to_global_list(edac_dev))
  317. goto fail0;
  318. /* set load time so that error rate can be tracked */
  319. edac_dev->start_time = jiffies;
  320. /* create this instance's sysfs entries */
  321. if (edac_device_create_sysfs(edac_dev)) {
  322. edac_device_printk(edac_dev, KERN_WARNING,
  323. "failed to create sysfs device\n");
  324. goto fail1;
  325. }
  326. /* If there IS a check routine, then we are running POLLED */
  327. if (edac_dev->edac_check != NULL) {
  328. /* This instance is NOW RUNNING */
  329. edac_dev->op_state = OP_RUNNING_POLL;
  330. edac_device_workq_setup(edac_dev, edac_dev->poll_msec ?: DEFAULT_POLL_INTERVAL);
  331. } else {
  332. edac_dev->op_state = OP_RUNNING_INTERRUPT;
  333. }
  334. /* Report action taken */
  335. edac_device_printk(edac_dev, KERN_INFO,
  336. "Giving out device to module %s controller %s: DEV %s (%s)\n",
  337. edac_dev->mod_name, edac_dev->ctl_name, edac_dev->dev_name,
  338. edac_op_state_to_string(edac_dev->op_state));
  339. mutex_unlock(&device_ctls_mutex);
  340. return 0;
  341. fail1:
  342. /* Some error, so remove the entry from the lsit */
  343. del_edac_device_from_global_list(edac_dev);
  344. fail0:
  345. mutex_unlock(&device_ctls_mutex);
  346. return 1;
  347. }
  348. EXPORT_SYMBOL_GPL(edac_device_add_device);
  349. struct edac_device_ctl_info *edac_device_del_device(struct device *dev)
  350. {
  351. struct edac_device_ctl_info *edac_dev;
  352. edac_dbg(0, "\n");
  353. mutex_lock(&device_ctls_mutex);
  354. /* Find the structure on the list, if not there, then leave */
  355. edac_dev = find_edac_device_by_dev(dev);
  356. if (edac_dev == NULL) {
  357. mutex_unlock(&device_ctls_mutex);
  358. return NULL;
  359. }
  360. /* mark this instance as OFFLINE */
  361. edac_dev->op_state = OP_OFFLINE;
  362. /* deregister from global list */
  363. del_edac_device_from_global_list(edac_dev);
  364. mutex_unlock(&device_ctls_mutex);
  365. /* clear workq processing on this instance */
  366. edac_device_workq_teardown(edac_dev);
  367. /* Tear down the sysfs entries for this instance */
  368. edac_device_remove_sysfs(edac_dev);
  369. edac_printk(KERN_INFO, EDAC_MC,
  370. "Removed device %d for %s %s: DEV %s\n",
  371. edac_dev->dev_idx,
  372. edac_dev->mod_name, edac_dev->ctl_name, edac_dev_name(edac_dev));
  373. return edac_dev;
  374. }
  375. EXPORT_SYMBOL_GPL(edac_device_del_device);
  376. static inline int edac_device_get_log_ce(struct edac_device_ctl_info *edac_dev)
  377. {
  378. return edac_dev->log_ce;
  379. }
  380. static inline int edac_device_get_log_ue(struct edac_device_ctl_info *edac_dev)
  381. {
  382. return edac_dev->log_ue;
  383. }
  384. static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
  385. *edac_dev)
  386. {
  387. return edac_dev->panic_on_ue;
  388. }
  389. void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
  390. unsigned int count, int inst_nr, int block_nr,
  391. const char *msg)
  392. {
  393. struct edac_device_instance *instance;
  394. struct edac_device_block *block = NULL;
  395. if (!count)
  396. return;
  397. if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
  398. edac_device_printk(edac_dev, KERN_ERR,
  399. "INTERNAL ERROR: 'instance' out of range "
  400. "(%d >= %d)\n", inst_nr,
  401. edac_dev->nr_instances);
  402. return;
  403. }
  404. instance = edac_dev->instances + inst_nr;
  405. if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) {
  406. edac_device_printk(edac_dev, KERN_ERR,
  407. "INTERNAL ERROR: instance %d 'block' "
  408. "out of range (%d >= %d)\n",
  409. inst_nr, block_nr,
  410. instance->nr_blocks);
  411. return;
  412. }
  413. if (instance->nr_blocks > 0) {
  414. block = instance->blocks + block_nr;
  415. block->counters.ce_count += count;
  416. }
  417. /* Propagate the count up the 'totals' tree */
  418. instance->counters.ce_count += count;
  419. edac_dev->counters.ce_count += count;
  420. if (edac_device_get_log_ce(edac_dev))
  421. edac_device_printk(edac_dev, KERN_WARNING,
  422. "CE: %s instance: %s block: %s count: %d '%s'\n",
  423. edac_dev->ctl_name, instance->name,
  424. block ? block->name : "N/A", count, msg);
  425. }
  426. EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);
  427. void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
  428. unsigned int count, int inst_nr, int block_nr,
  429. const char *msg)
  430. {
  431. struct edac_device_instance *instance;
  432. struct edac_device_block *block = NULL;
  433. if (!count)
  434. return;
  435. if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
  436. edac_device_printk(edac_dev, KERN_ERR,
  437. "INTERNAL ERROR: 'instance' out of range "
  438. "(%d >= %d)\n", inst_nr,
  439. edac_dev->nr_instances);
  440. return;
  441. }
  442. instance = edac_dev->instances + inst_nr;
  443. if ((block_nr >= instance->nr_blocks) || (block_nr < 0)) {
  444. edac_device_printk(edac_dev, KERN_ERR,
  445. "INTERNAL ERROR: instance %d 'block' "
  446. "out of range (%d >= %d)\n",
  447. inst_nr, block_nr,
  448. instance->nr_blocks);
  449. return;
  450. }
  451. if (instance->nr_blocks > 0) {
  452. block = instance->blocks + block_nr;
  453. block->counters.ue_count += count;
  454. }
  455. /* Propagate the count up the 'totals' tree */
  456. instance->counters.ue_count += count;
  457. edac_dev->counters.ue_count += count;
  458. if (edac_device_get_log_ue(edac_dev))
  459. edac_device_printk(edac_dev, KERN_EMERG,
  460. "UE: %s instance: %s block: %s count: %d '%s'\n",
  461. edac_dev->ctl_name, instance->name,
  462. block ? block->name : "N/A", count, msg);
  463. if (edac_device_get_panic_on_ue(edac_dev))
  464. panic("EDAC %s: UE instance: %s block %s count: %d '%s'\n",
  465. edac_dev->ctl_name, instance->name,
  466. block ? block->name : "N/A", count, msg);
  467. }
  468. EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);