nbd.c 58 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368
  1. /*
  2. * Network block device - make block devices work over TCP
  3. *
  4. * Note that you can not swap over this thing, yet. Seems to work but
  5. * deadlocks sometimes - you can not swap over TCP in general.
  6. *
  7. * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
  8. * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
  9. *
  10. * This file is released under GPLv2 or later.
  11. *
  12. * (part of code stolen from loop.c)
  13. */
  14. #include <linux/major.h>
  15. #include <linux/blkdev.h>
  16. #include <linux/module.h>
  17. #include <linux/init.h>
  18. #include <linux/sched.h>
  19. #include <linux/sched/mm.h>
  20. #include <linux/fs.h>
  21. #include <linux/bio.h>
  22. #include <linux/stat.h>
  23. #include <linux/errno.h>
  24. #include <linux/file.h>
  25. #include <linux/ioctl.h>
  26. #include <linux/mutex.h>
  27. #include <linux/compiler.h>
  28. #include <linux/err.h>
  29. #include <linux/kernel.h>
  30. #include <linux/slab.h>
  31. #include <net/sock.h>
  32. #include <linux/net.h>
  33. #include <linux/kthread.h>
  34. #include <linux/types.h>
  35. #include <linux/debugfs.h>
  36. #include <linux/blk-mq.h>
  37. #include <linux/uaccess.h>
  38. #include <asm/types.h>
  39. #include <linux/nbd.h>
  40. #include <linux/nbd-netlink.h>
  41. #include <net/genetlink.h>
  42. static DEFINE_IDR(nbd_index_idr);
  43. static DEFINE_MUTEX(nbd_index_mutex);
  44. static int nbd_total_devices = 0;
  45. struct nbd_sock {
  46. struct socket *sock;
  47. struct mutex tx_lock;
  48. struct request *pending;
  49. int sent;
  50. bool dead;
  51. int fallback_index;
  52. int cookie;
  53. };
  54. struct recv_thread_args {
  55. struct work_struct work;
  56. struct nbd_device *nbd;
  57. int index;
  58. };
  59. struct link_dead_args {
  60. struct work_struct work;
  61. int index;
  62. };
  63. #define NBD_TIMEDOUT 0
  64. #define NBD_DISCONNECT_REQUESTED 1
  65. #define NBD_DISCONNECTED 2
  66. #define NBD_HAS_PID_FILE 3
  67. #define NBD_HAS_CONFIG_REF 4
  68. #define NBD_BOUND 5
  69. #define NBD_DESTROY_ON_DISCONNECT 6
  70. #define NBD_DISCONNECT_ON_CLOSE 7
  71. struct nbd_config {
  72. u32 flags;
  73. unsigned long runtime_flags;
  74. u64 dead_conn_timeout;
  75. struct nbd_sock **socks;
  76. int num_connections;
  77. atomic_t live_connections;
  78. wait_queue_head_t conn_wait;
  79. atomic_t recv_threads;
  80. wait_queue_head_t recv_wq;
  81. loff_t blksize;
  82. loff_t bytesize;
  83. #if IS_ENABLED(CONFIG_DEBUG_FS)
  84. struct dentry *dbg_dir;
  85. #endif
  86. };
  87. struct nbd_device {
  88. struct blk_mq_tag_set tag_set;
  89. int index;
  90. refcount_t config_refs;
  91. refcount_t refs;
  92. struct nbd_config *config;
  93. struct mutex config_lock;
  94. struct gendisk *disk;
  95. struct workqueue_struct *recv_workq;
  96. struct list_head list;
  97. struct task_struct *task_recv;
  98. struct task_struct *task_setup;
  99. };
  100. #define NBD_CMD_REQUEUED 1
  101. struct nbd_cmd {
  102. struct nbd_device *nbd;
  103. struct mutex lock;
  104. int index;
  105. int cookie;
  106. blk_status_t status;
  107. unsigned long flags;
  108. u32 cmd_cookie;
  109. };
  110. #if IS_ENABLED(CONFIG_DEBUG_FS)
  111. static struct dentry *nbd_dbg_dir;
  112. #endif
  113. #define nbd_name(nbd) ((nbd)->disk->disk_name)
  114. #define NBD_MAGIC 0x68797548
  115. #define NBD_DEF_BLKSIZE 1024
  116. static unsigned int nbds_max = 16;
  117. static int max_part = 16;
  118. static int part_shift;
  119. static int nbd_dev_dbg_init(struct nbd_device *nbd);
  120. static void nbd_dev_dbg_close(struct nbd_device *nbd);
  121. static void nbd_config_put(struct nbd_device *nbd);
  122. static void nbd_connect_reply(struct genl_info *info, int index);
  123. static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
  124. static void nbd_dead_link_work(struct work_struct *work);
  125. static void nbd_disconnect_and_put(struct nbd_device *nbd);
  126. static inline struct device *nbd_to_dev(struct nbd_device *nbd)
  127. {
  128. return disk_to_dev(nbd->disk);
  129. }
  130. static void nbd_requeue_cmd(struct nbd_cmd *cmd)
  131. {
  132. struct request *req = blk_mq_rq_from_pdu(cmd);
  133. if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
  134. blk_mq_requeue_request(req, true);
  135. }
  136. #define NBD_COOKIE_BITS 32
  137. static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
  138. {
  139. struct request *req = blk_mq_rq_from_pdu(cmd);
  140. u32 tag = blk_mq_unique_tag(req);
  141. u64 cookie = cmd->cmd_cookie;
  142. return (cookie << NBD_COOKIE_BITS) | tag;
  143. }
  144. static u32 nbd_handle_to_tag(u64 handle)
  145. {
  146. return (u32)handle;
  147. }
  148. static u32 nbd_handle_to_cookie(u64 handle)
  149. {
  150. return (u32)(handle >> NBD_COOKIE_BITS);
  151. }
  152. static const char *nbdcmd_to_ascii(int cmd)
  153. {
  154. switch (cmd) {
  155. case NBD_CMD_READ: return "read";
  156. case NBD_CMD_WRITE: return "write";
  157. case NBD_CMD_DISC: return "disconnect";
  158. case NBD_CMD_FLUSH: return "flush";
  159. case NBD_CMD_TRIM: return "trim/discard";
  160. }
  161. return "invalid";
  162. }
  163. static ssize_t pid_show(struct device *dev,
  164. struct device_attribute *attr, char *buf)
  165. {
  166. struct gendisk *disk = dev_to_disk(dev);
  167. struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
  168. return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
  169. }
  170. static const struct device_attribute pid_attr = {
  171. .attr = { .name = "pid", .mode = 0444},
  172. .show = pid_show,
  173. };
  174. static void nbd_dev_remove(struct nbd_device *nbd)
  175. {
  176. struct gendisk *disk = nbd->disk;
  177. struct request_queue *q;
  178. if (disk) {
  179. q = disk->queue;
  180. del_gendisk(disk);
  181. blk_cleanup_queue(q);
  182. blk_mq_free_tag_set(&nbd->tag_set);
  183. disk->private_data = NULL;
  184. put_disk(disk);
  185. }
  186. kfree(nbd);
  187. }
  188. static void nbd_put(struct nbd_device *nbd)
  189. {
  190. if (refcount_dec_and_mutex_lock(&nbd->refs,
  191. &nbd_index_mutex)) {
  192. idr_remove(&nbd_index_idr, nbd->index);
  193. nbd_dev_remove(nbd);
  194. mutex_unlock(&nbd_index_mutex);
  195. }
  196. }
  197. static int nbd_disconnected(struct nbd_config *config)
  198. {
  199. return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
  200. test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
  201. }
  202. static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
  203. int notify)
  204. {
  205. if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
  206. struct link_dead_args *args;
  207. args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
  208. if (args) {
  209. INIT_WORK(&args->work, nbd_dead_link_work);
  210. args->index = nbd->index;
  211. queue_work(system_wq, &args->work);
  212. }
  213. }
  214. if (!nsock->dead) {
  215. kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
  216. if (atomic_dec_return(&nbd->config->live_connections) == 0) {
  217. if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
  218. &nbd->config->runtime_flags)) {
  219. set_bit(NBD_DISCONNECTED,
  220. &nbd->config->runtime_flags);
  221. dev_info(nbd_to_dev(nbd),
  222. "Disconnected due to user request.\n");
  223. }
  224. }
  225. }
  226. nsock->dead = true;
  227. nsock->pending = NULL;
  228. nsock->sent = 0;
  229. }
  230. static void nbd_size_clear(struct nbd_device *nbd)
  231. {
  232. if (nbd->config->bytesize) {
  233. set_capacity(nbd->disk, 0);
  234. kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
  235. }
  236. }
  237. static void nbd_size_update(struct nbd_device *nbd, bool start)
  238. {
  239. struct nbd_config *config = nbd->config;
  240. struct block_device *bdev = bdget_disk(nbd->disk, 0);
  241. if (config->flags & NBD_FLAG_SEND_TRIM) {
  242. nbd->disk->queue->limits.discard_granularity = config->blksize;
  243. nbd->disk->queue->limits.discard_alignment = config->blksize;
  244. blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
  245. }
  246. blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
  247. blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
  248. set_capacity(nbd->disk, config->bytesize >> 9);
  249. if (bdev) {
  250. if (bdev->bd_disk) {
  251. bd_set_size(bdev, config->bytesize);
  252. if (start)
  253. set_blocksize(bdev, config->blksize);
  254. } else
  255. bdev->bd_invalidated = 1;
  256. bdput(bdev);
  257. }
  258. kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
  259. }
  260. static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
  261. loff_t nr_blocks)
  262. {
  263. struct nbd_config *config = nbd->config;
  264. config->blksize = blocksize;
  265. config->bytesize = blocksize * nr_blocks;
  266. if (nbd->task_recv != NULL)
  267. nbd_size_update(nbd, false);
  268. }
  269. static void nbd_complete_rq(struct request *req)
  270. {
  271. struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
  272. dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
  273. cmd->status ? "failed" : "done");
  274. blk_mq_end_request(req, cmd->status);
  275. }
  276. /*
  277. * Forcibly shutdown the socket causing all listeners to error
  278. */
  279. static void sock_shutdown(struct nbd_device *nbd)
  280. {
  281. struct nbd_config *config = nbd->config;
  282. int i;
  283. if (config->num_connections == 0)
  284. return;
  285. if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
  286. return;
  287. for (i = 0; i < config->num_connections; i++) {
  288. struct nbd_sock *nsock = config->socks[i];
  289. mutex_lock(&nsock->tx_lock);
  290. nbd_mark_nsock_dead(nbd, nsock, 0);
  291. mutex_unlock(&nsock->tx_lock);
  292. }
  293. dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
  294. }
  295. static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
  296. bool reserved)
  297. {
  298. struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
  299. struct nbd_device *nbd = cmd->nbd;
  300. struct nbd_config *config;
  301. if (!mutex_trylock(&cmd->lock))
  302. return BLK_EH_RESET_TIMER;
  303. if (!refcount_inc_not_zero(&nbd->config_refs)) {
  304. cmd->status = BLK_STS_TIMEOUT;
  305. mutex_unlock(&cmd->lock);
  306. goto done;
  307. }
  308. config = nbd->config;
  309. if (config->num_connections > 1) {
  310. dev_err_ratelimited(nbd_to_dev(nbd),
  311. "Connection timed out, retrying (%d/%d alive)\n",
  312. atomic_read(&config->live_connections),
  313. config->num_connections);
  314. /*
  315. * Hooray we have more connections, requeue this IO, the submit
  316. * path will put it on a real connection.
  317. */
  318. if (config->socks && config->num_connections > 1) {
  319. if (cmd->index < config->num_connections) {
  320. struct nbd_sock *nsock =
  321. config->socks[cmd->index];
  322. mutex_lock(&nsock->tx_lock);
  323. /* We can have multiple outstanding requests, so
  324. * we don't want to mark the nsock dead if we've
  325. * already reconnected with a new socket, so
  326. * only mark it dead if its the same socket we
  327. * were sent out on.
  328. */
  329. if (cmd->cookie == nsock->cookie)
  330. nbd_mark_nsock_dead(nbd, nsock, 1);
  331. mutex_unlock(&nsock->tx_lock);
  332. }
  333. mutex_unlock(&cmd->lock);
  334. nbd_requeue_cmd(cmd);
  335. nbd_config_put(nbd);
  336. return BLK_EH_DONE;
  337. }
  338. } else {
  339. dev_err_ratelimited(nbd_to_dev(nbd),
  340. "Connection timed out\n");
  341. }
  342. set_bit(NBD_TIMEDOUT, &config->runtime_flags);
  343. cmd->status = BLK_STS_IOERR;
  344. mutex_unlock(&cmd->lock);
  345. sock_shutdown(nbd);
  346. nbd_config_put(nbd);
  347. done:
  348. blk_mq_complete_request(req);
  349. return BLK_EH_DONE;
  350. }
  351. /*
  352. * Send or receive packet.
  353. */
  354. static int sock_xmit(struct nbd_device *nbd, int index, int send,
  355. struct iov_iter *iter, int msg_flags, int *sent)
  356. {
  357. struct nbd_config *config = nbd->config;
  358. struct socket *sock = config->socks[index]->sock;
  359. int result;
  360. struct msghdr msg;
  361. unsigned int noreclaim_flag;
  362. if (unlikely(!sock)) {
  363. dev_err_ratelimited(disk_to_dev(nbd->disk),
  364. "Attempted %s on closed socket in sock_xmit\n",
  365. (send ? "send" : "recv"));
  366. return -EINVAL;
  367. }
  368. msg.msg_iter = *iter;
  369. noreclaim_flag = memalloc_noreclaim_save();
  370. do {
  371. sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
  372. msg.msg_name = NULL;
  373. msg.msg_namelen = 0;
  374. msg.msg_control = NULL;
  375. msg.msg_controllen = 0;
  376. msg.msg_flags = msg_flags | MSG_NOSIGNAL;
  377. if (send)
  378. result = sock_sendmsg(sock, &msg);
  379. else
  380. result = sock_recvmsg(sock, &msg, msg.msg_flags);
  381. if (result <= 0) {
  382. if (result == 0)
  383. result = -EPIPE; /* short read */
  384. break;
  385. }
  386. if (sent)
  387. *sent += result;
  388. } while (msg_data_left(&msg));
  389. memalloc_noreclaim_restore(noreclaim_flag);
  390. return result;
  391. }
  392. /*
  393. * Different settings for sk->sk_sndtimeo can result in different return values
  394. * if there is a signal pending when we enter sendmsg, because reasons?
  395. */
  396. static inline int was_interrupted(int result)
  397. {
  398. return result == -ERESTARTSYS || result == -EINTR;
  399. }
  400. /* always call with the tx_lock held */
  401. static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
  402. {
  403. struct request *req = blk_mq_rq_from_pdu(cmd);
  404. struct nbd_config *config = nbd->config;
  405. struct nbd_sock *nsock = config->socks[index];
  406. int result;
  407. struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
  408. struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
  409. struct iov_iter from;
  410. unsigned long size = blk_rq_bytes(req);
  411. struct bio *bio;
  412. u64 handle;
  413. u32 type;
  414. u32 nbd_cmd_flags = 0;
  415. int sent = nsock->sent, skip = 0;
  416. iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
  417. switch (req_op(req)) {
  418. case REQ_OP_DISCARD:
  419. type = NBD_CMD_TRIM;
  420. break;
  421. case REQ_OP_FLUSH:
  422. type = NBD_CMD_FLUSH;
  423. break;
  424. case REQ_OP_WRITE:
  425. type = NBD_CMD_WRITE;
  426. break;
  427. case REQ_OP_READ:
  428. type = NBD_CMD_READ;
  429. break;
  430. default:
  431. return -EIO;
  432. }
  433. if (rq_data_dir(req) == WRITE &&
  434. (config->flags & NBD_FLAG_READ_ONLY)) {
  435. dev_err_ratelimited(disk_to_dev(nbd->disk),
  436. "Write on read-only\n");
  437. return -EIO;
  438. }
  439. if (req->cmd_flags & REQ_FUA)
  440. nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
  441. /* We did a partial send previously, and we at least sent the whole
  442. * request struct, so just go and send the rest of the pages in the
  443. * request.
  444. */
  445. if (sent) {
  446. if (sent >= sizeof(request)) {
  447. skip = sent - sizeof(request);
  448. goto send_pages;
  449. }
  450. iov_iter_advance(&from, sent);
  451. } else {
  452. cmd->cmd_cookie++;
  453. }
  454. cmd->index = index;
  455. cmd->cookie = nsock->cookie;
  456. request.type = htonl(type | nbd_cmd_flags);
  457. if (type != NBD_CMD_FLUSH) {
  458. request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
  459. request.len = htonl(size);
  460. }
  461. handle = nbd_cmd_handle(cmd);
  462. memcpy(request.handle, &handle, sizeof(handle));
  463. dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
  464. req, nbdcmd_to_ascii(type),
  465. (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
  466. result = sock_xmit(nbd, index, 1, &from,
  467. (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
  468. if (result <= 0) {
  469. if (was_interrupted(result)) {
  470. /* If we havne't sent anything we can just return BUSY,
  471. * however if we have sent something we need to make
  472. * sure we only allow this req to be sent until we are
  473. * completely done.
  474. */
  475. if (sent) {
  476. nsock->pending = req;
  477. nsock->sent = sent;
  478. }
  479. set_bit(NBD_CMD_REQUEUED, &cmd->flags);
  480. return BLK_STS_RESOURCE;
  481. }
  482. dev_err_ratelimited(disk_to_dev(nbd->disk),
  483. "Send control failed (result %d)\n", result);
  484. return -EAGAIN;
  485. }
  486. send_pages:
  487. if (type != NBD_CMD_WRITE)
  488. goto out;
  489. bio = req->bio;
  490. while (bio) {
  491. struct bio *next = bio->bi_next;
  492. struct bvec_iter iter;
  493. struct bio_vec bvec;
  494. bio_for_each_segment(bvec, bio, iter) {
  495. bool is_last = !next && bio_iter_last(bvec, iter);
  496. int flags = is_last ? 0 : MSG_MORE;
  497. dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
  498. req, bvec.bv_len);
  499. iov_iter_bvec(&from, ITER_BVEC | WRITE,
  500. &bvec, 1, bvec.bv_len);
  501. if (skip) {
  502. if (skip >= iov_iter_count(&from)) {
  503. skip -= iov_iter_count(&from);
  504. continue;
  505. }
  506. iov_iter_advance(&from, skip);
  507. skip = 0;
  508. }
  509. result = sock_xmit(nbd, index, 1, &from, flags, &sent);
  510. if (result <= 0) {
  511. if (was_interrupted(result)) {
  512. /* We've already sent the header, we
  513. * have no choice but to set pending and
  514. * return BUSY.
  515. */
  516. nsock->pending = req;
  517. nsock->sent = sent;
  518. set_bit(NBD_CMD_REQUEUED, &cmd->flags);
  519. return BLK_STS_RESOURCE;
  520. }
  521. dev_err(disk_to_dev(nbd->disk),
  522. "Send data failed (result %d)\n",
  523. result);
  524. return -EAGAIN;
  525. }
  526. /*
  527. * The completion might already have come in,
  528. * so break for the last one instead of letting
  529. * the iterator do it. This prevents use-after-free
  530. * of the bio.
  531. */
  532. if (is_last)
  533. break;
  534. }
  535. bio = next;
  536. }
  537. out:
  538. nsock->pending = NULL;
  539. nsock->sent = 0;
  540. return 0;
  541. }
  542. /* NULL returned = something went wrong, inform userspace */
  543. static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
  544. {
  545. struct nbd_config *config = nbd->config;
  546. int result;
  547. struct nbd_reply reply;
  548. struct nbd_cmd *cmd;
  549. struct request *req = NULL;
  550. u64 handle;
  551. u16 hwq;
  552. u32 tag;
  553. struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
  554. struct iov_iter to;
  555. int ret = 0;
  556. reply.magic = 0;
  557. iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
  558. result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
  559. if (result <= 0) {
  560. if (!nbd_disconnected(config))
  561. dev_err(disk_to_dev(nbd->disk),
  562. "Receive control failed (result %d)\n", result);
  563. return ERR_PTR(result);
  564. }
  565. if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
  566. dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
  567. (unsigned long)ntohl(reply.magic));
  568. return ERR_PTR(-EPROTO);
  569. }
  570. memcpy(&handle, reply.handle, sizeof(handle));
  571. tag = nbd_handle_to_tag(handle);
  572. hwq = blk_mq_unique_tag_to_hwq(tag);
  573. if (hwq < nbd->tag_set.nr_hw_queues)
  574. req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
  575. blk_mq_unique_tag_to_tag(tag));
  576. if (!req || !blk_mq_request_started(req)) {
  577. dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
  578. tag, req);
  579. return ERR_PTR(-ENOENT);
  580. }
  581. cmd = blk_mq_rq_to_pdu(req);
  582. mutex_lock(&cmd->lock);
  583. if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
  584. dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
  585. req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
  586. ret = -ENOENT;
  587. goto out;
  588. }
  589. if (cmd->status != BLK_STS_OK) {
  590. dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
  591. req);
  592. ret = -ENOENT;
  593. goto out;
  594. }
  595. if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
  596. dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
  597. req);
  598. ret = -ENOENT;
  599. goto out;
  600. }
  601. if (ntohl(reply.error)) {
  602. dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
  603. ntohl(reply.error));
  604. cmd->status = BLK_STS_IOERR;
  605. goto out;
  606. }
  607. dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
  608. if (rq_data_dir(req) != WRITE) {
  609. struct req_iterator iter;
  610. struct bio_vec bvec;
  611. rq_for_each_segment(bvec, req, iter) {
  612. iov_iter_bvec(&to, ITER_BVEC | READ,
  613. &bvec, 1, bvec.bv_len);
  614. result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
  615. if (result <= 0) {
  616. dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
  617. result);
  618. /*
  619. * If we've disconnected or we only have 1
  620. * connection then we need to make sure we
  621. * complete this request, otherwise error out
  622. * and let the timeout stuff handle resubmitting
  623. * this request onto another connection.
  624. */
  625. if (nbd_disconnected(config) ||
  626. config->num_connections <= 1) {
  627. cmd->status = BLK_STS_IOERR;
  628. goto out;
  629. }
  630. ret = -EIO;
  631. goto out;
  632. }
  633. dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
  634. req, bvec.bv_len);
  635. }
  636. }
  637. out:
  638. mutex_unlock(&cmd->lock);
  639. return ret ? ERR_PTR(ret) : cmd;
  640. }
  641. static void recv_work(struct work_struct *work)
  642. {
  643. struct recv_thread_args *args = container_of(work,
  644. struct recv_thread_args,
  645. work);
  646. struct nbd_device *nbd = args->nbd;
  647. struct nbd_config *config = nbd->config;
  648. struct nbd_cmd *cmd;
  649. while (1) {
  650. cmd = nbd_read_stat(nbd, args->index);
  651. if (IS_ERR(cmd)) {
  652. struct nbd_sock *nsock = config->socks[args->index];
  653. mutex_lock(&nsock->tx_lock);
  654. nbd_mark_nsock_dead(nbd, nsock, 1);
  655. mutex_unlock(&nsock->tx_lock);
  656. break;
  657. }
  658. blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
  659. }
  660. nbd_config_put(nbd);
  661. atomic_dec(&config->recv_threads);
  662. wake_up(&config->recv_wq);
  663. kfree(args);
  664. }
  665. static void nbd_clear_req(struct request *req, void *data, bool reserved)
  666. {
  667. struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
  668. mutex_lock(&cmd->lock);
  669. cmd->status = BLK_STS_IOERR;
  670. mutex_unlock(&cmd->lock);
  671. blk_mq_complete_request(req);
  672. }
  673. static void nbd_clear_que(struct nbd_device *nbd)
  674. {
  675. blk_mq_quiesce_queue(nbd->disk->queue);
  676. blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
  677. blk_mq_unquiesce_queue(nbd->disk->queue);
  678. dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
  679. }
  680. static int find_fallback(struct nbd_device *nbd, int index)
  681. {
  682. struct nbd_config *config = nbd->config;
  683. int new_index = -1;
  684. struct nbd_sock *nsock = config->socks[index];
  685. int fallback = nsock->fallback_index;
  686. if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
  687. return new_index;
  688. if (config->num_connections <= 1) {
  689. dev_err_ratelimited(disk_to_dev(nbd->disk),
  690. "Attempted send on invalid socket\n");
  691. return new_index;
  692. }
  693. if (fallback >= 0 && fallback < config->num_connections &&
  694. !config->socks[fallback]->dead)
  695. return fallback;
  696. if (nsock->fallback_index < 0 ||
  697. nsock->fallback_index >= config->num_connections ||
  698. config->socks[nsock->fallback_index]->dead) {
  699. int i;
  700. for (i = 0; i < config->num_connections; i++) {
  701. if (i == index)
  702. continue;
  703. if (!config->socks[i]->dead) {
  704. new_index = i;
  705. break;
  706. }
  707. }
  708. nsock->fallback_index = new_index;
  709. if (new_index < 0) {
  710. dev_err_ratelimited(disk_to_dev(nbd->disk),
  711. "Dead connection, failed to find a fallback\n");
  712. return new_index;
  713. }
  714. }
  715. new_index = nsock->fallback_index;
  716. return new_index;
  717. }
  718. static int wait_for_reconnect(struct nbd_device *nbd)
  719. {
  720. struct nbd_config *config = nbd->config;
  721. if (!config->dead_conn_timeout)
  722. return 0;
  723. if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
  724. return 0;
  725. return wait_event_timeout(config->conn_wait,
  726. atomic_read(&config->live_connections) > 0,
  727. config->dead_conn_timeout) > 0;
  728. }
  729. static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
  730. {
  731. struct request *req = blk_mq_rq_from_pdu(cmd);
  732. struct nbd_device *nbd = cmd->nbd;
  733. struct nbd_config *config;
  734. struct nbd_sock *nsock;
  735. int ret;
  736. if (!refcount_inc_not_zero(&nbd->config_refs)) {
  737. dev_err_ratelimited(disk_to_dev(nbd->disk),
  738. "Socks array is empty\n");
  739. blk_mq_start_request(req);
  740. return -EINVAL;
  741. }
  742. config = nbd->config;
  743. if (index >= config->num_connections) {
  744. dev_err_ratelimited(disk_to_dev(nbd->disk),
  745. "Attempted send on invalid socket\n");
  746. nbd_config_put(nbd);
  747. blk_mq_start_request(req);
  748. return -EINVAL;
  749. }
  750. cmd->status = BLK_STS_OK;
  751. again:
  752. nsock = config->socks[index];
  753. mutex_lock(&nsock->tx_lock);
  754. if (nsock->dead) {
  755. int old_index = index;
  756. index = find_fallback(nbd, index);
  757. mutex_unlock(&nsock->tx_lock);
  758. if (index < 0) {
  759. if (wait_for_reconnect(nbd)) {
  760. index = old_index;
  761. goto again;
  762. }
  763. /* All the sockets should already be down at this point,
  764. * we just want to make sure that DISCONNECTED is set so
  765. * any requests that come in that were queue'ed waiting
  766. * for the reconnect timer don't trigger the timer again
  767. * and instead just error out.
  768. */
  769. sock_shutdown(nbd);
  770. nbd_config_put(nbd);
  771. blk_mq_start_request(req);
  772. return -EIO;
  773. }
  774. goto again;
  775. }
  776. /* Handle the case that we have a pending request that was partially
  777. * transmitted that _has_ to be serviced first. We need to call requeue
  778. * here so that it gets put _after_ the request that is already on the
  779. * dispatch list.
  780. */
  781. blk_mq_start_request(req);
  782. if (unlikely(nsock->pending && nsock->pending != req)) {
  783. nbd_requeue_cmd(cmd);
  784. ret = 0;
  785. goto out;
  786. }
  787. /*
  788. * Some failures are related to the link going down, so anything that
  789. * returns EAGAIN can be retried on a different socket.
  790. */
  791. ret = nbd_send_cmd(nbd, cmd, index);
  792. if (ret == -EAGAIN) {
  793. dev_err_ratelimited(disk_to_dev(nbd->disk),
  794. "Request send failed, requeueing\n");
  795. nbd_mark_nsock_dead(nbd, nsock, 1);
  796. nbd_requeue_cmd(cmd);
  797. ret = 0;
  798. }
  799. out:
  800. mutex_unlock(&nsock->tx_lock);
  801. nbd_config_put(nbd);
  802. return ret;
  803. }
  804. static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
  805. const struct blk_mq_queue_data *bd)
  806. {
  807. struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
  808. int ret;
  809. /*
  810. * Since we look at the bio's to send the request over the network we
  811. * need to make sure the completion work doesn't mark this request done
  812. * before we are done doing our send. This keeps us from dereferencing
  813. * freed data if we have particularly fast completions (ie we get the
  814. * completion before we exit sock_xmit on the last bvec) or in the case
  815. * that the server is misbehaving (or there was an error) before we're
  816. * done sending everything over the wire.
  817. */
  818. mutex_lock(&cmd->lock);
  819. clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
  820. /* We can be called directly from the user space process, which means we
  821. * could possibly have signals pending so our sendmsg will fail. In
  822. * this case we need to return that we are busy, otherwise error out as
  823. * appropriate.
  824. */
  825. ret = nbd_handle_cmd(cmd, hctx->queue_num);
  826. if (ret < 0)
  827. ret = BLK_STS_IOERR;
  828. else if (!ret)
  829. ret = BLK_STS_OK;
  830. mutex_unlock(&cmd->lock);
  831. return ret;
  832. }
  833. static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
  834. int *err)
  835. {
  836. struct socket *sock;
  837. *err = 0;
  838. sock = sockfd_lookup(fd, err);
  839. if (!sock)
  840. return NULL;
  841. if (sock->ops->shutdown == sock_no_shutdown) {
  842. dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
  843. *err = -EINVAL;
  844. sockfd_put(sock);
  845. return NULL;
  846. }
  847. return sock;
  848. }
  849. static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
  850. bool netlink)
  851. {
  852. struct nbd_config *config = nbd->config;
  853. struct socket *sock;
  854. struct nbd_sock **socks;
  855. struct nbd_sock *nsock;
  856. int err;
  857. sock = nbd_get_socket(nbd, arg, &err);
  858. if (!sock)
  859. return err;
  860. /*
  861. * We need to make sure we don't get any errant requests while we're
  862. * reallocating the ->socks array.
  863. */
  864. blk_mq_freeze_queue(nbd->disk->queue);
  865. if (!netlink && !nbd->task_setup &&
  866. !test_bit(NBD_BOUND, &config->runtime_flags))
  867. nbd->task_setup = current;
  868. if (!netlink &&
  869. (nbd->task_setup != current ||
  870. test_bit(NBD_BOUND, &config->runtime_flags))) {
  871. dev_err(disk_to_dev(nbd->disk),
  872. "Device being setup by another task");
  873. err = -EBUSY;
  874. goto put_socket;
  875. }
  876. nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
  877. if (!nsock) {
  878. err = -ENOMEM;
  879. goto put_socket;
  880. }
  881. socks = krealloc(config->socks, (config->num_connections + 1) *
  882. sizeof(struct nbd_sock *), GFP_KERNEL);
  883. if (!socks) {
  884. kfree(nsock);
  885. err = -ENOMEM;
  886. goto put_socket;
  887. }
  888. config->socks = socks;
  889. nsock->fallback_index = -1;
  890. nsock->dead = false;
  891. mutex_init(&nsock->tx_lock);
  892. nsock->sock = sock;
  893. nsock->pending = NULL;
  894. nsock->sent = 0;
  895. nsock->cookie = 0;
  896. socks[config->num_connections++] = nsock;
  897. atomic_inc(&config->live_connections);
  898. blk_mq_unfreeze_queue(nbd->disk->queue);
  899. return 0;
  900. put_socket:
  901. blk_mq_unfreeze_queue(nbd->disk->queue);
  902. sockfd_put(sock);
  903. return err;
  904. }
  905. static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
  906. {
  907. struct nbd_config *config = nbd->config;
  908. struct socket *sock, *old;
  909. struct recv_thread_args *args;
  910. int i;
  911. int err;
  912. sock = nbd_get_socket(nbd, arg, &err);
  913. if (!sock)
  914. return err;
  915. args = kzalloc(sizeof(*args), GFP_KERNEL);
  916. if (!args) {
  917. sockfd_put(sock);
  918. return -ENOMEM;
  919. }
  920. for (i = 0; i < config->num_connections; i++) {
  921. struct nbd_sock *nsock = config->socks[i];
  922. if (!nsock->dead)
  923. continue;
  924. mutex_lock(&nsock->tx_lock);
  925. if (!nsock->dead) {
  926. mutex_unlock(&nsock->tx_lock);
  927. continue;
  928. }
  929. sk_set_memalloc(sock->sk);
  930. if (nbd->tag_set.timeout)
  931. sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
  932. atomic_inc(&config->recv_threads);
  933. refcount_inc(&nbd->config_refs);
  934. old = nsock->sock;
  935. nsock->fallback_index = -1;
  936. nsock->sock = sock;
  937. nsock->dead = false;
  938. INIT_WORK(&args->work, recv_work);
  939. args->index = i;
  940. args->nbd = nbd;
  941. nsock->cookie++;
  942. mutex_unlock(&nsock->tx_lock);
  943. sockfd_put(old);
  944. clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
  945. /* We take the tx_mutex in an error path in the recv_work, so we
  946. * need to queue_work outside of the tx_mutex.
  947. */
  948. queue_work(nbd->recv_workq, &args->work);
  949. atomic_inc(&config->live_connections);
  950. wake_up(&config->conn_wait);
  951. return 0;
  952. }
  953. sockfd_put(sock);
  954. kfree(args);
  955. return -ENOSPC;
  956. }
  957. static void nbd_bdev_reset(struct block_device *bdev)
  958. {
  959. if (bdev->bd_openers > 1)
  960. return;
  961. bd_set_size(bdev, 0);
  962. }
  963. static void nbd_parse_flags(struct nbd_device *nbd)
  964. {
  965. struct nbd_config *config = nbd->config;
  966. if (config->flags & NBD_FLAG_READ_ONLY)
  967. set_disk_ro(nbd->disk, true);
  968. else
  969. set_disk_ro(nbd->disk, false);
  970. if (config->flags & NBD_FLAG_SEND_TRIM)
  971. blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
  972. if (config->flags & NBD_FLAG_SEND_FLUSH) {
  973. if (config->flags & NBD_FLAG_SEND_FUA)
  974. blk_queue_write_cache(nbd->disk->queue, true, true);
  975. else
  976. blk_queue_write_cache(nbd->disk->queue, true, false);
  977. }
  978. else
  979. blk_queue_write_cache(nbd->disk->queue, false, false);
  980. }
  981. static void send_disconnects(struct nbd_device *nbd)
  982. {
  983. struct nbd_config *config = nbd->config;
  984. struct nbd_request request = {
  985. .magic = htonl(NBD_REQUEST_MAGIC),
  986. .type = htonl(NBD_CMD_DISC),
  987. };
  988. struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
  989. struct iov_iter from;
  990. int i, ret;
  991. for (i = 0; i < config->num_connections; i++) {
  992. struct nbd_sock *nsock = config->socks[i];
  993. iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
  994. mutex_lock(&nsock->tx_lock);
  995. ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
  996. if (ret <= 0)
  997. dev_err(disk_to_dev(nbd->disk),
  998. "Send disconnect failed %d\n", ret);
  999. mutex_unlock(&nsock->tx_lock);
  1000. }
  1001. }
  1002. static int nbd_disconnect(struct nbd_device *nbd)
  1003. {
  1004. struct nbd_config *config = nbd->config;
  1005. dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
  1006. set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
  1007. send_disconnects(nbd);
  1008. return 0;
  1009. }
  1010. static void nbd_clear_sock(struct nbd_device *nbd)
  1011. {
  1012. sock_shutdown(nbd);
  1013. nbd_clear_que(nbd);
  1014. nbd->task_setup = NULL;
  1015. }
  1016. static void nbd_config_put(struct nbd_device *nbd)
  1017. {
  1018. if (refcount_dec_and_mutex_lock(&nbd->config_refs,
  1019. &nbd->config_lock)) {
  1020. struct nbd_config *config = nbd->config;
  1021. nbd_dev_dbg_close(nbd);
  1022. nbd_size_clear(nbd);
  1023. if (test_and_clear_bit(NBD_HAS_PID_FILE,
  1024. &config->runtime_flags))
  1025. device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
  1026. nbd->task_recv = NULL;
  1027. nbd_clear_sock(nbd);
  1028. if (config->num_connections) {
  1029. int i;
  1030. for (i = 0; i < config->num_connections; i++) {
  1031. sockfd_put(config->socks[i]->sock);
  1032. kfree(config->socks[i]);
  1033. }
  1034. kfree(config->socks);
  1035. }
  1036. kfree(nbd->config);
  1037. nbd->config = NULL;
  1038. if (nbd->recv_workq)
  1039. destroy_workqueue(nbd->recv_workq);
  1040. nbd->recv_workq = NULL;
  1041. nbd->tag_set.timeout = 0;
  1042. nbd->disk->queue->limits.discard_granularity = 0;
  1043. nbd->disk->queue->limits.discard_alignment = 0;
  1044. blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
  1045. blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
  1046. mutex_unlock(&nbd->config_lock);
  1047. nbd_put(nbd);
  1048. module_put(THIS_MODULE);
  1049. }
  1050. }
  1051. static int nbd_start_device(struct nbd_device *nbd)
  1052. {
  1053. struct nbd_config *config = nbd->config;
  1054. int num_connections = config->num_connections;
  1055. int error = 0, i;
  1056. if (nbd->task_recv)
  1057. return -EBUSY;
  1058. if (!config->socks)
  1059. return -EINVAL;
  1060. if (num_connections > 1 &&
  1061. !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
  1062. dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
  1063. return -EINVAL;
  1064. }
  1065. nbd->recv_workq = alloc_workqueue("knbd%d-recv",
  1066. WQ_MEM_RECLAIM | WQ_HIGHPRI |
  1067. WQ_UNBOUND, 0, nbd->index);
  1068. if (!nbd->recv_workq) {
  1069. dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
  1070. return -ENOMEM;
  1071. }
  1072. blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
  1073. nbd->task_recv = current;
  1074. nbd_parse_flags(nbd);
  1075. error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
  1076. if (error) {
  1077. dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
  1078. return error;
  1079. }
  1080. set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
  1081. nbd_dev_dbg_init(nbd);
  1082. for (i = 0; i < num_connections; i++) {
  1083. struct recv_thread_args *args;
  1084. args = kzalloc(sizeof(*args), GFP_KERNEL);
  1085. if (!args) {
  1086. sock_shutdown(nbd);
  1087. /*
  1088. * If num_connections is m (2 < m),
  1089. * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
  1090. * But NO.(n + 1) failed. We still have n recv threads.
  1091. * So, add flush_workqueue here to prevent recv threads
  1092. * dropping the last config_refs and trying to destroy
  1093. * the workqueue from inside the workqueue.
  1094. */
  1095. if (i)
  1096. flush_workqueue(nbd->recv_workq);
  1097. return -ENOMEM;
  1098. }
  1099. sk_set_memalloc(config->socks[i]->sock->sk);
  1100. if (nbd->tag_set.timeout)
  1101. config->socks[i]->sock->sk->sk_sndtimeo =
  1102. nbd->tag_set.timeout;
  1103. atomic_inc(&config->recv_threads);
  1104. refcount_inc(&nbd->config_refs);
  1105. INIT_WORK(&args->work, recv_work);
  1106. args->nbd = nbd;
  1107. args->index = i;
  1108. queue_work(nbd->recv_workq, &args->work);
  1109. }
  1110. nbd_size_update(nbd, true);
  1111. return error;
  1112. }
  1113. static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
  1114. {
  1115. struct nbd_config *config = nbd->config;
  1116. int ret;
  1117. ret = nbd_start_device(nbd);
  1118. if (ret)
  1119. return ret;
  1120. if (max_part)
  1121. bdev->bd_invalidated = 1;
  1122. mutex_unlock(&nbd->config_lock);
  1123. ret = wait_event_interruptible(config->recv_wq,
  1124. atomic_read(&config->recv_threads) == 0);
  1125. if (ret)
  1126. sock_shutdown(nbd);
  1127. flush_workqueue(nbd->recv_workq);
  1128. mutex_lock(&nbd->config_lock);
  1129. nbd_bdev_reset(bdev);
  1130. /* user requested, ignore socket errors */
  1131. if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
  1132. ret = 0;
  1133. if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
  1134. ret = -ETIMEDOUT;
  1135. return ret;
  1136. }
  1137. static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
  1138. struct block_device *bdev)
  1139. {
  1140. sock_shutdown(nbd);
  1141. __invalidate_device(bdev, true);
  1142. nbd_bdev_reset(bdev);
  1143. if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
  1144. &nbd->config->runtime_flags))
  1145. nbd_config_put(nbd);
  1146. }
  1147. static bool nbd_is_valid_blksize(unsigned long blksize)
  1148. {
  1149. if (!blksize || !is_power_of_2(blksize) || blksize < 512 ||
  1150. blksize > PAGE_SIZE)
  1151. return false;
  1152. return true;
  1153. }
  1154. /* Must be called with config_lock held */
  1155. static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
  1156. unsigned int cmd, unsigned long arg)
  1157. {
  1158. struct nbd_config *config = nbd->config;
  1159. switch (cmd) {
  1160. case NBD_DISCONNECT:
  1161. return nbd_disconnect(nbd);
  1162. case NBD_CLEAR_SOCK:
  1163. nbd_clear_sock_ioctl(nbd, bdev);
  1164. return 0;
  1165. case NBD_SET_SOCK:
  1166. return nbd_add_socket(nbd, arg, false);
  1167. case NBD_SET_BLKSIZE:
  1168. if (!arg)
  1169. arg = NBD_DEF_BLKSIZE;
  1170. if (!nbd_is_valid_blksize(arg))
  1171. return -EINVAL;
  1172. nbd_size_set(nbd, arg,
  1173. div_s64(config->bytesize, arg));
  1174. return 0;
  1175. case NBD_SET_SIZE:
  1176. nbd_size_set(nbd, config->blksize,
  1177. div_s64(arg, config->blksize));
  1178. return 0;
  1179. case NBD_SET_SIZE_BLOCKS:
  1180. nbd_size_set(nbd, config->blksize, arg);
  1181. return 0;
  1182. case NBD_SET_TIMEOUT:
  1183. if (arg) {
  1184. nbd->tag_set.timeout = arg * HZ;
  1185. blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
  1186. }
  1187. return 0;
  1188. case NBD_SET_FLAGS:
  1189. config->flags = arg;
  1190. return 0;
  1191. case NBD_DO_IT:
  1192. return nbd_start_device_ioctl(nbd, bdev);
  1193. case NBD_CLEAR_QUE:
  1194. /*
  1195. * This is for compatibility only. The queue is always cleared
  1196. * by NBD_DO_IT or NBD_CLEAR_SOCK.
  1197. */
  1198. return 0;
  1199. case NBD_PRINT_DEBUG:
  1200. /*
  1201. * For compatibility only, we no longer keep a list of
  1202. * outstanding requests.
  1203. */
  1204. return 0;
  1205. }
  1206. return -ENOTTY;
  1207. }
  1208. static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
  1209. unsigned int cmd, unsigned long arg)
  1210. {
  1211. struct nbd_device *nbd = bdev->bd_disk->private_data;
  1212. struct nbd_config *config = nbd->config;
  1213. int error = -EINVAL;
  1214. if (!capable(CAP_SYS_ADMIN))
  1215. return -EPERM;
  1216. /* The block layer will pass back some non-nbd ioctls in case we have
  1217. * special handling for them, but we don't so just return an error.
  1218. */
  1219. if (_IOC_TYPE(cmd) != 0xab)
  1220. return -EINVAL;
  1221. mutex_lock(&nbd->config_lock);
  1222. /* Don't allow ioctl operations on a nbd device that was created with
  1223. * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
  1224. */
  1225. if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
  1226. (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
  1227. error = __nbd_ioctl(bdev, nbd, cmd, arg);
  1228. else
  1229. dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
  1230. mutex_unlock(&nbd->config_lock);
  1231. return error;
  1232. }
  1233. static struct nbd_config *nbd_alloc_config(void)
  1234. {
  1235. struct nbd_config *config;
  1236. config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
  1237. if (!config)
  1238. return NULL;
  1239. atomic_set(&config->recv_threads, 0);
  1240. init_waitqueue_head(&config->recv_wq);
  1241. init_waitqueue_head(&config->conn_wait);
  1242. config->blksize = NBD_DEF_BLKSIZE;
  1243. atomic_set(&config->live_connections, 0);
  1244. try_module_get(THIS_MODULE);
  1245. return config;
  1246. }
  1247. static int nbd_open(struct block_device *bdev, fmode_t mode)
  1248. {
  1249. struct nbd_device *nbd;
  1250. int ret = 0;
  1251. mutex_lock(&nbd_index_mutex);
  1252. nbd = bdev->bd_disk->private_data;
  1253. if (!nbd) {
  1254. ret = -ENXIO;
  1255. goto out;
  1256. }
  1257. if (!refcount_inc_not_zero(&nbd->refs)) {
  1258. ret = -ENXIO;
  1259. goto out;
  1260. }
  1261. if (!refcount_inc_not_zero(&nbd->config_refs)) {
  1262. struct nbd_config *config;
  1263. mutex_lock(&nbd->config_lock);
  1264. if (refcount_inc_not_zero(&nbd->config_refs)) {
  1265. mutex_unlock(&nbd->config_lock);
  1266. goto out;
  1267. }
  1268. config = nbd->config = nbd_alloc_config();
  1269. if (!config) {
  1270. ret = -ENOMEM;
  1271. mutex_unlock(&nbd->config_lock);
  1272. goto out;
  1273. }
  1274. refcount_set(&nbd->config_refs, 1);
  1275. refcount_inc(&nbd->refs);
  1276. mutex_unlock(&nbd->config_lock);
  1277. bdev->bd_invalidated = 1;
  1278. } else if (nbd_disconnected(nbd->config)) {
  1279. bdev->bd_invalidated = 1;
  1280. }
  1281. out:
  1282. mutex_unlock(&nbd_index_mutex);
  1283. return ret;
  1284. }
  1285. static void nbd_release(struct gendisk *disk, fmode_t mode)
  1286. {
  1287. struct nbd_device *nbd = disk->private_data;
  1288. struct block_device *bdev = bdget_disk(disk, 0);
  1289. if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
  1290. bdev->bd_openers == 0)
  1291. nbd_disconnect_and_put(nbd);
  1292. bdput(bdev);
  1293. nbd_config_put(nbd);
  1294. nbd_put(nbd);
  1295. }
  1296. static const struct block_device_operations nbd_fops =
  1297. {
  1298. .owner = THIS_MODULE,
  1299. .open = nbd_open,
  1300. .release = nbd_release,
  1301. .ioctl = nbd_ioctl,
  1302. .compat_ioctl = nbd_ioctl,
  1303. };
  1304. #if IS_ENABLED(CONFIG_DEBUG_FS)
  1305. static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
  1306. {
  1307. struct nbd_device *nbd = s->private;
  1308. if (nbd->task_recv)
  1309. seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
  1310. return 0;
  1311. }
  1312. static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
  1313. {
  1314. return single_open(file, nbd_dbg_tasks_show, inode->i_private);
  1315. }
  1316. static const struct file_operations nbd_dbg_tasks_ops = {
  1317. .open = nbd_dbg_tasks_open,
  1318. .read = seq_read,
  1319. .llseek = seq_lseek,
  1320. .release = single_release,
  1321. };
  1322. static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
  1323. {
  1324. struct nbd_device *nbd = s->private;
  1325. u32 flags = nbd->config->flags;
  1326. seq_printf(s, "Hex: 0x%08x\n\n", flags);
  1327. seq_puts(s, "Known flags:\n");
  1328. if (flags & NBD_FLAG_HAS_FLAGS)
  1329. seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
  1330. if (flags & NBD_FLAG_READ_ONLY)
  1331. seq_puts(s, "NBD_FLAG_READ_ONLY\n");
  1332. if (flags & NBD_FLAG_SEND_FLUSH)
  1333. seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
  1334. if (flags & NBD_FLAG_SEND_FUA)
  1335. seq_puts(s, "NBD_FLAG_SEND_FUA\n");
  1336. if (flags & NBD_FLAG_SEND_TRIM)
  1337. seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
  1338. return 0;
  1339. }
  1340. static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
  1341. {
  1342. return single_open(file, nbd_dbg_flags_show, inode->i_private);
  1343. }
  1344. static const struct file_operations nbd_dbg_flags_ops = {
  1345. .open = nbd_dbg_flags_open,
  1346. .read = seq_read,
  1347. .llseek = seq_lseek,
  1348. .release = single_release,
  1349. };
  1350. static int nbd_dev_dbg_init(struct nbd_device *nbd)
  1351. {
  1352. struct dentry *dir;
  1353. struct nbd_config *config = nbd->config;
  1354. if (!nbd_dbg_dir)
  1355. return -EIO;
  1356. dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
  1357. if (!dir) {
  1358. dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
  1359. nbd_name(nbd));
  1360. return -EIO;
  1361. }
  1362. config->dbg_dir = dir;
  1363. debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
  1364. debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
  1365. debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
  1366. debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
  1367. debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
  1368. return 0;
  1369. }
  1370. static void nbd_dev_dbg_close(struct nbd_device *nbd)
  1371. {
  1372. debugfs_remove_recursive(nbd->config->dbg_dir);
  1373. }
  1374. static int nbd_dbg_init(void)
  1375. {
  1376. struct dentry *dbg_dir;
  1377. dbg_dir = debugfs_create_dir("nbd", NULL);
  1378. if (!dbg_dir)
  1379. return -EIO;
  1380. nbd_dbg_dir = dbg_dir;
  1381. return 0;
  1382. }
  1383. static void nbd_dbg_close(void)
  1384. {
  1385. debugfs_remove_recursive(nbd_dbg_dir);
  1386. }
  1387. #else /* IS_ENABLED(CONFIG_DEBUG_FS) */
  1388. static int nbd_dev_dbg_init(struct nbd_device *nbd)
  1389. {
  1390. return 0;
  1391. }
  1392. static void nbd_dev_dbg_close(struct nbd_device *nbd)
  1393. {
  1394. }
  1395. static int nbd_dbg_init(void)
  1396. {
  1397. return 0;
  1398. }
  1399. static void nbd_dbg_close(void)
  1400. {
  1401. }
  1402. #endif
  1403. static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
  1404. unsigned int hctx_idx, unsigned int numa_node)
  1405. {
  1406. struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
  1407. cmd->nbd = set->driver_data;
  1408. cmd->flags = 0;
  1409. mutex_init(&cmd->lock);
  1410. return 0;
  1411. }
  1412. static const struct blk_mq_ops nbd_mq_ops = {
  1413. .queue_rq = nbd_queue_rq,
  1414. .complete = nbd_complete_rq,
  1415. .init_request = nbd_init_request,
  1416. .timeout = nbd_xmit_timeout,
  1417. };
  1418. static int nbd_dev_add(int index)
  1419. {
  1420. struct nbd_device *nbd;
  1421. struct gendisk *disk;
  1422. struct request_queue *q;
  1423. int err = -ENOMEM;
  1424. nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
  1425. if (!nbd)
  1426. goto out;
  1427. disk = alloc_disk(1 << part_shift);
  1428. if (!disk)
  1429. goto out_free_nbd;
  1430. if (index >= 0) {
  1431. err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
  1432. GFP_KERNEL);
  1433. if (err == -ENOSPC)
  1434. err = -EEXIST;
  1435. } else {
  1436. err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
  1437. if (err >= 0)
  1438. index = err;
  1439. }
  1440. if (err < 0)
  1441. goto out_free_disk;
  1442. nbd->index = index;
  1443. nbd->disk = disk;
  1444. nbd->tag_set.ops = &nbd_mq_ops;
  1445. nbd->tag_set.nr_hw_queues = 1;
  1446. nbd->tag_set.queue_depth = 128;
  1447. nbd->tag_set.numa_node = NUMA_NO_NODE;
  1448. nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
  1449. nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
  1450. BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
  1451. nbd->tag_set.driver_data = nbd;
  1452. err = blk_mq_alloc_tag_set(&nbd->tag_set);
  1453. if (err)
  1454. goto out_free_idr;
  1455. q = blk_mq_init_queue(&nbd->tag_set);
  1456. if (IS_ERR(q)) {
  1457. err = PTR_ERR(q);
  1458. goto out_free_tags;
  1459. }
  1460. disk->queue = q;
  1461. /*
  1462. * Tell the block layer that we are not a rotational device
  1463. */
  1464. blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
  1465. blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
  1466. disk->queue->limits.discard_granularity = 0;
  1467. disk->queue->limits.discard_alignment = 0;
  1468. blk_queue_max_discard_sectors(disk->queue, 0);
  1469. blk_queue_max_segment_size(disk->queue, UINT_MAX);
  1470. blk_queue_max_segments(disk->queue, USHRT_MAX);
  1471. blk_queue_max_hw_sectors(disk->queue, 65536);
  1472. disk->queue->limits.max_sectors = 256;
  1473. mutex_init(&nbd->config_lock);
  1474. refcount_set(&nbd->config_refs, 0);
  1475. refcount_set(&nbd->refs, 1);
  1476. INIT_LIST_HEAD(&nbd->list);
  1477. disk->major = NBD_MAJOR;
  1478. disk->first_minor = index << part_shift;
  1479. disk->fops = &nbd_fops;
  1480. disk->private_data = nbd;
  1481. sprintf(disk->disk_name, "nbd%d", index);
  1482. add_disk(disk);
  1483. nbd_total_devices++;
  1484. return index;
  1485. out_free_tags:
  1486. blk_mq_free_tag_set(&nbd->tag_set);
  1487. out_free_idr:
  1488. idr_remove(&nbd_index_idr, index);
  1489. out_free_disk:
  1490. put_disk(disk);
  1491. out_free_nbd:
  1492. kfree(nbd);
  1493. out:
  1494. return err;
  1495. }
  1496. static int find_free_cb(int id, void *ptr, void *data)
  1497. {
  1498. struct nbd_device *nbd = ptr;
  1499. struct nbd_device **found = data;
  1500. if (!refcount_read(&nbd->config_refs)) {
  1501. *found = nbd;
  1502. return 1;
  1503. }
  1504. return 0;
  1505. }
  1506. /* Netlink interface. */
  1507. static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
  1508. [NBD_ATTR_INDEX] = { .type = NLA_U32 },
  1509. [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 },
  1510. [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 },
  1511. [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 },
  1512. [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 },
  1513. [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 },
  1514. [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED},
  1515. [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 },
  1516. [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED},
  1517. };
  1518. static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
  1519. [NBD_SOCK_FD] = { .type = NLA_U32 },
  1520. };
  1521. /* We don't use this right now since we don't parse the incoming list, but we
  1522. * still want it here so userspace knows what to expect.
  1523. */
  1524. static const struct nla_policy __attribute__((unused))
  1525. nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
  1526. [NBD_DEVICE_INDEX] = { .type = NLA_U32 },
  1527. [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 },
  1528. };
  1529. static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
  1530. {
  1531. struct nbd_device *nbd = NULL;
  1532. struct nbd_config *config;
  1533. int index = -1;
  1534. int ret;
  1535. bool put_dev = false;
  1536. if (!netlink_capable(skb, CAP_SYS_ADMIN))
  1537. return -EPERM;
  1538. if (info->attrs[NBD_ATTR_INDEX])
  1539. index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
  1540. if (!info->attrs[NBD_ATTR_SOCKETS]) {
  1541. printk(KERN_ERR "nbd: must specify at least one socket\n");
  1542. return -EINVAL;
  1543. }
  1544. if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
  1545. printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
  1546. return -EINVAL;
  1547. }
  1548. again:
  1549. mutex_lock(&nbd_index_mutex);
  1550. if (index == -1) {
  1551. ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
  1552. if (ret == 0) {
  1553. int new_index;
  1554. new_index = nbd_dev_add(-1);
  1555. if (new_index < 0) {
  1556. mutex_unlock(&nbd_index_mutex);
  1557. printk(KERN_ERR "nbd: failed to add new device\n");
  1558. return new_index;
  1559. }
  1560. nbd = idr_find(&nbd_index_idr, new_index);
  1561. }
  1562. } else {
  1563. nbd = idr_find(&nbd_index_idr, index);
  1564. if (!nbd) {
  1565. ret = nbd_dev_add(index);
  1566. if (ret < 0) {
  1567. mutex_unlock(&nbd_index_mutex);
  1568. printk(KERN_ERR "nbd: failed to add new device\n");
  1569. return ret;
  1570. }
  1571. nbd = idr_find(&nbd_index_idr, index);
  1572. }
  1573. }
  1574. if (!nbd) {
  1575. printk(KERN_ERR "nbd: couldn't find device at index %d\n",
  1576. index);
  1577. mutex_unlock(&nbd_index_mutex);
  1578. return -EINVAL;
  1579. }
  1580. if (!refcount_inc_not_zero(&nbd->refs)) {
  1581. mutex_unlock(&nbd_index_mutex);
  1582. if (index == -1)
  1583. goto again;
  1584. printk(KERN_ERR "nbd: device at index %d is going down\n",
  1585. index);
  1586. return -EINVAL;
  1587. }
  1588. mutex_unlock(&nbd_index_mutex);
  1589. mutex_lock(&nbd->config_lock);
  1590. if (refcount_read(&nbd->config_refs)) {
  1591. mutex_unlock(&nbd->config_lock);
  1592. nbd_put(nbd);
  1593. if (index == -1)
  1594. goto again;
  1595. printk(KERN_ERR "nbd: nbd%d already in use\n", index);
  1596. return -EBUSY;
  1597. }
  1598. if (WARN_ON(nbd->config)) {
  1599. mutex_unlock(&nbd->config_lock);
  1600. nbd_put(nbd);
  1601. return -EINVAL;
  1602. }
  1603. config = nbd->config = nbd_alloc_config();
  1604. if (!nbd->config) {
  1605. mutex_unlock(&nbd->config_lock);
  1606. nbd_put(nbd);
  1607. printk(KERN_ERR "nbd: couldn't allocate config\n");
  1608. return -ENOMEM;
  1609. }
  1610. refcount_set(&nbd->config_refs, 1);
  1611. set_bit(NBD_BOUND, &config->runtime_flags);
  1612. if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
  1613. u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
  1614. nbd_size_set(nbd, config->blksize,
  1615. div64_u64(bytes, config->blksize));
  1616. }
  1617. if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
  1618. u64 bsize =
  1619. nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
  1620. if (!bsize)
  1621. bsize = NBD_DEF_BLKSIZE;
  1622. if (!nbd_is_valid_blksize(bsize)) {
  1623. ret = -EINVAL;
  1624. goto out;
  1625. }
  1626. nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
  1627. }
  1628. if (info->attrs[NBD_ATTR_TIMEOUT]) {
  1629. u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
  1630. nbd->tag_set.timeout = timeout * HZ;
  1631. blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
  1632. }
  1633. if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
  1634. config->dead_conn_timeout =
  1635. nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
  1636. config->dead_conn_timeout *= HZ;
  1637. }
  1638. if (info->attrs[NBD_ATTR_SERVER_FLAGS])
  1639. config->flags =
  1640. nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
  1641. if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
  1642. u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
  1643. if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
  1644. set_bit(NBD_DESTROY_ON_DISCONNECT,
  1645. &config->runtime_flags);
  1646. put_dev = true;
  1647. }
  1648. if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
  1649. set_bit(NBD_DISCONNECT_ON_CLOSE,
  1650. &config->runtime_flags);
  1651. }
  1652. }
  1653. if (info->attrs[NBD_ATTR_SOCKETS]) {
  1654. struct nlattr *attr;
  1655. int rem, fd;
  1656. nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
  1657. rem) {
  1658. struct nlattr *socks[NBD_SOCK_MAX+1];
  1659. if (nla_type(attr) != NBD_SOCK_ITEM) {
  1660. printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
  1661. ret = -EINVAL;
  1662. goto out;
  1663. }
  1664. ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
  1665. nbd_sock_policy, info->extack);
  1666. if (ret != 0) {
  1667. printk(KERN_ERR "nbd: error processing sock list\n");
  1668. ret = -EINVAL;
  1669. goto out;
  1670. }
  1671. if (!socks[NBD_SOCK_FD])
  1672. continue;
  1673. fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
  1674. ret = nbd_add_socket(nbd, fd, true);
  1675. if (ret)
  1676. goto out;
  1677. }
  1678. }
  1679. ret = nbd_start_device(nbd);
  1680. out:
  1681. mutex_unlock(&nbd->config_lock);
  1682. if (!ret) {
  1683. set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
  1684. refcount_inc(&nbd->config_refs);
  1685. nbd_connect_reply(info, nbd->index);
  1686. }
  1687. nbd_config_put(nbd);
  1688. if (put_dev)
  1689. nbd_put(nbd);
  1690. return ret;
  1691. }
  1692. static void nbd_disconnect_and_put(struct nbd_device *nbd)
  1693. {
  1694. mutex_lock(&nbd->config_lock);
  1695. nbd_disconnect(nbd);
  1696. nbd_clear_sock(nbd);
  1697. mutex_unlock(&nbd->config_lock);
  1698. /*
  1699. * Make sure recv thread has finished, so it does not drop the last
  1700. * config ref and try to destroy the workqueue from inside the work
  1701. * queue.
  1702. */
  1703. flush_workqueue(nbd->recv_workq);
  1704. if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
  1705. &nbd->config->runtime_flags))
  1706. nbd_config_put(nbd);
  1707. }
  1708. static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
  1709. {
  1710. struct nbd_device *nbd;
  1711. int index;
  1712. if (!netlink_capable(skb, CAP_SYS_ADMIN))
  1713. return -EPERM;
  1714. if (!info->attrs[NBD_ATTR_INDEX]) {
  1715. printk(KERN_ERR "nbd: must specify an index to disconnect\n");
  1716. return -EINVAL;
  1717. }
  1718. index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
  1719. mutex_lock(&nbd_index_mutex);
  1720. nbd = idr_find(&nbd_index_idr, index);
  1721. if (!nbd) {
  1722. mutex_unlock(&nbd_index_mutex);
  1723. printk(KERN_ERR "nbd: couldn't find device at index %d\n",
  1724. index);
  1725. return -EINVAL;
  1726. }
  1727. if (!refcount_inc_not_zero(&nbd->refs)) {
  1728. mutex_unlock(&nbd_index_mutex);
  1729. printk(KERN_ERR "nbd: device at index %d is going down\n",
  1730. index);
  1731. return -EINVAL;
  1732. }
  1733. mutex_unlock(&nbd_index_mutex);
  1734. if (!refcount_inc_not_zero(&nbd->config_refs)) {
  1735. nbd_put(nbd);
  1736. return 0;
  1737. }
  1738. nbd_disconnect_and_put(nbd);
  1739. nbd_config_put(nbd);
  1740. nbd_put(nbd);
  1741. return 0;
  1742. }
  1743. static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
  1744. {
  1745. struct nbd_device *nbd = NULL;
  1746. struct nbd_config *config;
  1747. int index;
  1748. int ret = 0;
  1749. bool put_dev = false;
  1750. if (!netlink_capable(skb, CAP_SYS_ADMIN))
  1751. return -EPERM;
  1752. if (!info->attrs[NBD_ATTR_INDEX]) {
  1753. printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
  1754. return -EINVAL;
  1755. }
  1756. index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
  1757. mutex_lock(&nbd_index_mutex);
  1758. nbd = idr_find(&nbd_index_idr, index);
  1759. if (!nbd) {
  1760. mutex_unlock(&nbd_index_mutex);
  1761. printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
  1762. index);
  1763. return -EINVAL;
  1764. }
  1765. if (!refcount_inc_not_zero(&nbd->refs)) {
  1766. mutex_unlock(&nbd_index_mutex);
  1767. printk(KERN_ERR "nbd: device at index %d is going down\n",
  1768. index);
  1769. return -EINVAL;
  1770. }
  1771. mutex_unlock(&nbd_index_mutex);
  1772. if (!refcount_inc_not_zero(&nbd->config_refs)) {
  1773. dev_err(nbd_to_dev(nbd),
  1774. "not configured, cannot reconfigure\n");
  1775. nbd_put(nbd);
  1776. return -EINVAL;
  1777. }
  1778. mutex_lock(&nbd->config_lock);
  1779. config = nbd->config;
  1780. if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
  1781. !nbd->task_recv) {
  1782. dev_err(nbd_to_dev(nbd),
  1783. "not configured, cannot reconfigure\n");
  1784. ret = -EINVAL;
  1785. goto out;
  1786. }
  1787. if (info->attrs[NBD_ATTR_TIMEOUT]) {
  1788. u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
  1789. nbd->tag_set.timeout = timeout * HZ;
  1790. blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
  1791. }
  1792. if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
  1793. config->dead_conn_timeout =
  1794. nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
  1795. config->dead_conn_timeout *= HZ;
  1796. }
  1797. if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
  1798. u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
  1799. if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
  1800. if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
  1801. &config->runtime_flags))
  1802. put_dev = true;
  1803. } else {
  1804. if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
  1805. &config->runtime_flags))
  1806. refcount_inc(&nbd->refs);
  1807. }
  1808. if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
  1809. set_bit(NBD_DISCONNECT_ON_CLOSE,
  1810. &config->runtime_flags);
  1811. } else {
  1812. clear_bit(NBD_DISCONNECT_ON_CLOSE,
  1813. &config->runtime_flags);
  1814. }
  1815. }
  1816. if (info->attrs[NBD_ATTR_SOCKETS]) {
  1817. struct nlattr *attr;
  1818. int rem, fd;
  1819. nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
  1820. rem) {
  1821. struct nlattr *socks[NBD_SOCK_MAX+1];
  1822. if (nla_type(attr) != NBD_SOCK_ITEM) {
  1823. printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
  1824. ret = -EINVAL;
  1825. goto out;
  1826. }
  1827. ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
  1828. nbd_sock_policy, info->extack);
  1829. if (ret != 0) {
  1830. printk(KERN_ERR "nbd: error processing sock list\n");
  1831. ret = -EINVAL;
  1832. goto out;
  1833. }
  1834. if (!socks[NBD_SOCK_FD])
  1835. continue;
  1836. fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
  1837. ret = nbd_reconnect_socket(nbd, fd);
  1838. if (ret) {
  1839. if (ret == -ENOSPC)
  1840. ret = 0;
  1841. goto out;
  1842. }
  1843. dev_info(nbd_to_dev(nbd), "reconnected socket\n");
  1844. }
  1845. }
  1846. out:
  1847. mutex_unlock(&nbd->config_lock);
  1848. nbd_config_put(nbd);
  1849. nbd_put(nbd);
  1850. if (put_dev)
  1851. nbd_put(nbd);
  1852. return ret;
  1853. }
  1854. static const struct genl_ops nbd_connect_genl_ops[] = {
  1855. {
  1856. .cmd = NBD_CMD_CONNECT,
  1857. .policy = nbd_attr_policy,
  1858. .doit = nbd_genl_connect,
  1859. },
  1860. {
  1861. .cmd = NBD_CMD_DISCONNECT,
  1862. .policy = nbd_attr_policy,
  1863. .doit = nbd_genl_disconnect,
  1864. },
  1865. {
  1866. .cmd = NBD_CMD_RECONFIGURE,
  1867. .policy = nbd_attr_policy,
  1868. .doit = nbd_genl_reconfigure,
  1869. },
  1870. {
  1871. .cmd = NBD_CMD_STATUS,
  1872. .policy = nbd_attr_policy,
  1873. .doit = nbd_genl_status,
  1874. },
  1875. };
  1876. static const struct genl_multicast_group nbd_mcast_grps[] = {
  1877. { .name = NBD_GENL_MCAST_GROUP_NAME, },
  1878. };
  1879. static struct genl_family nbd_genl_family __ro_after_init = {
  1880. .hdrsize = 0,
  1881. .name = NBD_GENL_FAMILY_NAME,
  1882. .version = NBD_GENL_VERSION,
  1883. .module = THIS_MODULE,
  1884. .ops = nbd_connect_genl_ops,
  1885. .n_ops = ARRAY_SIZE(nbd_connect_genl_ops),
  1886. .maxattr = NBD_ATTR_MAX,
  1887. .mcgrps = nbd_mcast_grps,
  1888. .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
  1889. };
  1890. static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
  1891. {
  1892. struct nlattr *dev_opt;
  1893. u8 connected = 0;
  1894. int ret;
  1895. /* This is a little racey, but for status it's ok. The
  1896. * reason we don't take a ref here is because we can't
  1897. * take a ref in the index == -1 case as we would need
  1898. * to put under the nbd_index_mutex, which could
  1899. * deadlock if we are configured to remove ourselves
  1900. * once we're disconnected.
  1901. */
  1902. if (refcount_read(&nbd->config_refs))
  1903. connected = 1;
  1904. dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
  1905. if (!dev_opt)
  1906. return -EMSGSIZE;
  1907. ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
  1908. if (ret)
  1909. return -EMSGSIZE;
  1910. ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
  1911. connected);
  1912. if (ret)
  1913. return -EMSGSIZE;
  1914. nla_nest_end(reply, dev_opt);
  1915. return 0;
  1916. }
  1917. static int status_cb(int id, void *ptr, void *data)
  1918. {
  1919. struct nbd_device *nbd = ptr;
  1920. return populate_nbd_status(nbd, (struct sk_buff *)data);
  1921. }
  1922. static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
  1923. {
  1924. struct nlattr *dev_list;
  1925. struct sk_buff *reply;
  1926. void *reply_head;
  1927. size_t msg_size;
  1928. int index = -1;
  1929. int ret = -ENOMEM;
  1930. if (info->attrs[NBD_ATTR_INDEX])
  1931. index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
  1932. mutex_lock(&nbd_index_mutex);
  1933. msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
  1934. nla_attr_size(sizeof(u8)));
  1935. msg_size *= (index == -1) ? nbd_total_devices : 1;
  1936. reply = genlmsg_new(msg_size, GFP_KERNEL);
  1937. if (!reply)
  1938. goto out;
  1939. reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
  1940. NBD_CMD_STATUS);
  1941. if (!reply_head) {
  1942. nlmsg_free(reply);
  1943. goto out;
  1944. }
  1945. dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
  1946. if (index == -1) {
  1947. ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
  1948. if (ret) {
  1949. nlmsg_free(reply);
  1950. goto out;
  1951. }
  1952. } else {
  1953. struct nbd_device *nbd;
  1954. nbd = idr_find(&nbd_index_idr, index);
  1955. if (nbd) {
  1956. ret = populate_nbd_status(nbd, reply);
  1957. if (ret) {
  1958. nlmsg_free(reply);
  1959. goto out;
  1960. }
  1961. }
  1962. }
  1963. nla_nest_end(reply, dev_list);
  1964. genlmsg_end(reply, reply_head);
  1965. genlmsg_reply(reply, info);
  1966. ret = 0;
  1967. out:
  1968. mutex_unlock(&nbd_index_mutex);
  1969. return ret;
  1970. }
  1971. static void nbd_connect_reply(struct genl_info *info, int index)
  1972. {
  1973. struct sk_buff *skb;
  1974. void *msg_head;
  1975. int ret;
  1976. skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
  1977. if (!skb)
  1978. return;
  1979. msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
  1980. NBD_CMD_CONNECT);
  1981. if (!msg_head) {
  1982. nlmsg_free(skb);
  1983. return;
  1984. }
  1985. ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
  1986. if (ret) {
  1987. nlmsg_free(skb);
  1988. return;
  1989. }
  1990. genlmsg_end(skb, msg_head);
  1991. genlmsg_reply(skb, info);
  1992. }
  1993. static void nbd_mcast_index(int index)
  1994. {
  1995. struct sk_buff *skb;
  1996. void *msg_head;
  1997. int ret;
  1998. skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
  1999. if (!skb)
  2000. return;
  2001. msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
  2002. NBD_CMD_LINK_DEAD);
  2003. if (!msg_head) {
  2004. nlmsg_free(skb);
  2005. return;
  2006. }
  2007. ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
  2008. if (ret) {
  2009. nlmsg_free(skb);
  2010. return;
  2011. }
  2012. genlmsg_end(skb, msg_head);
  2013. genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
  2014. }
  2015. static void nbd_dead_link_work(struct work_struct *work)
  2016. {
  2017. struct link_dead_args *args = container_of(work, struct link_dead_args,
  2018. work);
  2019. nbd_mcast_index(args->index);
  2020. kfree(args);
  2021. }
  2022. static int __init nbd_init(void)
  2023. {
  2024. int i;
  2025. BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
  2026. if (max_part < 0) {
  2027. printk(KERN_ERR "nbd: max_part must be >= 0\n");
  2028. return -EINVAL;
  2029. }
  2030. part_shift = 0;
  2031. if (max_part > 0) {
  2032. part_shift = fls(max_part);
  2033. /*
  2034. * Adjust max_part according to part_shift as it is exported
  2035. * to user space so that user can know the max number of
  2036. * partition kernel should be able to manage.
  2037. *
  2038. * Note that -1 is required because partition 0 is reserved
  2039. * for the whole disk.
  2040. */
  2041. max_part = (1UL << part_shift) - 1;
  2042. }
  2043. if ((1UL << part_shift) > DISK_MAX_PARTS)
  2044. return -EINVAL;
  2045. if (nbds_max > 1UL << (MINORBITS - part_shift))
  2046. return -EINVAL;
  2047. if (register_blkdev(NBD_MAJOR, "nbd"))
  2048. return -EIO;
  2049. if (genl_register_family(&nbd_genl_family)) {
  2050. unregister_blkdev(NBD_MAJOR, "nbd");
  2051. return -EINVAL;
  2052. }
  2053. nbd_dbg_init();
  2054. mutex_lock(&nbd_index_mutex);
  2055. for (i = 0; i < nbds_max; i++)
  2056. nbd_dev_add(i);
  2057. mutex_unlock(&nbd_index_mutex);
  2058. return 0;
  2059. }
  2060. static int nbd_exit_cb(int id, void *ptr, void *data)
  2061. {
  2062. struct list_head *list = (struct list_head *)data;
  2063. struct nbd_device *nbd = ptr;
  2064. list_add_tail(&nbd->list, list);
  2065. return 0;
  2066. }
  2067. static void __exit nbd_cleanup(void)
  2068. {
  2069. struct nbd_device *nbd;
  2070. LIST_HEAD(del_list);
  2071. nbd_dbg_close();
  2072. mutex_lock(&nbd_index_mutex);
  2073. idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
  2074. mutex_unlock(&nbd_index_mutex);
  2075. while (!list_empty(&del_list)) {
  2076. nbd = list_first_entry(&del_list, struct nbd_device, list);
  2077. list_del_init(&nbd->list);
  2078. if (refcount_read(&nbd->refs) != 1)
  2079. printk(KERN_ERR "nbd: possibly leaking a device\n");
  2080. nbd_put(nbd);
  2081. }
  2082. idr_destroy(&nbd_index_idr);
  2083. genl_unregister_family(&nbd_genl_family);
  2084. unregister_blkdev(NBD_MAJOR, "nbd");
  2085. }
  2086. module_init(nbd_init);
  2087. module_exit(nbd_cleanup);
  2088. MODULE_DESCRIPTION("Network Block Device");
  2089. MODULE_LICENSE("GPL");
  2090. module_param(nbds_max, int, 0444);
  2091. MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
  2092. module_param(max_part, int, 0444);
  2093. MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");