ublk_drv.c 74 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Userspace block device - block device which IO is handled from userspace
  4. *
  5. * Take full use of io_uring passthrough command for communicating with
  6. * ublk userspace daemon(ublksrvd) for handling basic IO request.
  7. *
  8. * Copyright 2022 Ming Lei <ming.lei@redhat.com>
  9. *
  10. * (part of code stolen from loop.c)
  11. */
  12. #include <linux/module.h>
  13. #include <linux/moduleparam.h>
  14. #include <linux/sched.h>
  15. #include <linux/fs.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/file.h>
  18. #include <linux/stat.h>
  19. #include <linux/errno.h>
  20. #include <linux/major.h>
  21. #include <linux/wait.h>
  22. #include <linux/blkdev.h>
  23. #include <linux/init.h>
  24. #include <linux/swap.h>
  25. #include <linux/slab.h>
  26. #include <linux/compat.h>
  27. #include <linux/mutex.h>
  28. #include <linux/writeback.h>
  29. #include <linux/completion.h>
  30. #include <linux/highmem.h>
  31. #include <linux/sysfs.h>
  32. #include <linux/miscdevice.h>
  33. #include <linux/falloc.h>
  34. #include <linux/uio.h>
  35. #include <linux/ioprio.h>
  36. #include <linux/sched/mm.h>
  37. #include <linux/uaccess.h>
  38. #include <linux/cdev.h>
  39. #include <linux/io_uring/cmd.h>
  40. #include <linux/blk-mq.h>
  41. #include <linux/delay.h>
  42. #include <linux/mm.h>
  43. #include <asm/page.h>
  44. #include <linux/task_work.h>
  45. #include <linux/namei.h>
  46. #include <linux/kref.h>
  47. #include <uapi/linux/ublk_cmd.h>
  48. #define UBLK_MINORS (1U << MINORBITS)
  49. /* private ioctl command mirror */
  50. #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
  51. /* All UBLK_F_* have to be included into UBLK_F_ALL */
  52. #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
  53. | UBLK_F_URING_CMD_COMP_IN_TASK \
  54. | UBLK_F_NEED_GET_DATA \
  55. | UBLK_F_USER_RECOVERY \
  56. | UBLK_F_USER_RECOVERY_REISSUE \
  57. | UBLK_F_UNPRIVILEGED_DEV \
  58. | UBLK_F_CMD_IOCTL_ENCODE \
  59. | UBLK_F_USER_COPY \
  60. | UBLK_F_ZONED)
  61. /* All UBLK_PARAM_TYPE_* should be included here */
  62. #define UBLK_PARAM_TYPE_ALL \
  63. (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
  64. UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
  65. struct ublk_rq_data {
  66. struct llist_node node;
  67. struct kref ref;
  68. };
  69. struct ublk_uring_cmd_pdu {
  70. struct ublk_queue *ubq;
  71. u16 tag;
  72. };
  73. /*
  74. * io command is active: sqe cmd is received, and its cqe isn't done
  75. *
  76. * If the flag is set, the io command is owned by ublk driver, and waited
  77. * for incoming blk-mq request from the ublk block device.
  78. *
  79. * If the flag is cleared, the io command will be completed, and owned by
  80. * ublk server.
  81. */
  82. #define UBLK_IO_FLAG_ACTIVE 0x01
  83. /*
  84. * IO command is completed via cqe, and it is being handled by ublksrv, and
  85. * not committed yet
  86. *
  87. * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
  88. * cross verification
  89. */
  90. #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
  91. /*
  92. * IO command is aborted, so this flag is set in case of
  93. * !UBLK_IO_FLAG_ACTIVE.
  94. *
  95. * After this flag is observed, any pending or new incoming request
  96. * associated with this io command will be failed immediately
  97. */
  98. #define UBLK_IO_FLAG_ABORTED 0x04
  99. /*
  100. * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
  101. * get data buffer address from ublksrv.
  102. *
  103. * Then, bio data could be copied into this data buffer for a WRITE request
  104. * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
  105. */
  106. #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
  107. /* atomic RW with ubq->cancel_lock */
  108. #define UBLK_IO_FLAG_CANCELED 0x80000000
  109. struct ublk_io {
  110. /* userspace buffer address from io cmd */
  111. __u64 addr;
  112. unsigned int flags;
  113. int res;
  114. struct io_uring_cmd *cmd;
  115. };
  116. struct ublk_queue {
  117. int q_id;
  118. int q_depth;
  119. unsigned long flags;
  120. struct task_struct *ubq_daemon;
  121. char *io_cmd_buf;
  122. struct llist_head io_cmds;
  123. unsigned long io_addr; /* mapped vm address */
  124. unsigned int max_io_sz;
  125. bool force_abort;
  126. bool timeout;
  127. bool canceling;
  128. unsigned short nr_io_ready; /* how many ios setup */
  129. spinlock_t cancel_lock;
  130. struct ublk_device *dev;
  131. struct ublk_io ios[];
  132. };
  133. struct ublk_device {
  134. struct gendisk *ub_disk;
  135. char *__queues;
  136. unsigned int queue_size;
  137. struct ublksrv_ctrl_dev_info dev_info;
  138. struct blk_mq_tag_set tag_set;
  139. struct cdev cdev;
  140. struct device cdev_dev;
  141. #define UB_STATE_OPEN 0
  142. #define UB_STATE_USED 1
  143. #define UB_STATE_DELETED 2
  144. unsigned long state;
  145. int ub_number;
  146. struct mutex mutex;
  147. spinlock_t lock;
  148. struct mm_struct *mm;
  149. struct ublk_params params;
  150. struct completion completion;
  151. unsigned int nr_queues_ready;
  152. unsigned int nr_privileged_daemon;
  153. struct work_struct quiesce_work;
  154. struct work_struct stop_work;
  155. };
  156. /* header of ublk_params */
  157. struct ublk_params_header {
  158. __u32 len;
  159. __u32 types;
  160. };
  161. static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
  162. static inline unsigned int ublk_req_build_flags(struct request *req);
  163. static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
  164. int tag);
  165. static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
  166. {
  167. return ub->dev_info.flags & UBLK_F_USER_COPY;
  168. }
  169. static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
  170. {
  171. return ub->dev_info.flags & UBLK_F_ZONED;
  172. }
  173. static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
  174. {
  175. return ubq->flags & UBLK_F_ZONED;
  176. }
  177. #ifdef CONFIG_BLK_DEV_ZONED
  178. struct ublk_zoned_report_desc {
  179. __u64 sector;
  180. __u32 operation;
  181. __u32 nr_zones;
  182. };
  183. static DEFINE_XARRAY(ublk_zoned_report_descs);
  184. static int ublk_zoned_insert_report_desc(const struct request *req,
  185. struct ublk_zoned_report_desc *desc)
  186. {
  187. return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
  188. desc, GFP_KERNEL);
  189. }
  190. static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
  191. const struct request *req)
  192. {
  193. return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
  194. }
  195. static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
  196. const struct request *req)
  197. {
  198. return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
  199. }
  200. static int ublk_get_nr_zones(const struct ublk_device *ub)
  201. {
  202. const struct ublk_param_basic *p = &ub->params.basic;
  203. /* Zone size is a power of 2 */
  204. return p->dev_sectors >> ilog2(p->chunk_sectors);
  205. }
  206. static int ublk_revalidate_disk_zones(struct ublk_device *ub)
  207. {
  208. return blk_revalidate_disk_zones(ub->ub_disk);
  209. }
  210. static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
  211. {
  212. const struct ublk_param_zoned *p = &ub->params.zoned;
  213. int nr_zones;
  214. if (!ublk_dev_is_zoned(ub))
  215. return -EINVAL;
  216. if (!p->max_zone_append_sectors)
  217. return -EINVAL;
  218. nr_zones = ublk_get_nr_zones(ub);
  219. if (p->max_active_zones > nr_zones)
  220. return -EINVAL;
  221. if (p->max_open_zones > nr_zones)
  222. return -EINVAL;
  223. return 0;
  224. }
  225. static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
  226. {
  227. ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
  228. }
  229. /* Based on virtblk_alloc_report_buffer */
  230. static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
  231. unsigned int nr_zones, size_t *buflen)
  232. {
  233. struct request_queue *q = ublk->ub_disk->queue;
  234. size_t bufsize;
  235. void *buf;
  236. nr_zones = min_t(unsigned int, nr_zones,
  237. ublk->ub_disk->nr_zones);
  238. bufsize = nr_zones * sizeof(struct blk_zone);
  239. bufsize =
  240. min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
  241. while (bufsize >= sizeof(struct blk_zone)) {
  242. buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
  243. if (buf) {
  244. *buflen = bufsize;
  245. return buf;
  246. }
  247. bufsize >>= 1;
  248. }
  249. *buflen = 0;
  250. return NULL;
  251. }
  252. static int ublk_report_zones(struct gendisk *disk, sector_t sector,
  253. unsigned int nr_zones, report_zones_cb cb, void *data)
  254. {
  255. struct ublk_device *ub = disk->private_data;
  256. unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
  257. unsigned int first_zone = sector >> ilog2(zone_size_sectors);
  258. unsigned int done_zones = 0;
  259. unsigned int max_zones_per_request;
  260. int ret;
  261. struct blk_zone *buffer;
  262. size_t buffer_length;
  263. nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
  264. nr_zones);
  265. buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
  266. if (!buffer)
  267. return -ENOMEM;
  268. max_zones_per_request = buffer_length / sizeof(struct blk_zone);
  269. while (done_zones < nr_zones) {
  270. unsigned int remaining_zones = nr_zones - done_zones;
  271. unsigned int zones_in_request =
  272. min_t(unsigned int, remaining_zones, max_zones_per_request);
  273. struct request *req;
  274. struct ublk_zoned_report_desc desc;
  275. blk_status_t status;
  276. memset(buffer, 0, buffer_length);
  277. req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
  278. if (IS_ERR(req)) {
  279. ret = PTR_ERR(req);
  280. goto out;
  281. }
  282. desc.operation = UBLK_IO_OP_REPORT_ZONES;
  283. desc.sector = sector;
  284. desc.nr_zones = zones_in_request;
  285. ret = ublk_zoned_insert_report_desc(req, &desc);
  286. if (ret)
  287. goto free_req;
  288. ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
  289. GFP_KERNEL);
  290. if (ret)
  291. goto erase_desc;
  292. status = blk_execute_rq(req, 0);
  293. ret = blk_status_to_errno(status);
  294. erase_desc:
  295. ublk_zoned_erase_report_desc(req);
  296. free_req:
  297. blk_mq_free_request(req);
  298. if (ret)
  299. goto out;
  300. for (unsigned int i = 0; i < zones_in_request; i++) {
  301. struct blk_zone *zone = buffer + i;
  302. /* A zero length zone means no more zones in this response */
  303. if (!zone->len)
  304. break;
  305. ret = cb(zone, i, data);
  306. if (ret)
  307. goto out;
  308. done_zones++;
  309. sector += zone_size_sectors;
  310. }
  311. }
  312. ret = done_zones;
  313. out:
  314. kvfree(buffer);
  315. return ret;
  316. }
  317. static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
  318. struct request *req)
  319. {
  320. struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
  321. struct ublk_io *io = &ubq->ios[req->tag];
  322. struct ublk_zoned_report_desc *desc;
  323. u32 ublk_op;
  324. switch (req_op(req)) {
  325. case REQ_OP_ZONE_OPEN:
  326. ublk_op = UBLK_IO_OP_ZONE_OPEN;
  327. break;
  328. case REQ_OP_ZONE_CLOSE:
  329. ublk_op = UBLK_IO_OP_ZONE_CLOSE;
  330. break;
  331. case REQ_OP_ZONE_FINISH:
  332. ublk_op = UBLK_IO_OP_ZONE_FINISH;
  333. break;
  334. case REQ_OP_ZONE_RESET:
  335. ublk_op = UBLK_IO_OP_ZONE_RESET;
  336. break;
  337. case REQ_OP_ZONE_APPEND:
  338. ublk_op = UBLK_IO_OP_ZONE_APPEND;
  339. break;
  340. case REQ_OP_ZONE_RESET_ALL:
  341. ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
  342. break;
  343. case REQ_OP_DRV_IN:
  344. desc = ublk_zoned_get_report_desc(req);
  345. if (!desc)
  346. return BLK_STS_IOERR;
  347. ublk_op = desc->operation;
  348. switch (ublk_op) {
  349. case UBLK_IO_OP_REPORT_ZONES:
  350. iod->op_flags = ublk_op | ublk_req_build_flags(req);
  351. iod->nr_zones = desc->nr_zones;
  352. iod->start_sector = desc->sector;
  353. return BLK_STS_OK;
  354. default:
  355. return BLK_STS_IOERR;
  356. }
  357. case REQ_OP_DRV_OUT:
  358. /* We do not support drv_out */
  359. return BLK_STS_NOTSUPP;
  360. default:
  361. return BLK_STS_IOERR;
  362. }
  363. iod->op_flags = ublk_op | ublk_req_build_flags(req);
  364. iod->nr_sectors = blk_rq_sectors(req);
  365. iod->start_sector = blk_rq_pos(req);
  366. iod->addr = io->addr;
  367. return BLK_STS_OK;
  368. }
  369. #else
  370. #define ublk_report_zones (NULL)
  371. static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
  372. {
  373. return -EOPNOTSUPP;
  374. }
  375. static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
  376. {
  377. }
  378. static int ublk_revalidate_disk_zones(struct ublk_device *ub)
  379. {
  380. return 0;
  381. }
  382. static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
  383. struct request *req)
  384. {
  385. return BLK_STS_NOTSUPP;
  386. }
  387. #endif
  388. static inline void __ublk_complete_rq(struct request *req);
  389. static void ublk_complete_rq(struct kref *ref);
  390. static dev_t ublk_chr_devt;
  391. static const struct class ublk_chr_class = {
  392. .name = "ublk-char",
  393. };
  394. static DEFINE_IDR(ublk_index_idr);
  395. static DEFINE_SPINLOCK(ublk_idr_lock);
  396. static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
  397. static DEFINE_MUTEX(ublk_ctl_mutex);
  398. /*
  399. * Max ublk devices allowed to add
  400. *
  401. * It can be extended to one per-user limit in future or even controlled
  402. * by cgroup.
  403. */
  404. #define UBLK_MAX_UBLKS UBLK_MINORS
  405. static unsigned int ublks_max = 64;
  406. static unsigned int ublks_added; /* protected by ublk_ctl_mutex */
  407. static struct miscdevice ublk_misc;
  408. static inline unsigned ublk_pos_to_hwq(loff_t pos)
  409. {
  410. return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
  411. UBLK_QID_BITS_MASK;
  412. }
  413. static inline unsigned ublk_pos_to_buf_off(loff_t pos)
  414. {
  415. return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
  416. }
  417. static inline unsigned ublk_pos_to_tag(loff_t pos)
  418. {
  419. return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
  420. UBLK_TAG_BITS_MASK;
  421. }
  422. static void ublk_dev_param_basic_apply(struct ublk_device *ub)
  423. {
  424. const struct ublk_param_basic *p = &ub->params.basic;
  425. if (p->attrs & UBLK_ATTR_READ_ONLY)
  426. set_disk_ro(ub->ub_disk, true);
  427. set_capacity(ub->ub_disk, p->dev_sectors);
  428. }
  429. static int ublk_validate_params(const struct ublk_device *ub)
  430. {
  431. /* basic param is the only one which must be set */
  432. if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
  433. const struct ublk_param_basic *p = &ub->params.basic;
  434. if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
  435. return -EINVAL;
  436. if (p->logical_bs_shift > p->physical_bs_shift)
  437. return -EINVAL;
  438. if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
  439. return -EINVAL;
  440. if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
  441. return -EINVAL;
  442. } else
  443. return -EINVAL;
  444. if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
  445. const struct ublk_param_discard *p = &ub->params.discard;
  446. /* So far, only support single segment discard */
  447. if (p->max_discard_sectors && p->max_discard_segments != 1)
  448. return -EINVAL;
  449. if (!p->discard_granularity)
  450. return -EINVAL;
  451. }
  452. /* dev_t is read-only */
  453. if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
  454. return -EINVAL;
  455. if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
  456. return ublk_dev_param_zoned_validate(ub);
  457. else if (ublk_dev_is_zoned(ub))
  458. return -EINVAL;
  459. return 0;
  460. }
  461. static void ublk_apply_params(struct ublk_device *ub)
  462. {
  463. ublk_dev_param_basic_apply(ub);
  464. if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
  465. ublk_dev_param_zoned_apply(ub);
  466. }
  467. static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
  468. {
  469. return ubq->flags & UBLK_F_USER_COPY;
  470. }
  471. static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
  472. {
  473. /*
  474. * read()/write() is involved in user copy, so request reference
  475. * has to be grabbed
  476. */
  477. return ublk_support_user_copy(ubq);
  478. }
  479. static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
  480. struct request *req)
  481. {
  482. if (ublk_need_req_ref(ubq)) {
  483. struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
  484. kref_init(&data->ref);
  485. }
  486. }
  487. static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
  488. struct request *req)
  489. {
  490. if (ublk_need_req_ref(ubq)) {
  491. struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
  492. return kref_get_unless_zero(&data->ref);
  493. }
  494. return true;
  495. }
  496. static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
  497. struct request *req)
  498. {
  499. if (ublk_need_req_ref(ubq)) {
  500. struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
  501. kref_put(&data->ref, ublk_complete_rq);
  502. } else {
  503. __ublk_complete_rq(req);
  504. }
  505. }
  506. static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
  507. {
  508. return ubq->flags & UBLK_F_NEED_GET_DATA;
  509. }
  510. /* Called in slow path only, keep it noinline for trace purpose */
  511. static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
  512. {
  513. if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
  514. return ub;
  515. return NULL;
  516. }
  517. /* Called in slow path only, keep it noinline for trace purpose */
  518. static noinline void ublk_put_device(struct ublk_device *ub)
  519. {
  520. put_device(&ub->cdev_dev);
  521. }
  522. static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
  523. int qid)
  524. {
  525. return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
  526. }
  527. static inline bool ublk_rq_has_data(const struct request *rq)
  528. {
  529. return bio_has_data(rq->bio);
  530. }
  531. static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
  532. int tag)
  533. {
  534. return (struct ublksrv_io_desc *)
  535. &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
  536. }
  537. static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
  538. {
  539. return ublk_get_queue(ub, q_id)->io_cmd_buf;
  540. }
  541. static inline int __ublk_queue_cmd_buf_size(int depth)
  542. {
  543. return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
  544. }
  545. static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
  546. {
  547. struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
  548. return __ublk_queue_cmd_buf_size(ubq->q_depth);
  549. }
  550. static int ublk_max_cmd_buf_size(void)
  551. {
  552. return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
  553. }
  554. static inline bool ublk_queue_can_use_recovery_reissue(
  555. struct ublk_queue *ubq)
  556. {
  557. return (ubq->flags & UBLK_F_USER_RECOVERY) &&
  558. (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE);
  559. }
  560. static inline bool ublk_queue_can_use_recovery(
  561. struct ublk_queue *ubq)
  562. {
  563. return ubq->flags & UBLK_F_USER_RECOVERY;
  564. }
  565. static inline bool ublk_can_use_recovery(struct ublk_device *ub)
  566. {
  567. return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
  568. }
  569. static void ublk_free_disk(struct gendisk *disk)
  570. {
  571. struct ublk_device *ub = disk->private_data;
  572. clear_bit(UB_STATE_USED, &ub->state);
  573. ublk_put_device(ub);
  574. }
  575. static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
  576. unsigned int *owner_gid)
  577. {
  578. kuid_t uid;
  579. kgid_t gid;
  580. current_uid_gid(&uid, &gid);
  581. *owner_uid = from_kuid(&init_user_ns, uid);
  582. *owner_gid = from_kgid(&init_user_ns, gid);
  583. }
  584. static int ublk_open(struct gendisk *disk, blk_mode_t mode)
  585. {
  586. struct ublk_device *ub = disk->private_data;
  587. if (capable(CAP_SYS_ADMIN))
  588. return 0;
  589. /*
  590. * If it is one unprivileged device, only owner can open
  591. * the disk. Otherwise it could be one trap made by one
  592. * evil user who grants this disk's privileges to other
  593. * users deliberately.
  594. *
  595. * This way is reasonable too given anyone can create
  596. * unprivileged device, and no need other's grant.
  597. */
  598. if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
  599. unsigned int curr_uid, curr_gid;
  600. ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
  601. if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
  602. ub->dev_info.owner_gid)
  603. return -EPERM;
  604. }
  605. return 0;
  606. }
  607. static const struct block_device_operations ub_fops = {
  608. .owner = THIS_MODULE,
  609. .open = ublk_open,
  610. .free_disk = ublk_free_disk,
  611. .report_zones = ublk_report_zones,
  612. };
  613. #define UBLK_MAX_PIN_PAGES 32
  614. struct ublk_io_iter {
  615. struct page *pages[UBLK_MAX_PIN_PAGES];
  616. struct bio *bio;
  617. struct bvec_iter iter;
  618. };
  619. /* return how many pages are copied */
  620. static void ublk_copy_io_pages(struct ublk_io_iter *data,
  621. size_t total, size_t pg_off, int dir)
  622. {
  623. unsigned done = 0;
  624. unsigned pg_idx = 0;
  625. while (done < total) {
  626. struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
  627. unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
  628. (unsigned)(PAGE_SIZE - pg_off));
  629. void *bv_buf = bvec_kmap_local(&bv);
  630. void *pg_buf = kmap_local_page(data->pages[pg_idx]);
  631. if (dir == ITER_DEST)
  632. memcpy(pg_buf + pg_off, bv_buf, bytes);
  633. else
  634. memcpy(bv_buf, pg_buf + pg_off, bytes);
  635. kunmap_local(pg_buf);
  636. kunmap_local(bv_buf);
  637. /* advance page array */
  638. pg_off += bytes;
  639. if (pg_off == PAGE_SIZE) {
  640. pg_idx += 1;
  641. pg_off = 0;
  642. }
  643. done += bytes;
  644. /* advance bio */
  645. bio_advance_iter_single(data->bio, &data->iter, bytes);
  646. if (!data->iter.bi_size) {
  647. data->bio = data->bio->bi_next;
  648. if (data->bio == NULL)
  649. break;
  650. data->iter = data->bio->bi_iter;
  651. }
  652. }
  653. }
  654. static bool ublk_advance_io_iter(const struct request *req,
  655. struct ublk_io_iter *iter, unsigned int offset)
  656. {
  657. struct bio *bio = req->bio;
  658. for_each_bio(bio) {
  659. if (bio->bi_iter.bi_size > offset) {
  660. iter->bio = bio;
  661. iter->iter = bio->bi_iter;
  662. bio_advance_iter(iter->bio, &iter->iter, offset);
  663. return true;
  664. }
  665. offset -= bio->bi_iter.bi_size;
  666. }
  667. return false;
  668. }
  669. /*
  670. * Copy data between request pages and io_iter, and 'offset'
  671. * is the start point of linear offset of request.
  672. */
  673. static size_t ublk_copy_user_pages(const struct request *req,
  674. unsigned offset, struct iov_iter *uiter, int dir)
  675. {
  676. struct ublk_io_iter iter;
  677. size_t done = 0;
  678. if (!ublk_advance_io_iter(req, &iter, offset))
  679. return 0;
  680. while (iov_iter_count(uiter) && iter.bio) {
  681. unsigned nr_pages;
  682. ssize_t len;
  683. size_t off;
  684. int i;
  685. len = iov_iter_get_pages2(uiter, iter.pages,
  686. iov_iter_count(uiter),
  687. UBLK_MAX_PIN_PAGES, &off);
  688. if (len <= 0)
  689. return done;
  690. ublk_copy_io_pages(&iter, len, off, dir);
  691. nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
  692. for (i = 0; i < nr_pages; i++) {
  693. if (dir == ITER_DEST)
  694. set_page_dirty(iter.pages[i]);
  695. put_page(iter.pages[i]);
  696. }
  697. done += len;
  698. }
  699. return done;
  700. }
  701. static inline bool ublk_need_map_req(const struct request *req)
  702. {
  703. return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
  704. }
  705. static inline bool ublk_need_unmap_req(const struct request *req)
  706. {
  707. return ublk_rq_has_data(req) &&
  708. (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
  709. }
  710. static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
  711. struct ublk_io *io)
  712. {
  713. const unsigned int rq_bytes = blk_rq_bytes(req);
  714. if (ublk_support_user_copy(ubq))
  715. return rq_bytes;
  716. /*
  717. * no zero copy, we delay copy WRITE request data into ublksrv
  718. * context and the big benefit is that pinning pages in current
  719. * context is pretty fast, see ublk_pin_user_pages
  720. */
  721. if (ublk_need_map_req(req)) {
  722. struct iov_iter iter;
  723. const int dir = ITER_DEST;
  724. import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
  725. return ublk_copy_user_pages(req, 0, &iter, dir);
  726. }
  727. return rq_bytes;
  728. }
  729. static int ublk_unmap_io(const struct ublk_queue *ubq,
  730. const struct request *req,
  731. struct ublk_io *io)
  732. {
  733. const unsigned int rq_bytes = blk_rq_bytes(req);
  734. if (ublk_support_user_copy(ubq))
  735. return rq_bytes;
  736. if (ublk_need_unmap_req(req)) {
  737. struct iov_iter iter;
  738. const int dir = ITER_SOURCE;
  739. WARN_ON_ONCE(io->res > rq_bytes);
  740. import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
  741. return ublk_copy_user_pages(req, 0, &iter, dir);
  742. }
  743. return rq_bytes;
  744. }
  745. static inline unsigned int ublk_req_build_flags(struct request *req)
  746. {
  747. unsigned flags = 0;
  748. if (req->cmd_flags & REQ_FAILFAST_DEV)
  749. flags |= UBLK_IO_F_FAILFAST_DEV;
  750. if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
  751. flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
  752. if (req->cmd_flags & REQ_FAILFAST_DRIVER)
  753. flags |= UBLK_IO_F_FAILFAST_DRIVER;
  754. if (req->cmd_flags & REQ_META)
  755. flags |= UBLK_IO_F_META;
  756. if (req->cmd_flags & REQ_FUA)
  757. flags |= UBLK_IO_F_FUA;
  758. if (req->cmd_flags & REQ_NOUNMAP)
  759. flags |= UBLK_IO_F_NOUNMAP;
  760. if (req->cmd_flags & REQ_SWAP)
  761. flags |= UBLK_IO_F_SWAP;
  762. return flags;
  763. }
  764. static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
  765. {
  766. struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
  767. struct ublk_io *io = &ubq->ios[req->tag];
  768. enum req_op op = req_op(req);
  769. u32 ublk_op;
  770. if (!ublk_queue_is_zoned(ubq) &&
  771. (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
  772. return BLK_STS_IOERR;
  773. switch (req_op(req)) {
  774. case REQ_OP_READ:
  775. ublk_op = UBLK_IO_OP_READ;
  776. break;
  777. case REQ_OP_WRITE:
  778. ublk_op = UBLK_IO_OP_WRITE;
  779. break;
  780. case REQ_OP_FLUSH:
  781. ublk_op = UBLK_IO_OP_FLUSH;
  782. break;
  783. case REQ_OP_DISCARD:
  784. ublk_op = UBLK_IO_OP_DISCARD;
  785. break;
  786. case REQ_OP_WRITE_ZEROES:
  787. ublk_op = UBLK_IO_OP_WRITE_ZEROES;
  788. break;
  789. default:
  790. if (ublk_queue_is_zoned(ubq))
  791. return ublk_setup_iod_zoned(ubq, req);
  792. return BLK_STS_IOERR;
  793. }
  794. /* need to translate since kernel may change */
  795. iod->op_flags = ublk_op | ublk_req_build_flags(req);
  796. iod->nr_sectors = blk_rq_sectors(req);
  797. iod->start_sector = blk_rq_pos(req);
  798. iod->addr = io->addr;
  799. return BLK_STS_OK;
  800. }
  801. static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
  802. struct io_uring_cmd *ioucmd)
  803. {
  804. return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
  805. }
  806. static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
  807. {
  808. return ubq->ubq_daemon->flags & PF_EXITING;
  809. }
  810. /* todo: handle partial completion */
  811. static inline void __ublk_complete_rq(struct request *req)
  812. {
  813. struct ublk_queue *ubq = req->mq_hctx->driver_data;
  814. struct ublk_io *io = &ubq->ios[req->tag];
  815. unsigned int unmapped_bytes;
  816. blk_status_t res = BLK_STS_OK;
  817. /* called from ublk_abort_queue() code path */
  818. if (io->flags & UBLK_IO_FLAG_ABORTED) {
  819. res = BLK_STS_IOERR;
  820. goto exit;
  821. }
  822. /* failed read IO if nothing is read */
  823. if (!io->res && req_op(req) == REQ_OP_READ)
  824. io->res = -EIO;
  825. if (io->res < 0) {
  826. res = errno_to_blk_status(io->res);
  827. goto exit;
  828. }
  829. /*
  830. * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
  831. * directly.
  832. *
  833. * Both the two needn't unmap.
  834. */
  835. if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
  836. req_op(req) != REQ_OP_DRV_IN)
  837. goto exit;
  838. /* for READ request, writing data in iod->addr to rq buffers */
  839. unmapped_bytes = ublk_unmap_io(ubq, req, io);
  840. /*
  841. * Extremely impossible since we got data filled in just before
  842. *
  843. * Re-read simply for this unlikely case.
  844. */
  845. if (unlikely(unmapped_bytes < io->res))
  846. io->res = unmapped_bytes;
  847. if (blk_update_request(req, BLK_STS_OK, io->res))
  848. blk_mq_requeue_request(req, true);
  849. else
  850. __blk_mq_end_request(req, BLK_STS_OK);
  851. return;
  852. exit:
  853. blk_mq_end_request(req, res);
  854. }
  855. static void ublk_complete_rq(struct kref *ref)
  856. {
  857. struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
  858. ref);
  859. struct request *req = blk_mq_rq_from_pdu(data);
  860. __ublk_complete_rq(req);
  861. }
  862. /*
  863. * Since __ublk_rq_task_work always fails requests immediately during
  864. * exiting, __ublk_fail_req() is only called from abort context during
  865. * exiting. So lock is unnecessary.
  866. *
  867. * Also aborting may not be started yet, keep in mind that one failed
  868. * request may be issued by block layer again.
  869. */
  870. static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
  871. struct request *req)
  872. {
  873. WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
  874. if (ublk_queue_can_use_recovery_reissue(ubq))
  875. blk_mq_requeue_request(req, false);
  876. else
  877. ublk_put_req_ref(ubq, req);
  878. }
  879. static void ubq_complete_io_cmd(struct ublk_io *io, int res,
  880. unsigned issue_flags)
  881. {
  882. /* mark this cmd owned by ublksrv */
  883. io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
  884. /*
  885. * clear ACTIVE since we are done with this sqe/cmd slot
  886. * We can only accept io cmd in case of being not active.
  887. */
  888. io->flags &= ~UBLK_IO_FLAG_ACTIVE;
  889. /* tell ublksrv one io request is coming */
  890. io_uring_cmd_done(io->cmd, res, 0, issue_flags);
  891. }
  892. #define UBLK_REQUEUE_DELAY_MS 3
  893. static inline void __ublk_abort_rq(struct ublk_queue *ubq,
  894. struct request *rq)
  895. {
  896. /* We cannot process this rq so just requeue it. */
  897. if (ublk_queue_can_use_recovery(ubq))
  898. blk_mq_requeue_request(rq, false);
  899. else
  900. blk_mq_end_request(rq, BLK_STS_IOERR);
  901. }
  902. static inline void __ublk_rq_task_work(struct request *req,
  903. unsigned issue_flags)
  904. {
  905. struct ublk_queue *ubq = req->mq_hctx->driver_data;
  906. int tag = req->tag;
  907. struct ublk_io *io = &ubq->ios[tag];
  908. unsigned int mapped_bytes;
  909. pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
  910. __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
  911. ublk_get_iod(ubq, req->tag)->addr);
  912. /*
  913. * Task is exiting if either:
  914. *
  915. * (1) current != ubq_daemon.
  916. * io_uring_cmd_complete_in_task() tries to run task_work
  917. * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
  918. *
  919. * (2) current->flags & PF_EXITING.
  920. */
  921. if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
  922. __ublk_abort_rq(ubq, req);
  923. return;
  924. }
  925. if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
  926. /*
  927. * We have not handled UBLK_IO_NEED_GET_DATA command yet,
  928. * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
  929. * and notify it.
  930. */
  931. if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
  932. io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
  933. pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
  934. __func__, io->cmd->cmd_op, ubq->q_id,
  935. req->tag, io->flags);
  936. ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
  937. return;
  938. }
  939. /*
  940. * We have handled UBLK_IO_NEED_GET_DATA command,
  941. * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
  942. * do the copy work.
  943. */
  944. io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
  945. /* update iod->addr because ublksrv may have passed a new io buffer */
  946. ublk_get_iod(ubq, req->tag)->addr = io->addr;
  947. pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
  948. __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
  949. ublk_get_iod(ubq, req->tag)->addr);
  950. }
  951. mapped_bytes = ublk_map_io(ubq, req, io);
  952. /* partially mapped, update io descriptor */
  953. if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
  954. /*
  955. * Nothing mapped, retry until we succeed.
  956. *
  957. * We may never succeed in mapping any bytes here because
  958. * of OOM. TODO: reserve one buffer with single page pinned
  959. * for providing forward progress guarantee.
  960. */
  961. if (unlikely(!mapped_bytes)) {
  962. blk_mq_requeue_request(req, false);
  963. blk_mq_delay_kick_requeue_list(req->q,
  964. UBLK_REQUEUE_DELAY_MS);
  965. return;
  966. }
  967. ublk_get_iod(ubq, req->tag)->nr_sectors =
  968. mapped_bytes >> 9;
  969. }
  970. ublk_init_req_ref(ubq, req);
  971. ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
  972. }
  973. static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
  974. unsigned issue_flags)
  975. {
  976. struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
  977. struct ublk_rq_data *data, *tmp;
  978. io_cmds = llist_reverse_order(io_cmds);
  979. llist_for_each_entry_safe(data, tmp, io_cmds, node)
  980. __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags);
  981. }
  982. static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
  983. {
  984. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  985. struct ublk_queue *ubq = pdu->ubq;
  986. ublk_forward_io_cmds(ubq, issue_flags);
  987. }
  988. static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
  989. {
  990. struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
  991. if (llist_add(&data->node, &ubq->io_cmds)) {
  992. struct ublk_io *io = &ubq->ios[rq->tag];
  993. io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
  994. }
  995. }
  996. static enum blk_eh_timer_return ublk_timeout(struct request *rq)
  997. {
  998. struct ublk_queue *ubq = rq->mq_hctx->driver_data;
  999. unsigned int nr_inflight = 0;
  1000. int i;
  1001. if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
  1002. if (!ubq->timeout) {
  1003. send_sig(SIGKILL, ubq->ubq_daemon, 0);
  1004. ubq->timeout = true;
  1005. }
  1006. return BLK_EH_DONE;
  1007. }
  1008. if (!ubq_daemon_is_dying(ubq))
  1009. return BLK_EH_RESET_TIMER;
  1010. for (i = 0; i < ubq->q_depth; i++) {
  1011. struct ublk_io *io = &ubq->ios[i];
  1012. if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
  1013. nr_inflight++;
  1014. }
  1015. /* cancelable uring_cmd can't help us if all commands are in-flight */
  1016. if (nr_inflight == ubq->q_depth) {
  1017. struct ublk_device *ub = ubq->dev;
  1018. if (ublk_abort_requests(ub, ubq)) {
  1019. if (ublk_can_use_recovery(ub))
  1020. schedule_work(&ub->quiesce_work);
  1021. else
  1022. schedule_work(&ub->stop_work);
  1023. }
  1024. return BLK_EH_DONE;
  1025. }
  1026. return BLK_EH_RESET_TIMER;
  1027. }
  1028. static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
  1029. const struct blk_mq_queue_data *bd)
  1030. {
  1031. struct ublk_queue *ubq = hctx->driver_data;
  1032. struct request *rq = bd->rq;
  1033. blk_status_t res;
  1034. /* fill iod to slot in io cmd buffer */
  1035. res = ublk_setup_iod(ubq, rq);
  1036. if (unlikely(res != BLK_STS_OK))
  1037. return BLK_STS_IOERR;
  1038. /* With recovery feature enabled, force_abort is set in
  1039. * ublk_stop_dev() before calling del_gendisk(). We have to
  1040. * abort all requeued and new rqs here to let del_gendisk()
  1041. * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
  1042. * to avoid UAF on io_uring ctx.
  1043. *
  1044. * Note: force_abort is guaranteed to be seen because it is set
  1045. * before request queue is unqiuesced.
  1046. */
  1047. if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
  1048. return BLK_STS_IOERR;
  1049. if (unlikely(ubq->canceling)) {
  1050. __ublk_abort_rq(ubq, rq);
  1051. return BLK_STS_OK;
  1052. }
  1053. blk_mq_start_request(bd->rq);
  1054. ublk_queue_cmd(ubq, rq);
  1055. return BLK_STS_OK;
  1056. }
  1057. static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
  1058. unsigned int hctx_idx)
  1059. {
  1060. struct ublk_device *ub = driver_data;
  1061. struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
  1062. hctx->driver_data = ubq;
  1063. return 0;
  1064. }
  1065. static const struct blk_mq_ops ublk_mq_ops = {
  1066. .queue_rq = ublk_queue_rq,
  1067. .init_hctx = ublk_init_hctx,
  1068. .timeout = ublk_timeout,
  1069. };
  1070. static int ublk_ch_open(struct inode *inode, struct file *filp)
  1071. {
  1072. struct ublk_device *ub = container_of(inode->i_cdev,
  1073. struct ublk_device, cdev);
  1074. if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
  1075. return -EBUSY;
  1076. filp->private_data = ub;
  1077. return 0;
  1078. }
  1079. static int ublk_ch_release(struct inode *inode, struct file *filp)
  1080. {
  1081. struct ublk_device *ub = filp->private_data;
  1082. clear_bit(UB_STATE_OPEN, &ub->state);
  1083. return 0;
  1084. }
  1085. /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
  1086. static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
  1087. {
  1088. struct ublk_device *ub = filp->private_data;
  1089. size_t sz = vma->vm_end - vma->vm_start;
  1090. unsigned max_sz = ublk_max_cmd_buf_size();
  1091. unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
  1092. int q_id, ret = 0;
  1093. spin_lock(&ub->lock);
  1094. if (!ub->mm)
  1095. ub->mm = current->mm;
  1096. if (current->mm != ub->mm)
  1097. ret = -EINVAL;
  1098. spin_unlock(&ub->lock);
  1099. if (ret)
  1100. return ret;
  1101. if (vma->vm_flags & VM_WRITE)
  1102. return -EPERM;
  1103. end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
  1104. if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
  1105. return -EINVAL;
  1106. q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
  1107. pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
  1108. __func__, q_id, current->pid, vma->vm_start,
  1109. phys_off, (unsigned long)sz);
  1110. if (sz != ublk_queue_cmd_buf_size(ub, q_id))
  1111. return -EINVAL;
  1112. pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
  1113. return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
  1114. }
  1115. static void ublk_commit_completion(struct ublk_device *ub,
  1116. const struct ublksrv_io_cmd *ub_cmd)
  1117. {
  1118. u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
  1119. struct ublk_queue *ubq = ublk_get_queue(ub, qid);
  1120. struct ublk_io *io = &ubq->ios[tag];
  1121. struct request *req;
  1122. /* now this cmd slot is owned by nbd driver */
  1123. io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
  1124. io->res = ub_cmd->result;
  1125. /* find the io request and complete */
  1126. req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
  1127. if (WARN_ON_ONCE(unlikely(!req)))
  1128. return;
  1129. if (req_op(req) == REQ_OP_ZONE_APPEND)
  1130. req->__sector = ub_cmd->zone_append_lba;
  1131. if (likely(!blk_should_fake_timeout(req->q)))
  1132. ublk_put_req_ref(ubq, req);
  1133. }
  1134. /*
  1135. * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
  1136. * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
  1137. * context, so everything is serialized.
  1138. */
  1139. static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
  1140. {
  1141. int i;
  1142. for (i = 0; i < ubq->q_depth; i++) {
  1143. struct ublk_io *io = &ubq->ios[i];
  1144. if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
  1145. struct request *rq;
  1146. /*
  1147. * Either we fail the request or ublk_rq_task_work_fn
  1148. * will do it
  1149. */
  1150. rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
  1151. if (rq && blk_mq_request_started(rq)) {
  1152. io->flags |= UBLK_IO_FLAG_ABORTED;
  1153. __ublk_fail_req(ubq, io, rq);
  1154. }
  1155. }
  1156. }
  1157. }
  1158. static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
  1159. {
  1160. struct gendisk *disk;
  1161. spin_lock(&ubq->cancel_lock);
  1162. if (ubq->canceling) {
  1163. spin_unlock(&ubq->cancel_lock);
  1164. return false;
  1165. }
  1166. ubq->canceling = true;
  1167. spin_unlock(&ubq->cancel_lock);
  1168. spin_lock(&ub->lock);
  1169. disk = ub->ub_disk;
  1170. if (disk)
  1171. get_device(disk_to_dev(disk));
  1172. spin_unlock(&ub->lock);
  1173. /* Our disk has been dead */
  1174. if (!disk)
  1175. return false;
  1176. /* Now we are serialized with ublk_queue_rq() */
  1177. blk_mq_quiesce_queue(disk->queue);
  1178. /* abort queue is for making forward progress */
  1179. ublk_abort_queue(ub, ubq);
  1180. blk_mq_unquiesce_queue(disk->queue);
  1181. put_device(disk_to_dev(disk));
  1182. return true;
  1183. }
  1184. static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
  1185. unsigned int issue_flags)
  1186. {
  1187. bool done;
  1188. if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
  1189. return;
  1190. spin_lock(&ubq->cancel_lock);
  1191. done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
  1192. if (!done)
  1193. io->flags |= UBLK_IO_FLAG_CANCELED;
  1194. spin_unlock(&ubq->cancel_lock);
  1195. if (!done)
  1196. io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
  1197. }
  1198. /*
  1199. * The ublk char device won't be closed when calling cancel fn, so both
  1200. * ublk device and queue are guaranteed to be live
  1201. */
  1202. static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
  1203. unsigned int issue_flags)
  1204. {
  1205. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  1206. struct ublk_queue *ubq = pdu->ubq;
  1207. struct task_struct *task;
  1208. struct ublk_device *ub;
  1209. bool need_schedule;
  1210. struct ublk_io *io;
  1211. if (WARN_ON_ONCE(!ubq))
  1212. return;
  1213. if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
  1214. return;
  1215. task = io_uring_cmd_get_task(cmd);
  1216. if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
  1217. return;
  1218. ub = ubq->dev;
  1219. need_schedule = ublk_abort_requests(ub, ubq);
  1220. io = &ubq->ios[pdu->tag];
  1221. WARN_ON_ONCE(io->cmd != cmd);
  1222. ublk_cancel_cmd(ubq, io, issue_flags);
  1223. if (need_schedule) {
  1224. if (ublk_can_use_recovery(ub))
  1225. schedule_work(&ub->quiesce_work);
  1226. else
  1227. schedule_work(&ub->stop_work);
  1228. }
  1229. }
  1230. static inline bool ublk_queue_ready(struct ublk_queue *ubq)
  1231. {
  1232. return ubq->nr_io_ready == ubq->q_depth;
  1233. }
  1234. static void ublk_cancel_queue(struct ublk_queue *ubq)
  1235. {
  1236. int i;
  1237. for (i = 0; i < ubq->q_depth; i++)
  1238. ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
  1239. }
  1240. /* Cancel all pending commands, must be called after del_gendisk() returns */
  1241. static void ublk_cancel_dev(struct ublk_device *ub)
  1242. {
  1243. int i;
  1244. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  1245. ublk_cancel_queue(ublk_get_queue(ub, i));
  1246. }
  1247. static bool ublk_check_inflight_rq(struct request *rq, void *data)
  1248. {
  1249. bool *idle = data;
  1250. if (blk_mq_request_started(rq)) {
  1251. *idle = false;
  1252. return false;
  1253. }
  1254. return true;
  1255. }
  1256. static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
  1257. {
  1258. bool idle;
  1259. WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
  1260. while (true) {
  1261. idle = true;
  1262. blk_mq_tagset_busy_iter(&ub->tag_set,
  1263. ublk_check_inflight_rq, &idle);
  1264. if (idle)
  1265. break;
  1266. msleep(UBLK_REQUEUE_DELAY_MS);
  1267. }
  1268. }
  1269. static void __ublk_quiesce_dev(struct ublk_device *ub)
  1270. {
  1271. pr_devel("%s: quiesce ub: dev_id %d state %s\n",
  1272. __func__, ub->dev_info.dev_id,
  1273. ub->dev_info.state == UBLK_S_DEV_LIVE ?
  1274. "LIVE" : "QUIESCED");
  1275. blk_mq_quiesce_queue(ub->ub_disk->queue);
  1276. ublk_wait_tagset_rqs_idle(ub);
  1277. ub->dev_info.state = UBLK_S_DEV_QUIESCED;
  1278. }
  1279. static void ublk_quiesce_work_fn(struct work_struct *work)
  1280. {
  1281. struct ublk_device *ub =
  1282. container_of(work, struct ublk_device, quiesce_work);
  1283. mutex_lock(&ub->mutex);
  1284. if (ub->dev_info.state != UBLK_S_DEV_LIVE)
  1285. goto unlock;
  1286. __ublk_quiesce_dev(ub);
  1287. unlock:
  1288. mutex_unlock(&ub->mutex);
  1289. ublk_cancel_dev(ub);
  1290. }
  1291. static void ublk_unquiesce_dev(struct ublk_device *ub)
  1292. {
  1293. int i;
  1294. pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
  1295. __func__, ub->dev_info.dev_id,
  1296. ub->dev_info.state == UBLK_S_DEV_LIVE ?
  1297. "LIVE" : "QUIESCED");
  1298. /* quiesce_work has run. We let requeued rqs be aborted
  1299. * before running fallback_wq. "force_abort" must be seen
  1300. * after request queue is unqiuesced. Then del_gendisk()
  1301. * can move on.
  1302. */
  1303. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  1304. ublk_get_queue(ub, i)->force_abort = true;
  1305. blk_mq_unquiesce_queue(ub->ub_disk->queue);
  1306. /* We may have requeued some rqs in ublk_quiesce_queue() */
  1307. blk_mq_kick_requeue_list(ub->ub_disk->queue);
  1308. }
  1309. static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
  1310. {
  1311. struct gendisk *disk;
  1312. /* Sync with ublk_abort_queue() by holding the lock */
  1313. spin_lock(&ub->lock);
  1314. disk = ub->ub_disk;
  1315. ub->dev_info.state = UBLK_S_DEV_DEAD;
  1316. ub->dev_info.ublksrv_pid = -1;
  1317. ub->ub_disk = NULL;
  1318. spin_unlock(&ub->lock);
  1319. return disk;
  1320. }
  1321. static void ublk_stop_dev(struct ublk_device *ub)
  1322. {
  1323. struct gendisk *disk;
  1324. mutex_lock(&ub->mutex);
  1325. if (ub->dev_info.state == UBLK_S_DEV_DEAD)
  1326. goto unlock;
  1327. if (ublk_can_use_recovery(ub)) {
  1328. if (ub->dev_info.state == UBLK_S_DEV_LIVE)
  1329. __ublk_quiesce_dev(ub);
  1330. ublk_unquiesce_dev(ub);
  1331. }
  1332. del_gendisk(ub->ub_disk);
  1333. disk = ublk_detach_disk(ub);
  1334. put_disk(disk);
  1335. unlock:
  1336. mutex_unlock(&ub->mutex);
  1337. ublk_cancel_dev(ub);
  1338. }
  1339. /* device can only be started after all IOs are ready */
  1340. static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
  1341. {
  1342. mutex_lock(&ub->mutex);
  1343. ubq->nr_io_ready++;
  1344. if (ublk_queue_ready(ubq)) {
  1345. ubq->ubq_daemon = current;
  1346. get_task_struct(ubq->ubq_daemon);
  1347. ub->nr_queues_ready++;
  1348. if (capable(CAP_SYS_ADMIN))
  1349. ub->nr_privileged_daemon++;
  1350. }
  1351. if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
  1352. complete_all(&ub->completion);
  1353. mutex_unlock(&ub->mutex);
  1354. }
  1355. static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
  1356. int tag)
  1357. {
  1358. struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
  1359. struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
  1360. ublk_queue_cmd(ubq, req);
  1361. }
  1362. static inline int ublk_check_cmd_op(u32 cmd_op)
  1363. {
  1364. u32 ioc_type = _IOC_TYPE(cmd_op);
  1365. if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
  1366. return -EOPNOTSUPP;
  1367. if (ioc_type != 'u' && ioc_type != 0)
  1368. return -EOPNOTSUPP;
  1369. return 0;
  1370. }
  1371. static inline void ublk_fill_io_cmd(struct ublk_io *io,
  1372. struct io_uring_cmd *cmd, unsigned long buf_addr)
  1373. {
  1374. io->cmd = cmd;
  1375. io->flags |= UBLK_IO_FLAG_ACTIVE;
  1376. io->addr = buf_addr;
  1377. }
  1378. static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
  1379. unsigned int issue_flags,
  1380. struct ublk_queue *ubq, unsigned int tag)
  1381. {
  1382. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  1383. /*
  1384. * Safe to refer to @ubq since ublk_queue won't be died until its
  1385. * commands are completed
  1386. */
  1387. pdu->ubq = ubq;
  1388. pdu->tag = tag;
  1389. io_uring_cmd_mark_cancelable(cmd, issue_flags);
  1390. }
  1391. static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
  1392. unsigned int issue_flags,
  1393. const struct ublksrv_io_cmd *ub_cmd)
  1394. {
  1395. struct ublk_device *ub = cmd->file->private_data;
  1396. struct ublk_queue *ubq;
  1397. struct ublk_io *io;
  1398. u32 cmd_op = cmd->cmd_op;
  1399. unsigned tag = ub_cmd->tag;
  1400. int ret = -EINVAL;
  1401. struct request *req;
  1402. pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
  1403. __func__, cmd->cmd_op, ub_cmd->q_id, tag,
  1404. ub_cmd->result);
  1405. if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
  1406. goto out;
  1407. ubq = ublk_get_queue(ub, ub_cmd->q_id);
  1408. if (!ubq || ub_cmd->q_id != ubq->q_id)
  1409. goto out;
  1410. if (ubq->ubq_daemon && ubq->ubq_daemon != current)
  1411. goto out;
  1412. if (tag >= ubq->q_depth)
  1413. goto out;
  1414. io = &ubq->ios[tag];
  1415. /* there is pending io cmd, something must be wrong */
  1416. if (io->flags & UBLK_IO_FLAG_ACTIVE) {
  1417. ret = -EBUSY;
  1418. goto out;
  1419. }
  1420. /*
  1421. * ensure that the user issues UBLK_IO_NEED_GET_DATA
  1422. * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
  1423. */
  1424. if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
  1425. ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
  1426. goto out;
  1427. ret = ublk_check_cmd_op(cmd_op);
  1428. if (ret)
  1429. goto out;
  1430. ret = -EINVAL;
  1431. switch (_IOC_NR(cmd_op)) {
  1432. case UBLK_IO_FETCH_REQ:
  1433. /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
  1434. if (ublk_queue_ready(ubq)) {
  1435. ret = -EBUSY;
  1436. goto out;
  1437. }
  1438. /*
  1439. * The io is being handled by server, so COMMIT_RQ is expected
  1440. * instead of FETCH_REQ
  1441. */
  1442. if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
  1443. goto out;
  1444. if (!ublk_support_user_copy(ubq)) {
  1445. /*
  1446. * FETCH_RQ has to provide IO buffer if NEED GET
  1447. * DATA is not enabled
  1448. */
  1449. if (!ub_cmd->addr && !ublk_need_get_data(ubq))
  1450. goto out;
  1451. } else if (ub_cmd->addr) {
  1452. /* User copy requires addr to be unset */
  1453. ret = -EINVAL;
  1454. goto out;
  1455. }
  1456. ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
  1457. ublk_mark_io_ready(ub, ubq);
  1458. break;
  1459. case UBLK_IO_COMMIT_AND_FETCH_REQ:
  1460. req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
  1461. if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
  1462. goto out;
  1463. if (!ublk_support_user_copy(ubq)) {
  1464. /*
  1465. * COMMIT_AND_FETCH_REQ has to provide IO buffer if
  1466. * NEED GET DATA is not enabled or it is Read IO.
  1467. */
  1468. if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
  1469. req_op(req) == REQ_OP_READ))
  1470. goto out;
  1471. } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
  1472. /*
  1473. * User copy requires addr to be unset when command is
  1474. * not zone append
  1475. */
  1476. ret = -EINVAL;
  1477. goto out;
  1478. }
  1479. ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
  1480. ublk_commit_completion(ub, ub_cmd);
  1481. break;
  1482. case UBLK_IO_NEED_GET_DATA:
  1483. if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
  1484. goto out;
  1485. ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
  1486. ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
  1487. break;
  1488. default:
  1489. goto out;
  1490. }
  1491. ublk_prep_cancel(cmd, issue_flags, ubq, tag);
  1492. return -EIOCBQUEUED;
  1493. out:
  1494. io_uring_cmd_done(cmd, ret, 0, issue_flags);
  1495. pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
  1496. __func__, cmd_op, tag, ret, io->flags);
  1497. return -EIOCBQUEUED;
  1498. }
  1499. static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
  1500. struct ublk_queue *ubq, int tag, size_t offset)
  1501. {
  1502. struct request *req;
  1503. if (!ublk_need_req_ref(ubq))
  1504. return NULL;
  1505. req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
  1506. if (!req)
  1507. return NULL;
  1508. if (!ublk_get_req_ref(ubq, req))
  1509. return NULL;
  1510. if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
  1511. goto fail_put;
  1512. if (!ublk_rq_has_data(req))
  1513. goto fail_put;
  1514. if (offset > blk_rq_bytes(req))
  1515. goto fail_put;
  1516. return req;
  1517. fail_put:
  1518. ublk_put_req_ref(ubq, req);
  1519. return NULL;
  1520. }
  1521. static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
  1522. unsigned int issue_flags)
  1523. {
  1524. /*
  1525. * Not necessary for async retry, but let's keep it simple and always
  1526. * copy the values to avoid any potential reuse.
  1527. */
  1528. const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
  1529. const struct ublksrv_io_cmd ub_cmd = {
  1530. .q_id = READ_ONCE(ub_src->q_id),
  1531. .tag = READ_ONCE(ub_src->tag),
  1532. .result = READ_ONCE(ub_src->result),
  1533. .addr = READ_ONCE(ub_src->addr)
  1534. };
  1535. WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
  1536. return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
  1537. }
  1538. static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
  1539. unsigned int issue_flags)
  1540. {
  1541. ublk_ch_uring_cmd_local(cmd, issue_flags);
  1542. }
  1543. static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
  1544. {
  1545. if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
  1546. ublk_uring_cmd_cancel_fn(cmd, issue_flags);
  1547. return 0;
  1548. }
  1549. /* well-implemented server won't run into unlocked */
  1550. if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
  1551. io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
  1552. return -EIOCBQUEUED;
  1553. }
  1554. return ublk_ch_uring_cmd_local(cmd, issue_flags);
  1555. }
  1556. static inline bool ublk_check_ubuf_dir(const struct request *req,
  1557. int ubuf_dir)
  1558. {
  1559. /* copy ubuf to request pages */
  1560. if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
  1561. ubuf_dir == ITER_SOURCE)
  1562. return true;
  1563. /* copy request pages to ubuf */
  1564. if ((req_op(req) == REQ_OP_WRITE ||
  1565. req_op(req) == REQ_OP_ZONE_APPEND) &&
  1566. ubuf_dir == ITER_DEST)
  1567. return true;
  1568. return false;
  1569. }
  1570. static struct request *ublk_check_and_get_req(struct kiocb *iocb,
  1571. struct iov_iter *iter, size_t *off, int dir)
  1572. {
  1573. struct ublk_device *ub = iocb->ki_filp->private_data;
  1574. struct ublk_queue *ubq;
  1575. struct request *req;
  1576. size_t buf_off;
  1577. u16 tag, q_id;
  1578. if (!ub)
  1579. return ERR_PTR(-EACCES);
  1580. if (!user_backed_iter(iter))
  1581. return ERR_PTR(-EACCES);
  1582. if (ub->dev_info.state == UBLK_S_DEV_DEAD)
  1583. return ERR_PTR(-EACCES);
  1584. tag = ublk_pos_to_tag(iocb->ki_pos);
  1585. q_id = ublk_pos_to_hwq(iocb->ki_pos);
  1586. buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
  1587. if (q_id >= ub->dev_info.nr_hw_queues)
  1588. return ERR_PTR(-EINVAL);
  1589. ubq = ublk_get_queue(ub, q_id);
  1590. if (!ubq)
  1591. return ERR_PTR(-EINVAL);
  1592. if (tag >= ubq->q_depth)
  1593. return ERR_PTR(-EINVAL);
  1594. req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
  1595. if (!req)
  1596. return ERR_PTR(-EINVAL);
  1597. if (!req->mq_hctx || !req->mq_hctx->driver_data)
  1598. goto fail;
  1599. if (!ublk_check_ubuf_dir(req, dir))
  1600. goto fail;
  1601. *off = buf_off;
  1602. return req;
  1603. fail:
  1604. ublk_put_req_ref(ubq, req);
  1605. return ERR_PTR(-EACCES);
  1606. }
  1607. static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
  1608. {
  1609. struct ublk_queue *ubq;
  1610. struct request *req;
  1611. size_t buf_off;
  1612. size_t ret;
  1613. req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
  1614. if (IS_ERR(req))
  1615. return PTR_ERR(req);
  1616. ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
  1617. ubq = req->mq_hctx->driver_data;
  1618. ublk_put_req_ref(ubq, req);
  1619. return ret;
  1620. }
  1621. static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
  1622. {
  1623. struct ublk_queue *ubq;
  1624. struct request *req;
  1625. size_t buf_off;
  1626. size_t ret;
  1627. req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
  1628. if (IS_ERR(req))
  1629. return PTR_ERR(req);
  1630. ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
  1631. ubq = req->mq_hctx->driver_data;
  1632. ublk_put_req_ref(ubq, req);
  1633. return ret;
  1634. }
  1635. static const struct file_operations ublk_ch_fops = {
  1636. .owner = THIS_MODULE,
  1637. .open = ublk_ch_open,
  1638. .release = ublk_ch_release,
  1639. .read_iter = ublk_ch_read_iter,
  1640. .write_iter = ublk_ch_write_iter,
  1641. .uring_cmd = ublk_ch_uring_cmd,
  1642. .mmap = ublk_ch_mmap,
  1643. };
  1644. static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
  1645. {
  1646. int size = ublk_queue_cmd_buf_size(ub, q_id);
  1647. struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
  1648. if (ubq->ubq_daemon)
  1649. put_task_struct(ubq->ubq_daemon);
  1650. if (ubq->io_cmd_buf)
  1651. free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
  1652. }
  1653. static int ublk_init_queue(struct ublk_device *ub, int q_id)
  1654. {
  1655. struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
  1656. gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
  1657. void *ptr;
  1658. int size;
  1659. spin_lock_init(&ubq->cancel_lock);
  1660. ubq->flags = ub->dev_info.flags;
  1661. ubq->q_id = q_id;
  1662. ubq->q_depth = ub->dev_info.queue_depth;
  1663. size = ublk_queue_cmd_buf_size(ub, q_id);
  1664. ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
  1665. if (!ptr)
  1666. return -ENOMEM;
  1667. ubq->io_cmd_buf = ptr;
  1668. ubq->dev = ub;
  1669. return 0;
  1670. }
  1671. static void ublk_deinit_queues(struct ublk_device *ub)
  1672. {
  1673. int nr_queues = ub->dev_info.nr_hw_queues;
  1674. int i;
  1675. if (!ub->__queues)
  1676. return;
  1677. for (i = 0; i < nr_queues; i++)
  1678. ublk_deinit_queue(ub, i);
  1679. kfree(ub->__queues);
  1680. }
  1681. static int ublk_init_queues(struct ublk_device *ub)
  1682. {
  1683. int nr_queues = ub->dev_info.nr_hw_queues;
  1684. int depth = ub->dev_info.queue_depth;
  1685. int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
  1686. int i, ret = -ENOMEM;
  1687. ub->queue_size = ubq_size;
  1688. ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
  1689. if (!ub->__queues)
  1690. return ret;
  1691. for (i = 0; i < nr_queues; i++) {
  1692. if (ublk_init_queue(ub, i))
  1693. goto fail;
  1694. }
  1695. init_completion(&ub->completion);
  1696. return 0;
  1697. fail:
  1698. ublk_deinit_queues(ub);
  1699. return ret;
  1700. }
  1701. static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
  1702. {
  1703. int i = idx;
  1704. int err;
  1705. spin_lock(&ublk_idr_lock);
  1706. /* allocate id, if @id >= 0, we're requesting that specific id */
  1707. if (i >= 0) {
  1708. err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
  1709. if (err == -ENOSPC)
  1710. err = -EEXIST;
  1711. } else {
  1712. err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
  1713. GFP_NOWAIT);
  1714. }
  1715. spin_unlock(&ublk_idr_lock);
  1716. if (err >= 0)
  1717. ub->ub_number = err;
  1718. return err;
  1719. }
  1720. static void ublk_free_dev_number(struct ublk_device *ub)
  1721. {
  1722. spin_lock(&ublk_idr_lock);
  1723. idr_remove(&ublk_index_idr, ub->ub_number);
  1724. wake_up_all(&ublk_idr_wq);
  1725. spin_unlock(&ublk_idr_lock);
  1726. }
  1727. static void ublk_cdev_rel(struct device *dev)
  1728. {
  1729. struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
  1730. blk_mq_free_tag_set(&ub->tag_set);
  1731. ublk_deinit_queues(ub);
  1732. ublk_free_dev_number(ub);
  1733. mutex_destroy(&ub->mutex);
  1734. kfree(ub);
  1735. }
  1736. static int ublk_add_chdev(struct ublk_device *ub)
  1737. {
  1738. struct device *dev = &ub->cdev_dev;
  1739. int minor = ub->ub_number;
  1740. int ret;
  1741. dev->parent = ublk_misc.this_device;
  1742. dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
  1743. dev->class = &ublk_chr_class;
  1744. dev->release = ublk_cdev_rel;
  1745. device_initialize(dev);
  1746. ret = dev_set_name(dev, "ublkc%d", minor);
  1747. if (ret)
  1748. goto fail;
  1749. cdev_init(&ub->cdev, &ublk_ch_fops);
  1750. ret = cdev_device_add(&ub->cdev, dev);
  1751. if (ret)
  1752. goto fail;
  1753. ublks_added++;
  1754. return 0;
  1755. fail:
  1756. put_device(dev);
  1757. return ret;
  1758. }
  1759. static void ublk_stop_work_fn(struct work_struct *work)
  1760. {
  1761. struct ublk_device *ub =
  1762. container_of(work, struct ublk_device, stop_work);
  1763. ublk_stop_dev(ub);
  1764. }
  1765. /* align max io buffer size with PAGE_SIZE */
  1766. static void ublk_align_max_io_size(struct ublk_device *ub)
  1767. {
  1768. unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
  1769. ub->dev_info.max_io_buf_bytes =
  1770. round_down(max_io_bytes, PAGE_SIZE);
  1771. }
  1772. static int ublk_add_tag_set(struct ublk_device *ub)
  1773. {
  1774. ub->tag_set.ops = &ublk_mq_ops;
  1775. ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
  1776. ub->tag_set.queue_depth = ub->dev_info.queue_depth;
  1777. ub->tag_set.numa_node = NUMA_NO_NODE;
  1778. ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
  1779. ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
  1780. ub->tag_set.driver_data = ub;
  1781. return blk_mq_alloc_tag_set(&ub->tag_set);
  1782. }
  1783. static void ublk_remove(struct ublk_device *ub)
  1784. {
  1785. ublk_stop_dev(ub);
  1786. cancel_work_sync(&ub->stop_work);
  1787. cancel_work_sync(&ub->quiesce_work);
  1788. cdev_device_del(&ub->cdev, &ub->cdev_dev);
  1789. ublk_put_device(ub);
  1790. ublks_added--;
  1791. }
  1792. static struct ublk_device *ublk_get_device_from_id(int idx)
  1793. {
  1794. struct ublk_device *ub = NULL;
  1795. if (idx < 0)
  1796. return NULL;
  1797. spin_lock(&ublk_idr_lock);
  1798. ub = idr_find(&ublk_index_idr, idx);
  1799. if (ub)
  1800. ub = ublk_get_device(ub);
  1801. spin_unlock(&ublk_idr_lock);
  1802. return ub;
  1803. }
  1804. static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
  1805. {
  1806. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  1807. const struct ublk_param_basic *p = &ub->params.basic;
  1808. int ublksrv_pid = (int)header->data[0];
  1809. struct queue_limits lim = {
  1810. .logical_block_size = 1 << p->logical_bs_shift,
  1811. .physical_block_size = 1 << p->physical_bs_shift,
  1812. .io_min = 1 << p->io_min_shift,
  1813. .io_opt = 1 << p->io_opt_shift,
  1814. .max_hw_sectors = p->max_sectors,
  1815. .chunk_sectors = p->chunk_sectors,
  1816. .virt_boundary_mask = p->virt_boundary_mask,
  1817. .max_segments = USHRT_MAX,
  1818. .max_segment_size = UINT_MAX,
  1819. .dma_alignment = 3,
  1820. };
  1821. struct gendisk *disk;
  1822. int ret = -EINVAL;
  1823. if (ublksrv_pid <= 0)
  1824. return -EINVAL;
  1825. if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
  1826. return -EINVAL;
  1827. if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
  1828. const struct ublk_param_discard *pd = &ub->params.discard;
  1829. lim.discard_alignment = pd->discard_alignment;
  1830. lim.discard_granularity = pd->discard_granularity;
  1831. lim.max_hw_discard_sectors = pd->max_discard_sectors;
  1832. lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
  1833. lim.max_discard_segments = pd->max_discard_segments;
  1834. }
  1835. if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
  1836. const struct ublk_param_zoned *p = &ub->params.zoned;
  1837. if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
  1838. return -EOPNOTSUPP;
  1839. lim.features |= BLK_FEAT_ZONED;
  1840. lim.max_active_zones = p->max_active_zones;
  1841. lim.max_open_zones = p->max_open_zones;
  1842. lim.max_zone_append_sectors = p->max_zone_append_sectors;
  1843. }
  1844. if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
  1845. lim.features |= BLK_FEAT_WRITE_CACHE;
  1846. if (ub->params.basic.attrs & UBLK_ATTR_FUA)
  1847. lim.features |= BLK_FEAT_FUA;
  1848. }
  1849. if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
  1850. lim.features |= BLK_FEAT_ROTATIONAL;
  1851. if (wait_for_completion_interruptible(&ub->completion) != 0)
  1852. return -EINTR;
  1853. mutex_lock(&ub->mutex);
  1854. if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
  1855. test_bit(UB_STATE_USED, &ub->state)) {
  1856. ret = -EEXIST;
  1857. goto out_unlock;
  1858. }
  1859. disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
  1860. if (IS_ERR(disk)) {
  1861. ret = PTR_ERR(disk);
  1862. goto out_unlock;
  1863. }
  1864. sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
  1865. disk->fops = &ub_fops;
  1866. disk->private_data = ub;
  1867. ub->dev_info.ublksrv_pid = ublksrv_pid;
  1868. ub->ub_disk = disk;
  1869. ublk_apply_params(ub);
  1870. /* don't probe partitions if any one ubq daemon is un-trusted */
  1871. if (ub->nr_privileged_daemon != ub->nr_queues_ready)
  1872. set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
  1873. ublk_get_device(ub);
  1874. ub->dev_info.state = UBLK_S_DEV_LIVE;
  1875. if (ublk_dev_is_zoned(ub)) {
  1876. ret = ublk_revalidate_disk_zones(ub);
  1877. if (ret)
  1878. goto out_put_cdev;
  1879. }
  1880. ret = add_disk(disk);
  1881. if (ret)
  1882. goto out_put_cdev;
  1883. set_bit(UB_STATE_USED, &ub->state);
  1884. out_put_cdev:
  1885. if (ret) {
  1886. ublk_detach_disk(ub);
  1887. ublk_put_device(ub);
  1888. }
  1889. if (ret)
  1890. put_disk(disk);
  1891. out_unlock:
  1892. mutex_unlock(&ub->mutex);
  1893. return ret;
  1894. }
  1895. static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
  1896. struct io_uring_cmd *cmd)
  1897. {
  1898. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  1899. void __user *argp = (void __user *)(unsigned long)header->addr;
  1900. cpumask_var_t cpumask;
  1901. unsigned long queue;
  1902. unsigned int retlen;
  1903. unsigned int i;
  1904. int ret;
  1905. if (header->len * BITS_PER_BYTE < nr_cpu_ids)
  1906. return -EINVAL;
  1907. if (header->len & (sizeof(unsigned long)-1))
  1908. return -EINVAL;
  1909. if (!header->addr)
  1910. return -EINVAL;
  1911. queue = header->data[0];
  1912. if (queue >= ub->dev_info.nr_hw_queues)
  1913. return -EINVAL;
  1914. if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
  1915. return -ENOMEM;
  1916. for_each_possible_cpu(i) {
  1917. if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
  1918. cpumask_set_cpu(i, cpumask);
  1919. }
  1920. ret = -EFAULT;
  1921. retlen = min_t(unsigned short, header->len, cpumask_size());
  1922. if (copy_to_user(argp, cpumask, retlen))
  1923. goto out_free_cpumask;
  1924. if (retlen != header->len &&
  1925. clear_user(argp + retlen, header->len - retlen))
  1926. goto out_free_cpumask;
  1927. ret = 0;
  1928. out_free_cpumask:
  1929. free_cpumask_var(cpumask);
  1930. return ret;
  1931. }
  1932. static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
  1933. {
  1934. pr_devel("%s: dev id %d flags %llx\n", __func__,
  1935. info->dev_id, info->flags);
  1936. pr_devel("\t nr_hw_queues %d queue_depth %d\n",
  1937. info->nr_hw_queues, info->queue_depth);
  1938. }
  1939. static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
  1940. {
  1941. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  1942. void __user *argp = (void __user *)(unsigned long)header->addr;
  1943. struct ublksrv_ctrl_dev_info info;
  1944. struct ublk_device *ub;
  1945. int ret = -EINVAL;
  1946. if (header->len < sizeof(info) || !header->addr)
  1947. return -EINVAL;
  1948. if (header->queue_id != (u16)-1) {
  1949. pr_warn("%s: queue_id is wrong %x\n",
  1950. __func__, header->queue_id);
  1951. return -EINVAL;
  1952. }
  1953. if (copy_from_user(&info, argp, sizeof(info)))
  1954. return -EFAULT;
  1955. if (capable(CAP_SYS_ADMIN))
  1956. info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
  1957. else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
  1958. return -EPERM;
  1959. /*
  1960. * unprivileged device can't be trusted, but RECOVERY and
  1961. * RECOVERY_REISSUE still may hang error handling, so can't
  1962. * support recovery features for unprivileged ublk now
  1963. *
  1964. * TODO: provide forward progress for RECOVERY handler, so that
  1965. * unprivileged device can benefit from it
  1966. */
  1967. if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
  1968. info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
  1969. UBLK_F_USER_RECOVERY);
  1970. /*
  1971. * For USER_COPY, we depends on userspace to fill request
  1972. * buffer by pwrite() to ublk char device, which can't be
  1973. * used for unprivileged device
  1974. */
  1975. if (info.flags & UBLK_F_USER_COPY)
  1976. return -EINVAL;
  1977. }
  1978. /* the created device is always owned by current user */
  1979. ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
  1980. if (header->dev_id != info.dev_id) {
  1981. pr_warn("%s: dev id not match %u %u\n",
  1982. __func__, header->dev_id, info.dev_id);
  1983. return -EINVAL;
  1984. }
  1985. if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
  1986. pr_warn("%s: dev id is too large. Max supported is %d\n",
  1987. __func__, UBLK_MAX_UBLKS - 1);
  1988. return -EINVAL;
  1989. }
  1990. ublk_dump_dev_info(&info);
  1991. ret = mutex_lock_killable(&ublk_ctl_mutex);
  1992. if (ret)
  1993. return ret;
  1994. ret = -EACCES;
  1995. if (ublks_added >= ublks_max)
  1996. goto out_unlock;
  1997. ret = -ENOMEM;
  1998. ub = kzalloc(sizeof(*ub), GFP_KERNEL);
  1999. if (!ub)
  2000. goto out_unlock;
  2001. mutex_init(&ub->mutex);
  2002. spin_lock_init(&ub->lock);
  2003. INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
  2004. INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
  2005. ret = ublk_alloc_dev_number(ub, header->dev_id);
  2006. if (ret < 0)
  2007. goto out_free_ub;
  2008. memcpy(&ub->dev_info, &info, sizeof(info));
  2009. /* update device id */
  2010. ub->dev_info.dev_id = ub->ub_number;
  2011. /*
  2012. * 64bit flags will be copied back to userspace as feature
  2013. * negotiation result, so have to clear flags which driver
  2014. * doesn't support yet, then userspace can get correct flags
  2015. * (features) to handle.
  2016. */
  2017. ub->dev_info.flags &= UBLK_F_ALL;
  2018. ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
  2019. UBLK_F_URING_CMD_COMP_IN_TASK;
  2020. /* GET_DATA isn't needed any more with USER_COPY */
  2021. if (ublk_dev_is_user_copy(ub))
  2022. ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
  2023. /* Zoned storage support requires user copy feature */
  2024. if (ublk_dev_is_zoned(ub) &&
  2025. (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
  2026. ret = -EINVAL;
  2027. goto out_free_dev_number;
  2028. }
  2029. /* We are not ready to support zero copy */
  2030. ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
  2031. ub->dev_info.nr_hw_queues = min_t(unsigned int,
  2032. ub->dev_info.nr_hw_queues, nr_cpu_ids);
  2033. ublk_align_max_io_size(ub);
  2034. ret = ublk_init_queues(ub);
  2035. if (ret)
  2036. goto out_free_dev_number;
  2037. ret = ublk_add_tag_set(ub);
  2038. if (ret)
  2039. goto out_deinit_queues;
  2040. ret = -EFAULT;
  2041. if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
  2042. goto out_free_tag_set;
  2043. /*
  2044. * Add the char dev so that ublksrv daemon can be setup.
  2045. * ublk_add_chdev() will cleanup everything if it fails.
  2046. */
  2047. ret = ublk_add_chdev(ub);
  2048. goto out_unlock;
  2049. out_free_tag_set:
  2050. blk_mq_free_tag_set(&ub->tag_set);
  2051. out_deinit_queues:
  2052. ublk_deinit_queues(ub);
  2053. out_free_dev_number:
  2054. ublk_free_dev_number(ub);
  2055. out_free_ub:
  2056. mutex_destroy(&ub->mutex);
  2057. kfree(ub);
  2058. out_unlock:
  2059. mutex_unlock(&ublk_ctl_mutex);
  2060. return ret;
  2061. }
  2062. static inline bool ublk_idr_freed(int id)
  2063. {
  2064. void *ptr;
  2065. spin_lock(&ublk_idr_lock);
  2066. ptr = idr_find(&ublk_index_idr, id);
  2067. spin_unlock(&ublk_idr_lock);
  2068. return ptr == NULL;
  2069. }
  2070. static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
  2071. {
  2072. struct ublk_device *ub = *p_ub;
  2073. int idx = ub->ub_number;
  2074. int ret;
  2075. ret = mutex_lock_killable(&ublk_ctl_mutex);
  2076. if (ret)
  2077. return ret;
  2078. if (!test_bit(UB_STATE_DELETED, &ub->state)) {
  2079. ublk_remove(ub);
  2080. set_bit(UB_STATE_DELETED, &ub->state);
  2081. }
  2082. /* Mark the reference as consumed */
  2083. *p_ub = NULL;
  2084. ublk_put_device(ub);
  2085. mutex_unlock(&ublk_ctl_mutex);
  2086. /*
  2087. * Wait until the idr is removed, then it can be reused after
  2088. * DEL_DEV command is returned.
  2089. *
  2090. * If we returns because of user interrupt, future delete command
  2091. * may come:
  2092. *
  2093. * - the device number isn't freed, this device won't or needn't
  2094. * be deleted again, since UB_STATE_DELETED is set, and device
  2095. * will be released after the last reference is dropped
  2096. *
  2097. * - the device number is freed already, we will not find this
  2098. * device via ublk_get_device_from_id()
  2099. */
  2100. if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
  2101. return -EINTR;
  2102. return 0;
  2103. }
  2104. static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
  2105. {
  2106. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  2107. pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
  2108. __func__, cmd->cmd_op, header->dev_id, header->queue_id,
  2109. header->data[0], header->addr, header->len);
  2110. }
  2111. static int ublk_ctrl_stop_dev(struct ublk_device *ub)
  2112. {
  2113. ublk_stop_dev(ub);
  2114. cancel_work_sync(&ub->stop_work);
  2115. cancel_work_sync(&ub->quiesce_work);
  2116. return 0;
  2117. }
  2118. static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
  2119. struct io_uring_cmd *cmd)
  2120. {
  2121. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  2122. void __user *argp = (void __user *)(unsigned long)header->addr;
  2123. if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
  2124. return -EINVAL;
  2125. if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
  2126. return -EFAULT;
  2127. return 0;
  2128. }
  2129. /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
  2130. static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
  2131. {
  2132. ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
  2133. ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
  2134. if (ub->ub_disk) {
  2135. ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
  2136. ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
  2137. } else {
  2138. ub->params.devt.disk_major = 0;
  2139. ub->params.devt.disk_minor = 0;
  2140. }
  2141. ub->params.types |= UBLK_PARAM_TYPE_DEVT;
  2142. }
  2143. static int ublk_ctrl_get_params(struct ublk_device *ub,
  2144. struct io_uring_cmd *cmd)
  2145. {
  2146. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  2147. void __user *argp = (void __user *)(unsigned long)header->addr;
  2148. struct ublk_params_header ph;
  2149. int ret;
  2150. if (header->len <= sizeof(ph) || !header->addr)
  2151. return -EINVAL;
  2152. if (copy_from_user(&ph, argp, sizeof(ph)))
  2153. return -EFAULT;
  2154. if (ph.len > header->len || !ph.len)
  2155. return -EINVAL;
  2156. if (ph.len > sizeof(struct ublk_params))
  2157. ph.len = sizeof(struct ublk_params);
  2158. mutex_lock(&ub->mutex);
  2159. ublk_ctrl_fill_params_devt(ub);
  2160. if (copy_to_user(argp, &ub->params, ph.len))
  2161. ret = -EFAULT;
  2162. else
  2163. ret = 0;
  2164. mutex_unlock(&ub->mutex);
  2165. return ret;
  2166. }
  2167. static int ublk_ctrl_set_params(struct ublk_device *ub,
  2168. struct io_uring_cmd *cmd)
  2169. {
  2170. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  2171. void __user *argp = (void __user *)(unsigned long)header->addr;
  2172. struct ublk_params_header ph;
  2173. int ret = -EFAULT;
  2174. if (header->len <= sizeof(ph) || !header->addr)
  2175. return -EINVAL;
  2176. if (copy_from_user(&ph, argp, sizeof(ph)))
  2177. return -EFAULT;
  2178. if (ph.len > header->len || !ph.len || !ph.types)
  2179. return -EINVAL;
  2180. if (ph.len > sizeof(struct ublk_params))
  2181. ph.len = sizeof(struct ublk_params);
  2182. /* parameters can only be changed when device isn't live */
  2183. mutex_lock(&ub->mutex);
  2184. if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
  2185. ret = -EACCES;
  2186. } else if (copy_from_user(&ub->params, argp, ph.len)) {
  2187. ret = -EFAULT;
  2188. } else {
  2189. /* clear all we don't support yet */
  2190. ub->params.types &= UBLK_PARAM_TYPE_ALL;
  2191. ret = ublk_validate_params(ub);
  2192. if (ret)
  2193. ub->params.types = 0;
  2194. }
  2195. mutex_unlock(&ub->mutex);
  2196. return ret;
  2197. }
  2198. static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
  2199. {
  2200. int i;
  2201. WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
  2202. /* All old ioucmds have to be completed */
  2203. ubq->nr_io_ready = 0;
  2204. /* old daemon is PF_EXITING, put it now */
  2205. put_task_struct(ubq->ubq_daemon);
  2206. /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
  2207. ubq->ubq_daemon = NULL;
  2208. ubq->timeout = false;
  2209. ubq->canceling = false;
  2210. for (i = 0; i < ubq->q_depth; i++) {
  2211. struct ublk_io *io = &ubq->ios[i];
  2212. /* forget everything now and be ready for new FETCH_REQ */
  2213. io->flags = 0;
  2214. io->cmd = NULL;
  2215. io->addr = 0;
  2216. }
  2217. }
  2218. static int ublk_ctrl_start_recovery(struct ublk_device *ub,
  2219. struct io_uring_cmd *cmd)
  2220. {
  2221. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  2222. int ret = -EINVAL;
  2223. int i;
  2224. mutex_lock(&ub->mutex);
  2225. if (!ublk_can_use_recovery(ub))
  2226. goto out_unlock;
  2227. if (!ub->nr_queues_ready)
  2228. goto out_unlock;
  2229. /*
  2230. * START_RECOVERY is only allowd after:
  2231. *
  2232. * (1) UB_STATE_OPEN is not set, which means the dying process is exited
  2233. * and related io_uring ctx is freed so file struct of /dev/ublkcX is
  2234. * released.
  2235. *
  2236. * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
  2237. * (a)has quiesced request queue
  2238. * (b)has requeued every inflight rqs whose io_flags is ACTIVE
  2239. * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
  2240. * (d)has completed/camceled all ioucmds owned by ther dying process
  2241. */
  2242. if (test_bit(UB_STATE_OPEN, &ub->state) ||
  2243. ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
  2244. ret = -EBUSY;
  2245. goto out_unlock;
  2246. }
  2247. pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
  2248. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  2249. ublk_queue_reinit(ub, ublk_get_queue(ub, i));
  2250. /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
  2251. ub->mm = NULL;
  2252. ub->nr_queues_ready = 0;
  2253. ub->nr_privileged_daemon = 0;
  2254. init_completion(&ub->completion);
  2255. ret = 0;
  2256. out_unlock:
  2257. mutex_unlock(&ub->mutex);
  2258. return ret;
  2259. }
  2260. static int ublk_ctrl_end_recovery(struct ublk_device *ub,
  2261. struct io_uring_cmd *cmd)
  2262. {
  2263. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  2264. int ublksrv_pid = (int)header->data[0];
  2265. int ret = -EINVAL;
  2266. pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
  2267. __func__, ub->dev_info.nr_hw_queues, header->dev_id);
  2268. /* wait until new ubq_daemon sending all FETCH_REQ */
  2269. if (wait_for_completion_interruptible(&ub->completion))
  2270. return -EINTR;
  2271. pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
  2272. __func__, ub->dev_info.nr_hw_queues, header->dev_id);
  2273. mutex_lock(&ub->mutex);
  2274. if (!ublk_can_use_recovery(ub))
  2275. goto out_unlock;
  2276. if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
  2277. ret = -EBUSY;
  2278. goto out_unlock;
  2279. }
  2280. ub->dev_info.ublksrv_pid = ublksrv_pid;
  2281. pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
  2282. __func__, ublksrv_pid, header->dev_id);
  2283. blk_mq_unquiesce_queue(ub->ub_disk->queue);
  2284. pr_devel("%s: queue unquiesced, dev id %d.\n",
  2285. __func__, header->dev_id);
  2286. blk_mq_kick_requeue_list(ub->ub_disk->queue);
  2287. ub->dev_info.state = UBLK_S_DEV_LIVE;
  2288. ret = 0;
  2289. out_unlock:
  2290. mutex_unlock(&ub->mutex);
  2291. return ret;
  2292. }
  2293. static int ublk_ctrl_get_features(struct io_uring_cmd *cmd)
  2294. {
  2295. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  2296. void __user *argp = (void __user *)(unsigned long)header->addr;
  2297. u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY;
  2298. if (header->len != UBLK_FEATURES_LEN || !header->addr)
  2299. return -EINVAL;
  2300. if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
  2301. return -EFAULT;
  2302. return 0;
  2303. }
  2304. /*
  2305. * All control commands are sent via /dev/ublk-control, so we have to check
  2306. * the destination device's permission
  2307. */
  2308. static int ublk_char_dev_permission(struct ublk_device *ub,
  2309. const char *dev_path, int mask)
  2310. {
  2311. int err;
  2312. struct path path;
  2313. struct kstat stat;
  2314. err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
  2315. if (err)
  2316. return err;
  2317. err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
  2318. if (err)
  2319. goto exit;
  2320. err = -EPERM;
  2321. if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
  2322. goto exit;
  2323. err = inode_permission(&nop_mnt_idmap,
  2324. d_backing_inode(path.dentry), mask);
  2325. exit:
  2326. path_put(&path);
  2327. return err;
  2328. }
  2329. static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
  2330. struct io_uring_cmd *cmd)
  2331. {
  2332. struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
  2333. bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
  2334. void __user *argp = (void __user *)(unsigned long)header->addr;
  2335. char *dev_path = NULL;
  2336. int ret = 0;
  2337. int mask;
  2338. if (!unprivileged) {
  2339. if (!capable(CAP_SYS_ADMIN))
  2340. return -EPERM;
  2341. /*
  2342. * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
  2343. * char_dev_path in payload too, since userspace may not
  2344. * know if the specified device is created as unprivileged
  2345. * mode.
  2346. */
  2347. if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
  2348. return 0;
  2349. }
  2350. /*
  2351. * User has to provide the char device path for unprivileged ublk
  2352. *
  2353. * header->addr always points to the dev path buffer, and
  2354. * header->dev_path_len records length of dev path buffer.
  2355. */
  2356. if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
  2357. return -EINVAL;
  2358. if (header->len < header->dev_path_len)
  2359. return -EINVAL;
  2360. dev_path = memdup_user_nul(argp, header->dev_path_len);
  2361. if (IS_ERR(dev_path))
  2362. return PTR_ERR(dev_path);
  2363. ret = -EINVAL;
  2364. switch (_IOC_NR(cmd->cmd_op)) {
  2365. case UBLK_CMD_GET_DEV_INFO:
  2366. case UBLK_CMD_GET_DEV_INFO2:
  2367. case UBLK_CMD_GET_QUEUE_AFFINITY:
  2368. case UBLK_CMD_GET_PARAMS:
  2369. case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
  2370. mask = MAY_READ;
  2371. break;
  2372. case UBLK_CMD_START_DEV:
  2373. case UBLK_CMD_STOP_DEV:
  2374. case UBLK_CMD_ADD_DEV:
  2375. case UBLK_CMD_DEL_DEV:
  2376. case UBLK_CMD_SET_PARAMS:
  2377. case UBLK_CMD_START_USER_RECOVERY:
  2378. case UBLK_CMD_END_USER_RECOVERY:
  2379. mask = MAY_READ | MAY_WRITE;
  2380. break;
  2381. default:
  2382. goto exit;
  2383. }
  2384. ret = ublk_char_dev_permission(ub, dev_path, mask);
  2385. if (!ret) {
  2386. header->len -= header->dev_path_len;
  2387. header->addr += header->dev_path_len;
  2388. }
  2389. pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
  2390. __func__, ub->ub_number, cmd->cmd_op,
  2391. ub->dev_info.owner_uid, ub->dev_info.owner_gid,
  2392. dev_path, ret);
  2393. exit:
  2394. kfree(dev_path);
  2395. return ret;
  2396. }
  2397. static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
  2398. unsigned int issue_flags)
  2399. {
  2400. const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
  2401. struct ublk_device *ub = NULL;
  2402. u32 cmd_op = cmd->cmd_op;
  2403. int ret = -EINVAL;
  2404. if (issue_flags & IO_URING_F_NONBLOCK)
  2405. return -EAGAIN;
  2406. ublk_ctrl_cmd_dump(cmd);
  2407. if (!(issue_flags & IO_URING_F_SQE128))
  2408. goto out;
  2409. ret = ublk_check_cmd_op(cmd_op);
  2410. if (ret)
  2411. goto out;
  2412. if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
  2413. ret = ublk_ctrl_get_features(cmd);
  2414. goto out;
  2415. }
  2416. if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
  2417. ret = -ENODEV;
  2418. ub = ublk_get_device_from_id(header->dev_id);
  2419. if (!ub)
  2420. goto out;
  2421. ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
  2422. if (ret)
  2423. goto put_dev;
  2424. }
  2425. switch (_IOC_NR(cmd_op)) {
  2426. case UBLK_CMD_START_DEV:
  2427. ret = ublk_ctrl_start_dev(ub, cmd);
  2428. break;
  2429. case UBLK_CMD_STOP_DEV:
  2430. ret = ublk_ctrl_stop_dev(ub);
  2431. break;
  2432. case UBLK_CMD_GET_DEV_INFO:
  2433. case UBLK_CMD_GET_DEV_INFO2:
  2434. ret = ublk_ctrl_get_dev_info(ub, cmd);
  2435. break;
  2436. case UBLK_CMD_ADD_DEV:
  2437. ret = ublk_ctrl_add_dev(cmd);
  2438. break;
  2439. case UBLK_CMD_DEL_DEV:
  2440. ret = ublk_ctrl_del_dev(&ub, true);
  2441. break;
  2442. case UBLK_CMD_DEL_DEV_ASYNC:
  2443. ret = ublk_ctrl_del_dev(&ub, false);
  2444. break;
  2445. case UBLK_CMD_GET_QUEUE_AFFINITY:
  2446. ret = ublk_ctrl_get_queue_affinity(ub, cmd);
  2447. break;
  2448. case UBLK_CMD_GET_PARAMS:
  2449. ret = ublk_ctrl_get_params(ub, cmd);
  2450. break;
  2451. case UBLK_CMD_SET_PARAMS:
  2452. ret = ublk_ctrl_set_params(ub, cmd);
  2453. break;
  2454. case UBLK_CMD_START_USER_RECOVERY:
  2455. ret = ublk_ctrl_start_recovery(ub, cmd);
  2456. break;
  2457. case UBLK_CMD_END_USER_RECOVERY:
  2458. ret = ublk_ctrl_end_recovery(ub, cmd);
  2459. break;
  2460. default:
  2461. ret = -EOPNOTSUPP;
  2462. break;
  2463. }
  2464. put_dev:
  2465. if (ub)
  2466. ublk_put_device(ub);
  2467. out:
  2468. io_uring_cmd_done(cmd, ret, 0, issue_flags);
  2469. pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
  2470. __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
  2471. return -EIOCBQUEUED;
  2472. }
  2473. static const struct file_operations ublk_ctl_fops = {
  2474. .open = nonseekable_open,
  2475. .uring_cmd = ublk_ctrl_uring_cmd,
  2476. .owner = THIS_MODULE,
  2477. .llseek = noop_llseek,
  2478. };
  2479. static struct miscdevice ublk_misc = {
  2480. .minor = MISC_DYNAMIC_MINOR,
  2481. .name = "ublk-control",
  2482. .fops = &ublk_ctl_fops,
  2483. };
  2484. static int __init ublk_init(void)
  2485. {
  2486. int ret;
  2487. BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
  2488. UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
  2489. init_waitqueue_head(&ublk_idr_wq);
  2490. ret = misc_register(&ublk_misc);
  2491. if (ret)
  2492. return ret;
  2493. ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
  2494. if (ret)
  2495. goto unregister_mis;
  2496. ret = class_register(&ublk_chr_class);
  2497. if (ret)
  2498. goto free_chrdev_region;
  2499. return 0;
  2500. free_chrdev_region:
  2501. unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
  2502. unregister_mis:
  2503. misc_deregister(&ublk_misc);
  2504. return ret;
  2505. }
  2506. static void __exit ublk_exit(void)
  2507. {
  2508. struct ublk_device *ub;
  2509. int id;
  2510. idr_for_each_entry(&ublk_index_idr, ub, id)
  2511. ublk_remove(ub);
  2512. class_unregister(&ublk_chr_class);
  2513. misc_deregister(&ublk_misc);
  2514. idr_destroy(&ublk_index_idr);
  2515. unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
  2516. }
  2517. module_init(ublk_init);
  2518. module_exit(ublk_exit);
  2519. static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp)
  2520. {
  2521. return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
  2522. }
  2523. static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp)
  2524. {
  2525. return sysfs_emit(buf, "%u\n", ublks_max);
  2526. }
  2527. static const struct kernel_param_ops ublk_max_ublks_ops = {
  2528. .set = ublk_set_max_ublks,
  2529. .get = ublk_get_max_ublks,
  2530. };
  2531. module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644);
  2532. MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
  2533. MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
  2534. MODULE_DESCRIPTION("Userspace block device");
  2535. MODULE_LICENSE("GPL");