| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080 |
- // SPDX-License-Identifier: GPL-2.0-or-later
- /*
- * Userspace block device - block device which IO is handled from userspace
- *
- * Take full use of io_uring passthrough command for communicating with
- * ublk userspace daemon(ublksrvd) for handling basic IO request.
- *
- * Copyright 2022 Ming Lei <ming.lei@redhat.com>
- *
- * (part of code stolen from loop.c)
- */
- #include <linux/module.h>
- #include <linux/moduleparam.h>
- #include <linux/sched.h>
- #include <linux/fs.h>
- #include <linux/pagemap.h>
- #include <linux/file.h>
- #include <linux/stat.h>
- #include <linux/errno.h>
- #include <linux/major.h>
- #include <linux/wait.h>
- #include <linux/blkdev.h>
- #include <linux/init.h>
- #include <linux/swap.h>
- #include <linux/slab.h>
- #include <linux/compat.h>
- #include <linux/mutex.h>
- #include <linux/writeback.h>
- #include <linux/completion.h>
- #include <linux/highmem.h>
- #include <linux/sysfs.h>
- #include <linux/miscdevice.h>
- #include <linux/falloc.h>
- #include <linux/uio.h>
- #include <linux/ioprio.h>
- #include <linux/sched/mm.h>
- #include <linux/uaccess.h>
- #include <linux/cdev.h>
- #include <linux/io_uring/cmd.h>
- #include <linux/blk-mq.h>
- #include <linux/delay.h>
- #include <linux/mm.h>
- #include <asm/page.h>
- #include <linux/task_work.h>
- #include <linux/namei.h>
- #include <linux/kref.h>
- #include <uapi/linux/ublk_cmd.h>
- #define UBLK_MINORS (1U << MINORBITS)
- /* private ioctl command mirror */
- #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
- /* All UBLK_F_* have to be included into UBLK_F_ALL */
- #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
- | UBLK_F_URING_CMD_COMP_IN_TASK \
- | UBLK_F_NEED_GET_DATA \
- | UBLK_F_USER_RECOVERY \
- | UBLK_F_USER_RECOVERY_REISSUE \
- | UBLK_F_UNPRIVILEGED_DEV \
- | UBLK_F_CMD_IOCTL_ENCODE \
- | UBLK_F_USER_COPY \
- | UBLK_F_ZONED)
- /* All UBLK_PARAM_TYPE_* should be included here */
- #define UBLK_PARAM_TYPE_ALL \
- (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
- UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
- struct ublk_rq_data {
- struct llist_node node;
- struct kref ref;
- };
- struct ublk_uring_cmd_pdu {
- struct ublk_queue *ubq;
- u16 tag;
- };
- /*
- * io command is active: sqe cmd is received, and its cqe isn't done
- *
- * If the flag is set, the io command is owned by ublk driver, and waited
- * for incoming blk-mq request from the ublk block device.
- *
- * If the flag is cleared, the io command will be completed, and owned by
- * ublk server.
- */
- #define UBLK_IO_FLAG_ACTIVE 0x01
- /*
- * IO command is completed via cqe, and it is being handled by ublksrv, and
- * not committed yet
- *
- * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
- * cross verification
- */
- #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
- /*
- * IO command is aborted, so this flag is set in case of
- * !UBLK_IO_FLAG_ACTIVE.
- *
- * After this flag is observed, any pending or new incoming request
- * associated with this io command will be failed immediately
- */
- #define UBLK_IO_FLAG_ABORTED 0x04
- /*
- * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
- * get data buffer address from ublksrv.
- *
- * Then, bio data could be copied into this data buffer for a WRITE request
- * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
- */
- #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
- /* atomic RW with ubq->cancel_lock */
- #define UBLK_IO_FLAG_CANCELED 0x80000000
- struct ublk_io {
- /* userspace buffer address from io cmd */
- __u64 addr;
- unsigned int flags;
- int res;
- struct io_uring_cmd *cmd;
- };
- struct ublk_queue {
- int q_id;
- int q_depth;
- unsigned long flags;
- struct task_struct *ubq_daemon;
- char *io_cmd_buf;
- struct llist_head io_cmds;
- unsigned long io_addr; /* mapped vm address */
- unsigned int max_io_sz;
- bool force_abort;
- bool timeout;
- bool canceling;
- unsigned short nr_io_ready; /* how many ios setup */
- spinlock_t cancel_lock;
- struct ublk_device *dev;
- struct ublk_io ios[];
- };
- struct ublk_device {
- struct gendisk *ub_disk;
- char *__queues;
- unsigned int queue_size;
- struct ublksrv_ctrl_dev_info dev_info;
- struct blk_mq_tag_set tag_set;
- struct cdev cdev;
- struct device cdev_dev;
- #define UB_STATE_OPEN 0
- #define UB_STATE_USED 1
- #define UB_STATE_DELETED 2
- unsigned long state;
- int ub_number;
- struct mutex mutex;
- spinlock_t lock;
- struct mm_struct *mm;
- struct ublk_params params;
- struct completion completion;
- unsigned int nr_queues_ready;
- unsigned int nr_privileged_daemon;
- struct work_struct quiesce_work;
- struct work_struct stop_work;
- };
- /* header of ublk_params */
- struct ublk_params_header {
- __u32 len;
- __u32 types;
- };
- static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
- static inline unsigned int ublk_req_build_flags(struct request *req);
- static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag);
- static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
- {
- return ub->dev_info.flags & UBLK_F_USER_COPY;
- }
- static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
- {
- return ub->dev_info.flags & UBLK_F_ZONED;
- }
- static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
- {
- return ubq->flags & UBLK_F_ZONED;
- }
- #ifdef CONFIG_BLK_DEV_ZONED
- struct ublk_zoned_report_desc {
- __u64 sector;
- __u32 operation;
- __u32 nr_zones;
- };
- static DEFINE_XARRAY(ublk_zoned_report_descs);
- static int ublk_zoned_insert_report_desc(const struct request *req,
- struct ublk_zoned_report_desc *desc)
- {
- return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
- desc, GFP_KERNEL);
- }
- static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
- const struct request *req)
- {
- return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
- }
- static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
- const struct request *req)
- {
- return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
- }
- static int ublk_get_nr_zones(const struct ublk_device *ub)
- {
- const struct ublk_param_basic *p = &ub->params.basic;
- /* Zone size is a power of 2 */
- return p->dev_sectors >> ilog2(p->chunk_sectors);
- }
- static int ublk_revalidate_disk_zones(struct ublk_device *ub)
- {
- return blk_revalidate_disk_zones(ub->ub_disk);
- }
- static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
- {
- const struct ublk_param_zoned *p = &ub->params.zoned;
- int nr_zones;
- if (!ublk_dev_is_zoned(ub))
- return -EINVAL;
- if (!p->max_zone_append_sectors)
- return -EINVAL;
- nr_zones = ublk_get_nr_zones(ub);
- if (p->max_active_zones > nr_zones)
- return -EINVAL;
- if (p->max_open_zones > nr_zones)
- return -EINVAL;
- return 0;
- }
- static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
- {
- ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
- }
- /* Based on virtblk_alloc_report_buffer */
- static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
- unsigned int nr_zones, size_t *buflen)
- {
- struct request_queue *q = ublk->ub_disk->queue;
- size_t bufsize;
- void *buf;
- nr_zones = min_t(unsigned int, nr_zones,
- ublk->ub_disk->nr_zones);
- bufsize = nr_zones * sizeof(struct blk_zone);
- bufsize =
- min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
- while (bufsize >= sizeof(struct blk_zone)) {
- buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
- if (buf) {
- *buflen = bufsize;
- return buf;
- }
- bufsize >>= 1;
- }
- *buflen = 0;
- return NULL;
- }
- static int ublk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
- {
- struct ublk_device *ub = disk->private_data;
- unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
- unsigned int first_zone = sector >> ilog2(zone_size_sectors);
- unsigned int done_zones = 0;
- unsigned int max_zones_per_request;
- int ret;
- struct blk_zone *buffer;
- size_t buffer_length;
- nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
- nr_zones);
- buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
- if (!buffer)
- return -ENOMEM;
- max_zones_per_request = buffer_length / sizeof(struct blk_zone);
- while (done_zones < nr_zones) {
- unsigned int remaining_zones = nr_zones - done_zones;
- unsigned int zones_in_request =
- min_t(unsigned int, remaining_zones, max_zones_per_request);
- struct request *req;
- struct ublk_zoned_report_desc desc;
- blk_status_t status;
- memset(buffer, 0, buffer_length);
- req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
- if (IS_ERR(req)) {
- ret = PTR_ERR(req);
- goto out;
- }
- desc.operation = UBLK_IO_OP_REPORT_ZONES;
- desc.sector = sector;
- desc.nr_zones = zones_in_request;
- ret = ublk_zoned_insert_report_desc(req, &desc);
- if (ret)
- goto free_req;
- ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
- GFP_KERNEL);
- if (ret)
- goto erase_desc;
- status = blk_execute_rq(req, 0);
- ret = blk_status_to_errno(status);
- erase_desc:
- ublk_zoned_erase_report_desc(req);
- free_req:
- blk_mq_free_request(req);
- if (ret)
- goto out;
- for (unsigned int i = 0; i < zones_in_request; i++) {
- struct blk_zone *zone = buffer + i;
- /* A zero length zone means no more zones in this response */
- if (!zone->len)
- break;
- ret = cb(zone, i, data);
- if (ret)
- goto out;
- done_zones++;
- sector += zone_size_sectors;
- }
- }
- ret = done_zones;
- out:
- kvfree(buffer);
- return ret;
- }
- static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
- struct request *req)
- {
- struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
- struct ublk_io *io = &ubq->ios[req->tag];
- struct ublk_zoned_report_desc *desc;
- u32 ublk_op;
- switch (req_op(req)) {
- case REQ_OP_ZONE_OPEN:
- ublk_op = UBLK_IO_OP_ZONE_OPEN;
- break;
- case REQ_OP_ZONE_CLOSE:
- ublk_op = UBLK_IO_OP_ZONE_CLOSE;
- break;
- case REQ_OP_ZONE_FINISH:
- ublk_op = UBLK_IO_OP_ZONE_FINISH;
- break;
- case REQ_OP_ZONE_RESET:
- ublk_op = UBLK_IO_OP_ZONE_RESET;
- break;
- case REQ_OP_ZONE_APPEND:
- ublk_op = UBLK_IO_OP_ZONE_APPEND;
- break;
- case REQ_OP_ZONE_RESET_ALL:
- ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
- break;
- case REQ_OP_DRV_IN:
- desc = ublk_zoned_get_report_desc(req);
- if (!desc)
- return BLK_STS_IOERR;
- ublk_op = desc->operation;
- switch (ublk_op) {
- case UBLK_IO_OP_REPORT_ZONES:
- iod->op_flags = ublk_op | ublk_req_build_flags(req);
- iod->nr_zones = desc->nr_zones;
- iod->start_sector = desc->sector;
- return BLK_STS_OK;
- default:
- return BLK_STS_IOERR;
- }
- case REQ_OP_DRV_OUT:
- /* We do not support drv_out */
- return BLK_STS_NOTSUPP;
- default:
- return BLK_STS_IOERR;
- }
- iod->op_flags = ublk_op | ublk_req_build_flags(req);
- iod->nr_sectors = blk_rq_sectors(req);
- iod->start_sector = blk_rq_pos(req);
- iod->addr = io->addr;
- return BLK_STS_OK;
- }
- #else
- #define ublk_report_zones (NULL)
- static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
- {
- return -EOPNOTSUPP;
- }
- static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
- {
- }
- static int ublk_revalidate_disk_zones(struct ublk_device *ub)
- {
- return 0;
- }
- static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
- struct request *req)
- {
- return BLK_STS_NOTSUPP;
- }
- #endif
- static inline void __ublk_complete_rq(struct request *req);
- static void ublk_complete_rq(struct kref *ref);
- static dev_t ublk_chr_devt;
- static const struct class ublk_chr_class = {
- .name = "ublk-char",
- };
- static DEFINE_IDR(ublk_index_idr);
- static DEFINE_SPINLOCK(ublk_idr_lock);
- static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
- static DEFINE_MUTEX(ublk_ctl_mutex);
- /*
- * Max ublk devices allowed to add
- *
- * It can be extended to one per-user limit in future or even controlled
- * by cgroup.
- */
- #define UBLK_MAX_UBLKS UBLK_MINORS
- static unsigned int ublks_max = 64;
- static unsigned int ublks_added; /* protected by ublk_ctl_mutex */
- static struct miscdevice ublk_misc;
- static inline unsigned ublk_pos_to_hwq(loff_t pos)
- {
- return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
- UBLK_QID_BITS_MASK;
- }
- static inline unsigned ublk_pos_to_buf_off(loff_t pos)
- {
- return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
- }
- static inline unsigned ublk_pos_to_tag(loff_t pos)
- {
- return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
- UBLK_TAG_BITS_MASK;
- }
- static void ublk_dev_param_basic_apply(struct ublk_device *ub)
- {
- const struct ublk_param_basic *p = &ub->params.basic;
- if (p->attrs & UBLK_ATTR_READ_ONLY)
- set_disk_ro(ub->ub_disk, true);
- set_capacity(ub->ub_disk, p->dev_sectors);
- }
- static int ublk_validate_params(const struct ublk_device *ub)
- {
- /* basic param is the only one which must be set */
- if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
- const struct ublk_param_basic *p = &ub->params.basic;
- if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
- return -EINVAL;
- if (p->logical_bs_shift > p->physical_bs_shift)
- return -EINVAL;
- if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
- return -EINVAL;
- if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
- return -EINVAL;
- } else
- return -EINVAL;
- if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
- const struct ublk_param_discard *p = &ub->params.discard;
- /* So far, only support single segment discard */
- if (p->max_discard_sectors && p->max_discard_segments != 1)
- return -EINVAL;
- if (!p->discard_granularity)
- return -EINVAL;
- }
- /* dev_t is read-only */
- if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
- return -EINVAL;
- if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
- return ublk_dev_param_zoned_validate(ub);
- else if (ublk_dev_is_zoned(ub))
- return -EINVAL;
- return 0;
- }
- static void ublk_apply_params(struct ublk_device *ub)
- {
- ublk_dev_param_basic_apply(ub);
- if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
- ublk_dev_param_zoned_apply(ub);
- }
- static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
- {
- return ubq->flags & UBLK_F_USER_COPY;
- }
- static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
- {
- /*
- * read()/write() is involved in user copy, so request reference
- * has to be grabbed
- */
- return ublk_support_user_copy(ubq);
- }
- static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
- struct request *req)
- {
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- kref_init(&data->ref);
- }
- }
- static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
- struct request *req)
- {
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- return kref_get_unless_zero(&data->ref);
- }
- return true;
- }
- static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
- struct request *req)
- {
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- kref_put(&data->ref, ublk_complete_rq);
- } else {
- __ublk_complete_rq(req);
- }
- }
- static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
- {
- return ubq->flags & UBLK_F_NEED_GET_DATA;
- }
- /* Called in slow path only, keep it noinline for trace purpose */
- static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
- {
- if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
- return ub;
- return NULL;
- }
- /* Called in slow path only, keep it noinline for trace purpose */
- static noinline void ublk_put_device(struct ublk_device *ub)
- {
- put_device(&ub->cdev_dev);
- }
- static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
- int qid)
- {
- return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
- }
- static inline bool ublk_rq_has_data(const struct request *rq)
- {
- return bio_has_data(rq->bio);
- }
- static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag)
- {
- return (struct ublksrv_io_desc *)
- &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
- }
- static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
- {
- return ublk_get_queue(ub, q_id)->io_cmd_buf;
- }
- static inline int __ublk_queue_cmd_buf_size(int depth)
- {
- return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
- }
- static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
- {
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- return __ublk_queue_cmd_buf_size(ubq->q_depth);
- }
- static int ublk_max_cmd_buf_size(void)
- {
- return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
- }
- static inline bool ublk_queue_can_use_recovery_reissue(
- struct ublk_queue *ubq)
- {
- return (ubq->flags & UBLK_F_USER_RECOVERY) &&
- (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE);
- }
- static inline bool ublk_queue_can_use_recovery(
- struct ublk_queue *ubq)
- {
- return ubq->flags & UBLK_F_USER_RECOVERY;
- }
- static inline bool ublk_can_use_recovery(struct ublk_device *ub)
- {
- return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
- }
- static void ublk_free_disk(struct gendisk *disk)
- {
- struct ublk_device *ub = disk->private_data;
- clear_bit(UB_STATE_USED, &ub->state);
- ublk_put_device(ub);
- }
- static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
- unsigned int *owner_gid)
- {
- kuid_t uid;
- kgid_t gid;
- current_uid_gid(&uid, &gid);
- *owner_uid = from_kuid(&init_user_ns, uid);
- *owner_gid = from_kgid(&init_user_ns, gid);
- }
- static int ublk_open(struct gendisk *disk, blk_mode_t mode)
- {
- struct ublk_device *ub = disk->private_data;
- if (capable(CAP_SYS_ADMIN))
- return 0;
- /*
- * If it is one unprivileged device, only owner can open
- * the disk. Otherwise it could be one trap made by one
- * evil user who grants this disk's privileges to other
- * users deliberately.
- *
- * This way is reasonable too given anyone can create
- * unprivileged device, and no need other's grant.
- */
- if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
- unsigned int curr_uid, curr_gid;
- ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
- if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
- ub->dev_info.owner_gid)
- return -EPERM;
- }
- return 0;
- }
- static const struct block_device_operations ub_fops = {
- .owner = THIS_MODULE,
- .open = ublk_open,
- .free_disk = ublk_free_disk,
- .report_zones = ublk_report_zones,
- };
- #define UBLK_MAX_PIN_PAGES 32
- struct ublk_io_iter {
- struct page *pages[UBLK_MAX_PIN_PAGES];
- struct bio *bio;
- struct bvec_iter iter;
- };
- /* return how many pages are copied */
- static void ublk_copy_io_pages(struct ublk_io_iter *data,
- size_t total, size_t pg_off, int dir)
- {
- unsigned done = 0;
- unsigned pg_idx = 0;
- while (done < total) {
- struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
- unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
- (unsigned)(PAGE_SIZE - pg_off));
- void *bv_buf = bvec_kmap_local(&bv);
- void *pg_buf = kmap_local_page(data->pages[pg_idx]);
- if (dir == ITER_DEST)
- memcpy(pg_buf + pg_off, bv_buf, bytes);
- else
- memcpy(bv_buf, pg_buf + pg_off, bytes);
- kunmap_local(pg_buf);
- kunmap_local(bv_buf);
- /* advance page array */
- pg_off += bytes;
- if (pg_off == PAGE_SIZE) {
- pg_idx += 1;
- pg_off = 0;
- }
- done += bytes;
- /* advance bio */
- bio_advance_iter_single(data->bio, &data->iter, bytes);
- if (!data->iter.bi_size) {
- data->bio = data->bio->bi_next;
- if (data->bio == NULL)
- break;
- data->iter = data->bio->bi_iter;
- }
- }
- }
- static bool ublk_advance_io_iter(const struct request *req,
- struct ublk_io_iter *iter, unsigned int offset)
- {
- struct bio *bio = req->bio;
- for_each_bio(bio) {
- if (bio->bi_iter.bi_size > offset) {
- iter->bio = bio;
- iter->iter = bio->bi_iter;
- bio_advance_iter(iter->bio, &iter->iter, offset);
- return true;
- }
- offset -= bio->bi_iter.bi_size;
- }
- return false;
- }
- /*
- * Copy data between request pages and io_iter, and 'offset'
- * is the start point of linear offset of request.
- */
- static size_t ublk_copy_user_pages(const struct request *req,
- unsigned offset, struct iov_iter *uiter, int dir)
- {
- struct ublk_io_iter iter;
- size_t done = 0;
- if (!ublk_advance_io_iter(req, &iter, offset))
- return 0;
- while (iov_iter_count(uiter) && iter.bio) {
- unsigned nr_pages;
- ssize_t len;
- size_t off;
- int i;
- len = iov_iter_get_pages2(uiter, iter.pages,
- iov_iter_count(uiter),
- UBLK_MAX_PIN_PAGES, &off);
- if (len <= 0)
- return done;
- ublk_copy_io_pages(&iter, len, off, dir);
- nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
- for (i = 0; i < nr_pages; i++) {
- if (dir == ITER_DEST)
- set_page_dirty(iter.pages[i]);
- put_page(iter.pages[i]);
- }
- done += len;
- }
- return done;
- }
- static inline bool ublk_need_map_req(const struct request *req)
- {
- return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
- }
- static inline bool ublk_need_unmap_req(const struct request *req)
- {
- return ublk_rq_has_data(req) &&
- (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
- }
- static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
- struct ublk_io *io)
- {
- const unsigned int rq_bytes = blk_rq_bytes(req);
- if (ublk_support_user_copy(ubq))
- return rq_bytes;
- /*
- * no zero copy, we delay copy WRITE request data into ublksrv
- * context and the big benefit is that pinning pages in current
- * context is pretty fast, see ublk_pin_user_pages
- */
- if (ublk_need_map_req(req)) {
- struct iov_iter iter;
- const int dir = ITER_DEST;
- import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
- return ublk_copy_user_pages(req, 0, &iter, dir);
- }
- return rq_bytes;
- }
- static int ublk_unmap_io(const struct ublk_queue *ubq,
- const struct request *req,
- struct ublk_io *io)
- {
- const unsigned int rq_bytes = blk_rq_bytes(req);
- if (ublk_support_user_copy(ubq))
- return rq_bytes;
- if (ublk_need_unmap_req(req)) {
- struct iov_iter iter;
- const int dir = ITER_SOURCE;
- WARN_ON_ONCE(io->res > rq_bytes);
- import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
- return ublk_copy_user_pages(req, 0, &iter, dir);
- }
- return rq_bytes;
- }
- static inline unsigned int ublk_req_build_flags(struct request *req)
- {
- unsigned flags = 0;
- if (req->cmd_flags & REQ_FAILFAST_DEV)
- flags |= UBLK_IO_F_FAILFAST_DEV;
- if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
- flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
- if (req->cmd_flags & REQ_FAILFAST_DRIVER)
- flags |= UBLK_IO_F_FAILFAST_DRIVER;
- if (req->cmd_flags & REQ_META)
- flags |= UBLK_IO_F_META;
- if (req->cmd_flags & REQ_FUA)
- flags |= UBLK_IO_F_FUA;
- if (req->cmd_flags & REQ_NOUNMAP)
- flags |= UBLK_IO_F_NOUNMAP;
- if (req->cmd_flags & REQ_SWAP)
- flags |= UBLK_IO_F_SWAP;
- return flags;
- }
- static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
- {
- struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
- struct ublk_io *io = &ubq->ios[req->tag];
- enum req_op op = req_op(req);
- u32 ublk_op;
- if (!ublk_queue_is_zoned(ubq) &&
- (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
- return BLK_STS_IOERR;
- switch (req_op(req)) {
- case REQ_OP_READ:
- ublk_op = UBLK_IO_OP_READ;
- break;
- case REQ_OP_WRITE:
- ublk_op = UBLK_IO_OP_WRITE;
- break;
- case REQ_OP_FLUSH:
- ublk_op = UBLK_IO_OP_FLUSH;
- break;
- case REQ_OP_DISCARD:
- ublk_op = UBLK_IO_OP_DISCARD;
- break;
- case REQ_OP_WRITE_ZEROES:
- ublk_op = UBLK_IO_OP_WRITE_ZEROES;
- break;
- default:
- if (ublk_queue_is_zoned(ubq))
- return ublk_setup_iod_zoned(ubq, req);
- return BLK_STS_IOERR;
- }
- /* need to translate since kernel may change */
- iod->op_flags = ublk_op | ublk_req_build_flags(req);
- iod->nr_sectors = blk_rq_sectors(req);
- iod->start_sector = blk_rq_pos(req);
- iod->addr = io->addr;
- return BLK_STS_OK;
- }
- static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
- struct io_uring_cmd *ioucmd)
- {
- return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
- }
- static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
- {
- return ubq->ubq_daemon->flags & PF_EXITING;
- }
- /* todo: handle partial completion */
- static inline void __ublk_complete_rq(struct request *req)
- {
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- struct ublk_io *io = &ubq->ios[req->tag];
- unsigned int unmapped_bytes;
- blk_status_t res = BLK_STS_OK;
- /* called from ublk_abort_queue() code path */
- if (io->flags & UBLK_IO_FLAG_ABORTED) {
- res = BLK_STS_IOERR;
- goto exit;
- }
- /* failed read IO if nothing is read */
- if (!io->res && req_op(req) == REQ_OP_READ)
- io->res = -EIO;
- if (io->res < 0) {
- res = errno_to_blk_status(io->res);
- goto exit;
- }
- /*
- * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
- * directly.
- *
- * Both the two needn't unmap.
- */
- if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
- req_op(req) != REQ_OP_DRV_IN)
- goto exit;
- /* for READ request, writing data in iod->addr to rq buffers */
- unmapped_bytes = ublk_unmap_io(ubq, req, io);
- /*
- * Extremely impossible since we got data filled in just before
- *
- * Re-read simply for this unlikely case.
- */
- if (unlikely(unmapped_bytes < io->res))
- io->res = unmapped_bytes;
- if (blk_update_request(req, BLK_STS_OK, io->res))
- blk_mq_requeue_request(req, true);
- else
- __blk_mq_end_request(req, BLK_STS_OK);
- return;
- exit:
- blk_mq_end_request(req, res);
- }
- static void ublk_complete_rq(struct kref *ref)
- {
- struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
- ref);
- struct request *req = blk_mq_rq_from_pdu(data);
- __ublk_complete_rq(req);
- }
- /*
- * Since __ublk_rq_task_work always fails requests immediately during
- * exiting, __ublk_fail_req() is only called from abort context during
- * exiting. So lock is unnecessary.
- *
- * Also aborting may not be started yet, keep in mind that one failed
- * request may be issued by block layer again.
- */
- static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
- struct request *req)
- {
- WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
- if (ublk_queue_can_use_recovery_reissue(ubq))
- blk_mq_requeue_request(req, false);
- else
- ublk_put_req_ref(ubq, req);
- }
- static void ubq_complete_io_cmd(struct ublk_io *io, int res,
- unsigned issue_flags)
- {
- /* mark this cmd owned by ublksrv */
- io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
- /*
- * clear ACTIVE since we are done with this sqe/cmd slot
- * We can only accept io cmd in case of being not active.
- */
- io->flags &= ~UBLK_IO_FLAG_ACTIVE;
- /* tell ublksrv one io request is coming */
- io_uring_cmd_done(io->cmd, res, 0, issue_flags);
- }
- #define UBLK_REQUEUE_DELAY_MS 3
- static inline void __ublk_abort_rq(struct ublk_queue *ubq,
- struct request *rq)
- {
- /* We cannot process this rq so just requeue it. */
- if (ublk_queue_can_use_recovery(ubq))
- blk_mq_requeue_request(rq, false);
- else
- blk_mq_end_request(rq, BLK_STS_IOERR);
- }
- static inline void __ublk_rq_task_work(struct request *req,
- unsigned issue_flags)
- {
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- int tag = req->tag;
- struct ublk_io *io = &ubq->ios[tag];
- unsigned int mapped_bytes;
- pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
- ublk_get_iod(ubq, req->tag)->addr);
- /*
- * Task is exiting if either:
- *
- * (1) current != ubq_daemon.
- * io_uring_cmd_complete_in_task() tries to run task_work
- * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
- *
- * (2) current->flags & PF_EXITING.
- */
- if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
- __ublk_abort_rq(ubq, req);
- return;
- }
- if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
- /*
- * We have not handled UBLK_IO_NEED_GET_DATA command yet,
- * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
- * and notify it.
- */
- if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
- io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
- pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
- __func__, io->cmd->cmd_op, ubq->q_id,
- req->tag, io->flags);
- ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
- return;
- }
- /*
- * We have handled UBLK_IO_NEED_GET_DATA command,
- * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
- * do the copy work.
- */
- io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
- /* update iod->addr because ublksrv may have passed a new io buffer */
- ublk_get_iod(ubq, req->tag)->addr = io->addr;
- pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
- ublk_get_iod(ubq, req->tag)->addr);
- }
- mapped_bytes = ublk_map_io(ubq, req, io);
- /* partially mapped, update io descriptor */
- if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
- /*
- * Nothing mapped, retry until we succeed.
- *
- * We may never succeed in mapping any bytes here because
- * of OOM. TODO: reserve one buffer with single page pinned
- * for providing forward progress guarantee.
- */
- if (unlikely(!mapped_bytes)) {
- blk_mq_requeue_request(req, false);
- blk_mq_delay_kick_requeue_list(req->q,
- UBLK_REQUEUE_DELAY_MS);
- return;
- }
- ublk_get_iod(ubq, req->tag)->nr_sectors =
- mapped_bytes >> 9;
- }
- ublk_init_req_ref(ubq, req);
- ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
- }
- static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
- unsigned issue_flags)
- {
- struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
- struct ublk_rq_data *data, *tmp;
- io_cmds = llist_reverse_order(io_cmds);
- llist_for_each_entry_safe(data, tmp, io_cmds, node)
- __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags);
- }
- static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
- {
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
- struct ublk_queue *ubq = pdu->ubq;
- ublk_forward_io_cmds(ubq, issue_flags);
- }
- static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
- {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
- if (llist_add(&data->node, &ubq->io_cmds)) {
- struct ublk_io *io = &ubq->ios[rq->tag];
- io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
- }
- }
- static enum blk_eh_timer_return ublk_timeout(struct request *rq)
- {
- struct ublk_queue *ubq = rq->mq_hctx->driver_data;
- unsigned int nr_inflight = 0;
- int i;
- if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
- if (!ubq->timeout) {
- send_sig(SIGKILL, ubq->ubq_daemon, 0);
- ubq->timeout = true;
- }
- return BLK_EH_DONE;
- }
- if (!ubq_daemon_is_dying(ubq))
- return BLK_EH_RESET_TIMER;
- for (i = 0; i < ubq->q_depth; i++) {
- struct ublk_io *io = &ubq->ios[i];
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
- nr_inflight++;
- }
- /* cancelable uring_cmd can't help us if all commands are in-flight */
- if (nr_inflight == ubq->q_depth) {
- struct ublk_device *ub = ubq->dev;
- if (ublk_abort_requests(ub, ubq)) {
- if (ublk_can_use_recovery(ub))
- schedule_work(&ub->quiesce_work);
- else
- schedule_work(&ub->stop_work);
- }
- return BLK_EH_DONE;
- }
- return BLK_EH_RESET_TIMER;
- }
- static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
- const struct blk_mq_queue_data *bd)
- {
- struct ublk_queue *ubq = hctx->driver_data;
- struct request *rq = bd->rq;
- blk_status_t res;
- /* fill iod to slot in io cmd buffer */
- res = ublk_setup_iod(ubq, rq);
- if (unlikely(res != BLK_STS_OK))
- return BLK_STS_IOERR;
- /* With recovery feature enabled, force_abort is set in
- * ublk_stop_dev() before calling del_gendisk(). We have to
- * abort all requeued and new rqs here to let del_gendisk()
- * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
- * to avoid UAF on io_uring ctx.
- *
- * Note: force_abort is guaranteed to be seen because it is set
- * before request queue is unqiuesced.
- */
- if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
- return BLK_STS_IOERR;
- if (unlikely(ubq->canceling)) {
- __ublk_abort_rq(ubq, rq);
- return BLK_STS_OK;
- }
- blk_mq_start_request(bd->rq);
- ublk_queue_cmd(ubq, rq);
- return BLK_STS_OK;
- }
- static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
- unsigned int hctx_idx)
- {
- struct ublk_device *ub = driver_data;
- struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
- hctx->driver_data = ubq;
- return 0;
- }
- static const struct blk_mq_ops ublk_mq_ops = {
- .queue_rq = ublk_queue_rq,
- .init_hctx = ublk_init_hctx,
- .timeout = ublk_timeout,
- };
- static int ublk_ch_open(struct inode *inode, struct file *filp)
- {
- struct ublk_device *ub = container_of(inode->i_cdev,
- struct ublk_device, cdev);
- if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
- return -EBUSY;
- filp->private_data = ub;
- return 0;
- }
- static int ublk_ch_release(struct inode *inode, struct file *filp)
- {
- struct ublk_device *ub = filp->private_data;
- clear_bit(UB_STATE_OPEN, &ub->state);
- return 0;
- }
- /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
- static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
- {
- struct ublk_device *ub = filp->private_data;
- size_t sz = vma->vm_end - vma->vm_start;
- unsigned max_sz = ublk_max_cmd_buf_size();
- unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
- int q_id, ret = 0;
- spin_lock(&ub->lock);
- if (!ub->mm)
- ub->mm = current->mm;
- if (current->mm != ub->mm)
- ret = -EINVAL;
- spin_unlock(&ub->lock);
- if (ret)
- return ret;
- if (vma->vm_flags & VM_WRITE)
- return -EPERM;
- end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
- if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
- return -EINVAL;
- q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
- pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
- __func__, q_id, current->pid, vma->vm_start,
- phys_off, (unsigned long)sz);
- if (sz != ublk_queue_cmd_buf_size(ub, q_id))
- return -EINVAL;
- pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
- }
- static void ublk_commit_completion(struct ublk_device *ub,
- const struct ublksrv_io_cmd *ub_cmd)
- {
- u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
- struct ublk_queue *ubq = ublk_get_queue(ub, qid);
- struct ublk_io *io = &ubq->ios[tag];
- struct request *req;
- /* now this cmd slot is owned by nbd driver */
- io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
- io->res = ub_cmd->result;
- /* find the io request and complete */
- req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
- if (WARN_ON_ONCE(unlikely(!req)))
- return;
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
- if (likely(!blk_should_fake_timeout(req->q)))
- ublk_put_req_ref(ubq, req);
- }
- /*
- * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
- * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
- * context, so everything is serialized.
- */
- static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
- {
- int i;
- for (i = 0; i < ubq->q_depth; i++) {
- struct ublk_io *io = &ubq->ios[i];
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
- struct request *rq;
- /*
- * Either we fail the request or ublk_rq_task_work_fn
- * will do it
- */
- rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq && blk_mq_request_started(rq)) {
- io->flags |= UBLK_IO_FLAG_ABORTED;
- __ublk_fail_req(ubq, io, rq);
- }
- }
- }
- }
- static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
- {
- struct gendisk *disk;
- spin_lock(&ubq->cancel_lock);
- if (ubq->canceling) {
- spin_unlock(&ubq->cancel_lock);
- return false;
- }
- ubq->canceling = true;
- spin_unlock(&ubq->cancel_lock);
- spin_lock(&ub->lock);
- disk = ub->ub_disk;
- if (disk)
- get_device(disk_to_dev(disk));
- spin_unlock(&ub->lock);
- /* Our disk has been dead */
- if (!disk)
- return false;
- /* Now we are serialized with ublk_queue_rq() */
- blk_mq_quiesce_queue(disk->queue);
- /* abort queue is for making forward progress */
- ublk_abort_queue(ub, ubq);
- blk_mq_unquiesce_queue(disk->queue);
- put_device(disk_to_dev(disk));
- return true;
- }
- static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
- unsigned int issue_flags)
- {
- bool done;
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
- return;
- spin_lock(&ubq->cancel_lock);
- done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
- if (!done)
- io->flags |= UBLK_IO_FLAG_CANCELED;
- spin_unlock(&ubq->cancel_lock);
- if (!done)
- io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
- }
- /*
- * The ublk char device won't be closed when calling cancel fn, so both
- * ublk device and queue are guaranteed to be live
- */
- static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
- {
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
- struct ublk_queue *ubq = pdu->ubq;
- struct task_struct *task;
- struct ublk_device *ub;
- bool need_schedule;
- struct ublk_io *io;
- if (WARN_ON_ONCE(!ubq))
- return;
- if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
- return;
- task = io_uring_cmd_get_task(cmd);
- if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
- return;
- ub = ubq->dev;
- need_schedule = ublk_abort_requests(ub, ubq);
- io = &ubq->ios[pdu->tag];
- WARN_ON_ONCE(io->cmd != cmd);
- ublk_cancel_cmd(ubq, io, issue_flags);
- if (need_schedule) {
- if (ublk_can_use_recovery(ub))
- schedule_work(&ub->quiesce_work);
- else
- schedule_work(&ub->stop_work);
- }
- }
- static inline bool ublk_queue_ready(struct ublk_queue *ubq)
- {
- return ubq->nr_io_ready == ubq->q_depth;
- }
- static void ublk_cancel_queue(struct ublk_queue *ubq)
- {
- int i;
- for (i = 0; i < ubq->q_depth; i++)
- ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
- }
- /* Cancel all pending commands, must be called after del_gendisk() returns */
- static void ublk_cancel_dev(struct ublk_device *ub)
- {
- int i;
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
- ublk_cancel_queue(ublk_get_queue(ub, i));
- }
- static bool ublk_check_inflight_rq(struct request *rq, void *data)
- {
- bool *idle = data;
- if (blk_mq_request_started(rq)) {
- *idle = false;
- return false;
- }
- return true;
- }
- static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
- {
- bool idle;
- WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
- while (true) {
- idle = true;
- blk_mq_tagset_busy_iter(&ub->tag_set,
- ublk_check_inflight_rq, &idle);
- if (idle)
- break;
- msleep(UBLK_REQUEUE_DELAY_MS);
- }
- }
- static void __ublk_quiesce_dev(struct ublk_device *ub)
- {
- pr_devel("%s: quiesce ub: dev_id %d state %s\n",
- __func__, ub->dev_info.dev_id,
- ub->dev_info.state == UBLK_S_DEV_LIVE ?
- "LIVE" : "QUIESCED");
- blk_mq_quiesce_queue(ub->ub_disk->queue);
- ublk_wait_tagset_rqs_idle(ub);
- ub->dev_info.state = UBLK_S_DEV_QUIESCED;
- }
- static void ublk_quiesce_work_fn(struct work_struct *work)
- {
- struct ublk_device *ub =
- container_of(work, struct ublk_device, quiesce_work);
- mutex_lock(&ub->mutex);
- if (ub->dev_info.state != UBLK_S_DEV_LIVE)
- goto unlock;
- __ublk_quiesce_dev(ub);
- unlock:
- mutex_unlock(&ub->mutex);
- ublk_cancel_dev(ub);
- }
- static void ublk_unquiesce_dev(struct ublk_device *ub)
- {
- int i;
- pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
- __func__, ub->dev_info.dev_id,
- ub->dev_info.state == UBLK_S_DEV_LIVE ?
- "LIVE" : "QUIESCED");
- /* quiesce_work has run. We let requeued rqs be aborted
- * before running fallback_wq. "force_abort" must be seen
- * after request queue is unqiuesced. Then del_gendisk()
- * can move on.
- */
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
- ublk_get_queue(ub, i)->force_abort = true;
- blk_mq_unquiesce_queue(ub->ub_disk->queue);
- /* We may have requeued some rqs in ublk_quiesce_queue() */
- blk_mq_kick_requeue_list(ub->ub_disk->queue);
- }
- static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
- {
- struct gendisk *disk;
- /* Sync with ublk_abort_queue() by holding the lock */
- spin_lock(&ub->lock);
- disk = ub->ub_disk;
- ub->dev_info.state = UBLK_S_DEV_DEAD;
- ub->dev_info.ublksrv_pid = -1;
- ub->ub_disk = NULL;
- spin_unlock(&ub->lock);
- return disk;
- }
- static void ublk_stop_dev(struct ublk_device *ub)
- {
- struct gendisk *disk;
- mutex_lock(&ub->mutex);
- if (ub->dev_info.state == UBLK_S_DEV_DEAD)
- goto unlock;
- if (ublk_can_use_recovery(ub)) {
- if (ub->dev_info.state == UBLK_S_DEV_LIVE)
- __ublk_quiesce_dev(ub);
- ublk_unquiesce_dev(ub);
- }
- del_gendisk(ub->ub_disk);
- disk = ublk_detach_disk(ub);
- put_disk(disk);
- unlock:
- mutex_unlock(&ub->mutex);
- ublk_cancel_dev(ub);
- }
- /* device can only be started after all IOs are ready */
- static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
- {
- mutex_lock(&ub->mutex);
- ubq->nr_io_ready++;
- if (ublk_queue_ready(ubq)) {
- ubq->ubq_daemon = current;
- get_task_struct(ubq->ubq_daemon);
- ub->nr_queues_ready++;
- if (capable(CAP_SYS_ADMIN))
- ub->nr_privileged_daemon++;
- }
- if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
- complete_all(&ub->completion);
- mutex_unlock(&ub->mutex);
- }
- static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
- int tag)
- {
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
- ublk_queue_cmd(ubq, req);
- }
- static inline int ublk_check_cmd_op(u32 cmd_op)
- {
- u32 ioc_type = _IOC_TYPE(cmd_op);
- if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
- return -EOPNOTSUPP;
- if (ioc_type != 'u' && ioc_type != 0)
- return -EOPNOTSUPP;
- return 0;
- }
- static inline void ublk_fill_io_cmd(struct ublk_io *io,
- struct io_uring_cmd *cmd, unsigned long buf_addr)
- {
- io->cmd = cmd;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
- io->addr = buf_addr;
- }
- static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
- unsigned int issue_flags,
- struct ublk_queue *ubq, unsigned int tag)
- {
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
- /*
- * Safe to refer to @ubq since ublk_queue won't be died until its
- * commands are completed
- */
- pdu->ubq = ubq;
- pdu->tag = tag;
- io_uring_cmd_mark_cancelable(cmd, issue_flags);
- }
- static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
- unsigned int issue_flags,
- const struct ublksrv_io_cmd *ub_cmd)
- {
- struct ublk_device *ub = cmd->file->private_data;
- struct ublk_queue *ubq;
- struct ublk_io *io;
- u32 cmd_op = cmd->cmd_op;
- unsigned tag = ub_cmd->tag;
- int ret = -EINVAL;
- struct request *req;
- pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
- __func__, cmd->cmd_op, ub_cmd->q_id, tag,
- ub_cmd->result);
- if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
- goto out;
- ubq = ublk_get_queue(ub, ub_cmd->q_id);
- if (!ubq || ub_cmd->q_id != ubq->q_id)
- goto out;
- if (ubq->ubq_daemon && ubq->ubq_daemon != current)
- goto out;
- if (tag >= ubq->q_depth)
- goto out;
- io = &ubq->ios[tag];
- /* there is pending io cmd, something must be wrong */
- if (io->flags & UBLK_IO_FLAG_ACTIVE) {
- ret = -EBUSY;
- goto out;
- }
- /*
- * ensure that the user issues UBLK_IO_NEED_GET_DATA
- * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
- */
- if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
- ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
- goto out;
- ret = ublk_check_cmd_op(cmd_op);
- if (ret)
- goto out;
- ret = -EINVAL;
- switch (_IOC_NR(cmd_op)) {
- case UBLK_IO_FETCH_REQ:
- /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
- if (ublk_queue_ready(ubq)) {
- ret = -EBUSY;
- goto out;
- }
- /*
- * The io is being handled by server, so COMMIT_RQ is expected
- * instead of FETCH_REQ
- */
- if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
- goto out;
- if (!ublk_support_user_copy(ubq)) {
- /*
- * FETCH_RQ has to provide IO buffer if NEED GET
- * DATA is not enabled
- */
- if (!ub_cmd->addr && !ublk_need_get_data(ubq))
- goto out;
- } else if (ub_cmd->addr) {
- /* User copy requires addr to be unset */
- ret = -EINVAL;
- goto out;
- }
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- ublk_mark_io_ready(ub, ubq);
- break;
- case UBLK_IO_COMMIT_AND_FETCH_REQ:
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
- if (!ublk_support_user_copy(ubq)) {
- /*
- * COMMIT_AND_FETCH_REQ has to provide IO buffer if
- * NEED GET DATA is not enabled or it is Read IO.
- */
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
- req_op(req) == REQ_OP_READ))
- goto out;
- } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
- /*
- * User copy requires addr to be unset when command is
- * not zone append
- */
- ret = -EINVAL;
- goto out;
- }
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- ublk_commit_completion(ub, ub_cmd);
- break;
- case UBLK_IO_NEED_GET_DATA:
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
- break;
- default:
- goto out;
- }
- ublk_prep_cancel(cmd, issue_flags, ubq, tag);
- return -EIOCBQUEUED;
- out:
- io_uring_cmd_done(cmd, ret, 0, issue_flags);
- pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
- __func__, cmd_op, tag, ret, io->flags);
- return -EIOCBQUEUED;
- }
- static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
- struct ublk_queue *ubq, int tag, size_t offset)
- {
- struct request *req;
- if (!ublk_need_req_ref(ubq))
- return NULL;
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
- if (!req)
- return NULL;
- if (!ublk_get_req_ref(ubq, req))
- return NULL;
- if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
- goto fail_put;
- if (!ublk_rq_has_data(req))
- goto fail_put;
- if (offset > blk_rq_bytes(req))
- goto fail_put;
- return req;
- fail_put:
- ublk_put_req_ref(ubq, req);
- return NULL;
- }
- static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
- {
- /*
- * Not necessary for async retry, but let's keep it simple and always
- * copy the values to avoid any potential reuse.
- */
- const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
- const struct ublksrv_io_cmd ub_cmd = {
- .q_id = READ_ONCE(ub_src->q_id),
- .tag = READ_ONCE(ub_src->tag),
- .result = READ_ONCE(ub_src->result),
- .addr = READ_ONCE(ub_src->addr)
- };
- WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
- return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
- }
- static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
- {
- ublk_ch_uring_cmd_local(cmd, issue_flags);
- }
- static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
- {
- if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
- ublk_uring_cmd_cancel_fn(cmd, issue_flags);
- return 0;
- }
- /* well-implemented server won't run into unlocked */
- if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
- io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
- return -EIOCBQUEUED;
- }
- return ublk_ch_uring_cmd_local(cmd, issue_flags);
- }
- static inline bool ublk_check_ubuf_dir(const struct request *req,
- int ubuf_dir)
- {
- /* copy ubuf to request pages */
- if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
- ubuf_dir == ITER_SOURCE)
- return true;
- /* copy request pages to ubuf */
- if ((req_op(req) == REQ_OP_WRITE ||
- req_op(req) == REQ_OP_ZONE_APPEND) &&
- ubuf_dir == ITER_DEST)
- return true;
- return false;
- }
- static struct request *ublk_check_and_get_req(struct kiocb *iocb,
- struct iov_iter *iter, size_t *off, int dir)
- {
- struct ublk_device *ub = iocb->ki_filp->private_data;
- struct ublk_queue *ubq;
- struct request *req;
- size_t buf_off;
- u16 tag, q_id;
- if (!ub)
- return ERR_PTR(-EACCES);
- if (!user_backed_iter(iter))
- return ERR_PTR(-EACCES);
- if (ub->dev_info.state == UBLK_S_DEV_DEAD)
- return ERR_PTR(-EACCES);
- tag = ublk_pos_to_tag(iocb->ki_pos);
- q_id = ublk_pos_to_hwq(iocb->ki_pos);
- buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
- if (q_id >= ub->dev_info.nr_hw_queues)
- return ERR_PTR(-EINVAL);
- ubq = ublk_get_queue(ub, q_id);
- if (!ubq)
- return ERR_PTR(-EINVAL);
- if (tag >= ubq->q_depth)
- return ERR_PTR(-EINVAL);
- req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
- if (!req)
- return ERR_PTR(-EINVAL);
- if (!req->mq_hctx || !req->mq_hctx->driver_data)
- goto fail;
- if (!ublk_check_ubuf_dir(req, dir))
- goto fail;
- *off = buf_off;
- return req;
- fail:
- ublk_put_req_ref(ubq, req);
- return ERR_PTR(-EACCES);
- }
- static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
- {
- struct ublk_queue *ubq;
- struct request *req;
- size_t buf_off;
- size_t ret;
- req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
- if (IS_ERR(req))
- return PTR_ERR(req);
- ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
- ubq = req->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, req);
- return ret;
- }
- static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
- {
- struct ublk_queue *ubq;
- struct request *req;
- size_t buf_off;
- size_t ret;
- req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
- if (IS_ERR(req))
- return PTR_ERR(req);
- ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
- ubq = req->mq_hctx->driver_data;
- ublk_put_req_ref(ubq, req);
- return ret;
- }
- static const struct file_operations ublk_ch_fops = {
- .owner = THIS_MODULE,
- .open = ublk_ch_open,
- .release = ublk_ch_release,
- .read_iter = ublk_ch_read_iter,
- .write_iter = ublk_ch_write_iter,
- .uring_cmd = ublk_ch_uring_cmd,
- .mmap = ublk_ch_mmap,
- };
- static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
- {
- int size = ublk_queue_cmd_buf_size(ub, q_id);
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- if (ubq->ubq_daemon)
- put_task_struct(ubq->ubq_daemon);
- if (ubq->io_cmd_buf)
- free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
- }
- static int ublk_init_queue(struct ublk_device *ub, int q_id)
- {
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
- void *ptr;
- int size;
- spin_lock_init(&ubq->cancel_lock);
- ubq->flags = ub->dev_info.flags;
- ubq->q_id = q_id;
- ubq->q_depth = ub->dev_info.queue_depth;
- size = ublk_queue_cmd_buf_size(ub, q_id);
- ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
- if (!ptr)
- return -ENOMEM;
- ubq->io_cmd_buf = ptr;
- ubq->dev = ub;
- return 0;
- }
- static void ublk_deinit_queues(struct ublk_device *ub)
- {
- int nr_queues = ub->dev_info.nr_hw_queues;
- int i;
- if (!ub->__queues)
- return;
- for (i = 0; i < nr_queues; i++)
- ublk_deinit_queue(ub, i);
- kfree(ub->__queues);
- }
- static int ublk_init_queues(struct ublk_device *ub)
- {
- int nr_queues = ub->dev_info.nr_hw_queues;
- int depth = ub->dev_info.queue_depth;
- int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
- int i, ret = -ENOMEM;
- ub->queue_size = ubq_size;
- ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
- if (!ub->__queues)
- return ret;
- for (i = 0; i < nr_queues; i++) {
- if (ublk_init_queue(ub, i))
- goto fail;
- }
- init_completion(&ub->completion);
- return 0;
- fail:
- ublk_deinit_queues(ub);
- return ret;
- }
- static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
- {
- int i = idx;
- int err;
- spin_lock(&ublk_idr_lock);
- /* allocate id, if @id >= 0, we're requesting that specific id */
- if (i >= 0) {
- err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
- if (err == -ENOSPC)
- err = -EEXIST;
- } else {
- err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
- GFP_NOWAIT);
- }
- spin_unlock(&ublk_idr_lock);
- if (err >= 0)
- ub->ub_number = err;
- return err;
- }
- static void ublk_free_dev_number(struct ublk_device *ub)
- {
- spin_lock(&ublk_idr_lock);
- idr_remove(&ublk_index_idr, ub->ub_number);
- wake_up_all(&ublk_idr_wq);
- spin_unlock(&ublk_idr_lock);
- }
- static void ublk_cdev_rel(struct device *dev)
- {
- struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
- blk_mq_free_tag_set(&ub->tag_set);
- ublk_deinit_queues(ub);
- ublk_free_dev_number(ub);
- mutex_destroy(&ub->mutex);
- kfree(ub);
- }
- static int ublk_add_chdev(struct ublk_device *ub)
- {
- struct device *dev = &ub->cdev_dev;
- int minor = ub->ub_number;
- int ret;
- dev->parent = ublk_misc.this_device;
- dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
- dev->class = &ublk_chr_class;
- dev->release = ublk_cdev_rel;
- device_initialize(dev);
- ret = dev_set_name(dev, "ublkc%d", minor);
- if (ret)
- goto fail;
- cdev_init(&ub->cdev, &ublk_ch_fops);
- ret = cdev_device_add(&ub->cdev, dev);
- if (ret)
- goto fail;
- ublks_added++;
- return 0;
- fail:
- put_device(dev);
- return ret;
- }
- static void ublk_stop_work_fn(struct work_struct *work)
- {
- struct ublk_device *ub =
- container_of(work, struct ublk_device, stop_work);
- ublk_stop_dev(ub);
- }
- /* align max io buffer size with PAGE_SIZE */
- static void ublk_align_max_io_size(struct ublk_device *ub)
- {
- unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
- ub->dev_info.max_io_buf_bytes =
- round_down(max_io_bytes, PAGE_SIZE);
- }
- static int ublk_add_tag_set(struct ublk_device *ub)
- {
- ub->tag_set.ops = &ublk_mq_ops;
- ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
- ub->tag_set.queue_depth = ub->dev_info.queue_depth;
- ub->tag_set.numa_node = NUMA_NO_NODE;
- ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
- ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
- ub->tag_set.driver_data = ub;
- return blk_mq_alloc_tag_set(&ub->tag_set);
- }
- static void ublk_remove(struct ublk_device *ub)
- {
- ublk_stop_dev(ub);
- cancel_work_sync(&ub->stop_work);
- cancel_work_sync(&ub->quiesce_work);
- cdev_device_del(&ub->cdev, &ub->cdev_dev);
- ublk_put_device(ub);
- ublks_added--;
- }
- static struct ublk_device *ublk_get_device_from_id(int idx)
- {
- struct ublk_device *ub = NULL;
- if (idx < 0)
- return NULL;
- spin_lock(&ublk_idr_lock);
- ub = idr_find(&ublk_index_idr, idx);
- if (ub)
- ub = ublk_get_device(ub);
- spin_unlock(&ublk_idr_lock);
- return ub;
- }
- static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- const struct ublk_param_basic *p = &ub->params.basic;
- int ublksrv_pid = (int)header->data[0];
- struct queue_limits lim = {
- .logical_block_size = 1 << p->logical_bs_shift,
- .physical_block_size = 1 << p->physical_bs_shift,
- .io_min = 1 << p->io_min_shift,
- .io_opt = 1 << p->io_opt_shift,
- .max_hw_sectors = p->max_sectors,
- .chunk_sectors = p->chunk_sectors,
- .virt_boundary_mask = p->virt_boundary_mask,
- .max_segments = USHRT_MAX,
- .max_segment_size = UINT_MAX,
- .dma_alignment = 3,
- };
- struct gendisk *disk;
- int ret = -EINVAL;
- if (ublksrv_pid <= 0)
- return -EINVAL;
- if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
- return -EINVAL;
- if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
- const struct ublk_param_discard *pd = &ub->params.discard;
- lim.discard_alignment = pd->discard_alignment;
- lim.discard_granularity = pd->discard_granularity;
- lim.max_hw_discard_sectors = pd->max_discard_sectors;
- lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
- lim.max_discard_segments = pd->max_discard_segments;
- }
- if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
- const struct ublk_param_zoned *p = &ub->params.zoned;
- if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
- return -EOPNOTSUPP;
- lim.features |= BLK_FEAT_ZONED;
- lim.max_active_zones = p->max_active_zones;
- lim.max_open_zones = p->max_open_zones;
- lim.max_zone_append_sectors = p->max_zone_append_sectors;
- }
- if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
- lim.features |= BLK_FEAT_WRITE_CACHE;
- if (ub->params.basic.attrs & UBLK_ATTR_FUA)
- lim.features |= BLK_FEAT_FUA;
- }
- if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
- lim.features |= BLK_FEAT_ROTATIONAL;
- if (wait_for_completion_interruptible(&ub->completion) != 0)
- return -EINTR;
- mutex_lock(&ub->mutex);
- if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
- test_bit(UB_STATE_USED, &ub->state)) {
- ret = -EEXIST;
- goto out_unlock;
- }
- disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
- if (IS_ERR(disk)) {
- ret = PTR_ERR(disk);
- goto out_unlock;
- }
- sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
- disk->fops = &ub_fops;
- disk->private_data = ub;
- ub->dev_info.ublksrv_pid = ublksrv_pid;
- ub->ub_disk = disk;
- ublk_apply_params(ub);
- /* don't probe partitions if any one ubq daemon is un-trusted */
- if (ub->nr_privileged_daemon != ub->nr_queues_ready)
- set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
- ublk_get_device(ub);
- ub->dev_info.state = UBLK_S_DEV_LIVE;
- if (ublk_dev_is_zoned(ub)) {
- ret = ublk_revalidate_disk_zones(ub);
- if (ret)
- goto out_put_cdev;
- }
- ret = add_disk(disk);
- if (ret)
- goto out_put_cdev;
- set_bit(UB_STATE_USED, &ub->state);
- out_put_cdev:
- if (ret) {
- ublk_detach_disk(ub);
- ublk_put_device(ub);
- }
- if (ret)
- put_disk(disk);
- out_unlock:
- mutex_unlock(&ub->mutex);
- return ret;
- }
- static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
- struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- void __user *argp = (void __user *)(unsigned long)header->addr;
- cpumask_var_t cpumask;
- unsigned long queue;
- unsigned int retlen;
- unsigned int i;
- int ret;
- if (header->len * BITS_PER_BYTE < nr_cpu_ids)
- return -EINVAL;
- if (header->len & (sizeof(unsigned long)-1))
- return -EINVAL;
- if (!header->addr)
- return -EINVAL;
- queue = header->data[0];
- if (queue >= ub->dev_info.nr_hw_queues)
- return -EINVAL;
- if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
- return -ENOMEM;
- for_each_possible_cpu(i) {
- if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
- cpumask_set_cpu(i, cpumask);
- }
- ret = -EFAULT;
- retlen = min_t(unsigned short, header->len, cpumask_size());
- if (copy_to_user(argp, cpumask, retlen))
- goto out_free_cpumask;
- if (retlen != header->len &&
- clear_user(argp + retlen, header->len - retlen))
- goto out_free_cpumask;
- ret = 0;
- out_free_cpumask:
- free_cpumask_var(cpumask);
- return ret;
- }
- static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
- {
- pr_devel("%s: dev id %d flags %llx\n", __func__,
- info->dev_id, info->flags);
- pr_devel("\t nr_hw_queues %d queue_depth %d\n",
- info->nr_hw_queues, info->queue_depth);
- }
- static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- void __user *argp = (void __user *)(unsigned long)header->addr;
- struct ublksrv_ctrl_dev_info info;
- struct ublk_device *ub;
- int ret = -EINVAL;
- if (header->len < sizeof(info) || !header->addr)
- return -EINVAL;
- if (header->queue_id != (u16)-1) {
- pr_warn("%s: queue_id is wrong %x\n",
- __func__, header->queue_id);
- return -EINVAL;
- }
- if (copy_from_user(&info, argp, sizeof(info)))
- return -EFAULT;
- if (capable(CAP_SYS_ADMIN))
- info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
- else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
- return -EPERM;
- /*
- * unprivileged device can't be trusted, but RECOVERY and
- * RECOVERY_REISSUE still may hang error handling, so can't
- * support recovery features for unprivileged ublk now
- *
- * TODO: provide forward progress for RECOVERY handler, so that
- * unprivileged device can benefit from it
- */
- if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
- info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
- UBLK_F_USER_RECOVERY);
- /*
- * For USER_COPY, we depends on userspace to fill request
- * buffer by pwrite() to ublk char device, which can't be
- * used for unprivileged device
- */
- if (info.flags & UBLK_F_USER_COPY)
- return -EINVAL;
- }
- /* the created device is always owned by current user */
- ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
- if (header->dev_id != info.dev_id) {
- pr_warn("%s: dev id not match %u %u\n",
- __func__, header->dev_id, info.dev_id);
- return -EINVAL;
- }
- if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
- pr_warn("%s: dev id is too large. Max supported is %d\n",
- __func__, UBLK_MAX_UBLKS - 1);
- return -EINVAL;
- }
- ublk_dump_dev_info(&info);
- ret = mutex_lock_killable(&ublk_ctl_mutex);
- if (ret)
- return ret;
- ret = -EACCES;
- if (ublks_added >= ublks_max)
- goto out_unlock;
- ret = -ENOMEM;
- ub = kzalloc(sizeof(*ub), GFP_KERNEL);
- if (!ub)
- goto out_unlock;
- mutex_init(&ub->mutex);
- spin_lock_init(&ub->lock);
- INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
- INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
- ret = ublk_alloc_dev_number(ub, header->dev_id);
- if (ret < 0)
- goto out_free_ub;
- memcpy(&ub->dev_info, &info, sizeof(info));
- /* update device id */
- ub->dev_info.dev_id = ub->ub_number;
- /*
- * 64bit flags will be copied back to userspace as feature
- * negotiation result, so have to clear flags which driver
- * doesn't support yet, then userspace can get correct flags
- * (features) to handle.
- */
- ub->dev_info.flags &= UBLK_F_ALL;
- ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
- UBLK_F_URING_CMD_COMP_IN_TASK;
- /* GET_DATA isn't needed any more with USER_COPY */
- if (ublk_dev_is_user_copy(ub))
- ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
- /* Zoned storage support requires user copy feature */
- if (ublk_dev_is_zoned(ub) &&
- (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
- ret = -EINVAL;
- goto out_free_dev_number;
- }
- /* We are not ready to support zero copy */
- ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
- ub->dev_info.nr_hw_queues = min_t(unsigned int,
- ub->dev_info.nr_hw_queues, nr_cpu_ids);
- ublk_align_max_io_size(ub);
- ret = ublk_init_queues(ub);
- if (ret)
- goto out_free_dev_number;
- ret = ublk_add_tag_set(ub);
- if (ret)
- goto out_deinit_queues;
- ret = -EFAULT;
- if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
- goto out_free_tag_set;
- /*
- * Add the char dev so that ublksrv daemon can be setup.
- * ublk_add_chdev() will cleanup everything if it fails.
- */
- ret = ublk_add_chdev(ub);
- goto out_unlock;
- out_free_tag_set:
- blk_mq_free_tag_set(&ub->tag_set);
- out_deinit_queues:
- ublk_deinit_queues(ub);
- out_free_dev_number:
- ublk_free_dev_number(ub);
- out_free_ub:
- mutex_destroy(&ub->mutex);
- kfree(ub);
- out_unlock:
- mutex_unlock(&ublk_ctl_mutex);
- return ret;
- }
- static inline bool ublk_idr_freed(int id)
- {
- void *ptr;
- spin_lock(&ublk_idr_lock);
- ptr = idr_find(&ublk_index_idr, id);
- spin_unlock(&ublk_idr_lock);
- return ptr == NULL;
- }
- static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
- {
- struct ublk_device *ub = *p_ub;
- int idx = ub->ub_number;
- int ret;
- ret = mutex_lock_killable(&ublk_ctl_mutex);
- if (ret)
- return ret;
- if (!test_bit(UB_STATE_DELETED, &ub->state)) {
- ublk_remove(ub);
- set_bit(UB_STATE_DELETED, &ub->state);
- }
- /* Mark the reference as consumed */
- *p_ub = NULL;
- ublk_put_device(ub);
- mutex_unlock(&ublk_ctl_mutex);
- /*
- * Wait until the idr is removed, then it can be reused after
- * DEL_DEV command is returned.
- *
- * If we returns because of user interrupt, future delete command
- * may come:
- *
- * - the device number isn't freed, this device won't or needn't
- * be deleted again, since UB_STATE_DELETED is set, and device
- * will be released after the last reference is dropped
- *
- * - the device number is freed already, we will not find this
- * device via ublk_get_device_from_id()
- */
- if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
- return -EINTR;
- return 0;
- }
- static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
- __func__, cmd->cmd_op, header->dev_id, header->queue_id,
- header->data[0], header->addr, header->len);
- }
- static int ublk_ctrl_stop_dev(struct ublk_device *ub)
- {
- ublk_stop_dev(ub);
- cancel_work_sync(&ub->stop_work);
- cancel_work_sync(&ub->quiesce_work);
- return 0;
- }
- static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
- struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- void __user *argp = (void __user *)(unsigned long)header->addr;
- if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
- return -EINVAL;
- if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
- return -EFAULT;
- return 0;
- }
- /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
- static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
- {
- ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
- ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
- if (ub->ub_disk) {
- ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
- ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
- } else {
- ub->params.devt.disk_major = 0;
- ub->params.devt.disk_minor = 0;
- }
- ub->params.types |= UBLK_PARAM_TYPE_DEVT;
- }
- static int ublk_ctrl_get_params(struct ublk_device *ub,
- struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- void __user *argp = (void __user *)(unsigned long)header->addr;
- struct ublk_params_header ph;
- int ret;
- if (header->len <= sizeof(ph) || !header->addr)
- return -EINVAL;
- if (copy_from_user(&ph, argp, sizeof(ph)))
- return -EFAULT;
- if (ph.len > header->len || !ph.len)
- return -EINVAL;
- if (ph.len > sizeof(struct ublk_params))
- ph.len = sizeof(struct ublk_params);
- mutex_lock(&ub->mutex);
- ublk_ctrl_fill_params_devt(ub);
- if (copy_to_user(argp, &ub->params, ph.len))
- ret = -EFAULT;
- else
- ret = 0;
- mutex_unlock(&ub->mutex);
- return ret;
- }
- static int ublk_ctrl_set_params(struct ublk_device *ub,
- struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- void __user *argp = (void __user *)(unsigned long)header->addr;
- struct ublk_params_header ph;
- int ret = -EFAULT;
- if (header->len <= sizeof(ph) || !header->addr)
- return -EINVAL;
- if (copy_from_user(&ph, argp, sizeof(ph)))
- return -EFAULT;
- if (ph.len > header->len || !ph.len || !ph.types)
- return -EINVAL;
- if (ph.len > sizeof(struct ublk_params))
- ph.len = sizeof(struct ublk_params);
- /* parameters can only be changed when device isn't live */
- mutex_lock(&ub->mutex);
- if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
- ret = -EACCES;
- } else if (copy_from_user(&ub->params, argp, ph.len)) {
- ret = -EFAULT;
- } else {
- /* clear all we don't support yet */
- ub->params.types &= UBLK_PARAM_TYPE_ALL;
- ret = ublk_validate_params(ub);
- if (ret)
- ub->params.types = 0;
- }
- mutex_unlock(&ub->mutex);
- return ret;
- }
- static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
- {
- int i;
- WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
- /* All old ioucmds have to be completed */
- ubq->nr_io_ready = 0;
- /* old daemon is PF_EXITING, put it now */
- put_task_struct(ubq->ubq_daemon);
- /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
- ubq->ubq_daemon = NULL;
- ubq->timeout = false;
- ubq->canceling = false;
- for (i = 0; i < ubq->q_depth; i++) {
- struct ublk_io *io = &ubq->ios[i];
- /* forget everything now and be ready for new FETCH_REQ */
- io->flags = 0;
- io->cmd = NULL;
- io->addr = 0;
- }
- }
- static int ublk_ctrl_start_recovery(struct ublk_device *ub,
- struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- int ret = -EINVAL;
- int i;
- mutex_lock(&ub->mutex);
- if (!ublk_can_use_recovery(ub))
- goto out_unlock;
- if (!ub->nr_queues_ready)
- goto out_unlock;
- /*
- * START_RECOVERY is only allowd after:
- *
- * (1) UB_STATE_OPEN is not set, which means the dying process is exited
- * and related io_uring ctx is freed so file struct of /dev/ublkcX is
- * released.
- *
- * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
- * (a)has quiesced request queue
- * (b)has requeued every inflight rqs whose io_flags is ACTIVE
- * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
- * (d)has completed/camceled all ioucmds owned by ther dying process
- */
- if (test_bit(UB_STATE_OPEN, &ub->state) ||
- ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
- ret = -EBUSY;
- goto out_unlock;
- }
- pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
- ublk_queue_reinit(ub, ublk_get_queue(ub, i));
- /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
- ub->mm = NULL;
- ub->nr_queues_ready = 0;
- ub->nr_privileged_daemon = 0;
- init_completion(&ub->completion);
- ret = 0;
- out_unlock:
- mutex_unlock(&ub->mutex);
- return ret;
- }
- static int ublk_ctrl_end_recovery(struct ublk_device *ub,
- struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- int ublksrv_pid = (int)header->data[0];
- int ret = -EINVAL;
- pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
- __func__, ub->dev_info.nr_hw_queues, header->dev_id);
- /* wait until new ubq_daemon sending all FETCH_REQ */
- if (wait_for_completion_interruptible(&ub->completion))
- return -EINTR;
- pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
- __func__, ub->dev_info.nr_hw_queues, header->dev_id);
- mutex_lock(&ub->mutex);
- if (!ublk_can_use_recovery(ub))
- goto out_unlock;
- if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
- ret = -EBUSY;
- goto out_unlock;
- }
- ub->dev_info.ublksrv_pid = ublksrv_pid;
- pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
- __func__, ublksrv_pid, header->dev_id);
- blk_mq_unquiesce_queue(ub->ub_disk->queue);
- pr_devel("%s: queue unquiesced, dev id %d.\n",
- __func__, header->dev_id);
- blk_mq_kick_requeue_list(ub->ub_disk->queue);
- ub->dev_info.state = UBLK_S_DEV_LIVE;
- ret = 0;
- out_unlock:
- mutex_unlock(&ub->mutex);
- return ret;
- }
- static int ublk_ctrl_get_features(struct io_uring_cmd *cmd)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- void __user *argp = (void __user *)(unsigned long)header->addr;
- u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY;
- if (header->len != UBLK_FEATURES_LEN || !header->addr)
- return -EINVAL;
- if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
- return -EFAULT;
- return 0;
- }
- /*
- * All control commands are sent via /dev/ublk-control, so we have to check
- * the destination device's permission
- */
- static int ublk_char_dev_permission(struct ublk_device *ub,
- const char *dev_path, int mask)
- {
- int err;
- struct path path;
- struct kstat stat;
- err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
- if (err)
- return err;
- err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
- if (err)
- goto exit;
- err = -EPERM;
- if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
- goto exit;
- err = inode_permission(&nop_mnt_idmap,
- d_backing_inode(path.dentry), mask);
- exit:
- path_put(&path);
- return err;
- }
- static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
- struct io_uring_cmd *cmd)
- {
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
- bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
- void __user *argp = (void __user *)(unsigned long)header->addr;
- char *dev_path = NULL;
- int ret = 0;
- int mask;
- if (!unprivileged) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- /*
- * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
- * char_dev_path in payload too, since userspace may not
- * know if the specified device is created as unprivileged
- * mode.
- */
- if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
- return 0;
- }
- /*
- * User has to provide the char device path for unprivileged ublk
- *
- * header->addr always points to the dev path buffer, and
- * header->dev_path_len records length of dev path buffer.
- */
- if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
- return -EINVAL;
- if (header->len < header->dev_path_len)
- return -EINVAL;
- dev_path = memdup_user_nul(argp, header->dev_path_len);
- if (IS_ERR(dev_path))
- return PTR_ERR(dev_path);
- ret = -EINVAL;
- switch (_IOC_NR(cmd->cmd_op)) {
- case UBLK_CMD_GET_DEV_INFO:
- case UBLK_CMD_GET_DEV_INFO2:
- case UBLK_CMD_GET_QUEUE_AFFINITY:
- case UBLK_CMD_GET_PARAMS:
- case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
- mask = MAY_READ;
- break;
- case UBLK_CMD_START_DEV:
- case UBLK_CMD_STOP_DEV:
- case UBLK_CMD_ADD_DEV:
- case UBLK_CMD_DEL_DEV:
- case UBLK_CMD_SET_PARAMS:
- case UBLK_CMD_START_USER_RECOVERY:
- case UBLK_CMD_END_USER_RECOVERY:
- mask = MAY_READ | MAY_WRITE;
- break;
- default:
- goto exit;
- }
- ret = ublk_char_dev_permission(ub, dev_path, mask);
- if (!ret) {
- header->len -= header->dev_path_len;
- header->addr += header->dev_path_len;
- }
- pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
- __func__, ub->ub_number, cmd->cmd_op,
- ub->dev_info.owner_uid, ub->dev_info.owner_gid,
- dev_path, ret);
- exit:
- kfree(dev_path);
- return ret;
- }
- static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
- unsigned int issue_flags)
- {
- const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
- struct ublk_device *ub = NULL;
- u32 cmd_op = cmd->cmd_op;
- int ret = -EINVAL;
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
- ublk_ctrl_cmd_dump(cmd);
- if (!(issue_flags & IO_URING_F_SQE128))
- goto out;
- ret = ublk_check_cmd_op(cmd_op);
- if (ret)
- goto out;
- if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
- ret = ublk_ctrl_get_features(cmd);
- goto out;
- }
- if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
- ret = -ENODEV;
- ub = ublk_get_device_from_id(header->dev_id);
- if (!ub)
- goto out;
- ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
- if (ret)
- goto put_dev;
- }
- switch (_IOC_NR(cmd_op)) {
- case UBLK_CMD_START_DEV:
- ret = ublk_ctrl_start_dev(ub, cmd);
- break;
- case UBLK_CMD_STOP_DEV:
- ret = ublk_ctrl_stop_dev(ub);
- break;
- case UBLK_CMD_GET_DEV_INFO:
- case UBLK_CMD_GET_DEV_INFO2:
- ret = ublk_ctrl_get_dev_info(ub, cmd);
- break;
- case UBLK_CMD_ADD_DEV:
- ret = ublk_ctrl_add_dev(cmd);
- break;
- case UBLK_CMD_DEL_DEV:
- ret = ublk_ctrl_del_dev(&ub, true);
- break;
- case UBLK_CMD_DEL_DEV_ASYNC:
- ret = ublk_ctrl_del_dev(&ub, false);
- break;
- case UBLK_CMD_GET_QUEUE_AFFINITY:
- ret = ublk_ctrl_get_queue_affinity(ub, cmd);
- break;
- case UBLK_CMD_GET_PARAMS:
- ret = ublk_ctrl_get_params(ub, cmd);
- break;
- case UBLK_CMD_SET_PARAMS:
- ret = ublk_ctrl_set_params(ub, cmd);
- break;
- case UBLK_CMD_START_USER_RECOVERY:
- ret = ublk_ctrl_start_recovery(ub, cmd);
- break;
- case UBLK_CMD_END_USER_RECOVERY:
- ret = ublk_ctrl_end_recovery(ub, cmd);
- break;
- default:
- ret = -EOPNOTSUPP;
- break;
- }
- put_dev:
- if (ub)
- ublk_put_device(ub);
- out:
- io_uring_cmd_done(cmd, ret, 0, issue_flags);
- pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
- __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
- return -EIOCBQUEUED;
- }
- static const struct file_operations ublk_ctl_fops = {
- .open = nonseekable_open,
- .uring_cmd = ublk_ctrl_uring_cmd,
- .owner = THIS_MODULE,
- .llseek = noop_llseek,
- };
- static struct miscdevice ublk_misc = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "ublk-control",
- .fops = &ublk_ctl_fops,
- };
- static int __init ublk_init(void)
- {
- int ret;
- BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
- UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
- init_waitqueue_head(&ublk_idr_wq);
- ret = misc_register(&ublk_misc);
- if (ret)
- return ret;
- ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
- if (ret)
- goto unregister_mis;
- ret = class_register(&ublk_chr_class);
- if (ret)
- goto free_chrdev_region;
- return 0;
- free_chrdev_region:
- unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
- unregister_mis:
- misc_deregister(&ublk_misc);
- return ret;
- }
- static void __exit ublk_exit(void)
- {
- struct ublk_device *ub;
- int id;
- idr_for_each_entry(&ublk_index_idr, ub, id)
- ublk_remove(ub);
- class_unregister(&ublk_chr_class);
- misc_deregister(&ublk_misc);
- idr_destroy(&ublk_index_idr);
- unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
- }
- module_init(ublk_init);
- module_exit(ublk_exit);
- static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp)
- {
- return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
- }
- static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp)
- {
- return sysfs_emit(buf, "%u\n", ublks_max);
- }
- static const struct kernel_param_ops ublk_max_ublks_ops = {
- .set = ublk_set_max_ublks,
- .get = ublk_get_max_ublks,
- };
- module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644);
- MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
- MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
- MODULE_DESCRIPTION("Userspace block device");
- MODULE_LICENSE("GPL");
|