pci-hyperv.c 114 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) Microsoft Corporation.
  4. *
  5. * Author:
  6. * Jake Oshins <jakeo@microsoft.com>
  7. *
  8. * This driver acts as a paravirtual front-end for PCI Express root buses.
  9. * When a PCI Express function (either an entire device or an SR-IOV
  10. * Virtual Function) is being passed through to the VM, this driver exposes
  11. * a new bus to the guest VM. This is modeled as a root PCI bus because
  12. * no bridges are being exposed to the VM. In fact, with a "Generation 2"
  13. * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
  14. * until a device as been exposed using this driver.
  15. *
  16. * Each root PCI bus has its own PCI domain, which is called "Segment" in
  17. * the PCI Firmware Specifications. Thus while each device passed through
  18. * to the VM using this front-end will appear at "device 0", the domain will
  19. * be unique. Typically, each bus will have one PCI function on it, though
  20. * this driver does support more than one.
  21. *
  22. * In order to map the interrupts from the device through to the guest VM,
  23. * this driver also implements an IRQ Domain, which handles interrupts (either
  24. * MSI or MSI-X) associated with the functions on the bus. As interrupts are
  25. * set up, torn down, or reaffined, this driver communicates with the
  26. * underlying hypervisor to adjust the mappings in the I/O MMU so that each
  27. * interrupt will be delivered to the correct virtual processor at the right
  28. * vector. This driver does not support level-triggered (line-based)
  29. * interrupts, and will report that the Interrupt Line register in the
  30. * function's configuration space is zero.
  31. *
  32. * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
  33. * facilities. For instance, the configuration space of a function exposed
  34. * by Hyper-V is mapped into a single page of memory space, and the
  35. * read and write handlers for config space must be aware of this mechanism.
  36. * Similarly, device setup and teardown involves messages sent to and from
  37. * the PCI back-end driver in Hyper-V.
  38. */
  39. #include <linux/kernel.h>
  40. #include <linux/module.h>
  41. #include <linux/pci.h>
  42. #include <linux/pci-ecam.h>
  43. #include <linux/delay.h>
  44. #include <linux/semaphore.h>
  45. #include <linux/irq.h>
  46. #include <linux/msi.h>
  47. #include <linux/hyperv.h>
  48. #include <linux/refcount.h>
  49. #include <linux/irqdomain.h>
  50. #include <linux/acpi.h>
  51. #include <linux/sizes.h>
  52. #include <asm/mshyperv.h>
  53. /*
  54. * Protocol versions. The low word is the minor version, the high word the
  55. * major version.
  56. */
  57. #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
  58. #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
  59. #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
  60. enum pci_protocol_version_t {
  61. PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */
  62. PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */
  63. PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */
  64. PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4), /* WS2022 */
  65. };
  66. #define CPU_AFFINITY_ALL -1ULL
  67. /*
  68. * Supported protocol versions in the order of probing - highest go
  69. * first.
  70. */
  71. static enum pci_protocol_version_t pci_protocol_versions[] = {
  72. PCI_PROTOCOL_VERSION_1_4,
  73. PCI_PROTOCOL_VERSION_1_3,
  74. PCI_PROTOCOL_VERSION_1_2,
  75. PCI_PROTOCOL_VERSION_1_1,
  76. };
  77. #define PCI_CONFIG_MMIO_LENGTH 0x2000
  78. #define CFG_PAGE_OFFSET 0x1000
  79. #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
  80. #define MAX_SUPPORTED_MSI_MESSAGES 0x400
  81. #define STATUS_REVISION_MISMATCH 0xC0000059
  82. /* space for 32bit serial number as string */
  83. #define SLOT_NAME_SIZE 11
  84. /*
  85. * Size of requestor for VMbus; the value is based on the observation
  86. * that having more than one request outstanding is 'rare', and so 64
  87. * should be generous in ensuring that we don't ever run out.
  88. */
  89. #define HV_PCI_RQSTOR_SIZE 64
  90. /*
  91. * Message Types
  92. */
  93. enum pci_message_type {
  94. /*
  95. * Version 1.1
  96. */
  97. PCI_MESSAGE_BASE = 0x42490000,
  98. PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0,
  99. PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1,
  100. PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4,
  101. PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
  102. PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6,
  103. PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7,
  104. PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8,
  105. PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9,
  106. PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA,
  107. PCI_EJECT = PCI_MESSAGE_BASE + 0xB,
  108. PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC,
  109. PCI_REENABLE = PCI_MESSAGE_BASE + 0xD,
  110. PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE,
  111. PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF,
  112. PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10,
  113. PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11,
  114. PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12,
  115. PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13,
  116. PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14,
  117. PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15,
  118. PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16,
  119. PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17,
  120. PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */
  121. PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19,
  122. PCI_RESOURCES_ASSIGNED3 = PCI_MESSAGE_BASE + 0x1A,
  123. PCI_CREATE_INTERRUPT_MESSAGE3 = PCI_MESSAGE_BASE + 0x1B,
  124. PCI_MESSAGE_MAXIMUM
  125. };
  126. /*
  127. * Structures defining the virtual PCI Express protocol.
  128. */
  129. union pci_version {
  130. struct {
  131. u16 minor_version;
  132. u16 major_version;
  133. } parts;
  134. u32 version;
  135. } __packed;
  136. /*
  137. * Function numbers are 8-bits wide on Express, as interpreted through ARI,
  138. * which is all this driver does. This representation is the one used in
  139. * Windows, which is what is expected when sending this back and forth with
  140. * the Hyper-V parent partition.
  141. */
  142. union win_slot_encoding {
  143. struct {
  144. u32 dev:5;
  145. u32 func:3;
  146. u32 reserved:24;
  147. } bits;
  148. u32 slot;
  149. } __packed;
  150. /*
  151. * Pretty much as defined in the PCI Specifications.
  152. */
  153. struct pci_function_description {
  154. u16 v_id; /* vendor ID */
  155. u16 d_id; /* device ID */
  156. u8 rev;
  157. u8 prog_intf;
  158. u8 subclass;
  159. u8 base_class;
  160. u32 subsystem_id;
  161. union win_slot_encoding win_slot;
  162. u32 ser; /* serial number */
  163. } __packed;
  164. enum pci_device_description_flags {
  165. HV_PCI_DEVICE_FLAG_NONE = 0x0,
  166. HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1,
  167. };
  168. struct pci_function_description2 {
  169. u16 v_id; /* vendor ID */
  170. u16 d_id; /* device ID */
  171. u8 rev;
  172. u8 prog_intf;
  173. u8 subclass;
  174. u8 base_class;
  175. u32 subsystem_id;
  176. union win_slot_encoding win_slot;
  177. u32 ser; /* serial number */
  178. u32 flags;
  179. u16 virtual_numa_node;
  180. u16 reserved;
  181. } __packed;
  182. /**
  183. * struct hv_msi_desc
  184. * @vector: IDT entry
  185. * @delivery_mode: As defined in Intel's Programmer's
  186. * Reference Manual, Volume 3, Chapter 8.
  187. * @vector_count: Number of contiguous entries in the
  188. * Interrupt Descriptor Table that are
  189. * occupied by this Message-Signaled
  190. * Interrupt. For "MSI", as first defined
  191. * in PCI 2.2, this can be between 1 and
  192. * 32. For "MSI-X," as first defined in PCI
  193. * 3.0, this must be 1, as each MSI-X table
  194. * entry would have its own descriptor.
  195. * @reserved: Empty space
  196. * @cpu_mask: All the target virtual processors.
  197. */
  198. struct hv_msi_desc {
  199. u8 vector;
  200. u8 delivery_mode;
  201. u16 vector_count;
  202. u32 reserved;
  203. u64 cpu_mask;
  204. } __packed;
  205. /**
  206. * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
  207. * @vector: IDT entry
  208. * @delivery_mode: As defined in Intel's Programmer's
  209. * Reference Manual, Volume 3, Chapter 8.
  210. * @vector_count: Number of contiguous entries in the
  211. * Interrupt Descriptor Table that are
  212. * occupied by this Message-Signaled
  213. * Interrupt. For "MSI", as first defined
  214. * in PCI 2.2, this can be between 1 and
  215. * 32. For "MSI-X," as first defined in PCI
  216. * 3.0, this must be 1, as each MSI-X table
  217. * entry would have its own descriptor.
  218. * @processor_count: number of bits enabled in array.
  219. * @processor_array: All the target virtual processors.
  220. */
  221. struct hv_msi_desc2 {
  222. u8 vector;
  223. u8 delivery_mode;
  224. u16 vector_count;
  225. u16 processor_count;
  226. u16 processor_array[32];
  227. } __packed;
  228. /*
  229. * struct hv_msi_desc3 - 1.3 version of hv_msi_desc
  230. * Everything is the same as in 'hv_msi_desc2' except that the size of the
  231. * 'vector' field is larger to support bigger vector values. For ex: LPI
  232. * vectors on ARM.
  233. */
  234. struct hv_msi_desc3 {
  235. u32 vector;
  236. u8 delivery_mode;
  237. u8 reserved;
  238. u16 vector_count;
  239. u16 processor_count;
  240. u16 processor_array[32];
  241. } __packed;
  242. /**
  243. * struct tran_int_desc
  244. * @reserved: unused, padding
  245. * @vector_count: same as in hv_msi_desc
  246. * @data: This is the "data payload" value that is
  247. * written by the device when it generates
  248. * a message-signaled interrupt, either MSI
  249. * or MSI-X.
  250. * @address: This is the address to which the data
  251. * payload is written on interrupt
  252. * generation.
  253. */
  254. struct tran_int_desc {
  255. u16 reserved;
  256. u16 vector_count;
  257. u32 data;
  258. u64 address;
  259. } __packed;
  260. /*
  261. * A generic message format for virtual PCI.
  262. * Specific message formats are defined later in the file.
  263. */
  264. struct pci_message {
  265. u32 type;
  266. } __packed;
  267. struct pci_child_message {
  268. struct pci_message message_type;
  269. union win_slot_encoding wslot;
  270. } __packed;
  271. struct pci_incoming_message {
  272. struct vmpacket_descriptor hdr;
  273. struct pci_message message_type;
  274. } __packed;
  275. struct pci_response {
  276. struct vmpacket_descriptor hdr;
  277. s32 status; /* negative values are failures */
  278. } __packed;
  279. struct pci_packet {
  280. void (*completion_func)(void *context, struct pci_response *resp,
  281. int resp_packet_size);
  282. void *compl_ctxt;
  283. struct pci_message message[];
  284. };
  285. /*
  286. * Specific message types supporting the PCI protocol.
  287. */
  288. /*
  289. * Version negotiation message. Sent from the guest to the host.
  290. * The guest is free to try different versions until the host
  291. * accepts the version.
  292. *
  293. * pci_version: The protocol version requested.
  294. * is_last_attempt: If TRUE, this is the last version guest will request.
  295. * reservedz: Reserved field, set to zero.
  296. */
  297. struct pci_version_request {
  298. struct pci_message message_type;
  299. u32 protocol_version;
  300. } __packed;
  301. /*
  302. * Bus D0 Entry. This is sent from the guest to the host when the virtual
  303. * bus (PCI Express port) is ready for action.
  304. */
  305. struct pci_bus_d0_entry {
  306. struct pci_message message_type;
  307. u32 reserved;
  308. u64 mmio_base;
  309. } __packed;
  310. struct pci_bus_relations {
  311. struct pci_incoming_message incoming;
  312. u32 device_count;
  313. struct pci_function_description func[];
  314. } __packed;
  315. struct pci_bus_relations2 {
  316. struct pci_incoming_message incoming;
  317. u32 device_count;
  318. struct pci_function_description2 func[];
  319. } __packed;
  320. struct pci_q_res_req_response {
  321. struct vmpacket_descriptor hdr;
  322. s32 status; /* negative values are failures */
  323. u32 probed_bar[PCI_STD_NUM_BARS];
  324. } __packed;
  325. struct pci_set_power {
  326. struct pci_message message_type;
  327. union win_slot_encoding wslot;
  328. u32 power_state; /* In Windows terms */
  329. u32 reserved;
  330. } __packed;
  331. struct pci_set_power_response {
  332. struct vmpacket_descriptor hdr;
  333. s32 status; /* negative values are failures */
  334. union win_slot_encoding wslot;
  335. u32 resultant_state; /* In Windows terms */
  336. u32 reserved;
  337. } __packed;
  338. struct pci_resources_assigned {
  339. struct pci_message message_type;
  340. union win_slot_encoding wslot;
  341. u8 memory_range[0x14][6]; /* not used here */
  342. u32 msi_descriptors;
  343. u32 reserved[4];
  344. } __packed;
  345. struct pci_resources_assigned2 {
  346. struct pci_message message_type;
  347. union win_slot_encoding wslot;
  348. u8 memory_range[0x14][6]; /* not used here */
  349. u32 msi_descriptor_count;
  350. u8 reserved[70];
  351. } __packed;
  352. struct pci_create_interrupt {
  353. struct pci_message message_type;
  354. union win_slot_encoding wslot;
  355. struct hv_msi_desc int_desc;
  356. } __packed;
  357. struct pci_create_int_response {
  358. struct pci_response response;
  359. u32 reserved;
  360. struct tran_int_desc int_desc;
  361. } __packed;
  362. struct pci_create_interrupt2 {
  363. struct pci_message message_type;
  364. union win_slot_encoding wslot;
  365. struct hv_msi_desc2 int_desc;
  366. } __packed;
  367. struct pci_create_interrupt3 {
  368. struct pci_message message_type;
  369. union win_slot_encoding wslot;
  370. struct hv_msi_desc3 int_desc;
  371. } __packed;
  372. struct pci_delete_interrupt {
  373. struct pci_message message_type;
  374. union win_slot_encoding wslot;
  375. struct tran_int_desc int_desc;
  376. } __packed;
  377. /*
  378. * Note: the VM must pass a valid block id, wslot and bytes_requested.
  379. */
  380. struct pci_read_block {
  381. struct pci_message message_type;
  382. u32 block_id;
  383. union win_slot_encoding wslot;
  384. u32 bytes_requested;
  385. } __packed;
  386. struct pci_read_block_response {
  387. struct vmpacket_descriptor hdr;
  388. u32 status;
  389. u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
  390. } __packed;
  391. /*
  392. * Note: the VM must pass a valid block id, wslot and byte_count.
  393. */
  394. struct pci_write_block {
  395. struct pci_message message_type;
  396. u32 block_id;
  397. union win_slot_encoding wslot;
  398. u32 byte_count;
  399. u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
  400. } __packed;
  401. struct pci_dev_inval_block {
  402. struct pci_incoming_message incoming;
  403. union win_slot_encoding wslot;
  404. u64 block_mask;
  405. } __packed;
  406. struct pci_dev_incoming {
  407. struct pci_incoming_message incoming;
  408. union win_slot_encoding wslot;
  409. } __packed;
  410. struct pci_eject_response {
  411. struct pci_message message_type;
  412. union win_slot_encoding wslot;
  413. u32 status;
  414. } __packed;
  415. static int pci_ring_size = VMBUS_RING_SIZE(SZ_16K);
  416. /*
  417. * Driver specific state.
  418. */
  419. enum hv_pcibus_state {
  420. hv_pcibus_init = 0,
  421. hv_pcibus_probed,
  422. hv_pcibus_installed,
  423. hv_pcibus_removing,
  424. hv_pcibus_maximum
  425. };
  426. struct hv_pcibus_device {
  427. #ifdef CONFIG_X86
  428. struct pci_sysdata sysdata;
  429. #elif defined(CONFIG_ARM64)
  430. struct pci_config_window sysdata;
  431. #endif
  432. struct pci_host_bridge *bridge;
  433. struct fwnode_handle *fwnode;
  434. /* Protocol version negotiated with the host */
  435. enum pci_protocol_version_t protocol_version;
  436. struct mutex state_lock;
  437. enum hv_pcibus_state state;
  438. struct hv_device *hdev;
  439. resource_size_t low_mmio_space;
  440. resource_size_t high_mmio_space;
  441. struct resource *mem_config;
  442. struct resource *low_mmio_res;
  443. struct resource *high_mmio_res;
  444. struct completion *survey_event;
  445. struct pci_bus *pci_bus;
  446. spinlock_t config_lock; /* Avoid two threads writing index page */
  447. spinlock_t device_list_lock; /* Protect lists below */
  448. void __iomem *cfg_addr;
  449. struct list_head children;
  450. struct list_head dr_list;
  451. struct msi_domain_info msi_info;
  452. struct irq_domain *irq_domain;
  453. struct workqueue_struct *wq;
  454. /* Highest slot of child device with resources allocated */
  455. int wslot_res_allocated;
  456. bool use_calls; /* Use hypercalls to access mmio cfg space */
  457. };
  458. /*
  459. * Tracks "Device Relations" messages from the host, which must be both
  460. * processed in order and deferred so that they don't run in the context
  461. * of the incoming packet callback.
  462. */
  463. struct hv_dr_work {
  464. struct work_struct wrk;
  465. struct hv_pcibus_device *bus;
  466. };
  467. struct hv_pcidev_description {
  468. u16 v_id; /* vendor ID */
  469. u16 d_id; /* device ID */
  470. u8 rev;
  471. u8 prog_intf;
  472. u8 subclass;
  473. u8 base_class;
  474. u32 subsystem_id;
  475. union win_slot_encoding win_slot;
  476. u32 ser; /* serial number */
  477. u32 flags;
  478. u16 virtual_numa_node;
  479. };
  480. struct hv_dr_state {
  481. struct list_head list_entry;
  482. u32 device_count;
  483. struct hv_pcidev_description func[] __counted_by(device_count);
  484. };
  485. struct hv_pci_dev {
  486. /* List protected by pci_rescan_remove_lock */
  487. struct list_head list_entry;
  488. refcount_t refs;
  489. struct pci_slot *pci_slot;
  490. struct hv_pcidev_description desc;
  491. bool reported_missing;
  492. struct hv_pcibus_device *hbus;
  493. struct work_struct wrk;
  494. void (*block_invalidate)(void *context, u64 block_mask);
  495. void *invalidate_context;
  496. /*
  497. * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
  498. * read it back, for each of the BAR offsets within config space.
  499. */
  500. u32 probed_bar[PCI_STD_NUM_BARS];
  501. };
  502. struct hv_pci_compl {
  503. struct completion host_event;
  504. s32 completion_status;
  505. };
  506. static void hv_pci_onchannelcallback(void *context);
  507. #ifdef CONFIG_X86
  508. #define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED
  509. #define FLOW_HANDLER handle_edge_irq
  510. #define FLOW_NAME "edge"
  511. static int hv_pci_irqchip_init(void)
  512. {
  513. return 0;
  514. }
  515. static struct irq_domain *hv_pci_get_root_domain(void)
  516. {
  517. return x86_vector_domain;
  518. }
  519. static unsigned int hv_msi_get_int_vector(struct irq_data *data)
  520. {
  521. struct irq_cfg *cfg = irqd_cfg(data);
  522. return cfg->vector;
  523. }
  524. #define hv_msi_prepare pci_msi_prepare
  525. /**
  526. * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current
  527. * affinity.
  528. * @data: Describes the IRQ
  529. *
  530. * Build new a destination for the MSI and make a hypercall to
  531. * update the Interrupt Redirection Table. "Device Logical ID"
  532. * is built out of this PCI bus's instance GUID and the function
  533. * number of the device.
  534. */
  535. static void hv_arch_irq_unmask(struct irq_data *data)
  536. {
  537. struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
  538. struct hv_retarget_device_interrupt *params;
  539. struct tran_int_desc *int_desc;
  540. struct hv_pcibus_device *hbus;
  541. const struct cpumask *dest;
  542. cpumask_var_t tmp;
  543. struct pci_bus *pbus;
  544. struct pci_dev *pdev;
  545. unsigned long flags;
  546. u32 var_size = 0;
  547. int cpu, nr_bank;
  548. u64 res;
  549. dest = irq_data_get_effective_affinity_mask(data);
  550. pdev = msi_desc_to_pci_dev(msi_desc);
  551. pbus = pdev->bus;
  552. hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
  553. int_desc = data->chip_data;
  554. if (!int_desc) {
  555. dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n",
  556. __func__, data->irq);
  557. return;
  558. }
  559. local_irq_save(flags);
  560. params = *this_cpu_ptr(hyperv_pcpu_input_arg);
  561. memset(params, 0, sizeof(*params));
  562. params->partition_id = HV_PARTITION_ID_SELF;
  563. params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
  564. params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
  565. params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
  566. params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
  567. (hbus->hdev->dev_instance.b[4] << 16) |
  568. (hbus->hdev->dev_instance.b[7] << 8) |
  569. (hbus->hdev->dev_instance.b[6] & 0xf8) |
  570. PCI_FUNC(pdev->devfn);
  571. params->int_target.vector = hv_msi_get_int_vector(data);
  572. if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
  573. /*
  574. * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
  575. * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
  576. * with >64 VP support.
  577. * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
  578. * is not sufficient for this hypercall.
  579. */
  580. params->int_target.flags |=
  581. HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
  582. if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
  583. res = 1;
  584. goto out;
  585. }
  586. cpumask_and(tmp, dest, cpu_online_mask);
  587. nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
  588. free_cpumask_var(tmp);
  589. if (nr_bank <= 0) {
  590. res = 1;
  591. goto out;
  592. }
  593. /*
  594. * var-sized hypercall, var-size starts after vp_mask (thus
  595. * vp_set.format does not count, but vp_set.valid_bank_mask
  596. * does).
  597. */
  598. var_size = 1 + nr_bank;
  599. } else {
  600. for_each_cpu_and(cpu, dest, cpu_online_mask) {
  601. params->int_target.vp_mask |=
  602. (1ULL << hv_cpu_number_to_vp_number(cpu));
  603. }
  604. }
  605. res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
  606. params, NULL);
  607. out:
  608. local_irq_restore(flags);
  609. /*
  610. * During hibernation, when a CPU is offlined, the kernel tries
  611. * to move the interrupt to the remaining CPUs that haven't
  612. * been offlined yet. In this case, the below hv_do_hypercall()
  613. * always fails since the vmbus channel has been closed:
  614. * refer to cpu_disable_common() -> fixup_irqs() ->
  615. * irq_migrate_all_off_this_cpu() -> migrate_one_irq().
  616. *
  617. * Suppress the error message for hibernation because the failure
  618. * during hibernation does not matter (at this time all the devices
  619. * have been frozen). Note: the correct affinity info is still updated
  620. * into the irqdata data structure in migrate_one_irq() ->
  621. * irq_do_set_affinity(), so later when the VM resumes,
  622. * hv_pci_restore_msi_state() is able to correctly restore the
  623. * interrupt with the correct affinity.
  624. */
  625. if (!hv_result_success(res) && hbus->state != hv_pcibus_removing)
  626. dev_err(&hbus->hdev->device,
  627. "%s() failed: %#llx", __func__, res);
  628. }
  629. #elif defined(CONFIG_ARM64)
  630. /*
  631. * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit
  632. * of room at the start to allow for SPIs to be specified through ACPI and
  633. * starting with a power of two to satisfy power of 2 multi-MSI requirement.
  634. */
  635. #define HV_PCI_MSI_SPI_START 64
  636. #define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START)
  637. #define DELIVERY_MODE 0
  638. #define FLOW_HANDLER NULL
  639. #define FLOW_NAME NULL
  640. #define hv_msi_prepare NULL
  641. struct hv_pci_chip_data {
  642. DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR);
  643. struct mutex map_lock;
  644. };
  645. /* Hyper-V vPCI MSI GIC IRQ domain */
  646. static struct irq_domain *hv_msi_gic_irq_domain;
  647. /* Hyper-V PCI MSI IRQ chip */
  648. static struct irq_chip hv_arm64_msi_irq_chip = {
  649. .name = "MSI",
  650. .irq_set_affinity = irq_chip_set_affinity_parent,
  651. .irq_eoi = irq_chip_eoi_parent,
  652. .irq_mask = irq_chip_mask_parent,
  653. .irq_unmask = irq_chip_unmask_parent
  654. };
  655. static unsigned int hv_msi_get_int_vector(struct irq_data *irqd)
  656. {
  657. return irqd->parent_data->hwirq;
  658. }
  659. /*
  660. * @nr_bm_irqs: Indicates the number of IRQs that were allocated from
  661. * the bitmap.
  662. * @nr_dom_irqs: Indicates the number of IRQs that were allocated from
  663. * the parent domain.
  664. */
  665. static void hv_pci_vec_irq_free(struct irq_domain *domain,
  666. unsigned int virq,
  667. unsigned int nr_bm_irqs,
  668. unsigned int nr_dom_irqs)
  669. {
  670. struct hv_pci_chip_data *chip_data = domain->host_data;
  671. struct irq_data *d = irq_domain_get_irq_data(domain, virq);
  672. int first = d->hwirq - HV_PCI_MSI_SPI_START;
  673. int i;
  674. mutex_lock(&chip_data->map_lock);
  675. bitmap_release_region(chip_data->spi_map,
  676. first,
  677. get_count_order(nr_bm_irqs));
  678. mutex_unlock(&chip_data->map_lock);
  679. for (i = 0; i < nr_dom_irqs; i++) {
  680. if (i)
  681. d = irq_domain_get_irq_data(domain, virq + i);
  682. irq_domain_reset_irq_data(d);
  683. }
  684. irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs);
  685. }
  686. static void hv_pci_vec_irq_domain_free(struct irq_domain *domain,
  687. unsigned int virq,
  688. unsigned int nr_irqs)
  689. {
  690. hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs);
  691. }
  692. static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain,
  693. unsigned int nr_irqs,
  694. irq_hw_number_t *hwirq)
  695. {
  696. struct hv_pci_chip_data *chip_data = domain->host_data;
  697. int index;
  698. /* Find and allocate region from the SPI bitmap */
  699. mutex_lock(&chip_data->map_lock);
  700. index = bitmap_find_free_region(chip_data->spi_map,
  701. HV_PCI_MSI_SPI_NR,
  702. get_count_order(nr_irqs));
  703. mutex_unlock(&chip_data->map_lock);
  704. if (index < 0)
  705. return -ENOSPC;
  706. *hwirq = index + HV_PCI_MSI_SPI_START;
  707. return 0;
  708. }
  709. static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain,
  710. unsigned int virq,
  711. irq_hw_number_t hwirq)
  712. {
  713. struct irq_fwspec fwspec;
  714. struct irq_data *d;
  715. int ret;
  716. fwspec.fwnode = domain->parent->fwnode;
  717. fwspec.param_count = 2;
  718. fwspec.param[0] = hwirq;
  719. fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
  720. ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
  721. if (ret)
  722. return ret;
  723. /*
  724. * Since the interrupt specifier is not coming from ACPI or DT, the
  725. * trigger type will need to be set explicitly. Otherwise, it will be
  726. * set to whatever is in the GIC configuration.
  727. */
  728. d = irq_domain_get_irq_data(domain->parent, virq);
  729. return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
  730. }
  731. static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain,
  732. unsigned int virq, unsigned int nr_irqs,
  733. void *args)
  734. {
  735. irq_hw_number_t hwirq;
  736. unsigned int i;
  737. int ret;
  738. ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq);
  739. if (ret)
  740. return ret;
  741. for (i = 0; i < nr_irqs; i++) {
  742. ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i,
  743. hwirq + i);
  744. if (ret) {
  745. hv_pci_vec_irq_free(domain, virq, nr_irqs, i);
  746. return ret;
  747. }
  748. irq_domain_set_hwirq_and_chip(domain, virq + i,
  749. hwirq + i,
  750. &hv_arm64_msi_irq_chip,
  751. domain->host_data);
  752. pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i);
  753. }
  754. return 0;
  755. }
  756. /*
  757. * Pick the first cpu as the irq affinity that can be temporarily used for
  758. * composing MSI from the hypervisor. GIC will eventually set the right
  759. * affinity for the irq and the 'unmask' will retarget the interrupt to that
  760. * cpu.
  761. */
  762. static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain,
  763. struct irq_data *irqd, bool reserve)
  764. {
  765. int cpu = cpumask_first(cpu_present_mask);
  766. irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
  767. return 0;
  768. }
  769. static const struct irq_domain_ops hv_pci_domain_ops = {
  770. .alloc = hv_pci_vec_irq_domain_alloc,
  771. .free = hv_pci_vec_irq_domain_free,
  772. .activate = hv_pci_vec_irq_domain_activate,
  773. };
  774. static int hv_pci_irqchip_init(void)
  775. {
  776. static struct hv_pci_chip_data *chip_data;
  777. struct fwnode_handle *fn = NULL;
  778. int ret = -ENOMEM;
  779. chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL);
  780. if (!chip_data)
  781. return ret;
  782. mutex_init(&chip_data->map_lock);
  783. fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64");
  784. if (!fn)
  785. goto free_chip;
  786. /*
  787. * IRQ domain once enabled, should not be removed since there is no
  788. * way to ensure that all the corresponding devices are also gone and
  789. * no interrupts will be generated.
  790. */
  791. hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR,
  792. fn, &hv_pci_domain_ops,
  793. chip_data);
  794. if (!hv_msi_gic_irq_domain) {
  795. pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n");
  796. goto free_chip;
  797. }
  798. return 0;
  799. free_chip:
  800. kfree(chip_data);
  801. if (fn)
  802. irq_domain_free_fwnode(fn);
  803. return ret;
  804. }
  805. static struct irq_domain *hv_pci_get_root_domain(void)
  806. {
  807. return hv_msi_gic_irq_domain;
  808. }
  809. /*
  810. * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD
  811. * registers which Hyper-V already supports, so no hypercall needed.
  812. */
  813. static void hv_arch_irq_unmask(struct irq_data *data) { }
  814. #endif /* CONFIG_ARM64 */
  815. /**
  816. * hv_pci_generic_compl() - Invoked for a completion packet
  817. * @context: Set up by the sender of the packet.
  818. * @resp: The response packet
  819. * @resp_packet_size: Size in bytes of the packet
  820. *
  821. * This function is used to trigger an event and report status
  822. * for any message for which the completion packet contains a
  823. * status and nothing else.
  824. */
  825. static void hv_pci_generic_compl(void *context, struct pci_response *resp,
  826. int resp_packet_size)
  827. {
  828. struct hv_pci_compl *comp_pkt = context;
  829. comp_pkt->completion_status = resp->status;
  830. complete(&comp_pkt->host_event);
  831. }
  832. static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
  833. u32 wslot);
  834. static void get_pcichild(struct hv_pci_dev *hpdev)
  835. {
  836. refcount_inc(&hpdev->refs);
  837. }
  838. static void put_pcichild(struct hv_pci_dev *hpdev)
  839. {
  840. if (refcount_dec_and_test(&hpdev->refs))
  841. kfree(hpdev);
  842. }
  843. /*
  844. * There is no good way to get notified from vmbus_onoffer_rescind(),
  845. * so let's use polling here, since this is not a hot path.
  846. */
  847. static int wait_for_response(struct hv_device *hdev,
  848. struct completion *comp)
  849. {
  850. while (true) {
  851. if (hdev->channel->rescind) {
  852. dev_warn_once(&hdev->device, "The device is gone.\n");
  853. return -ENODEV;
  854. }
  855. if (wait_for_completion_timeout(comp, HZ / 10))
  856. break;
  857. }
  858. return 0;
  859. }
  860. /**
  861. * devfn_to_wslot() - Convert from Linux PCI slot to Windows
  862. * @devfn: The Linux representation of PCI slot
  863. *
  864. * Windows uses a slightly different representation of PCI slot.
  865. *
  866. * Return: The Windows representation
  867. */
  868. static u32 devfn_to_wslot(int devfn)
  869. {
  870. union win_slot_encoding wslot;
  871. wslot.slot = 0;
  872. wslot.bits.dev = PCI_SLOT(devfn);
  873. wslot.bits.func = PCI_FUNC(devfn);
  874. return wslot.slot;
  875. }
  876. /**
  877. * wslot_to_devfn() - Convert from Windows PCI slot to Linux
  878. * @wslot: The Windows representation of PCI slot
  879. *
  880. * Windows uses a slightly different representation of PCI slot.
  881. *
  882. * Return: The Linux representation
  883. */
  884. static int wslot_to_devfn(u32 wslot)
  885. {
  886. union win_slot_encoding slot_no;
  887. slot_no.slot = wslot;
  888. return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
  889. }
  890. static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val)
  891. {
  892. struct hv_mmio_read_input *in;
  893. struct hv_mmio_read_output *out;
  894. u64 ret;
  895. /*
  896. * Must be called with interrupts disabled so it is safe
  897. * to use the per-cpu input argument page. Use it for
  898. * both input and output.
  899. */
  900. in = *this_cpu_ptr(hyperv_pcpu_input_arg);
  901. out = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*in);
  902. in->gpa = gpa;
  903. in->size = size;
  904. ret = hv_do_hypercall(HVCALL_MMIO_READ, in, out);
  905. if (hv_result_success(ret)) {
  906. switch (size) {
  907. case 1:
  908. *val = *(u8 *)(out->data);
  909. break;
  910. case 2:
  911. *val = *(u16 *)(out->data);
  912. break;
  913. default:
  914. *val = *(u32 *)(out->data);
  915. break;
  916. }
  917. } else
  918. dev_err(dev, "MMIO read hypercall error %llx addr %llx size %d\n",
  919. ret, gpa, size);
  920. }
  921. static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val)
  922. {
  923. struct hv_mmio_write_input *in;
  924. u64 ret;
  925. /*
  926. * Must be called with interrupts disabled so it is safe
  927. * to use the per-cpu input argument memory.
  928. */
  929. in = *this_cpu_ptr(hyperv_pcpu_input_arg);
  930. in->gpa = gpa;
  931. in->size = size;
  932. switch (size) {
  933. case 1:
  934. *(u8 *)(in->data) = val;
  935. break;
  936. case 2:
  937. *(u16 *)(in->data) = val;
  938. break;
  939. default:
  940. *(u32 *)(in->data) = val;
  941. break;
  942. }
  943. ret = hv_do_hypercall(HVCALL_MMIO_WRITE, in, NULL);
  944. if (!hv_result_success(ret))
  945. dev_err(dev, "MMIO write hypercall error %llx addr %llx size %d\n",
  946. ret, gpa, size);
  947. }
  948. /*
  949. * PCI Configuration Space for these root PCI buses is implemented as a pair
  950. * of pages in memory-mapped I/O space. Writing to the first page chooses
  951. * the PCI function being written or read. Once the first page has been
  952. * written to, the following page maps in the entire configuration space of
  953. * the function.
  954. */
  955. /**
  956. * _hv_pcifront_read_config() - Internal PCI config read
  957. * @hpdev: The PCI driver's representation of the device
  958. * @where: Offset within config space
  959. * @size: Size of the transfer
  960. * @val: Pointer to the buffer receiving the data
  961. */
  962. static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
  963. int size, u32 *val)
  964. {
  965. struct hv_pcibus_device *hbus = hpdev->hbus;
  966. struct device *dev = &hbus->hdev->device;
  967. int offset = where + CFG_PAGE_OFFSET;
  968. unsigned long flags;
  969. /*
  970. * If the attempt is to read the IDs or the ROM BAR, simulate that.
  971. */
  972. if (where + size <= PCI_COMMAND) {
  973. memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
  974. } else if (where >= PCI_CLASS_REVISION && where + size <=
  975. PCI_CACHE_LINE_SIZE) {
  976. memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
  977. PCI_CLASS_REVISION, size);
  978. } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
  979. PCI_ROM_ADDRESS) {
  980. memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
  981. PCI_SUBSYSTEM_VENDOR_ID, size);
  982. } else if (where >= PCI_ROM_ADDRESS && where + size <=
  983. PCI_CAPABILITY_LIST) {
  984. /* ROM BARs are unimplemented */
  985. *val = 0;
  986. } else if ((where >= PCI_INTERRUPT_LINE && where + size <= PCI_INTERRUPT_PIN) ||
  987. (where >= PCI_INTERRUPT_PIN && where + size <= PCI_MIN_GNT)) {
  988. /*
  989. * Interrupt Line and Interrupt PIN are hard-wired to zero
  990. * because this front-end only supports message-signaled
  991. * interrupts.
  992. */
  993. *val = 0;
  994. } else if (where + size <= CFG_PAGE_SIZE) {
  995. spin_lock_irqsave(&hbus->config_lock, flags);
  996. if (hbus->use_calls) {
  997. phys_addr_t addr = hbus->mem_config->start + offset;
  998. hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
  999. hpdev->desc.win_slot.slot);
  1000. hv_pci_read_mmio(dev, addr, size, val);
  1001. } else {
  1002. void __iomem *addr = hbus->cfg_addr + offset;
  1003. /* Choose the function to be read. (See comment above) */
  1004. writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
  1005. /* Make sure the function was chosen before reading. */
  1006. mb();
  1007. /* Read from that function's config space. */
  1008. switch (size) {
  1009. case 1:
  1010. *val = readb(addr);
  1011. break;
  1012. case 2:
  1013. *val = readw(addr);
  1014. break;
  1015. default:
  1016. *val = readl(addr);
  1017. break;
  1018. }
  1019. /*
  1020. * Make sure the read was done before we release the
  1021. * spinlock allowing consecutive reads/writes.
  1022. */
  1023. mb();
  1024. }
  1025. spin_unlock_irqrestore(&hbus->config_lock, flags);
  1026. } else {
  1027. dev_err(dev, "Attempt to read beyond a function's config space.\n");
  1028. }
  1029. }
  1030. static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
  1031. {
  1032. struct hv_pcibus_device *hbus = hpdev->hbus;
  1033. struct device *dev = &hbus->hdev->device;
  1034. u32 val;
  1035. u16 ret;
  1036. unsigned long flags;
  1037. spin_lock_irqsave(&hbus->config_lock, flags);
  1038. if (hbus->use_calls) {
  1039. phys_addr_t addr = hbus->mem_config->start +
  1040. CFG_PAGE_OFFSET + PCI_VENDOR_ID;
  1041. hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
  1042. hpdev->desc.win_slot.slot);
  1043. hv_pci_read_mmio(dev, addr, 2, &val);
  1044. ret = val; /* Truncates to 16 bits */
  1045. } else {
  1046. void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET +
  1047. PCI_VENDOR_ID;
  1048. /* Choose the function to be read. (See comment above) */
  1049. writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
  1050. /* Make sure the function was chosen before we start reading. */
  1051. mb();
  1052. /* Read from that function's config space. */
  1053. ret = readw(addr);
  1054. /*
  1055. * mb() is not required here, because the
  1056. * spin_unlock_irqrestore() is a barrier.
  1057. */
  1058. }
  1059. spin_unlock_irqrestore(&hbus->config_lock, flags);
  1060. return ret;
  1061. }
  1062. /**
  1063. * _hv_pcifront_write_config() - Internal PCI config write
  1064. * @hpdev: The PCI driver's representation of the device
  1065. * @where: Offset within config space
  1066. * @size: Size of the transfer
  1067. * @val: The data being transferred
  1068. */
  1069. static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
  1070. int size, u32 val)
  1071. {
  1072. struct hv_pcibus_device *hbus = hpdev->hbus;
  1073. struct device *dev = &hbus->hdev->device;
  1074. int offset = where + CFG_PAGE_OFFSET;
  1075. unsigned long flags;
  1076. if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
  1077. where + size <= PCI_CAPABILITY_LIST) {
  1078. /* SSIDs and ROM BARs are read-only */
  1079. } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
  1080. spin_lock_irqsave(&hbus->config_lock, flags);
  1081. if (hbus->use_calls) {
  1082. phys_addr_t addr = hbus->mem_config->start + offset;
  1083. hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
  1084. hpdev->desc.win_slot.slot);
  1085. hv_pci_write_mmio(dev, addr, size, val);
  1086. } else {
  1087. void __iomem *addr = hbus->cfg_addr + offset;
  1088. /* Choose the function to write. (See comment above) */
  1089. writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
  1090. /* Make sure the function was chosen before writing. */
  1091. wmb();
  1092. /* Write to that function's config space. */
  1093. switch (size) {
  1094. case 1:
  1095. writeb(val, addr);
  1096. break;
  1097. case 2:
  1098. writew(val, addr);
  1099. break;
  1100. default:
  1101. writel(val, addr);
  1102. break;
  1103. }
  1104. /*
  1105. * Make sure the write was done before we release the
  1106. * spinlock allowing consecutive reads/writes.
  1107. */
  1108. mb();
  1109. }
  1110. spin_unlock_irqrestore(&hbus->config_lock, flags);
  1111. } else {
  1112. dev_err(dev, "Attempt to write beyond a function's config space.\n");
  1113. }
  1114. }
  1115. /**
  1116. * hv_pcifront_read_config() - Read configuration space
  1117. * @bus: PCI Bus structure
  1118. * @devfn: Device/function
  1119. * @where: Offset from base
  1120. * @size: Byte/word/dword
  1121. * @val: Value to be read
  1122. *
  1123. * Return: PCIBIOS_SUCCESSFUL on success
  1124. * PCIBIOS_DEVICE_NOT_FOUND on failure
  1125. */
  1126. static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
  1127. int where, int size, u32 *val)
  1128. {
  1129. struct hv_pcibus_device *hbus =
  1130. container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
  1131. struct hv_pci_dev *hpdev;
  1132. hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
  1133. if (!hpdev)
  1134. return PCIBIOS_DEVICE_NOT_FOUND;
  1135. _hv_pcifront_read_config(hpdev, where, size, val);
  1136. put_pcichild(hpdev);
  1137. return PCIBIOS_SUCCESSFUL;
  1138. }
  1139. /**
  1140. * hv_pcifront_write_config() - Write configuration space
  1141. * @bus: PCI Bus structure
  1142. * @devfn: Device/function
  1143. * @where: Offset from base
  1144. * @size: Byte/word/dword
  1145. * @val: Value to be written to device
  1146. *
  1147. * Return: PCIBIOS_SUCCESSFUL on success
  1148. * PCIBIOS_DEVICE_NOT_FOUND on failure
  1149. */
  1150. static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
  1151. int where, int size, u32 val)
  1152. {
  1153. struct hv_pcibus_device *hbus =
  1154. container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
  1155. struct hv_pci_dev *hpdev;
  1156. hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
  1157. if (!hpdev)
  1158. return PCIBIOS_DEVICE_NOT_FOUND;
  1159. _hv_pcifront_write_config(hpdev, where, size, val);
  1160. put_pcichild(hpdev);
  1161. return PCIBIOS_SUCCESSFUL;
  1162. }
  1163. /* PCIe operations */
  1164. static struct pci_ops hv_pcifront_ops = {
  1165. .read = hv_pcifront_read_config,
  1166. .write = hv_pcifront_write_config,
  1167. };
  1168. /*
  1169. * Paravirtual backchannel
  1170. *
  1171. * Hyper-V SR-IOV provides a backchannel mechanism in software for
  1172. * communication between a VF driver and a PF driver. These
  1173. * "configuration blocks" are similar in concept to PCI configuration space,
  1174. * but instead of doing reads and writes in 32-bit chunks through a very slow
  1175. * path, packets of up to 128 bytes can be sent or received asynchronously.
  1176. *
  1177. * Nearly every SR-IOV device contains just such a communications channel in
  1178. * hardware, so using this one in software is usually optional. Using the
  1179. * software channel, however, allows driver implementers to leverage software
  1180. * tools that fuzz the communications channel looking for vulnerabilities.
  1181. *
  1182. * The usage model for these packets puts the responsibility for reading or
  1183. * writing on the VF driver. The VF driver sends a read or a write packet,
  1184. * indicating which "block" is being referred to by number.
  1185. *
  1186. * If the PF driver wishes to initiate communication, it can "invalidate" one or
  1187. * more of the first 64 blocks. This invalidation is delivered via a callback
  1188. * supplied by the VF driver by this driver.
  1189. *
  1190. * No protocol is implied, except that supplied by the PF and VF drivers.
  1191. */
  1192. struct hv_read_config_compl {
  1193. struct hv_pci_compl comp_pkt;
  1194. void *buf;
  1195. unsigned int len;
  1196. unsigned int bytes_returned;
  1197. };
  1198. /**
  1199. * hv_pci_read_config_compl() - Invoked when a response packet
  1200. * for a read config block operation arrives.
  1201. * @context: Identifies the read config operation
  1202. * @resp: The response packet itself
  1203. * @resp_packet_size: Size in bytes of the response packet
  1204. */
  1205. static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
  1206. int resp_packet_size)
  1207. {
  1208. struct hv_read_config_compl *comp = context;
  1209. struct pci_read_block_response *read_resp =
  1210. (struct pci_read_block_response *)resp;
  1211. unsigned int data_len, hdr_len;
  1212. hdr_len = offsetof(struct pci_read_block_response, bytes);
  1213. if (resp_packet_size < hdr_len) {
  1214. comp->comp_pkt.completion_status = -1;
  1215. goto out;
  1216. }
  1217. data_len = resp_packet_size - hdr_len;
  1218. if (data_len > 0 && read_resp->status == 0) {
  1219. comp->bytes_returned = min(comp->len, data_len);
  1220. memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
  1221. } else {
  1222. comp->bytes_returned = 0;
  1223. }
  1224. comp->comp_pkt.completion_status = read_resp->status;
  1225. out:
  1226. complete(&comp->comp_pkt.host_event);
  1227. }
  1228. /**
  1229. * hv_read_config_block() - Sends a read config block request to
  1230. * the back-end driver running in the Hyper-V parent partition.
  1231. * @pdev: The PCI driver's representation for this device.
  1232. * @buf: Buffer into which the config block will be copied.
  1233. * @len: Size in bytes of buf.
  1234. * @block_id: Identifies the config block which has been requested.
  1235. * @bytes_returned: Size which came back from the back-end driver.
  1236. *
  1237. * Return: 0 on success, -errno on failure
  1238. */
  1239. static int hv_read_config_block(struct pci_dev *pdev, void *buf,
  1240. unsigned int len, unsigned int block_id,
  1241. unsigned int *bytes_returned)
  1242. {
  1243. struct hv_pcibus_device *hbus =
  1244. container_of(pdev->bus->sysdata, struct hv_pcibus_device,
  1245. sysdata);
  1246. struct {
  1247. struct pci_packet pkt;
  1248. char buf[sizeof(struct pci_read_block)];
  1249. } pkt;
  1250. struct hv_read_config_compl comp_pkt;
  1251. struct pci_read_block *read_blk;
  1252. int ret;
  1253. if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
  1254. return -EINVAL;
  1255. init_completion(&comp_pkt.comp_pkt.host_event);
  1256. comp_pkt.buf = buf;
  1257. comp_pkt.len = len;
  1258. memset(&pkt, 0, sizeof(pkt));
  1259. pkt.pkt.completion_func = hv_pci_read_config_compl;
  1260. pkt.pkt.compl_ctxt = &comp_pkt;
  1261. read_blk = (struct pci_read_block *)&pkt.pkt.message;
  1262. read_blk->message_type.type = PCI_READ_BLOCK;
  1263. read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
  1264. read_blk->block_id = block_id;
  1265. read_blk->bytes_requested = len;
  1266. ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
  1267. sizeof(*read_blk), (unsigned long)&pkt.pkt,
  1268. VM_PKT_DATA_INBAND,
  1269. VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
  1270. if (ret)
  1271. return ret;
  1272. ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
  1273. if (ret)
  1274. return ret;
  1275. if (comp_pkt.comp_pkt.completion_status != 0 ||
  1276. comp_pkt.bytes_returned == 0) {
  1277. dev_err(&hbus->hdev->device,
  1278. "Read Config Block failed: 0x%x, bytes_returned=%d\n",
  1279. comp_pkt.comp_pkt.completion_status,
  1280. comp_pkt.bytes_returned);
  1281. return -EIO;
  1282. }
  1283. *bytes_returned = comp_pkt.bytes_returned;
  1284. return 0;
  1285. }
  1286. /**
  1287. * hv_pci_write_config_compl() - Invoked when a response packet for a write
  1288. * config block operation arrives.
  1289. * @context: Identifies the write config operation
  1290. * @resp: The response packet itself
  1291. * @resp_packet_size: Size in bytes of the response packet
  1292. */
  1293. static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
  1294. int resp_packet_size)
  1295. {
  1296. struct hv_pci_compl *comp_pkt = context;
  1297. comp_pkt->completion_status = resp->status;
  1298. complete(&comp_pkt->host_event);
  1299. }
  1300. /**
  1301. * hv_write_config_block() - Sends a write config block request to the
  1302. * back-end driver running in the Hyper-V parent partition.
  1303. * @pdev: The PCI driver's representation for this device.
  1304. * @buf: Buffer from which the config block will be copied.
  1305. * @len: Size in bytes of buf.
  1306. * @block_id: Identifies the config block which is being written.
  1307. *
  1308. * Return: 0 on success, -errno on failure
  1309. */
  1310. static int hv_write_config_block(struct pci_dev *pdev, void *buf,
  1311. unsigned int len, unsigned int block_id)
  1312. {
  1313. struct hv_pcibus_device *hbus =
  1314. container_of(pdev->bus->sysdata, struct hv_pcibus_device,
  1315. sysdata);
  1316. struct {
  1317. struct pci_packet pkt;
  1318. char buf[sizeof(struct pci_write_block)];
  1319. u32 reserved;
  1320. } pkt;
  1321. struct hv_pci_compl comp_pkt;
  1322. struct pci_write_block *write_blk;
  1323. u32 pkt_size;
  1324. int ret;
  1325. if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
  1326. return -EINVAL;
  1327. init_completion(&comp_pkt.host_event);
  1328. memset(&pkt, 0, sizeof(pkt));
  1329. pkt.pkt.completion_func = hv_pci_write_config_compl;
  1330. pkt.pkt.compl_ctxt = &comp_pkt;
  1331. write_blk = (struct pci_write_block *)&pkt.pkt.message;
  1332. write_blk->message_type.type = PCI_WRITE_BLOCK;
  1333. write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
  1334. write_blk->block_id = block_id;
  1335. write_blk->byte_count = len;
  1336. memcpy(write_blk->bytes, buf, len);
  1337. pkt_size = offsetof(struct pci_write_block, bytes) + len;
  1338. /*
  1339. * This quirk is required on some hosts shipped around 2018, because
  1340. * these hosts don't check the pkt_size correctly (new hosts have been
  1341. * fixed since early 2019). The quirk is also safe on very old hosts
  1342. * and new hosts, because, on them, what really matters is the length
  1343. * specified in write_blk->byte_count.
  1344. */
  1345. pkt_size += sizeof(pkt.reserved);
  1346. ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
  1347. (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
  1348. VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
  1349. if (ret)
  1350. return ret;
  1351. ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
  1352. if (ret)
  1353. return ret;
  1354. if (comp_pkt.completion_status != 0) {
  1355. dev_err(&hbus->hdev->device,
  1356. "Write Config Block failed: 0x%x\n",
  1357. comp_pkt.completion_status);
  1358. return -EIO;
  1359. }
  1360. return 0;
  1361. }
  1362. /**
  1363. * hv_register_block_invalidate() - Invoked when a config block invalidation
  1364. * arrives from the back-end driver.
  1365. * @pdev: The PCI driver's representation for this device.
  1366. * @context: Identifies the device.
  1367. * @block_invalidate: Identifies all of the blocks being invalidated.
  1368. *
  1369. * Return: 0 on success, -errno on failure
  1370. */
  1371. static int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
  1372. void (*block_invalidate)(void *context,
  1373. u64 block_mask))
  1374. {
  1375. struct hv_pcibus_device *hbus =
  1376. container_of(pdev->bus->sysdata, struct hv_pcibus_device,
  1377. sysdata);
  1378. struct hv_pci_dev *hpdev;
  1379. hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
  1380. if (!hpdev)
  1381. return -ENODEV;
  1382. hpdev->block_invalidate = block_invalidate;
  1383. hpdev->invalidate_context = context;
  1384. put_pcichild(hpdev);
  1385. return 0;
  1386. }
  1387. /* Interrupt management hooks */
  1388. static void hv_int_desc_free(struct hv_pci_dev *hpdev,
  1389. struct tran_int_desc *int_desc)
  1390. {
  1391. struct pci_delete_interrupt *int_pkt;
  1392. struct {
  1393. struct pci_packet pkt;
  1394. u8 buffer[sizeof(struct pci_delete_interrupt)];
  1395. } ctxt;
  1396. if (!int_desc->vector_count) {
  1397. kfree(int_desc);
  1398. return;
  1399. }
  1400. memset(&ctxt, 0, sizeof(ctxt));
  1401. int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
  1402. int_pkt->message_type.type =
  1403. PCI_DELETE_INTERRUPT_MESSAGE;
  1404. int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
  1405. int_pkt->int_desc = *int_desc;
  1406. vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
  1407. 0, VM_PKT_DATA_INBAND, 0);
  1408. kfree(int_desc);
  1409. }
  1410. /**
  1411. * hv_msi_free() - Free the MSI.
  1412. * @domain: The interrupt domain pointer
  1413. * @info: Extra MSI-related context
  1414. * @irq: Identifies the IRQ.
  1415. *
  1416. * The Hyper-V parent partition and hypervisor are tracking the
  1417. * messages that are in use, keeping the interrupt redirection
  1418. * table up to date. This callback sends a message that frees
  1419. * the IRT entry and related tracking nonsense.
  1420. */
  1421. static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
  1422. unsigned int irq)
  1423. {
  1424. struct hv_pcibus_device *hbus;
  1425. struct hv_pci_dev *hpdev;
  1426. struct pci_dev *pdev;
  1427. struct tran_int_desc *int_desc;
  1428. struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
  1429. struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
  1430. pdev = msi_desc_to_pci_dev(msi);
  1431. hbus = info->data;
  1432. int_desc = irq_data_get_irq_chip_data(irq_data);
  1433. if (!int_desc)
  1434. return;
  1435. irq_data->chip_data = NULL;
  1436. hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
  1437. if (!hpdev) {
  1438. kfree(int_desc);
  1439. return;
  1440. }
  1441. hv_int_desc_free(hpdev, int_desc);
  1442. put_pcichild(hpdev);
  1443. }
  1444. static void hv_irq_mask(struct irq_data *data)
  1445. {
  1446. pci_msi_mask_irq(data);
  1447. if (data->parent_data->chip->irq_mask)
  1448. irq_chip_mask_parent(data);
  1449. }
  1450. static void hv_irq_unmask(struct irq_data *data)
  1451. {
  1452. hv_arch_irq_unmask(data);
  1453. if (data->parent_data->chip->irq_unmask)
  1454. irq_chip_unmask_parent(data);
  1455. pci_msi_unmask_irq(data);
  1456. }
  1457. struct compose_comp_ctxt {
  1458. struct hv_pci_compl comp_pkt;
  1459. struct tran_int_desc int_desc;
  1460. };
  1461. static void hv_pci_compose_compl(void *context, struct pci_response *resp,
  1462. int resp_packet_size)
  1463. {
  1464. struct compose_comp_ctxt *comp_pkt = context;
  1465. struct pci_create_int_response *int_resp =
  1466. (struct pci_create_int_response *)resp;
  1467. if (resp_packet_size < sizeof(*int_resp)) {
  1468. comp_pkt->comp_pkt.completion_status = -1;
  1469. goto out;
  1470. }
  1471. comp_pkt->comp_pkt.completion_status = resp->status;
  1472. comp_pkt->int_desc = int_resp->int_desc;
  1473. out:
  1474. complete(&comp_pkt->comp_pkt.host_event);
  1475. }
  1476. static u32 hv_compose_msi_req_v1(
  1477. struct pci_create_interrupt *int_pkt,
  1478. u32 slot, u8 vector, u16 vector_count)
  1479. {
  1480. int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
  1481. int_pkt->wslot.slot = slot;
  1482. int_pkt->int_desc.vector = vector;
  1483. int_pkt->int_desc.vector_count = vector_count;
  1484. int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
  1485. /*
  1486. * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
  1487. * hv_irq_unmask().
  1488. */
  1489. int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
  1490. return sizeof(*int_pkt);
  1491. }
  1492. /*
  1493. * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and
  1494. * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be
  1495. * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V
  1496. * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is
  1497. * not irrelevant because Hyper-V chooses the physical CPU to handle the
  1498. * interrupts based on the vCPU specified in message sent to the vPCI VSP in
  1499. * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest,
  1500. * but assigning too many vPCI device interrupts to the same pCPU can cause a
  1501. * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V
  1502. * to spread out the pCPUs that it selects.
  1503. *
  1504. * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu()
  1505. * to always return the same dummy vCPU, because a second call to
  1506. * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a
  1507. * new pCPU for the interrupt. But for the multi-MSI case, the second call to
  1508. * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the
  1509. * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that
  1510. * the pCPUs are spread out. All interrupts for a multi-MSI device end up using
  1511. * the same pCPU, even though the vCPUs will be spread out by later calls
  1512. * to hv_irq_unmask(), but that is the best we can do now.
  1513. *
  1514. * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not*
  1515. * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an
  1516. * enhancement is planned for a future version. With that enhancement, the
  1517. * dummy vCPU selection won't matter, and interrupts for the same multi-MSI
  1518. * device will be spread across multiple pCPUs.
  1519. */
  1520. /*
  1521. * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
  1522. * by subsequent retarget in hv_irq_unmask().
  1523. */
  1524. static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
  1525. {
  1526. return cpumask_first_and(affinity, cpu_online_mask);
  1527. }
  1528. /*
  1529. * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0.
  1530. */
  1531. static int hv_compose_multi_msi_req_get_cpu(void)
  1532. {
  1533. static DEFINE_SPINLOCK(multi_msi_cpu_lock);
  1534. /* -1 means starting with CPU 0 */
  1535. static int cpu_next = -1;
  1536. unsigned long flags;
  1537. int cpu;
  1538. spin_lock_irqsave(&multi_msi_cpu_lock, flags);
  1539. cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids,
  1540. false);
  1541. cpu = cpu_next;
  1542. spin_unlock_irqrestore(&multi_msi_cpu_lock, flags);
  1543. return cpu;
  1544. }
  1545. static u32 hv_compose_msi_req_v2(
  1546. struct pci_create_interrupt2 *int_pkt, int cpu,
  1547. u32 slot, u8 vector, u16 vector_count)
  1548. {
  1549. int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
  1550. int_pkt->wslot.slot = slot;
  1551. int_pkt->int_desc.vector = vector;
  1552. int_pkt->int_desc.vector_count = vector_count;
  1553. int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
  1554. int_pkt->int_desc.processor_array[0] =
  1555. hv_cpu_number_to_vp_number(cpu);
  1556. int_pkt->int_desc.processor_count = 1;
  1557. return sizeof(*int_pkt);
  1558. }
  1559. static u32 hv_compose_msi_req_v3(
  1560. struct pci_create_interrupt3 *int_pkt, int cpu,
  1561. u32 slot, u32 vector, u16 vector_count)
  1562. {
  1563. int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
  1564. int_pkt->wslot.slot = slot;
  1565. int_pkt->int_desc.vector = vector;
  1566. int_pkt->int_desc.reserved = 0;
  1567. int_pkt->int_desc.vector_count = vector_count;
  1568. int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
  1569. int_pkt->int_desc.processor_array[0] =
  1570. hv_cpu_number_to_vp_number(cpu);
  1571. int_pkt->int_desc.processor_count = 1;
  1572. return sizeof(*int_pkt);
  1573. }
  1574. /**
  1575. * hv_compose_msi_msg() - Supplies a valid MSI address/data
  1576. * @data: Everything about this MSI
  1577. * @msg: Buffer that is filled in by this function
  1578. *
  1579. * This function unpacks the IRQ looking for target CPU set, IDT
  1580. * vector and mode and sends a message to the parent partition
  1581. * asking for a mapping for that tuple in this partition. The
  1582. * response supplies a data value and address to which that data
  1583. * should be written to trigger that interrupt.
  1584. */
  1585. static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
  1586. {
  1587. struct hv_pcibus_device *hbus;
  1588. struct vmbus_channel *channel;
  1589. struct hv_pci_dev *hpdev;
  1590. struct pci_bus *pbus;
  1591. struct pci_dev *pdev;
  1592. const struct cpumask *dest;
  1593. struct compose_comp_ctxt comp;
  1594. struct tran_int_desc *int_desc;
  1595. struct msi_desc *msi_desc;
  1596. /*
  1597. * vector_count should be u16: see hv_msi_desc, hv_msi_desc2
  1598. * and hv_msi_desc3. vector must be u32: see hv_msi_desc3.
  1599. */
  1600. u16 vector_count;
  1601. u32 vector;
  1602. struct {
  1603. struct pci_packet pci_pkt;
  1604. union {
  1605. struct pci_create_interrupt v1;
  1606. struct pci_create_interrupt2 v2;
  1607. struct pci_create_interrupt3 v3;
  1608. } int_pkts;
  1609. } __packed ctxt;
  1610. bool multi_msi;
  1611. u64 trans_id;
  1612. u32 size;
  1613. int ret;
  1614. int cpu;
  1615. msi_desc = irq_data_get_msi_desc(data);
  1616. multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
  1617. msi_desc->nvec_used > 1;
  1618. /* Reuse the previous allocation */
  1619. if (data->chip_data && multi_msi) {
  1620. int_desc = data->chip_data;
  1621. msg->address_hi = int_desc->address >> 32;
  1622. msg->address_lo = int_desc->address & 0xffffffff;
  1623. msg->data = int_desc->data;
  1624. return;
  1625. }
  1626. pdev = msi_desc_to_pci_dev(msi_desc);
  1627. dest = irq_data_get_effective_affinity_mask(data);
  1628. pbus = pdev->bus;
  1629. hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
  1630. channel = hbus->hdev->channel;
  1631. hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
  1632. if (!hpdev)
  1633. goto return_null_message;
  1634. /* Free any previous message that might have already been composed. */
  1635. if (data->chip_data && !multi_msi) {
  1636. int_desc = data->chip_data;
  1637. data->chip_data = NULL;
  1638. hv_int_desc_free(hpdev, int_desc);
  1639. }
  1640. int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
  1641. if (!int_desc)
  1642. goto drop_reference;
  1643. if (multi_msi) {
  1644. /*
  1645. * If this is not the first MSI of Multi MSI, we already have
  1646. * a mapping. Can exit early.
  1647. */
  1648. if (msi_desc->irq != data->irq) {
  1649. data->chip_data = int_desc;
  1650. int_desc->address = msi_desc->msg.address_lo |
  1651. (u64)msi_desc->msg.address_hi << 32;
  1652. int_desc->data = msi_desc->msg.data +
  1653. (data->irq - msi_desc->irq);
  1654. msg->address_hi = msi_desc->msg.address_hi;
  1655. msg->address_lo = msi_desc->msg.address_lo;
  1656. msg->data = int_desc->data;
  1657. put_pcichild(hpdev);
  1658. return;
  1659. }
  1660. /*
  1661. * The vector we select here is a dummy value. The correct
  1662. * value gets sent to the hypervisor in unmask(). This needs
  1663. * to be aligned with the count, and also not zero. Multi-msi
  1664. * is powers of 2 up to 32, so 32 will always work here.
  1665. */
  1666. vector = 32;
  1667. vector_count = msi_desc->nvec_used;
  1668. cpu = hv_compose_multi_msi_req_get_cpu();
  1669. } else {
  1670. vector = hv_msi_get_int_vector(data);
  1671. vector_count = 1;
  1672. cpu = hv_compose_msi_req_get_cpu(dest);
  1673. }
  1674. /*
  1675. * hv_compose_msi_req_v1 and v2 are for x86 only, meaning 'vector'
  1676. * can't exceed u8. Cast 'vector' down to u8 for v1/v2 explicitly
  1677. * for better readability.
  1678. */
  1679. memset(&ctxt, 0, sizeof(ctxt));
  1680. init_completion(&comp.comp_pkt.host_event);
  1681. ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
  1682. ctxt.pci_pkt.compl_ctxt = &comp;
  1683. switch (hbus->protocol_version) {
  1684. case PCI_PROTOCOL_VERSION_1_1:
  1685. size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
  1686. hpdev->desc.win_slot.slot,
  1687. (u8)vector,
  1688. vector_count);
  1689. break;
  1690. case PCI_PROTOCOL_VERSION_1_2:
  1691. case PCI_PROTOCOL_VERSION_1_3:
  1692. size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
  1693. cpu,
  1694. hpdev->desc.win_slot.slot,
  1695. (u8)vector,
  1696. vector_count);
  1697. break;
  1698. case PCI_PROTOCOL_VERSION_1_4:
  1699. size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
  1700. cpu,
  1701. hpdev->desc.win_slot.slot,
  1702. vector,
  1703. vector_count);
  1704. break;
  1705. default:
  1706. /* As we only negotiate protocol versions known to this driver,
  1707. * this path should never hit. However, this is it not a hot
  1708. * path so we print a message to aid future updates.
  1709. */
  1710. dev_err(&hbus->hdev->device,
  1711. "Unexpected vPCI protocol, update driver.");
  1712. goto free_int_desc;
  1713. }
  1714. ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
  1715. size, (unsigned long)&ctxt.pci_pkt,
  1716. &trans_id, VM_PKT_DATA_INBAND,
  1717. VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
  1718. if (ret) {
  1719. dev_err(&hbus->hdev->device,
  1720. "Sending request for interrupt failed: 0x%x",
  1721. comp.comp_pkt.completion_status);
  1722. goto free_int_desc;
  1723. }
  1724. /*
  1725. * Prevents hv_pci_onchannelcallback() from running concurrently
  1726. * in the tasklet.
  1727. */
  1728. tasklet_disable_in_atomic(&channel->callback_event);
  1729. /*
  1730. * Since this function is called with IRQ locks held, can't
  1731. * do normal wait for completion; instead poll.
  1732. */
  1733. while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
  1734. unsigned long flags;
  1735. /* 0xFFFF means an invalid PCI VENDOR ID. */
  1736. if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
  1737. dev_err_once(&hbus->hdev->device,
  1738. "the device has gone\n");
  1739. goto enable_tasklet;
  1740. }
  1741. /*
  1742. * Make sure that the ring buffer data structure doesn't get
  1743. * freed while we dereference the ring buffer pointer. Test
  1744. * for the channel's onchannel_callback being NULL within a
  1745. * sched_lock critical section. See also the inline comments
  1746. * in vmbus_reset_channel_cb().
  1747. */
  1748. spin_lock_irqsave(&channel->sched_lock, flags);
  1749. if (unlikely(channel->onchannel_callback == NULL)) {
  1750. spin_unlock_irqrestore(&channel->sched_lock, flags);
  1751. goto enable_tasklet;
  1752. }
  1753. hv_pci_onchannelcallback(hbus);
  1754. spin_unlock_irqrestore(&channel->sched_lock, flags);
  1755. udelay(100);
  1756. }
  1757. tasklet_enable(&channel->callback_event);
  1758. if (comp.comp_pkt.completion_status < 0) {
  1759. dev_err(&hbus->hdev->device,
  1760. "Request for interrupt failed: 0x%x",
  1761. comp.comp_pkt.completion_status);
  1762. goto free_int_desc;
  1763. }
  1764. /*
  1765. * Record the assignment so that this can be unwound later. Using
  1766. * irq_set_chip_data() here would be appropriate, but the lock it takes
  1767. * is already held.
  1768. */
  1769. *int_desc = comp.int_desc;
  1770. data->chip_data = int_desc;
  1771. /* Pass up the result. */
  1772. msg->address_hi = comp.int_desc.address >> 32;
  1773. msg->address_lo = comp.int_desc.address & 0xffffffff;
  1774. msg->data = comp.int_desc.data;
  1775. put_pcichild(hpdev);
  1776. return;
  1777. enable_tasklet:
  1778. tasklet_enable(&channel->callback_event);
  1779. /*
  1780. * The completion packet on the stack becomes invalid after 'return';
  1781. * remove the ID from the VMbus requestor if the identifier is still
  1782. * mapped to/associated with the packet. (The identifier could have
  1783. * been 're-used', i.e., already removed and (re-)mapped.)
  1784. *
  1785. * Cf. hv_pci_onchannelcallback().
  1786. */
  1787. vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt);
  1788. free_int_desc:
  1789. kfree(int_desc);
  1790. drop_reference:
  1791. put_pcichild(hpdev);
  1792. return_null_message:
  1793. msg->address_hi = 0;
  1794. msg->address_lo = 0;
  1795. msg->data = 0;
  1796. }
  1797. /* HW Interrupt Chip Descriptor */
  1798. static struct irq_chip hv_msi_irq_chip = {
  1799. .name = "Hyper-V PCIe MSI",
  1800. .irq_compose_msi_msg = hv_compose_msi_msg,
  1801. .irq_set_affinity = irq_chip_set_affinity_parent,
  1802. #ifdef CONFIG_X86
  1803. .irq_ack = irq_chip_ack_parent,
  1804. #elif defined(CONFIG_ARM64)
  1805. .irq_eoi = irq_chip_eoi_parent,
  1806. #endif
  1807. .irq_mask = hv_irq_mask,
  1808. .irq_unmask = hv_irq_unmask,
  1809. };
  1810. static struct msi_domain_ops hv_msi_ops = {
  1811. .msi_prepare = hv_msi_prepare,
  1812. .msi_free = hv_msi_free,
  1813. };
  1814. /**
  1815. * hv_pcie_init_irq_domain() - Initialize IRQ domain
  1816. * @hbus: The root PCI bus
  1817. *
  1818. * This function creates an IRQ domain which will be used for
  1819. * interrupts from devices that have been passed through. These
  1820. * devices only support MSI and MSI-X, not line-based interrupts
  1821. * or simulations of line-based interrupts through PCIe's
  1822. * fabric-layer messages. Because interrupts are remapped, we
  1823. * can support multi-message MSI here.
  1824. *
  1825. * Return: '0' on success and error value on failure
  1826. */
  1827. static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
  1828. {
  1829. hbus->msi_info.chip = &hv_msi_irq_chip;
  1830. hbus->msi_info.ops = &hv_msi_ops;
  1831. hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
  1832. MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
  1833. MSI_FLAG_PCI_MSIX);
  1834. hbus->msi_info.handler = FLOW_HANDLER;
  1835. hbus->msi_info.handler_name = FLOW_NAME;
  1836. hbus->msi_info.data = hbus;
  1837. hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode,
  1838. &hbus->msi_info,
  1839. hv_pci_get_root_domain());
  1840. if (!hbus->irq_domain) {
  1841. dev_err(&hbus->hdev->device,
  1842. "Failed to build an MSI IRQ domain\n");
  1843. return -ENODEV;
  1844. }
  1845. dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain);
  1846. return 0;
  1847. }
  1848. /**
  1849. * get_bar_size() - Get the address space consumed by a BAR
  1850. * @bar_val: Value that a BAR returned after -1 was written
  1851. * to it.
  1852. *
  1853. * This function returns the size of the BAR, rounded up to 1
  1854. * page. It has to be rounded up because the hypervisor's page
  1855. * table entry that maps the BAR into the VM can't specify an
  1856. * offset within a page. The invariant is that the hypervisor
  1857. * must place any BARs of smaller than page length at the
  1858. * beginning of a page.
  1859. *
  1860. * Return: Size in bytes of the consumed MMIO space.
  1861. */
  1862. static u64 get_bar_size(u64 bar_val)
  1863. {
  1864. return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
  1865. PAGE_SIZE);
  1866. }
  1867. /**
  1868. * survey_child_resources() - Total all MMIO requirements
  1869. * @hbus: Root PCI bus, as understood by this driver
  1870. */
  1871. static void survey_child_resources(struct hv_pcibus_device *hbus)
  1872. {
  1873. struct hv_pci_dev *hpdev;
  1874. resource_size_t bar_size = 0;
  1875. unsigned long flags;
  1876. struct completion *event;
  1877. u64 bar_val;
  1878. int i;
  1879. /* If nobody is waiting on the answer, don't compute it. */
  1880. event = xchg(&hbus->survey_event, NULL);
  1881. if (!event)
  1882. return;
  1883. /* If the answer has already been computed, go with it. */
  1884. if (hbus->low_mmio_space || hbus->high_mmio_space) {
  1885. complete(event);
  1886. return;
  1887. }
  1888. spin_lock_irqsave(&hbus->device_list_lock, flags);
  1889. /*
  1890. * Due to an interesting quirk of the PCI spec, all memory regions
  1891. * for a child device are a power of 2 in size and aligned in memory,
  1892. * so it's sufficient to just add them up without tracking alignment.
  1893. */
  1894. list_for_each_entry(hpdev, &hbus->children, list_entry) {
  1895. for (i = 0; i < PCI_STD_NUM_BARS; i++) {
  1896. if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
  1897. dev_err(&hbus->hdev->device,
  1898. "There's an I/O BAR in this list!\n");
  1899. if (hpdev->probed_bar[i] != 0) {
  1900. /*
  1901. * A probed BAR has all the upper bits set that
  1902. * can be changed.
  1903. */
  1904. bar_val = hpdev->probed_bar[i];
  1905. if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
  1906. bar_val |=
  1907. ((u64)hpdev->probed_bar[++i] << 32);
  1908. else
  1909. bar_val |= 0xffffffff00000000ULL;
  1910. bar_size = get_bar_size(bar_val);
  1911. if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
  1912. hbus->high_mmio_space += bar_size;
  1913. else
  1914. hbus->low_mmio_space += bar_size;
  1915. }
  1916. }
  1917. }
  1918. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  1919. complete(event);
  1920. }
  1921. /**
  1922. * prepopulate_bars() - Fill in BARs with defaults
  1923. * @hbus: Root PCI bus, as understood by this driver
  1924. *
  1925. * The core PCI driver code seems much, much happier if the BARs
  1926. * for a device have values upon first scan. So fill them in.
  1927. * The algorithm below works down from large sizes to small,
  1928. * attempting to pack the assignments optimally. The assumption,
  1929. * enforced in other parts of the code, is that the beginning of
  1930. * the memory-mapped I/O space will be aligned on the largest
  1931. * BAR size.
  1932. */
  1933. static void prepopulate_bars(struct hv_pcibus_device *hbus)
  1934. {
  1935. resource_size_t high_size = 0;
  1936. resource_size_t low_size = 0;
  1937. resource_size_t high_base = 0;
  1938. resource_size_t low_base = 0;
  1939. resource_size_t bar_size;
  1940. struct hv_pci_dev *hpdev;
  1941. unsigned long flags;
  1942. u64 bar_val;
  1943. u32 command;
  1944. bool high;
  1945. int i;
  1946. if (hbus->low_mmio_space) {
  1947. low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
  1948. low_base = hbus->low_mmio_res->start;
  1949. }
  1950. if (hbus->high_mmio_space) {
  1951. high_size = 1ULL <<
  1952. (63 - __builtin_clzll(hbus->high_mmio_space));
  1953. high_base = hbus->high_mmio_res->start;
  1954. }
  1955. spin_lock_irqsave(&hbus->device_list_lock, flags);
  1956. /*
  1957. * Clear the memory enable bit, in case it's already set. This occurs
  1958. * in the suspend path of hibernation, where the device is suspended,
  1959. * resumed and suspended again: see hibernation_snapshot() and
  1960. * hibernation_platform_enter().
  1961. *
  1962. * If the memory enable bit is already set, Hyper-V silently ignores
  1963. * the below BAR updates, and the related PCI device driver can not
  1964. * work, because reading from the device register(s) always returns
  1965. * 0xFFFFFFFF (PCI_ERROR_RESPONSE).
  1966. */
  1967. list_for_each_entry(hpdev, &hbus->children, list_entry) {
  1968. _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command);
  1969. command &= ~PCI_COMMAND_MEMORY;
  1970. _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command);
  1971. }
  1972. /* Pick addresses for the BARs. */
  1973. do {
  1974. list_for_each_entry(hpdev, &hbus->children, list_entry) {
  1975. for (i = 0; i < PCI_STD_NUM_BARS; i++) {
  1976. bar_val = hpdev->probed_bar[i];
  1977. if (bar_val == 0)
  1978. continue;
  1979. high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
  1980. if (high) {
  1981. bar_val |=
  1982. ((u64)hpdev->probed_bar[i + 1]
  1983. << 32);
  1984. } else {
  1985. bar_val |= 0xffffffffULL << 32;
  1986. }
  1987. bar_size = get_bar_size(bar_val);
  1988. if (high) {
  1989. if (high_size != bar_size) {
  1990. i++;
  1991. continue;
  1992. }
  1993. _hv_pcifront_write_config(hpdev,
  1994. PCI_BASE_ADDRESS_0 + (4 * i),
  1995. 4,
  1996. (u32)(high_base & 0xffffff00));
  1997. i++;
  1998. _hv_pcifront_write_config(hpdev,
  1999. PCI_BASE_ADDRESS_0 + (4 * i),
  2000. 4, (u32)(high_base >> 32));
  2001. high_base += bar_size;
  2002. } else {
  2003. if (low_size != bar_size)
  2004. continue;
  2005. _hv_pcifront_write_config(hpdev,
  2006. PCI_BASE_ADDRESS_0 + (4 * i),
  2007. 4,
  2008. (u32)(low_base & 0xffffff00));
  2009. low_base += bar_size;
  2010. }
  2011. }
  2012. if (high_size <= 1 && low_size <= 1) {
  2013. /*
  2014. * No need to set the PCI_COMMAND_MEMORY bit as
  2015. * the core PCI driver doesn't require the bit
  2016. * to be pre-set. Actually here we intentionally
  2017. * keep the bit off so that the PCI BAR probing
  2018. * in the core PCI driver doesn't cause Hyper-V
  2019. * to unnecessarily unmap/map the virtual BARs
  2020. * from/to the physical BARs multiple times.
  2021. * This reduces the VM boot time significantly
  2022. * if the BAR sizes are huge.
  2023. */
  2024. break;
  2025. }
  2026. }
  2027. high_size >>= 1;
  2028. low_size >>= 1;
  2029. } while (high_size || low_size);
  2030. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2031. }
  2032. /*
  2033. * Assign entries in sysfs pci slot directory.
  2034. *
  2035. * Note that this function does not need to lock the children list
  2036. * because it is called from pci_devices_present_work which
  2037. * is serialized with hv_eject_device_work because they are on the
  2038. * same ordered workqueue. Therefore hbus->children list will not change
  2039. * even when pci_create_slot sleeps.
  2040. */
  2041. static void hv_pci_assign_slots(struct hv_pcibus_device *hbus)
  2042. {
  2043. struct hv_pci_dev *hpdev;
  2044. char name[SLOT_NAME_SIZE];
  2045. int slot_nr;
  2046. list_for_each_entry(hpdev, &hbus->children, list_entry) {
  2047. if (hpdev->pci_slot)
  2048. continue;
  2049. slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot));
  2050. snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser);
  2051. hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr,
  2052. name, NULL);
  2053. if (IS_ERR(hpdev->pci_slot)) {
  2054. pr_warn("pci_create slot %s failed\n", name);
  2055. hpdev->pci_slot = NULL;
  2056. }
  2057. }
  2058. }
  2059. /*
  2060. * Remove entries in sysfs pci slot directory.
  2061. */
  2062. static void hv_pci_remove_slots(struct hv_pcibus_device *hbus)
  2063. {
  2064. struct hv_pci_dev *hpdev;
  2065. list_for_each_entry(hpdev, &hbus->children, list_entry) {
  2066. if (!hpdev->pci_slot)
  2067. continue;
  2068. pci_destroy_slot(hpdev->pci_slot);
  2069. hpdev->pci_slot = NULL;
  2070. }
  2071. }
  2072. /*
  2073. * Set NUMA node for the devices on the bus
  2074. */
  2075. static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
  2076. {
  2077. struct pci_dev *dev;
  2078. struct pci_bus *bus = hbus->bridge->bus;
  2079. struct hv_pci_dev *hv_dev;
  2080. list_for_each_entry(dev, &bus->devices, bus_list) {
  2081. hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn));
  2082. if (!hv_dev)
  2083. continue;
  2084. if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY &&
  2085. hv_dev->desc.virtual_numa_node < num_possible_nodes())
  2086. /*
  2087. * The kernel may boot with some NUMA nodes offline
  2088. * (e.g. in a KDUMP kernel) or with NUMA disabled via
  2089. * "numa=off". In those cases, adjust the host provided
  2090. * NUMA node to a valid NUMA node used by the kernel.
  2091. */
  2092. set_dev_node(&dev->dev,
  2093. numa_map_to_online_node(
  2094. hv_dev->desc.virtual_numa_node));
  2095. put_pcichild(hv_dev);
  2096. }
  2097. }
  2098. /**
  2099. * create_root_hv_pci_bus() - Expose a new root PCI bus
  2100. * @hbus: Root PCI bus, as understood by this driver
  2101. *
  2102. * Return: 0 on success, -errno on failure
  2103. */
  2104. static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
  2105. {
  2106. int error;
  2107. struct pci_host_bridge *bridge = hbus->bridge;
  2108. bridge->dev.parent = &hbus->hdev->device;
  2109. bridge->sysdata = &hbus->sysdata;
  2110. bridge->ops = &hv_pcifront_ops;
  2111. error = pci_scan_root_bus_bridge(bridge);
  2112. if (error)
  2113. return error;
  2114. pci_lock_rescan_remove();
  2115. hv_pci_assign_numa_node(hbus);
  2116. pci_bus_assign_resources(bridge->bus);
  2117. hv_pci_assign_slots(hbus);
  2118. pci_bus_add_devices(bridge->bus);
  2119. pci_unlock_rescan_remove();
  2120. hbus->state = hv_pcibus_installed;
  2121. return 0;
  2122. }
  2123. struct q_res_req_compl {
  2124. struct completion host_event;
  2125. struct hv_pci_dev *hpdev;
  2126. };
  2127. /**
  2128. * q_resource_requirements() - Query Resource Requirements
  2129. * @context: The completion context.
  2130. * @resp: The response that came from the host.
  2131. * @resp_packet_size: The size in bytes of resp.
  2132. *
  2133. * This function is invoked on completion of a Query Resource
  2134. * Requirements packet.
  2135. */
  2136. static void q_resource_requirements(void *context, struct pci_response *resp,
  2137. int resp_packet_size)
  2138. {
  2139. struct q_res_req_compl *completion = context;
  2140. struct pci_q_res_req_response *q_res_req =
  2141. (struct pci_q_res_req_response *)resp;
  2142. s32 status;
  2143. int i;
  2144. status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status;
  2145. if (status < 0) {
  2146. dev_err(&completion->hpdev->hbus->hdev->device,
  2147. "query resource requirements failed: %x\n",
  2148. status);
  2149. } else {
  2150. for (i = 0; i < PCI_STD_NUM_BARS; i++) {
  2151. completion->hpdev->probed_bar[i] =
  2152. q_res_req->probed_bar[i];
  2153. }
  2154. }
  2155. complete(&completion->host_event);
  2156. }
  2157. /**
  2158. * new_pcichild_device() - Create a new child device
  2159. * @hbus: The internal struct tracking this root PCI bus.
  2160. * @desc: The information supplied so far from the host
  2161. * about the device.
  2162. *
  2163. * This function creates the tracking structure for a new child
  2164. * device and kicks off the process of figuring out what it is.
  2165. *
  2166. * Return: Pointer to the new tracking struct
  2167. */
  2168. static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
  2169. struct hv_pcidev_description *desc)
  2170. {
  2171. struct hv_pci_dev *hpdev;
  2172. struct pci_child_message *res_req;
  2173. struct q_res_req_compl comp_pkt;
  2174. struct {
  2175. struct pci_packet init_packet;
  2176. u8 buffer[sizeof(struct pci_child_message)];
  2177. } pkt;
  2178. unsigned long flags;
  2179. int ret;
  2180. hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL);
  2181. if (!hpdev)
  2182. return NULL;
  2183. hpdev->hbus = hbus;
  2184. memset(&pkt, 0, sizeof(pkt));
  2185. init_completion(&comp_pkt.host_event);
  2186. comp_pkt.hpdev = hpdev;
  2187. pkt.init_packet.compl_ctxt = &comp_pkt;
  2188. pkt.init_packet.completion_func = q_resource_requirements;
  2189. res_req = (struct pci_child_message *)&pkt.init_packet.message;
  2190. res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
  2191. res_req->wslot.slot = desc->win_slot.slot;
  2192. ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
  2193. sizeof(struct pci_child_message),
  2194. (unsigned long)&pkt.init_packet,
  2195. VM_PKT_DATA_INBAND,
  2196. VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
  2197. if (ret)
  2198. goto error;
  2199. if (wait_for_response(hbus->hdev, &comp_pkt.host_event))
  2200. goto error;
  2201. hpdev->desc = *desc;
  2202. refcount_set(&hpdev->refs, 1);
  2203. get_pcichild(hpdev);
  2204. spin_lock_irqsave(&hbus->device_list_lock, flags);
  2205. list_add_tail(&hpdev->list_entry, &hbus->children);
  2206. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2207. return hpdev;
  2208. error:
  2209. kfree(hpdev);
  2210. return NULL;
  2211. }
  2212. /**
  2213. * get_pcichild_wslot() - Find device from slot
  2214. * @hbus: Root PCI bus, as understood by this driver
  2215. * @wslot: Location on the bus
  2216. *
  2217. * This function looks up a PCI device and returns the internal
  2218. * representation of it. It acquires a reference on it, so that
  2219. * the device won't be deleted while somebody is using it. The
  2220. * caller is responsible for calling put_pcichild() to release
  2221. * this reference.
  2222. *
  2223. * Return: Internal representation of a PCI device
  2224. */
  2225. static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
  2226. u32 wslot)
  2227. {
  2228. unsigned long flags;
  2229. struct hv_pci_dev *iter, *hpdev = NULL;
  2230. spin_lock_irqsave(&hbus->device_list_lock, flags);
  2231. list_for_each_entry(iter, &hbus->children, list_entry) {
  2232. if (iter->desc.win_slot.slot == wslot) {
  2233. hpdev = iter;
  2234. get_pcichild(hpdev);
  2235. break;
  2236. }
  2237. }
  2238. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2239. return hpdev;
  2240. }
  2241. /**
  2242. * pci_devices_present_work() - Handle new list of child devices
  2243. * @work: Work struct embedded in struct hv_dr_work
  2244. *
  2245. * "Bus Relations" is the Windows term for "children of this
  2246. * bus." The terminology is preserved here for people trying to
  2247. * debug the interaction between Hyper-V and Linux. This
  2248. * function is called when the parent partition reports a list
  2249. * of functions that should be observed under this PCI Express
  2250. * port (bus).
  2251. *
  2252. * This function updates the list, and must tolerate being
  2253. * called multiple times with the same information. The typical
  2254. * number of child devices is one, with very atypical cases
  2255. * involving three or four, so the algorithms used here can be
  2256. * simple and inefficient.
  2257. *
  2258. * It must also treat the omission of a previously observed device as
  2259. * notification that the device no longer exists.
  2260. *
  2261. * Note that this function is serialized with hv_eject_device_work(),
  2262. * because both are pushed to the ordered workqueue hbus->wq.
  2263. */
  2264. static void pci_devices_present_work(struct work_struct *work)
  2265. {
  2266. u32 child_no;
  2267. bool found;
  2268. struct hv_pcidev_description *new_desc;
  2269. struct hv_pci_dev *hpdev;
  2270. struct hv_pcibus_device *hbus;
  2271. struct list_head removed;
  2272. struct hv_dr_work *dr_wrk;
  2273. struct hv_dr_state *dr = NULL;
  2274. unsigned long flags;
  2275. dr_wrk = container_of(work, struct hv_dr_work, wrk);
  2276. hbus = dr_wrk->bus;
  2277. kfree(dr_wrk);
  2278. INIT_LIST_HEAD(&removed);
  2279. /* Pull this off the queue and process it if it was the last one. */
  2280. spin_lock_irqsave(&hbus->device_list_lock, flags);
  2281. while (!list_empty(&hbus->dr_list)) {
  2282. dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
  2283. list_entry);
  2284. list_del(&dr->list_entry);
  2285. /* Throw this away if the list still has stuff in it. */
  2286. if (!list_empty(&hbus->dr_list)) {
  2287. kfree(dr);
  2288. continue;
  2289. }
  2290. }
  2291. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2292. if (!dr)
  2293. return;
  2294. mutex_lock(&hbus->state_lock);
  2295. /* First, mark all existing children as reported missing. */
  2296. spin_lock_irqsave(&hbus->device_list_lock, flags);
  2297. list_for_each_entry(hpdev, &hbus->children, list_entry) {
  2298. hpdev->reported_missing = true;
  2299. }
  2300. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2301. /* Next, add back any reported devices. */
  2302. for (child_no = 0; child_no < dr->device_count; child_no++) {
  2303. found = false;
  2304. new_desc = &dr->func[child_no];
  2305. spin_lock_irqsave(&hbus->device_list_lock, flags);
  2306. list_for_each_entry(hpdev, &hbus->children, list_entry) {
  2307. if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) &&
  2308. (hpdev->desc.v_id == new_desc->v_id) &&
  2309. (hpdev->desc.d_id == new_desc->d_id) &&
  2310. (hpdev->desc.ser == new_desc->ser)) {
  2311. hpdev->reported_missing = false;
  2312. found = true;
  2313. }
  2314. }
  2315. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2316. if (!found) {
  2317. hpdev = new_pcichild_device(hbus, new_desc);
  2318. if (!hpdev)
  2319. dev_err(&hbus->hdev->device,
  2320. "couldn't record a child device.\n");
  2321. }
  2322. }
  2323. /* Move missing children to a list on the stack. */
  2324. spin_lock_irqsave(&hbus->device_list_lock, flags);
  2325. do {
  2326. found = false;
  2327. list_for_each_entry(hpdev, &hbus->children, list_entry) {
  2328. if (hpdev->reported_missing) {
  2329. found = true;
  2330. put_pcichild(hpdev);
  2331. list_move_tail(&hpdev->list_entry, &removed);
  2332. break;
  2333. }
  2334. }
  2335. } while (found);
  2336. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2337. /* Delete everything that should no longer exist. */
  2338. while (!list_empty(&removed)) {
  2339. hpdev = list_first_entry(&removed, struct hv_pci_dev,
  2340. list_entry);
  2341. list_del(&hpdev->list_entry);
  2342. if (hpdev->pci_slot)
  2343. pci_destroy_slot(hpdev->pci_slot);
  2344. put_pcichild(hpdev);
  2345. }
  2346. switch (hbus->state) {
  2347. case hv_pcibus_installed:
  2348. /*
  2349. * Tell the core to rescan bus
  2350. * because there may have been changes.
  2351. */
  2352. pci_lock_rescan_remove();
  2353. pci_scan_child_bus(hbus->bridge->bus);
  2354. hv_pci_assign_numa_node(hbus);
  2355. hv_pci_assign_slots(hbus);
  2356. pci_unlock_rescan_remove();
  2357. break;
  2358. case hv_pcibus_init:
  2359. case hv_pcibus_probed:
  2360. survey_child_resources(hbus);
  2361. break;
  2362. default:
  2363. break;
  2364. }
  2365. mutex_unlock(&hbus->state_lock);
  2366. kfree(dr);
  2367. }
  2368. /**
  2369. * hv_pci_start_relations_work() - Queue work to start device discovery
  2370. * @hbus: Root PCI bus, as understood by this driver
  2371. * @dr: The list of children returned from host
  2372. *
  2373. * Return: 0 on success, -errno on failure
  2374. */
  2375. static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus,
  2376. struct hv_dr_state *dr)
  2377. {
  2378. struct hv_dr_work *dr_wrk;
  2379. unsigned long flags;
  2380. bool pending_dr;
  2381. if (hbus->state == hv_pcibus_removing) {
  2382. dev_info(&hbus->hdev->device,
  2383. "PCI VMBus BUS_RELATIONS: ignored\n");
  2384. return -ENOENT;
  2385. }
  2386. dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
  2387. if (!dr_wrk)
  2388. return -ENOMEM;
  2389. INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
  2390. dr_wrk->bus = hbus;
  2391. spin_lock_irqsave(&hbus->device_list_lock, flags);
  2392. /*
  2393. * If pending_dr is true, we have already queued a work,
  2394. * which will see the new dr. Otherwise, we need to
  2395. * queue a new work.
  2396. */
  2397. pending_dr = !list_empty(&hbus->dr_list);
  2398. list_add_tail(&dr->list_entry, &hbus->dr_list);
  2399. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2400. if (pending_dr)
  2401. kfree(dr_wrk);
  2402. else
  2403. queue_work(hbus->wq, &dr_wrk->wrk);
  2404. return 0;
  2405. }
  2406. /**
  2407. * hv_pci_devices_present() - Handle list of new children
  2408. * @hbus: Root PCI bus, as understood by this driver
  2409. * @relations: Packet from host listing children
  2410. *
  2411. * Process a new list of devices on the bus. The list of devices is
  2412. * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS,
  2413. * whenever a new list of devices for this bus appears.
  2414. */
  2415. static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
  2416. struct pci_bus_relations *relations)
  2417. {
  2418. struct hv_dr_state *dr;
  2419. int i;
  2420. dr = kzalloc(struct_size(dr, func, relations->device_count),
  2421. GFP_NOWAIT);
  2422. if (!dr)
  2423. return;
  2424. dr->device_count = relations->device_count;
  2425. for (i = 0; i < dr->device_count; i++) {
  2426. dr->func[i].v_id = relations->func[i].v_id;
  2427. dr->func[i].d_id = relations->func[i].d_id;
  2428. dr->func[i].rev = relations->func[i].rev;
  2429. dr->func[i].prog_intf = relations->func[i].prog_intf;
  2430. dr->func[i].subclass = relations->func[i].subclass;
  2431. dr->func[i].base_class = relations->func[i].base_class;
  2432. dr->func[i].subsystem_id = relations->func[i].subsystem_id;
  2433. dr->func[i].win_slot = relations->func[i].win_slot;
  2434. dr->func[i].ser = relations->func[i].ser;
  2435. }
  2436. if (hv_pci_start_relations_work(hbus, dr))
  2437. kfree(dr);
  2438. }
  2439. /**
  2440. * hv_pci_devices_present2() - Handle list of new children
  2441. * @hbus: Root PCI bus, as understood by this driver
  2442. * @relations: Packet from host listing children
  2443. *
  2444. * This function is the v2 version of hv_pci_devices_present()
  2445. */
  2446. static void hv_pci_devices_present2(struct hv_pcibus_device *hbus,
  2447. struct pci_bus_relations2 *relations)
  2448. {
  2449. struct hv_dr_state *dr;
  2450. int i;
  2451. dr = kzalloc(struct_size(dr, func, relations->device_count),
  2452. GFP_NOWAIT);
  2453. if (!dr)
  2454. return;
  2455. dr->device_count = relations->device_count;
  2456. for (i = 0; i < dr->device_count; i++) {
  2457. dr->func[i].v_id = relations->func[i].v_id;
  2458. dr->func[i].d_id = relations->func[i].d_id;
  2459. dr->func[i].rev = relations->func[i].rev;
  2460. dr->func[i].prog_intf = relations->func[i].prog_intf;
  2461. dr->func[i].subclass = relations->func[i].subclass;
  2462. dr->func[i].base_class = relations->func[i].base_class;
  2463. dr->func[i].subsystem_id = relations->func[i].subsystem_id;
  2464. dr->func[i].win_slot = relations->func[i].win_slot;
  2465. dr->func[i].ser = relations->func[i].ser;
  2466. dr->func[i].flags = relations->func[i].flags;
  2467. dr->func[i].virtual_numa_node =
  2468. relations->func[i].virtual_numa_node;
  2469. }
  2470. if (hv_pci_start_relations_work(hbus, dr))
  2471. kfree(dr);
  2472. }
  2473. /**
  2474. * hv_eject_device_work() - Asynchronously handles ejection
  2475. * @work: Work struct embedded in internal device struct
  2476. *
  2477. * This function handles ejecting a device. Windows will
  2478. * attempt to gracefully eject a device, waiting 60 seconds to
  2479. * hear back from the guest OS that this completed successfully.
  2480. * If this timer expires, the device will be forcibly removed.
  2481. */
  2482. static void hv_eject_device_work(struct work_struct *work)
  2483. {
  2484. struct pci_eject_response *ejct_pkt;
  2485. struct hv_pcibus_device *hbus;
  2486. struct hv_pci_dev *hpdev;
  2487. struct pci_dev *pdev;
  2488. unsigned long flags;
  2489. int wslot;
  2490. struct {
  2491. struct pci_packet pkt;
  2492. u8 buffer[sizeof(struct pci_eject_response)];
  2493. } ctxt;
  2494. hpdev = container_of(work, struct hv_pci_dev, wrk);
  2495. hbus = hpdev->hbus;
  2496. mutex_lock(&hbus->state_lock);
  2497. /*
  2498. * Ejection can come before or after the PCI bus has been set up, so
  2499. * attempt to find it and tear down the bus state, if it exists. This
  2500. * must be done without constructs like pci_domain_nr(hbus->bridge->bus)
  2501. * because hbus->bridge->bus may not exist yet.
  2502. */
  2503. wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
  2504. pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot);
  2505. if (pdev) {
  2506. pci_lock_rescan_remove();
  2507. pci_stop_and_remove_bus_device(pdev);
  2508. pci_dev_put(pdev);
  2509. pci_unlock_rescan_remove();
  2510. }
  2511. spin_lock_irqsave(&hbus->device_list_lock, flags);
  2512. list_del(&hpdev->list_entry);
  2513. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  2514. if (hpdev->pci_slot)
  2515. pci_destroy_slot(hpdev->pci_slot);
  2516. memset(&ctxt, 0, sizeof(ctxt));
  2517. ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
  2518. ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
  2519. ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
  2520. vmbus_sendpacket(hbus->hdev->channel, ejct_pkt,
  2521. sizeof(*ejct_pkt), 0,
  2522. VM_PKT_DATA_INBAND, 0);
  2523. /* For the get_pcichild() in hv_pci_eject_device() */
  2524. put_pcichild(hpdev);
  2525. /* For the two refs got in new_pcichild_device() */
  2526. put_pcichild(hpdev);
  2527. put_pcichild(hpdev);
  2528. /* hpdev has been freed. Do not use it any more. */
  2529. mutex_unlock(&hbus->state_lock);
  2530. }
  2531. /**
  2532. * hv_pci_eject_device() - Handles device ejection
  2533. * @hpdev: Internal device tracking struct
  2534. *
  2535. * This function is invoked when an ejection packet arrives. It
  2536. * just schedules work so that we don't re-enter the packet
  2537. * delivery code handling the ejection.
  2538. */
  2539. static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
  2540. {
  2541. struct hv_pcibus_device *hbus = hpdev->hbus;
  2542. struct hv_device *hdev = hbus->hdev;
  2543. if (hbus->state == hv_pcibus_removing) {
  2544. dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n");
  2545. return;
  2546. }
  2547. get_pcichild(hpdev);
  2548. INIT_WORK(&hpdev->wrk, hv_eject_device_work);
  2549. queue_work(hbus->wq, &hpdev->wrk);
  2550. }
  2551. /**
  2552. * hv_pci_onchannelcallback() - Handles incoming packets
  2553. * @context: Internal bus tracking struct
  2554. *
  2555. * This function is invoked whenever the host sends a packet to
  2556. * this channel (which is private to this root PCI bus).
  2557. */
  2558. static void hv_pci_onchannelcallback(void *context)
  2559. {
  2560. const int packet_size = 0x100;
  2561. int ret;
  2562. struct hv_pcibus_device *hbus = context;
  2563. struct vmbus_channel *chan = hbus->hdev->channel;
  2564. u32 bytes_recvd;
  2565. u64 req_id, req_addr;
  2566. struct vmpacket_descriptor *desc;
  2567. unsigned char *buffer;
  2568. int bufferlen = packet_size;
  2569. struct pci_packet *comp_packet;
  2570. struct pci_response *response;
  2571. struct pci_incoming_message *new_message;
  2572. struct pci_bus_relations *bus_rel;
  2573. struct pci_bus_relations2 *bus_rel2;
  2574. struct pci_dev_inval_block *inval;
  2575. struct pci_dev_incoming *dev_message;
  2576. struct hv_pci_dev *hpdev;
  2577. unsigned long flags;
  2578. buffer = kmalloc(bufferlen, GFP_ATOMIC);
  2579. if (!buffer)
  2580. return;
  2581. while (1) {
  2582. ret = vmbus_recvpacket_raw(chan, buffer, bufferlen,
  2583. &bytes_recvd, &req_id);
  2584. if (ret == -ENOBUFS) {
  2585. kfree(buffer);
  2586. /* Handle large packet */
  2587. bufferlen = bytes_recvd;
  2588. buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
  2589. if (!buffer)
  2590. return;
  2591. continue;
  2592. }
  2593. /* Zero length indicates there are no more packets. */
  2594. if (ret || !bytes_recvd)
  2595. break;
  2596. /*
  2597. * All incoming packets must be at least as large as a
  2598. * response.
  2599. */
  2600. if (bytes_recvd <= sizeof(struct pci_response))
  2601. continue;
  2602. desc = (struct vmpacket_descriptor *)buffer;
  2603. switch (desc->type) {
  2604. case VM_PKT_COMP:
  2605. lock_requestor(chan, flags);
  2606. req_addr = __vmbus_request_addr_match(chan, req_id,
  2607. VMBUS_RQST_ADDR_ANY);
  2608. if (req_addr == VMBUS_RQST_ERROR) {
  2609. unlock_requestor(chan, flags);
  2610. dev_err(&hbus->hdev->device,
  2611. "Invalid transaction ID %llx\n",
  2612. req_id);
  2613. break;
  2614. }
  2615. comp_packet = (struct pci_packet *)req_addr;
  2616. response = (struct pci_response *)buffer;
  2617. /*
  2618. * Call ->completion_func() within the critical section to make
  2619. * sure that the packet pointer is still valid during the call:
  2620. * here 'valid' means that there's a task still waiting for the
  2621. * completion, and that the packet data is still on the waiting
  2622. * task's stack. Cf. hv_compose_msi_msg().
  2623. */
  2624. comp_packet->completion_func(comp_packet->compl_ctxt,
  2625. response,
  2626. bytes_recvd);
  2627. unlock_requestor(chan, flags);
  2628. break;
  2629. case VM_PKT_DATA_INBAND:
  2630. new_message = (struct pci_incoming_message *)buffer;
  2631. switch (new_message->message_type.type) {
  2632. case PCI_BUS_RELATIONS:
  2633. bus_rel = (struct pci_bus_relations *)buffer;
  2634. if (bytes_recvd < sizeof(*bus_rel) ||
  2635. bytes_recvd <
  2636. struct_size(bus_rel, func,
  2637. bus_rel->device_count)) {
  2638. dev_err(&hbus->hdev->device,
  2639. "bus relations too small\n");
  2640. break;
  2641. }
  2642. hv_pci_devices_present(hbus, bus_rel);
  2643. break;
  2644. case PCI_BUS_RELATIONS2:
  2645. bus_rel2 = (struct pci_bus_relations2 *)buffer;
  2646. if (bytes_recvd < sizeof(*bus_rel2) ||
  2647. bytes_recvd <
  2648. struct_size(bus_rel2, func,
  2649. bus_rel2->device_count)) {
  2650. dev_err(&hbus->hdev->device,
  2651. "bus relations v2 too small\n");
  2652. break;
  2653. }
  2654. hv_pci_devices_present2(hbus, bus_rel2);
  2655. break;
  2656. case PCI_EJECT:
  2657. dev_message = (struct pci_dev_incoming *)buffer;
  2658. if (bytes_recvd < sizeof(*dev_message)) {
  2659. dev_err(&hbus->hdev->device,
  2660. "eject message too small\n");
  2661. break;
  2662. }
  2663. hpdev = get_pcichild_wslot(hbus,
  2664. dev_message->wslot.slot);
  2665. if (hpdev) {
  2666. hv_pci_eject_device(hpdev);
  2667. put_pcichild(hpdev);
  2668. }
  2669. break;
  2670. case PCI_INVALIDATE_BLOCK:
  2671. inval = (struct pci_dev_inval_block *)buffer;
  2672. if (bytes_recvd < sizeof(*inval)) {
  2673. dev_err(&hbus->hdev->device,
  2674. "invalidate message too small\n");
  2675. break;
  2676. }
  2677. hpdev = get_pcichild_wslot(hbus,
  2678. inval->wslot.slot);
  2679. if (hpdev) {
  2680. if (hpdev->block_invalidate) {
  2681. hpdev->block_invalidate(
  2682. hpdev->invalidate_context,
  2683. inval->block_mask);
  2684. }
  2685. put_pcichild(hpdev);
  2686. }
  2687. break;
  2688. default:
  2689. dev_warn(&hbus->hdev->device,
  2690. "Unimplemented protocol message %x\n",
  2691. new_message->message_type.type);
  2692. break;
  2693. }
  2694. break;
  2695. default:
  2696. dev_err(&hbus->hdev->device,
  2697. "unhandled packet type %d, tid %llx len %d\n",
  2698. desc->type, req_id, bytes_recvd);
  2699. break;
  2700. }
  2701. }
  2702. kfree(buffer);
  2703. }
  2704. /**
  2705. * hv_pci_protocol_negotiation() - Set up protocol
  2706. * @hdev: VMBus's tracking struct for this root PCI bus.
  2707. * @version: Array of supported channel protocol versions in
  2708. * the order of probing - highest go first.
  2709. * @num_version: Number of elements in the version array.
  2710. *
  2711. * This driver is intended to support running on Windows 10
  2712. * (server) and later versions. It will not run on earlier
  2713. * versions, as they assume that many of the operations which
  2714. * Linux needs accomplished with a spinlock held were done via
  2715. * asynchronous messaging via VMBus. Windows 10 increases the
  2716. * surface area of PCI emulation so that these actions can take
  2717. * place by suspending a virtual processor for their duration.
  2718. *
  2719. * This function negotiates the channel protocol version,
  2720. * failing if the host doesn't support the necessary protocol
  2721. * level.
  2722. */
  2723. static int hv_pci_protocol_negotiation(struct hv_device *hdev,
  2724. enum pci_protocol_version_t version[],
  2725. int num_version)
  2726. {
  2727. struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
  2728. struct pci_version_request *version_req;
  2729. struct hv_pci_compl comp_pkt;
  2730. struct pci_packet *pkt;
  2731. int ret;
  2732. int i;
  2733. /*
  2734. * Initiate the handshake with the host and negotiate
  2735. * a version that the host can support. We start with the
  2736. * highest version number and go down if the host cannot
  2737. * support it.
  2738. */
  2739. pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
  2740. if (!pkt)
  2741. return -ENOMEM;
  2742. init_completion(&comp_pkt.host_event);
  2743. pkt->completion_func = hv_pci_generic_compl;
  2744. pkt->compl_ctxt = &comp_pkt;
  2745. version_req = (struct pci_version_request *)&pkt->message;
  2746. version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
  2747. for (i = 0; i < num_version; i++) {
  2748. version_req->protocol_version = version[i];
  2749. ret = vmbus_sendpacket(hdev->channel, version_req,
  2750. sizeof(struct pci_version_request),
  2751. (unsigned long)pkt, VM_PKT_DATA_INBAND,
  2752. VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
  2753. if (!ret)
  2754. ret = wait_for_response(hdev, &comp_pkt.host_event);
  2755. if (ret) {
  2756. dev_err(&hdev->device,
  2757. "PCI Pass-through VSP failed to request version: %d",
  2758. ret);
  2759. goto exit;
  2760. }
  2761. if (comp_pkt.completion_status >= 0) {
  2762. hbus->protocol_version = version[i];
  2763. dev_info(&hdev->device,
  2764. "PCI VMBus probing: Using version %#x\n",
  2765. hbus->protocol_version);
  2766. goto exit;
  2767. }
  2768. if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
  2769. dev_err(&hdev->device,
  2770. "PCI Pass-through VSP failed version request: %#x",
  2771. comp_pkt.completion_status);
  2772. ret = -EPROTO;
  2773. goto exit;
  2774. }
  2775. reinit_completion(&comp_pkt.host_event);
  2776. }
  2777. dev_err(&hdev->device,
  2778. "PCI pass-through VSP failed to find supported version");
  2779. ret = -EPROTO;
  2780. exit:
  2781. kfree(pkt);
  2782. return ret;
  2783. }
  2784. /**
  2785. * hv_pci_free_bridge_windows() - Release memory regions for the
  2786. * bus
  2787. * @hbus: Root PCI bus, as understood by this driver
  2788. */
  2789. static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
  2790. {
  2791. /*
  2792. * Set the resources back to the way they looked when they
  2793. * were allocated by setting IORESOURCE_BUSY again.
  2794. */
  2795. if (hbus->low_mmio_space && hbus->low_mmio_res) {
  2796. hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
  2797. vmbus_free_mmio(hbus->low_mmio_res->start,
  2798. resource_size(hbus->low_mmio_res));
  2799. }
  2800. if (hbus->high_mmio_space && hbus->high_mmio_res) {
  2801. hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
  2802. vmbus_free_mmio(hbus->high_mmio_res->start,
  2803. resource_size(hbus->high_mmio_res));
  2804. }
  2805. }
  2806. /**
  2807. * hv_pci_allocate_bridge_windows() - Allocate memory regions
  2808. * for the bus
  2809. * @hbus: Root PCI bus, as understood by this driver
  2810. *
  2811. * This function calls vmbus_allocate_mmio(), which is itself a
  2812. * bit of a compromise. Ideally, we might change the pnp layer
  2813. * in the kernel such that it comprehends either PCI devices
  2814. * which are "grandchildren of ACPI," with some intermediate bus
  2815. * node (in this case, VMBus) or change it such that it
  2816. * understands VMBus. The pnp layer, however, has been declared
  2817. * deprecated, and not subject to change.
  2818. *
  2819. * The workaround, implemented here, is to ask VMBus to allocate
  2820. * MMIO space for this bus. VMBus itself knows which ranges are
  2821. * appropriate by looking at its own ACPI objects. Then, after
  2822. * these ranges are claimed, they're modified to look like they
  2823. * would have looked if the ACPI and pnp code had allocated
  2824. * bridge windows. These descriptors have to exist in this form
  2825. * in order to satisfy the code which will get invoked when the
  2826. * endpoint PCI function driver calls request_mem_region() or
  2827. * request_mem_region_exclusive().
  2828. *
  2829. * Return: 0 on success, -errno on failure
  2830. */
  2831. static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
  2832. {
  2833. resource_size_t align;
  2834. int ret;
  2835. if (hbus->low_mmio_space) {
  2836. align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
  2837. ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
  2838. (u64)(u32)0xffffffff,
  2839. hbus->low_mmio_space,
  2840. align, false);
  2841. if (ret) {
  2842. dev_err(&hbus->hdev->device,
  2843. "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
  2844. hbus->low_mmio_space);
  2845. return ret;
  2846. }
  2847. /* Modify this resource to become a bridge window. */
  2848. hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
  2849. hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
  2850. pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res);
  2851. }
  2852. if (hbus->high_mmio_space) {
  2853. align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
  2854. ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
  2855. 0x100000000, -1,
  2856. hbus->high_mmio_space, align,
  2857. false);
  2858. if (ret) {
  2859. dev_err(&hbus->hdev->device,
  2860. "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
  2861. hbus->high_mmio_space);
  2862. goto release_low_mmio;
  2863. }
  2864. /* Modify this resource to become a bridge window. */
  2865. hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
  2866. hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
  2867. pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res);
  2868. }
  2869. return 0;
  2870. release_low_mmio:
  2871. if (hbus->low_mmio_res) {
  2872. vmbus_free_mmio(hbus->low_mmio_res->start,
  2873. resource_size(hbus->low_mmio_res));
  2874. }
  2875. return ret;
  2876. }
  2877. /**
  2878. * hv_allocate_config_window() - Find MMIO space for PCI Config
  2879. * @hbus: Root PCI bus, as understood by this driver
  2880. *
  2881. * This function claims memory-mapped I/O space for accessing
  2882. * configuration space for the functions on this bus.
  2883. *
  2884. * Return: 0 on success, -errno on failure
  2885. */
  2886. static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
  2887. {
  2888. int ret;
  2889. /*
  2890. * Set up a region of MMIO space to use for accessing configuration
  2891. * space.
  2892. */
  2893. ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
  2894. PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
  2895. if (ret)
  2896. return ret;
  2897. /*
  2898. * vmbus_allocate_mmio() gets used for allocating both device endpoint
  2899. * resource claims (those which cannot be overlapped) and the ranges
  2900. * which are valid for the children of this bus, which are intended
  2901. * to be overlapped by those children. Set the flag on this claim
  2902. * meaning that this region can't be overlapped.
  2903. */
  2904. hbus->mem_config->flags |= IORESOURCE_BUSY;
  2905. return 0;
  2906. }
  2907. static void hv_free_config_window(struct hv_pcibus_device *hbus)
  2908. {
  2909. vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
  2910. }
  2911. static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs);
  2912. /**
  2913. * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
  2914. * @hdev: VMBus's tracking struct for this root PCI bus
  2915. *
  2916. * Return: 0 on success, -errno on failure
  2917. */
  2918. static int hv_pci_enter_d0(struct hv_device *hdev)
  2919. {
  2920. struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
  2921. struct pci_bus_d0_entry *d0_entry;
  2922. struct hv_pci_compl comp_pkt;
  2923. struct pci_packet *pkt;
  2924. bool retry = true;
  2925. int ret;
  2926. enter_d0_retry:
  2927. /*
  2928. * Tell the host that the bus is ready to use, and moved into the
  2929. * powered-on state. This includes telling the host which region
  2930. * of memory-mapped I/O space has been chosen for configuration space
  2931. * access.
  2932. */
  2933. pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
  2934. if (!pkt)
  2935. return -ENOMEM;
  2936. init_completion(&comp_pkt.host_event);
  2937. pkt->completion_func = hv_pci_generic_compl;
  2938. pkt->compl_ctxt = &comp_pkt;
  2939. d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
  2940. d0_entry->message_type.type = PCI_BUS_D0ENTRY;
  2941. d0_entry->mmio_base = hbus->mem_config->start;
  2942. ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
  2943. (unsigned long)pkt, VM_PKT_DATA_INBAND,
  2944. VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
  2945. if (!ret)
  2946. ret = wait_for_response(hdev, &comp_pkt.host_event);
  2947. if (ret)
  2948. goto exit;
  2949. /*
  2950. * In certain case (Kdump) the pci device of interest was
  2951. * not cleanly shut down and resource is still held on host
  2952. * side, the host could return invalid device status.
  2953. * We need to explicitly request host to release the resource
  2954. * and try to enter D0 again.
  2955. */
  2956. if (comp_pkt.completion_status < 0 && retry) {
  2957. retry = false;
  2958. dev_err(&hdev->device, "Retrying D0 Entry\n");
  2959. /*
  2960. * Hv_pci_bus_exit() calls hv_send_resource_released()
  2961. * to free up resources of its child devices.
  2962. * In the kdump kernel we need to set the
  2963. * wslot_res_allocated to 255 so it scans all child
  2964. * devices to release resources allocated in the
  2965. * normal kernel before panic happened.
  2966. */
  2967. hbus->wslot_res_allocated = 255;
  2968. ret = hv_pci_bus_exit(hdev, true);
  2969. if (ret == 0) {
  2970. kfree(pkt);
  2971. goto enter_d0_retry;
  2972. }
  2973. dev_err(&hdev->device,
  2974. "Retrying D0 failed with ret %d\n", ret);
  2975. }
  2976. if (comp_pkt.completion_status < 0) {
  2977. dev_err(&hdev->device,
  2978. "PCI Pass-through VSP failed D0 Entry with status %x\n",
  2979. comp_pkt.completion_status);
  2980. ret = -EPROTO;
  2981. goto exit;
  2982. }
  2983. ret = 0;
  2984. exit:
  2985. kfree(pkt);
  2986. return ret;
  2987. }
  2988. /**
  2989. * hv_pci_query_relations() - Ask host to send list of child
  2990. * devices
  2991. * @hdev: VMBus's tracking struct for this root PCI bus
  2992. *
  2993. * Return: 0 on success, -errno on failure
  2994. */
  2995. static int hv_pci_query_relations(struct hv_device *hdev)
  2996. {
  2997. struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
  2998. struct pci_message message;
  2999. struct completion comp;
  3000. int ret;
  3001. /* Ask the host to send along the list of child devices */
  3002. init_completion(&comp);
  3003. if (cmpxchg(&hbus->survey_event, NULL, &comp))
  3004. return -ENOTEMPTY;
  3005. memset(&message, 0, sizeof(message));
  3006. message.type = PCI_QUERY_BUS_RELATIONS;
  3007. ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
  3008. 0, VM_PKT_DATA_INBAND, 0);
  3009. if (!ret)
  3010. ret = wait_for_response(hdev, &comp);
  3011. /*
  3012. * In the case of fast device addition/removal, it's possible that
  3013. * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we
  3014. * already got a PCI_BUS_RELATIONS* message from the host and the
  3015. * channel callback already scheduled a work to hbus->wq, which can be
  3016. * running pci_devices_present_work() -> survey_child_resources() ->
  3017. * complete(&hbus->survey_event), even after hv_pci_query_relations()
  3018. * exits and the stack variable 'comp' is no longer valid; as a result,
  3019. * a hang or a page fault may happen when the complete() calls
  3020. * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from
  3021. * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is
  3022. * -ENODEV, there can't be any more work item scheduled to hbus->wq
  3023. * after the flush_workqueue(): see vmbus_onoffer_rescind() ->
  3024. * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() ->
  3025. * channel->rescind = true.
  3026. */
  3027. flush_workqueue(hbus->wq);
  3028. return ret;
  3029. }
  3030. /**
  3031. * hv_send_resources_allocated() - Report local resource choices
  3032. * @hdev: VMBus's tracking struct for this root PCI bus
  3033. *
  3034. * The host OS is expecting to be sent a request as a message
  3035. * which contains all the resources that the device will use.
  3036. * The response contains those same resources, "translated"
  3037. * which is to say, the values which should be used by the
  3038. * hardware, when it delivers an interrupt. (MMIO resources are
  3039. * used in local terms.) This is nice for Windows, and lines up
  3040. * with the FDO/PDO split, which doesn't exist in Linux. Linux
  3041. * is deeply expecting to scan an emulated PCI configuration
  3042. * space. So this message is sent here only to drive the state
  3043. * machine on the host forward.
  3044. *
  3045. * Return: 0 on success, -errno on failure
  3046. */
  3047. static int hv_send_resources_allocated(struct hv_device *hdev)
  3048. {
  3049. struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
  3050. struct pci_resources_assigned *res_assigned;
  3051. struct pci_resources_assigned2 *res_assigned2;
  3052. struct hv_pci_compl comp_pkt;
  3053. struct hv_pci_dev *hpdev;
  3054. struct pci_packet *pkt;
  3055. size_t size_res;
  3056. int wslot;
  3057. int ret;
  3058. size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2)
  3059. ? sizeof(*res_assigned) : sizeof(*res_assigned2);
  3060. pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
  3061. if (!pkt)
  3062. return -ENOMEM;
  3063. ret = 0;
  3064. for (wslot = 0; wslot < 256; wslot++) {
  3065. hpdev = get_pcichild_wslot(hbus, wslot);
  3066. if (!hpdev)
  3067. continue;
  3068. memset(pkt, 0, sizeof(*pkt) + size_res);
  3069. init_completion(&comp_pkt.host_event);
  3070. pkt->completion_func = hv_pci_generic_compl;
  3071. pkt->compl_ctxt = &comp_pkt;
  3072. if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) {
  3073. res_assigned =
  3074. (struct pci_resources_assigned *)&pkt->message;
  3075. res_assigned->message_type.type =
  3076. PCI_RESOURCES_ASSIGNED;
  3077. res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
  3078. } else {
  3079. res_assigned2 =
  3080. (struct pci_resources_assigned2 *)&pkt->message;
  3081. res_assigned2->message_type.type =
  3082. PCI_RESOURCES_ASSIGNED2;
  3083. res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
  3084. }
  3085. put_pcichild(hpdev);
  3086. ret = vmbus_sendpacket(hdev->channel, &pkt->message,
  3087. size_res, (unsigned long)pkt,
  3088. VM_PKT_DATA_INBAND,
  3089. VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
  3090. if (!ret)
  3091. ret = wait_for_response(hdev, &comp_pkt.host_event);
  3092. if (ret)
  3093. break;
  3094. if (comp_pkt.completion_status < 0) {
  3095. ret = -EPROTO;
  3096. dev_err(&hdev->device,
  3097. "resource allocated returned 0x%x",
  3098. comp_pkt.completion_status);
  3099. break;
  3100. }
  3101. hbus->wslot_res_allocated = wslot;
  3102. }
  3103. kfree(pkt);
  3104. return ret;
  3105. }
  3106. /**
  3107. * hv_send_resources_released() - Report local resources
  3108. * released
  3109. * @hdev: VMBus's tracking struct for this root PCI bus
  3110. *
  3111. * Return: 0 on success, -errno on failure
  3112. */
  3113. static int hv_send_resources_released(struct hv_device *hdev)
  3114. {
  3115. struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
  3116. struct pci_child_message pkt;
  3117. struct hv_pci_dev *hpdev;
  3118. int wslot;
  3119. int ret;
  3120. for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) {
  3121. hpdev = get_pcichild_wslot(hbus, wslot);
  3122. if (!hpdev)
  3123. continue;
  3124. memset(&pkt, 0, sizeof(pkt));
  3125. pkt.message_type.type = PCI_RESOURCES_RELEASED;
  3126. pkt.wslot.slot = hpdev->desc.win_slot.slot;
  3127. put_pcichild(hpdev);
  3128. ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
  3129. VM_PKT_DATA_INBAND, 0);
  3130. if (ret)
  3131. return ret;
  3132. hbus->wslot_res_allocated = wslot - 1;
  3133. }
  3134. hbus->wslot_res_allocated = -1;
  3135. return 0;
  3136. }
  3137. #define HVPCI_DOM_MAP_SIZE (64 * 1024)
  3138. static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
  3139. /*
  3140. * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
  3141. * as invalid for passthrough PCI devices of this driver.
  3142. */
  3143. #define HVPCI_DOM_INVALID 0
  3144. /**
  3145. * hv_get_dom_num() - Get a valid PCI domain number
  3146. * Check if the PCI domain number is in use, and return another number if
  3147. * it is in use.
  3148. *
  3149. * @dom: Requested domain number
  3150. *
  3151. * return: domain number on success, HVPCI_DOM_INVALID on failure
  3152. */
  3153. static u16 hv_get_dom_num(u16 dom)
  3154. {
  3155. unsigned int i;
  3156. if (test_and_set_bit(dom, hvpci_dom_map) == 0)
  3157. return dom;
  3158. for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
  3159. if (test_and_set_bit(i, hvpci_dom_map) == 0)
  3160. return i;
  3161. }
  3162. return HVPCI_DOM_INVALID;
  3163. }
  3164. /**
  3165. * hv_put_dom_num() - Mark the PCI domain number as free
  3166. * @dom: Domain number to be freed
  3167. */
  3168. static void hv_put_dom_num(u16 dom)
  3169. {
  3170. clear_bit(dom, hvpci_dom_map);
  3171. }
  3172. /**
  3173. * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  3174. * @hdev: VMBus's tracking struct for this root PCI bus
  3175. * @dev_id: Identifies the device itself
  3176. *
  3177. * Return: 0 on success, -errno on failure
  3178. */
  3179. static int hv_pci_probe(struct hv_device *hdev,
  3180. const struct hv_vmbus_device_id *dev_id)
  3181. {
  3182. struct pci_host_bridge *bridge;
  3183. struct hv_pcibus_device *hbus;
  3184. u16 dom_req, dom;
  3185. char *name;
  3186. int ret;
  3187. bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
  3188. if (!bridge)
  3189. return -ENOMEM;
  3190. hbus = kzalloc(sizeof(*hbus), GFP_KERNEL);
  3191. if (!hbus)
  3192. return -ENOMEM;
  3193. hbus->bridge = bridge;
  3194. mutex_init(&hbus->state_lock);
  3195. hbus->state = hv_pcibus_init;
  3196. hbus->wslot_res_allocated = -1;
  3197. /*
  3198. * The PCI bus "domain" is what is called "segment" in ACPI and other
  3199. * specs. Pull it from the instance ID, to get something usually
  3200. * unique. In rare cases of collision, we will find out another number
  3201. * not in use.
  3202. *
  3203. * Note that, since this code only runs in a Hyper-V VM, Hyper-V
  3204. * together with this guest driver can guarantee that (1) The only
  3205. * domain used by Gen1 VMs for something that looks like a physical
  3206. * PCI bus (which is actually emulated by the hypervisor) is domain 0.
  3207. * (2) There will be no overlap between domains (after fixing possible
  3208. * collisions) in the same VM.
  3209. */
  3210. dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
  3211. dom = hv_get_dom_num(dom_req);
  3212. if (dom == HVPCI_DOM_INVALID) {
  3213. dev_err(&hdev->device,
  3214. "Unable to use dom# 0x%x or other numbers", dom_req);
  3215. ret = -EINVAL;
  3216. goto free_bus;
  3217. }
  3218. if (dom != dom_req)
  3219. dev_info(&hdev->device,
  3220. "PCI dom# 0x%x has collision, using 0x%x",
  3221. dom_req, dom);
  3222. hbus->bridge->domain_nr = dom;
  3223. #ifdef CONFIG_X86
  3224. hbus->sysdata.domain = dom;
  3225. hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS);
  3226. #elif defined(CONFIG_ARM64)
  3227. /*
  3228. * Set the PCI bus parent to be the corresponding VMbus
  3229. * device. Then the VMbus device will be assigned as the
  3230. * ACPI companion in pcibios_root_bridge_prepare() and
  3231. * pci_dma_configure() will propagate device coherence
  3232. * information to devices created on the bus.
  3233. */
  3234. hbus->sysdata.parent = hdev->device.parent;
  3235. hbus->use_calls = false;
  3236. #endif
  3237. hbus->hdev = hdev;
  3238. INIT_LIST_HEAD(&hbus->children);
  3239. INIT_LIST_HEAD(&hbus->dr_list);
  3240. spin_lock_init(&hbus->config_lock);
  3241. spin_lock_init(&hbus->device_list_lock);
  3242. hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
  3243. hbus->bridge->domain_nr);
  3244. if (!hbus->wq) {
  3245. ret = -ENOMEM;
  3246. goto free_dom;
  3247. }
  3248. hdev->channel->next_request_id_callback = vmbus_next_request_id;
  3249. hdev->channel->request_addr_callback = vmbus_request_addr;
  3250. hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE;
  3251. ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
  3252. hv_pci_onchannelcallback, hbus);
  3253. if (ret)
  3254. goto destroy_wq;
  3255. hv_set_drvdata(hdev, hbus);
  3256. ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions,
  3257. ARRAY_SIZE(pci_protocol_versions));
  3258. if (ret)
  3259. goto close;
  3260. ret = hv_allocate_config_window(hbus);
  3261. if (ret)
  3262. goto close;
  3263. hbus->cfg_addr = ioremap(hbus->mem_config->start,
  3264. PCI_CONFIG_MMIO_LENGTH);
  3265. if (!hbus->cfg_addr) {
  3266. dev_err(&hdev->device,
  3267. "Unable to map a virtual address for config space\n");
  3268. ret = -ENOMEM;
  3269. goto free_config;
  3270. }
  3271. name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance);
  3272. if (!name) {
  3273. ret = -ENOMEM;
  3274. goto unmap;
  3275. }
  3276. hbus->fwnode = irq_domain_alloc_named_fwnode(name);
  3277. kfree(name);
  3278. if (!hbus->fwnode) {
  3279. ret = -ENOMEM;
  3280. goto unmap;
  3281. }
  3282. ret = hv_pcie_init_irq_domain(hbus);
  3283. if (ret)
  3284. goto free_fwnode;
  3285. ret = hv_pci_query_relations(hdev);
  3286. if (ret)
  3287. goto free_irq_domain;
  3288. mutex_lock(&hbus->state_lock);
  3289. ret = hv_pci_enter_d0(hdev);
  3290. if (ret)
  3291. goto release_state_lock;
  3292. ret = hv_pci_allocate_bridge_windows(hbus);
  3293. if (ret)
  3294. goto exit_d0;
  3295. ret = hv_send_resources_allocated(hdev);
  3296. if (ret)
  3297. goto free_windows;
  3298. prepopulate_bars(hbus);
  3299. hbus->state = hv_pcibus_probed;
  3300. ret = create_root_hv_pci_bus(hbus);
  3301. if (ret)
  3302. goto free_windows;
  3303. mutex_unlock(&hbus->state_lock);
  3304. return 0;
  3305. free_windows:
  3306. hv_pci_free_bridge_windows(hbus);
  3307. exit_d0:
  3308. (void) hv_pci_bus_exit(hdev, true);
  3309. release_state_lock:
  3310. mutex_unlock(&hbus->state_lock);
  3311. free_irq_domain:
  3312. irq_domain_remove(hbus->irq_domain);
  3313. free_fwnode:
  3314. irq_domain_free_fwnode(hbus->fwnode);
  3315. unmap:
  3316. iounmap(hbus->cfg_addr);
  3317. free_config:
  3318. hv_free_config_window(hbus);
  3319. close:
  3320. vmbus_close(hdev->channel);
  3321. destroy_wq:
  3322. destroy_workqueue(hbus->wq);
  3323. free_dom:
  3324. hv_put_dom_num(hbus->bridge->domain_nr);
  3325. free_bus:
  3326. kfree(hbus);
  3327. return ret;
  3328. }
  3329. static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
  3330. {
  3331. struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
  3332. struct vmbus_channel *chan = hdev->channel;
  3333. struct {
  3334. struct pci_packet teardown_packet;
  3335. u8 buffer[sizeof(struct pci_message)];
  3336. } pkt;
  3337. struct hv_pci_compl comp_pkt;
  3338. struct hv_pci_dev *hpdev, *tmp;
  3339. unsigned long flags;
  3340. u64 trans_id;
  3341. int ret;
  3342. /*
  3343. * After the host sends the RESCIND_CHANNEL message, it doesn't
  3344. * access the per-channel ringbuffer any longer.
  3345. */
  3346. if (chan->rescind)
  3347. return 0;
  3348. if (!keep_devs) {
  3349. struct list_head removed;
  3350. /* Move all present children to the list on stack */
  3351. INIT_LIST_HEAD(&removed);
  3352. spin_lock_irqsave(&hbus->device_list_lock, flags);
  3353. list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry)
  3354. list_move_tail(&hpdev->list_entry, &removed);
  3355. spin_unlock_irqrestore(&hbus->device_list_lock, flags);
  3356. /* Remove all children in the list */
  3357. list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) {
  3358. list_del(&hpdev->list_entry);
  3359. if (hpdev->pci_slot)
  3360. pci_destroy_slot(hpdev->pci_slot);
  3361. /* For the two refs got in new_pcichild_device() */
  3362. put_pcichild(hpdev);
  3363. put_pcichild(hpdev);
  3364. }
  3365. }
  3366. ret = hv_send_resources_released(hdev);
  3367. if (ret) {
  3368. dev_err(&hdev->device,
  3369. "Couldn't send resources released packet(s)\n");
  3370. return ret;
  3371. }
  3372. memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
  3373. init_completion(&comp_pkt.host_event);
  3374. pkt.teardown_packet.completion_func = hv_pci_generic_compl;
  3375. pkt.teardown_packet.compl_ctxt = &comp_pkt;
  3376. pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
  3377. ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message,
  3378. sizeof(struct pci_message),
  3379. (unsigned long)&pkt.teardown_packet,
  3380. &trans_id, VM_PKT_DATA_INBAND,
  3381. VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
  3382. if (ret)
  3383. return ret;
  3384. if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) {
  3385. /*
  3386. * The completion packet on the stack becomes invalid after
  3387. * 'return'; remove the ID from the VMbus requestor if the
  3388. * identifier is still mapped to/associated with the packet.
  3389. *
  3390. * Cf. hv_pci_onchannelcallback().
  3391. */
  3392. vmbus_request_addr_match(chan, trans_id,
  3393. (unsigned long)&pkt.teardown_packet);
  3394. return -ETIMEDOUT;
  3395. }
  3396. return 0;
  3397. }
  3398. /**
  3399. * hv_pci_remove() - Remove routine for this VMBus channel
  3400. * @hdev: VMBus's tracking struct for this root PCI bus
  3401. */
  3402. static void hv_pci_remove(struct hv_device *hdev)
  3403. {
  3404. struct hv_pcibus_device *hbus;
  3405. hbus = hv_get_drvdata(hdev);
  3406. if (hbus->state == hv_pcibus_installed) {
  3407. tasklet_disable(&hdev->channel->callback_event);
  3408. hbus->state = hv_pcibus_removing;
  3409. tasklet_enable(&hdev->channel->callback_event);
  3410. destroy_workqueue(hbus->wq);
  3411. hbus->wq = NULL;
  3412. /*
  3413. * At this point, no work is running or can be scheduled
  3414. * on hbus-wq. We can't race with hv_pci_devices_present()
  3415. * or hv_pci_eject_device(), it's safe to proceed.
  3416. */
  3417. /* Remove the bus from PCI's point of view. */
  3418. pci_lock_rescan_remove();
  3419. pci_stop_root_bus(hbus->bridge->bus);
  3420. hv_pci_remove_slots(hbus);
  3421. pci_remove_root_bus(hbus->bridge->bus);
  3422. pci_unlock_rescan_remove();
  3423. }
  3424. hv_pci_bus_exit(hdev, false);
  3425. vmbus_close(hdev->channel);
  3426. iounmap(hbus->cfg_addr);
  3427. hv_free_config_window(hbus);
  3428. hv_pci_free_bridge_windows(hbus);
  3429. irq_domain_remove(hbus->irq_domain);
  3430. irq_domain_free_fwnode(hbus->fwnode);
  3431. hv_put_dom_num(hbus->bridge->domain_nr);
  3432. kfree(hbus);
  3433. }
  3434. static int hv_pci_suspend(struct hv_device *hdev)
  3435. {
  3436. struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
  3437. enum hv_pcibus_state old_state;
  3438. int ret;
  3439. /*
  3440. * hv_pci_suspend() must make sure there are no pending work items
  3441. * before calling vmbus_close(), since it runs in a process context
  3442. * as a callback in dpm_suspend(). When it starts to run, the channel
  3443. * callback hv_pci_onchannelcallback(), which runs in a tasklet
  3444. * context, can be still running concurrently and scheduling new work
  3445. * items onto hbus->wq in hv_pci_devices_present() and
  3446. * hv_pci_eject_device(), and the work item handlers can access the
  3447. * vmbus channel, which can be being closed by hv_pci_suspend(), e.g.
  3448. * the work item handler pci_devices_present_work() ->
  3449. * new_pcichild_device() writes to the vmbus channel.
  3450. *
  3451. * To eliminate the race, hv_pci_suspend() disables the channel
  3452. * callback tasklet, sets hbus->state to hv_pcibus_removing, and
  3453. * re-enables the tasklet. This way, when hv_pci_suspend() proceeds,
  3454. * it knows that no new work item can be scheduled, and then it flushes
  3455. * hbus->wq and safely closes the vmbus channel.
  3456. */
  3457. tasklet_disable(&hdev->channel->callback_event);
  3458. /* Change the hbus state to prevent new work items. */
  3459. old_state = hbus->state;
  3460. if (hbus->state == hv_pcibus_installed)
  3461. hbus->state = hv_pcibus_removing;
  3462. tasklet_enable(&hdev->channel->callback_event);
  3463. if (old_state != hv_pcibus_installed)
  3464. return -EINVAL;
  3465. flush_workqueue(hbus->wq);
  3466. ret = hv_pci_bus_exit(hdev, true);
  3467. if (ret)
  3468. return ret;
  3469. vmbus_close(hdev->channel);
  3470. return 0;
  3471. }
  3472. static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
  3473. {
  3474. struct irq_data *irq_data;
  3475. struct msi_desc *entry;
  3476. int ret = 0;
  3477. if (!pdev->msi_enabled && !pdev->msix_enabled)
  3478. return 0;
  3479. msi_lock_descs(&pdev->dev);
  3480. msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) {
  3481. irq_data = irq_get_irq_data(entry->irq);
  3482. if (WARN_ON_ONCE(!irq_data)) {
  3483. ret = -EINVAL;
  3484. break;
  3485. }
  3486. hv_compose_msi_msg(irq_data, &entry->msg);
  3487. }
  3488. msi_unlock_descs(&pdev->dev);
  3489. return ret;
  3490. }
  3491. /*
  3492. * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg()
  3493. * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V
  3494. * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg()
  3495. * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping
  3496. * Table entries.
  3497. */
  3498. static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus)
  3499. {
  3500. pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL);
  3501. }
  3502. static int hv_pci_resume(struct hv_device *hdev)
  3503. {
  3504. struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
  3505. enum pci_protocol_version_t version[1];
  3506. int ret;
  3507. hbus->state = hv_pcibus_init;
  3508. hdev->channel->next_request_id_callback = vmbus_next_request_id;
  3509. hdev->channel->request_addr_callback = vmbus_request_addr;
  3510. hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE;
  3511. ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
  3512. hv_pci_onchannelcallback, hbus);
  3513. if (ret)
  3514. return ret;
  3515. /* Only use the version that was in use before hibernation. */
  3516. version[0] = hbus->protocol_version;
  3517. ret = hv_pci_protocol_negotiation(hdev, version, 1);
  3518. if (ret)
  3519. goto out;
  3520. ret = hv_pci_query_relations(hdev);
  3521. if (ret)
  3522. goto out;
  3523. mutex_lock(&hbus->state_lock);
  3524. ret = hv_pci_enter_d0(hdev);
  3525. if (ret)
  3526. goto release_state_lock;
  3527. ret = hv_send_resources_allocated(hdev);
  3528. if (ret)
  3529. goto release_state_lock;
  3530. prepopulate_bars(hbus);
  3531. hv_pci_restore_msi_state(hbus);
  3532. hbus->state = hv_pcibus_installed;
  3533. mutex_unlock(&hbus->state_lock);
  3534. return 0;
  3535. release_state_lock:
  3536. mutex_unlock(&hbus->state_lock);
  3537. out:
  3538. vmbus_close(hdev->channel);
  3539. return ret;
  3540. }
  3541. static const struct hv_vmbus_device_id hv_pci_id_table[] = {
  3542. /* PCI Pass-through Class ID */
  3543. /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
  3544. { HV_PCIE_GUID, },
  3545. { },
  3546. };
  3547. MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
  3548. static struct hv_driver hv_pci_drv = {
  3549. .name = "hv_pci",
  3550. .id_table = hv_pci_id_table,
  3551. .probe = hv_pci_probe,
  3552. .remove = hv_pci_remove,
  3553. .suspend = hv_pci_suspend,
  3554. .resume = hv_pci_resume,
  3555. };
  3556. static void __exit exit_hv_pci_drv(void)
  3557. {
  3558. vmbus_driver_unregister(&hv_pci_drv);
  3559. hvpci_block_ops.read_block = NULL;
  3560. hvpci_block_ops.write_block = NULL;
  3561. hvpci_block_ops.reg_blk_invalidate = NULL;
  3562. }
  3563. static int __init init_hv_pci_drv(void)
  3564. {
  3565. int ret;
  3566. if (!hv_is_hyperv_initialized())
  3567. return -ENODEV;
  3568. ret = hv_pci_irqchip_init();
  3569. if (ret)
  3570. return ret;
  3571. /* Set the invalid domain number's bit, so it will not be used */
  3572. set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
  3573. /* Initialize PCI block r/w interface */
  3574. hvpci_block_ops.read_block = hv_read_config_block;
  3575. hvpci_block_ops.write_block = hv_write_config_block;
  3576. hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
  3577. return vmbus_driver_register(&hv_pci_drv);
  3578. }
  3579. module_init(init_hv_pci_drv);
  3580. module_exit(exit_hv_pci_drv);
  3581. MODULE_DESCRIPTION("Hyper-V PCI");
  3582. MODULE_LICENSE("GPL v2");