mdsmap.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/ceph/ceph_debug.h>
  3. #include <linux/bug.h>
  4. #include <linux/err.h>
  5. #include <linux/random.h>
  6. #include <linux/slab.h>
  7. #include <linux/types.h>
  8. #include <linux/ceph/messenger.h>
  9. #include <linux/ceph/decode.h>
  10. #include "mdsmap.h"
  11. #include "mds_client.h"
  12. #include "super.h"
  13. #define CEPH_MDS_IS_READY(i, ignore_laggy) \
  14. (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
  15. static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
  16. {
  17. int n = 0;
  18. int i, j;
  19. /* count */
  20. for (i = 0; i < m->possible_max_rank; i++)
  21. if (CEPH_MDS_IS_READY(i, ignore_laggy))
  22. n++;
  23. if (n == 0)
  24. return -1;
  25. /* pick */
  26. n = get_random_u32_below(n);
  27. for (j = 0, i = 0; i < m->possible_max_rank; i++) {
  28. if (CEPH_MDS_IS_READY(i, ignore_laggy))
  29. j++;
  30. if (j > n)
  31. break;
  32. }
  33. return i;
  34. }
  35. /*
  36. * choose a random mds that is "up" (i.e. has a state > 0), or -1.
  37. */
  38. int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
  39. {
  40. int mds;
  41. mds = __mdsmap_get_random_mds(m, false);
  42. if (mds == m->possible_max_rank || mds == -1)
  43. mds = __mdsmap_get_random_mds(m, true);
  44. return mds == m->possible_max_rank ? -1 : mds;
  45. }
  46. #define __decode_and_drop_type(p, end, type, bad) \
  47. do { \
  48. if (*p + sizeof(type) > end) \
  49. goto bad; \
  50. *p += sizeof(type); \
  51. } while (0)
  52. #define __decode_and_drop_set(p, end, type, bad) \
  53. do { \
  54. u32 n; \
  55. size_t need; \
  56. ceph_decode_32_safe(p, end, n, bad); \
  57. need = sizeof(type) * n; \
  58. ceph_decode_need(p, end, need, bad); \
  59. *p += need; \
  60. } while (0)
  61. #define __decode_and_drop_map(p, end, ktype, vtype, bad) \
  62. do { \
  63. u32 n; \
  64. size_t need; \
  65. ceph_decode_32_safe(p, end, n, bad); \
  66. need = (sizeof(ktype) + sizeof(vtype)) * n; \
  67. ceph_decode_need(p, end, need, bad); \
  68. *p += need; \
  69. } while (0)
  70. static int __decode_and_drop_compat_set(void **p, void* end)
  71. {
  72. int i;
  73. /* compat, ro_compat, incompat*/
  74. for (i = 0; i < 3; i++) {
  75. u32 n;
  76. ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
  77. /* mask */
  78. *p += sizeof(u64);
  79. /* names (map<u64, string>) */
  80. n = ceph_decode_32(p);
  81. while (n-- > 0) {
  82. u32 len;
  83. ceph_decode_need(p, end, sizeof(u64) + sizeof(u32),
  84. bad);
  85. *p += sizeof(u64);
  86. len = ceph_decode_32(p);
  87. ceph_decode_need(p, end, len, bad);
  88. *p += len;
  89. }
  90. }
  91. return 0;
  92. bad:
  93. return -1;
  94. }
  95. /*
  96. * Decode an MDS map
  97. *
  98. * Ignore any fields we don't care about (there are quite a few of
  99. * them).
  100. */
  101. struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
  102. void *end, bool msgr2)
  103. {
  104. struct ceph_client *cl = mdsc->fsc->client;
  105. struct ceph_mdsmap *m;
  106. const void *start = *p;
  107. int i, j, n;
  108. int err;
  109. u8 mdsmap_v;
  110. u16 mdsmap_ev;
  111. u32 target;
  112. m = kzalloc(sizeof(*m), GFP_NOFS);
  113. if (!m)
  114. return ERR_PTR(-ENOMEM);
  115. ceph_decode_need(p, end, 1 + 1, bad);
  116. mdsmap_v = ceph_decode_8(p);
  117. *p += sizeof(u8); /* mdsmap_cv */
  118. if (mdsmap_v >= 4) {
  119. u32 mdsmap_len;
  120. ceph_decode_32_safe(p, end, mdsmap_len, bad);
  121. if (end < *p + mdsmap_len)
  122. goto bad;
  123. end = *p + mdsmap_len;
  124. }
  125. ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
  126. m->m_epoch = ceph_decode_32(p);
  127. m->m_client_epoch = ceph_decode_32(p);
  128. m->m_last_failure = ceph_decode_32(p);
  129. m->m_root = ceph_decode_32(p);
  130. m->m_session_timeout = ceph_decode_32(p);
  131. m->m_session_autoclose = ceph_decode_32(p);
  132. m->m_max_file_size = ceph_decode_64(p);
  133. m->m_max_mds = ceph_decode_32(p);
  134. /*
  135. * pick out the active nodes as the m_num_active_mds, the
  136. * m_num_active_mds maybe larger than m_max_mds when decreasing
  137. * the max_mds in cluster side, in other case it should less
  138. * than or equal to m_max_mds.
  139. */
  140. m->m_num_active_mds = n = ceph_decode_32(p);
  141. /*
  142. * the possible max rank, it maybe larger than the m_num_active_mds,
  143. * for example if the mds_max == 2 in the cluster, when the MDS(0)
  144. * was laggy and being replaced by a new MDS, we will temporarily
  145. * receive a new mds map with n_num_mds == 1 and the active MDS(1),
  146. * and the mds rank >= m_num_active_mds.
  147. */
  148. m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds);
  149. m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS);
  150. if (!m->m_info)
  151. goto nomem;
  152. /* pick out active nodes from mds_info (state > 0) */
  153. for (i = 0; i < n; i++) {
  154. u64 global_id;
  155. u32 namelen;
  156. s32 mds, inc, state;
  157. u8 info_v;
  158. void *info_end = NULL;
  159. struct ceph_entity_addr addr;
  160. u32 num_export_targets;
  161. void *pexport_targets = NULL;
  162. struct ceph_timespec laggy_since;
  163. struct ceph_mds_info *info;
  164. bool laggy;
  165. ceph_decode_need(p, end, sizeof(u64) + 1, bad);
  166. global_id = ceph_decode_64(p);
  167. info_v= ceph_decode_8(p);
  168. if (info_v >= 4) {
  169. u32 info_len;
  170. ceph_decode_need(p, end, 1 + sizeof(u32), bad);
  171. *p += sizeof(u8); /* info_cv */
  172. info_len = ceph_decode_32(p);
  173. info_end = *p + info_len;
  174. if (info_end > end)
  175. goto bad;
  176. }
  177. ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
  178. *p += sizeof(u64);
  179. namelen = ceph_decode_32(p); /* skip mds name */
  180. *p += namelen;
  181. ceph_decode_32_safe(p, end, mds, bad);
  182. ceph_decode_32_safe(p, end, inc, bad);
  183. ceph_decode_32_safe(p, end, state, bad);
  184. *p += sizeof(u64); /* state_seq */
  185. if (info_v >= 8)
  186. err = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
  187. else
  188. err = ceph_decode_entity_addr(p, end, &addr);
  189. if (err)
  190. goto corrupt;
  191. ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since),
  192. bad);
  193. laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
  194. *p += sizeof(u32);
  195. ceph_decode_32_safe(p, end, namelen, bad);
  196. *p += namelen;
  197. if (info_v >= 2) {
  198. ceph_decode_32_safe(p, end, num_export_targets, bad);
  199. pexport_targets = *p;
  200. *p += num_export_targets * sizeof(u32);
  201. } else {
  202. num_export_targets = 0;
  203. }
  204. if (info_end && *p != info_end) {
  205. if (*p > info_end)
  206. goto bad;
  207. *p = info_end;
  208. }
  209. doutc(cl, "%d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id,
  210. mds, inc, ceph_pr_addr(&addr),
  211. ceph_mds_state_name(state), laggy ? "(laggy)" : "");
  212. if (mds < 0 || mds >= m->possible_max_rank) {
  213. pr_warn_client(cl, "got incorrect mds(%d)\n", mds);
  214. continue;
  215. }
  216. if (state <= 0) {
  217. doutc(cl, "got incorrect state(%s)\n",
  218. ceph_mds_state_name(state));
  219. continue;
  220. }
  221. info = &m->m_info[mds];
  222. info->global_id = global_id;
  223. info->state = state;
  224. info->addr = addr;
  225. info->laggy = laggy;
  226. info->num_export_targets = num_export_targets;
  227. if (num_export_targets) {
  228. info->export_targets = kcalloc(num_export_targets,
  229. sizeof(u32), GFP_NOFS);
  230. if (!info->export_targets)
  231. goto nomem;
  232. for (j = 0; j < num_export_targets; j++) {
  233. target = ceph_decode_32(&pexport_targets);
  234. info->export_targets[j] = target;
  235. }
  236. } else {
  237. info->export_targets = NULL;
  238. }
  239. }
  240. /* pg_pools */
  241. ceph_decode_32_safe(p, end, n, bad);
  242. m->m_num_data_pg_pools = n;
  243. m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
  244. if (!m->m_data_pg_pools)
  245. goto nomem;
  246. ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
  247. for (i = 0; i < n; i++)
  248. m->m_data_pg_pools[i] = ceph_decode_64(p);
  249. m->m_cas_pg_pool = ceph_decode_64(p);
  250. m->m_enabled = m->m_epoch > 1;
  251. mdsmap_ev = 1;
  252. if (mdsmap_v >= 2) {
  253. ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext);
  254. }
  255. if (mdsmap_ev >= 3) {
  256. if (__decode_and_drop_compat_set(p, end) < 0)
  257. goto bad_ext;
  258. }
  259. /* metadata_pool */
  260. if (mdsmap_ev < 5) {
  261. __decode_and_drop_type(p, end, u32, bad_ext);
  262. } else {
  263. __decode_and_drop_type(p, end, u64, bad_ext);
  264. }
  265. /* created + modified + tableserver */
  266. __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
  267. __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
  268. __decode_and_drop_type(p, end, u32, bad_ext);
  269. /* in */
  270. {
  271. int num_laggy = 0;
  272. ceph_decode_32_safe(p, end, n, bad_ext);
  273. ceph_decode_need(p, end, sizeof(u32) * n, bad_ext);
  274. for (i = 0; i < n; i++) {
  275. s32 mds = ceph_decode_32(p);
  276. if (mds >= 0 && mds < m->possible_max_rank) {
  277. if (m->m_info[mds].laggy)
  278. num_laggy++;
  279. }
  280. }
  281. m->m_num_laggy = num_laggy;
  282. if (n > m->possible_max_rank) {
  283. void *new_m_info = krealloc(m->m_info,
  284. n * sizeof(*m->m_info),
  285. GFP_NOFS | __GFP_ZERO);
  286. if (!new_m_info)
  287. goto nomem;
  288. m->m_info = new_m_info;
  289. }
  290. m->possible_max_rank = n;
  291. }
  292. /* inc */
  293. __decode_and_drop_map(p, end, u32, u32, bad_ext);
  294. /* up */
  295. __decode_and_drop_map(p, end, u32, u64, bad_ext);
  296. /* failed */
  297. __decode_and_drop_set(p, end, u32, bad_ext);
  298. /* stopped */
  299. __decode_and_drop_set(p, end, u32, bad_ext);
  300. if (mdsmap_ev >= 4) {
  301. /* last_failure_osd_epoch */
  302. __decode_and_drop_type(p, end, u32, bad_ext);
  303. }
  304. if (mdsmap_ev >= 6) {
  305. /* ever_allowed_snaps */
  306. __decode_and_drop_type(p, end, u8, bad_ext);
  307. /* explicitly_allowed_snaps */
  308. __decode_and_drop_type(p, end, u8, bad_ext);
  309. }
  310. if (mdsmap_ev >= 7) {
  311. /* inline_data_enabled */
  312. __decode_and_drop_type(p, end, u8, bad_ext);
  313. }
  314. if (mdsmap_ev >= 8) {
  315. /* enabled */
  316. ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
  317. /* fs_name */
  318. ceph_decode_skip_string(p, end, bad_ext);
  319. }
  320. /* damaged */
  321. if (mdsmap_ev >= 9) {
  322. size_t need;
  323. ceph_decode_32_safe(p, end, n, bad_ext);
  324. need = sizeof(u32) * n;
  325. ceph_decode_need(p, end, need, bad_ext);
  326. *p += need;
  327. m->m_damaged = n > 0;
  328. } else {
  329. m->m_damaged = false;
  330. }
  331. if (mdsmap_ev >= 17) {
  332. /* balancer */
  333. ceph_decode_skip_string(p, end, bad_ext);
  334. /* standby_count_wanted */
  335. ceph_decode_skip_32(p, end, bad_ext);
  336. /* old_max_mds */
  337. ceph_decode_skip_32(p, end, bad_ext);
  338. /* min_compat_client */
  339. ceph_decode_skip_8(p, end, bad_ext);
  340. /* required_client_features */
  341. ceph_decode_skip_set(p, end, 64, bad_ext);
  342. /* bal_rank_mask */
  343. ceph_decode_skip_string(p, end, bad_ext);
  344. }
  345. if (mdsmap_ev >= 18) {
  346. ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
  347. }
  348. bad_ext:
  349. doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
  350. !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
  351. *p = end;
  352. doutc(cl, "success epoch %u\n", m->m_epoch);
  353. return m;
  354. nomem:
  355. err = -ENOMEM;
  356. goto out_err;
  357. corrupt:
  358. pr_err_client(cl, "corrupt mdsmap\n");
  359. print_hex_dump(KERN_DEBUG, "mdsmap: ",
  360. DUMP_PREFIX_OFFSET, 16, 1,
  361. start, end - start, true);
  362. out_err:
  363. ceph_mdsmap_destroy(m);
  364. return ERR_PTR(err);
  365. bad:
  366. err = -EINVAL;
  367. goto corrupt;
  368. }
  369. void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
  370. {
  371. int i;
  372. if (m->m_info) {
  373. for (i = 0; i < m->possible_max_rank; i++)
  374. kfree(m->m_info[i].export_targets);
  375. kfree(m->m_info);
  376. }
  377. kfree(m->m_data_pg_pools);
  378. kfree(m);
  379. }
  380. bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
  381. {
  382. int i, nr_active = 0;
  383. if (!m->m_enabled)
  384. return false;
  385. if (m->m_damaged)
  386. return false;
  387. if (m->m_num_laggy == m->m_num_active_mds)
  388. return false;
  389. for (i = 0; i < m->possible_max_rank; i++) {
  390. if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
  391. nr_active++;
  392. }
  393. return nr_active > 0;
  394. }