| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475 |
- // SPDX-License-Identifier: GPL-2.0-or-later
- /* vnode and volume validity verification.
- *
- * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
- #include <linux/kernel.h>
- #include <linux/module.h>
- #include <linux/sched.h>
- #include "internal.h"
- /*
- * Data validation is managed through a number of mechanisms from the server:
- *
- * (1) On first contact with a server (such as if it has just been rebooted),
- * the server sends us a CB.InitCallBackState* request.
- *
- * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
- * calls, the server maintains a time-limited per-vnode promise that it
- * will send us a CB.CallBack request if a third party alters the vnodes
- * accessed.
- *
- * Note that a vnode-level callbacks may also be sent for other reasons,
- * such as filelock release.
- *
- * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
- * calls, each server maintains a time-limited per-volume promise that it
- * will send us a CB.CallBack request if the RO volume is updated to a
- * snapshot of the RW volume ("vos release"). This is an atomic event
- * that cuts over all instances of the RO volume across multiple servers
- * simultaneously.
- *
- * Note that a volume-level callbacks may also be sent for other reasons,
- * such as the volumeserver taking over control of the volume from the
- * fileserver.
- *
- * Note also that each server maintains an independent time limit on an
- * independent callback.
- *
- * (4) Certain RPC calls include a volume information record "VolSync" in
- * their reply. This contains a creation date for the volume that should
- * remain unchanged for a RW volume (but will be changed if the volume is
- * restored from backup) or will be bumped to the time of snapshotting
- * when a RO volume is released.
- *
- * In order to track this events, the following are provided:
- *
- * ->cb_v_break. A counter of events that might mean that the contents of
- * a volume have been altered since we last checked a vnode.
- *
- * ->cb_v_check. A counter of the number of events that we've sent a
- * query to the server for. Everything's up to date if this equals
- * cb_v_break.
- *
- * ->cb_scrub. A counter of the number of regression events for which we
- * have to completely wipe the cache.
- *
- * ->cb_ro_snapshot. A counter of the number of times that we've
- * recognised that a RO volume has been updated.
- *
- * ->cb_break. A counter of events that might mean that the contents of a
- * vnode have been altered.
- *
- * ->cb_expires_at. The time at which the callback promise expires or
- * AFS_NO_CB_PROMISE if we have no promise.
- *
- * The way we manage things is:
- *
- * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
- * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
- * volume and volume's server record.
- *
- * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
- * callback break on all the volumes that have been using that volume
- * (ie. increment ->cb_v_break and reset ->cb_expires_at).
- *
- * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
- * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
- * dispatch a work item to unmap all PTEs to the vnode's pagecache to
- * force reentry to the filesystem for revalidation.
- *
- * (4) When entering the filesystem, we call afs_validate() to check the
- * validity of a vnode. This first checks to see if ->cb_v_check and
- * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
- * exclusively and perform an FS.FetchStatus on the vnode.
- *
- * After checking the volume, we check the vnode. If there's a mismatch
- * between the volume counters and the vnode's mirrors of those counters,
- * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
- *
- * (5) When the reply from FS.FetchStatus arrives, the VolSync record is
- * parsed:
- *
- * (A) If the Creation timestamp has changed on a RW volume or regressed
- * on a RO volume, we try to increment ->cb_scrub; if it advances on a
- * RO volume, we assume "vos release" happened and try to increment
- * ->cb_ro_snapshot.
- *
- * (B) If the Update timestamp has regressed, we try to increment
- * ->cb_scrub.
- *
- * Note that in both of these cases, we only do the increment if we can
- * cmpxchg the value of the timestamp from the value we noted before the
- * op. This tries to prevent parallel ops from fighting one another.
- *
- * volume->cb_v_check is then set to ->cb_v_break.
- *
- * (6) The AFSCallBack record included in the FS.FetchStatus reply is also
- * parsed and used to set the promise in ->cb_expires_at for the vnode,
- * the volume and the volume's server record.
- *
- * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
- * the vnode.
- */
- /*
- * Check the validity of a vnode/inode and its parent volume.
- */
- bool afs_check_validity(const struct afs_vnode *vnode)
- {
- const struct afs_volume *volume = vnode->volume;
- time64_t deadline = ktime_get_real_seconds() + 10;
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
- return true;
- if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
- atomic64_read(&vnode->cb_expires_at) <= deadline ||
- volume->cb_expires_at <= deadline ||
- vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
- vnode->cb_scrub != atomic_read(&volume->cb_scrub) ||
- test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
- _debug("inval");
- return false;
- }
- return true;
- }
- /*
- * See if the server we've just talked to is currently excluded.
- */
- static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
- {
- const struct afs_server_entry *se;
- const struct afs_server_list *slist;
- bool is_excluded = true;
- int i;
- rcu_read_lock();
- slist = rcu_dereference(volume->servers);
- for (i = 0; i < slist->nr_servers; i++) {
- se = &slist->servers[i];
- if (op->server == se->server) {
- is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
- break;
- }
- }
- rcu_read_unlock();
- return is_excluded;
- }
- /*
- * Update the volume's server list when the creation time changes and see if
- * the server we've just talked to is currently excluded.
- */
- static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
- {
- int ret;
- if (__afs_is_server_excluded(op, volume))
- return 1;
- set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
- ret = afs_check_volume_status(op->volume, op);
- if (ret < 0)
- return ret;
- return __afs_is_server_excluded(op, volume);
- }
- /*
- * Handle a change to the volume creation time in the VolSync record.
- */
- static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
- {
- unsigned int snap;
- time64_t cur = volume->creation_time;
- time64_t old = op->pre_volsync.creation;
- time64_t new = op->volsync.creation;
- int ret;
- _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
- if (cur == TIME64_MIN) {
- volume->creation_time = new;
- return 0;
- }
- if (new == cur)
- return 0;
- /* Try to advance the creation timestamp from what we had before the
- * operation to what we got back from the server. This should
- * hopefully ensure that in a race between multiple operations only one
- * of them will do this.
- */
- if (cur != old)
- return 0;
- /* If the creation time changes in an unexpected way, we need to scrub
- * our caches. For a RW vol, this will only change if the volume is
- * restored from a backup; for a RO/Backup vol, this will advance when
- * the volume is updated to a new snapshot (eg. "vos release").
- */
- if (volume->type == AFSVL_RWVOL)
- goto regressed;
- if (volume->type == AFSVL_BACKVOL) {
- if (new < old)
- goto regressed;
- goto advance;
- }
- /* We have an RO volume, we need to query the VL server and look at the
- * server flags to see if RW->RO replication is in progress.
- */
- ret = afs_is_server_excluded(op, volume);
- if (ret < 0)
- return ret;
- if (ret > 0) {
- snap = atomic_read(&volume->cb_ro_snapshot);
- trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
- return ret;
- }
- advance:
- snap = atomic_inc_return(&volume->cb_ro_snapshot);
- trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
- volume->creation_time = new;
- return 0;
- regressed:
- atomic_inc(&volume->cb_scrub);
- trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
- volume->creation_time = new;
- return 0;
- }
- /*
- * Handle a change to the volume update time in the VolSync record.
- */
- static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
- {
- enum afs_cb_break_reason reason = afs_cb_break_no_break;
- time64_t cur = volume->update_time;
- time64_t old = op->pre_volsync.update;
- time64_t new = op->volsync.update;
- _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
- if (cur == TIME64_MIN) {
- volume->update_time = new;
- return;
- }
- if (new == cur)
- return;
- /* If the volume update time changes in an unexpected way, we need to
- * scrub our caches. For a RW vol, this will advance on every
- * modification op; for a RO/Backup vol, this will advance when the
- * volume is updated to a new snapshot (eg. "vos release").
- */
- if (new < old)
- reason = afs_cb_break_for_update_regress;
- /* Try to advance the update timestamp from what we had before the
- * operation to what we got back from the server. This should
- * hopefully ensure that in a race between multiple operations only one
- * of them will do this.
- */
- if (cur == old) {
- if (reason == afs_cb_break_for_update_regress) {
- atomic_inc(&volume->cb_scrub);
- trace_afs_cb_v_break(volume->vid, 0, reason);
- }
- volume->update_time = new;
- }
- }
- static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
- {
- int ret = 0;
- if (likely(op->volsync.creation == volume->creation_time &&
- op->volsync.update == volume->update_time))
- return 0;
- mutex_lock(&volume->volsync_lock);
- if (op->volsync.creation != volume->creation_time) {
- ret = afs_update_volume_creation_time(op, volume);
- if (ret < 0)
- goto out;
- }
- if (op->volsync.update != volume->update_time)
- afs_update_volume_update_time(op, volume);
- out:
- mutex_unlock(&volume->volsync_lock);
- return ret;
- }
- /*
- * Update the state of a volume, including recording the expiration time of the
- * callback promise. Returns 1 to redo the operation from the start.
- */
- int afs_update_volume_state(struct afs_operation *op)
- {
- struct afs_server_list *slist = op->server_list;
- struct afs_server_entry *se = &slist->servers[op->server_index];
- struct afs_callback *cb = &op->file[0].scb.callback;
- struct afs_volume *volume = op->volume;
- unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
- unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
- int ret;
- _enter("%llx", op->volume->vid);
- if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
- ret = afs_update_volume_times(op, volume);
- if (ret != 0) {
- _leave(" = %d", ret);
- return ret;
- }
- }
- if (op->cb_v_break == cb_v_break &&
- (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
- time64_t expires_at = cb->expires_at;
- if (!op->file[0].scb.have_cb)
- expires_at = op->file[1].scb.callback.expires_at;
- se->cb_expires_at = expires_at;
- volume->cb_expires_at = expires_at;
- }
- if (cb_v_check < op->cb_v_break)
- atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
- return 0;
- }
- /*
- * mark the data attached to an inode as obsolete due to a write on the server
- * - might also want to ditch all the outstanding writes and dirty pages
- */
- static void afs_zap_data(struct afs_vnode *vnode)
- {
- _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
- afs_invalidate_cache(vnode, 0);
- /* nuke all the non-dirty pages that aren't locked, mapped or being
- * written back in a regular file and completely discard the pages in a
- * directory or symlink */
- if (S_ISREG(vnode->netfs.inode.i_mode))
- filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
- else
- filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
- }
- /*
- * validate a vnode/inode
- * - there are several things we need to check
- * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
- * symlink)
- * - parent dir metadata changed (security changes)
- * - dentry data changed (write, truncate)
- * - dentry metadata changed (security changes)
- */
- int afs_validate(struct afs_vnode *vnode, struct key *key)
- {
- struct afs_volume *volume = vnode->volume;
- unsigned int cb_ro_snapshot, cb_scrub;
- time64_t deadline = ktime_get_real_seconds() + 10;
- bool zap = false, locked_vol = false;
- int ret;
- _enter("{v={%llx:%llu} fl=%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, vnode->flags,
- key_serial(key));
- if (afs_check_validity(vnode))
- return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
- ret = down_write_killable(&vnode->validate_lock);
- if (ret < 0)
- goto error;
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- ret = -ESTALE;
- goto error_unlock;
- }
- /* Validate a volume after the v_break has changed or the volume
- * callback expired. We only want to do this once per volume per
- * v_break change. The actual work will be done when parsing the
- * status fetch reply.
- */
- if (volume->cb_expires_at <= deadline ||
- atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
- ret = mutex_lock_interruptible(&volume->cb_check_lock);
- if (ret < 0)
- goto error_unlock;
- locked_vol = true;
- }
- cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
- cb_scrub = atomic_read(&volume->cb_scrub);
- if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
- vnode->cb_scrub != cb_scrub)
- unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
- if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
- vnode->cb_scrub != cb_scrub ||
- volume->cb_expires_at <= deadline ||
- atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
- atomic64_read(&vnode->cb_expires_at) <= deadline
- ) {
- ret = afs_fetch_status(vnode, key, false, NULL);
- if (ret < 0) {
- if (ret == -ENOENT) {
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
- goto error_unlock;
- }
- _debug("new promise [fl=%lx]", vnode->flags);
- }
- /* We can drop the volume lock now as. */
- if (locked_vol) {
- mutex_unlock(&volume->cb_check_lock);
- locked_vol = false;
- }
- cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
- cb_scrub = atomic_read(&volume->cb_scrub);
- _debug("vnode inval %x==%x %x==%x",
- vnode->cb_ro_snapshot, cb_ro_snapshot,
- vnode->cb_scrub, cb_scrub);
- if (vnode->cb_scrub != cb_scrub)
- zap = true;
- vnode->cb_ro_snapshot = cb_ro_snapshot;
- vnode->cb_scrub = cb_scrub;
- /* if the vnode's data version number changed then its contents are
- * different */
- zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
- if (zap)
- afs_zap_data(vnode);
- up_write(&vnode->validate_lock);
- _leave(" = 0");
- return 0;
- error_unlock:
- if (locked_vol)
- mutex_unlock(&volume->cb_check_lock);
- up_write(&vnode->validate_lock);
- error:
- _leave(" = %d", ret);
- return ret;
- }
|