diff options
Diffstat (limited to 'xlators')
710 files changed, 339072 insertions, 353856 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am index 876a31f8e81..ef20cbb64fa 100644 --- a/xlators/Makefile.am +++ b/xlators/Makefile.am @@ -1,16 +1,12 @@ -if ENABLE_EXPERIMENTAL - EXPERIMENTAL = experimental -endif - if BUILD_GNFS GNFS_DIR = nfs endif -DIST_SUBDIRS = cluster storage protocol performance debug features encryption \ - mount nfs mgmt system playground meta experimental +DIST_SUBDIRS = cluster storage protocol performance debug features \ + mount nfs mgmt system playground meta -SUBDIRS = cluster storage protocol performance debug features encryption \ - mount ${GNFS_DIR} mgmt system playground meta $(EXPERIMENTAL) +SUBDIRS = cluster storage protocol performance debug features \ + mount ${GNFS_DIR} mgmt system playground meta EXTRA_DIST = xlator.sym diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am index 903fbb39f12..8e067d5ab58 100644 --- a/xlators/cluster/Makefile.am +++ b/xlators/cluster/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = stripe afr dht ec +SUBDIRS = afr dht ec CLEANFILES = diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c index 11594a2b7ae..032ab5c8001 100644 --- a/xlators/cluster/afr/src/afr-common.c +++ b/xlators/cluster/afr/src/afr-common.c @@ -15,22 +15,20 @@ #include <stdlib.h> #include <signal.h> -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "statedump.h" -#include "events.h" -#include "upcall-utils.h" +#include <glusterfs/dict.h> +#include <glusterfs/hashfn.h> +#include <glusterfs/list.h> +#include <glusterfs/call-stub.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/statedump.h> +#include <glusterfs/events.h> +#include <glusterfs/upcall-utils.h> #include "afr-inode-read.h" #include "afr-inode-write.h" @@ -40,120 +38,798 @@ #include "afr-self-heal.h" #include "afr-self-heald.h" #include "afr-messages.h" -#include "compound-fop-utils.h" int32_t -afr_quorum_errno (afr_private_t *priv) +afr_quorum_errno(afr_private_t *priv) { - if (priv->quorum_reads) - return ENOTCONN; - return EROFS; + return ENOTCONN; +} + +gf_boolean_t +afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name, + pid_t pid) +{ + if (!__is_root_gfid(pargfid)) { + return _gf_false; + } + + if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) { + /*For backward compatibility /.landfill is private*/ + return _gf_true; + } + + if (pid == GF_CLIENT_PID_GSYNCD) { + /*geo-rep needs to create/sync private directory on slave because + * it appears in changelog*/ + return _gf_false; + } + + if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) { + if (strcmp(name, priv->anon_inode_name) == 0) { + /* anonymous-inode dir is private*/ + return _gf_true; + } + } else { + if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) == + 0) { + /* anonymous-inode dir prefix is private for geo-rep to work*/ + return _gf_true; + } + } + + return _gf_false; +} + +void +afr_fill_success_replies(afr_local_t *local, afr_private_t *priv, + unsigned char *replies) +{ + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + replies[i] = 1; + } else { + replies[i] = 0; + } + } } int -afr_fav_child_reset_sink_xattrs (void *opaque); +afr_fav_child_reset_sink_xattrs(void *opaque); int -afr_fav_child_reset_sink_xattrs_cbk (int ret, call_frame_t *frame, - void *opaque); +afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque); static void -afr_discover_done (call_frame_t *frame, xlator_t *this); +afr_discover_done(call_frame_t *frame, xlator_t *this); -gf_boolean_t -afr_is_consistent_io_possible (afr_local_t *local, afr_private_t *priv, - int32_t *op_errno) +int +afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - if (priv->consistent_io && local->call_count != priv->child_count) { - gf_msg (THIS->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOLS_DOWN, "All subvolumes are not up"); - if (op_errno) - *op_errno = ENOTCONN; - return _gf_false; + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; + + local->cont.lk.dom_lock_op_ret[i] = op_ret; + local->cont.lk.dom_lock_op_errno[i] = op_errno; + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to acquire %s on %s", + uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM, + priv->children[i]->name); + } else { + local->cont.lk.dom_locked_nodes[i] = 1; + } + + syncbarrier_wake(&local->barrier); + + return 0; +} + +int +afr_dom_lock_acquire(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + int i = 0; + + priv = frame->this->private; + local = frame->local; + local->cont.lk.dom_locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + if (!local->cont.lk.dom_locked_nodes) { + return -ENOMEM; + } + local->cont.lk.dom_lock_op_ret = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_ret) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + local->cont.lk.dom_lock_op_errno = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno), + gf_afr_mt_int32_t); + if (!local->cont.lk.dom_lock_op_errno) { + return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */ + } + flock.l_type = F_WRLCK; + + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLK, &flock, NULL); + + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) + goto blocking_lock; + + /*If any of the bricks returned EAGAIN, we still need blocking locks.*/ + if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) != + priv->child_count) { + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.dom_lock_op_ret[i] == -1 && + local->cont.lk.dom_lock_op_errno[i] == EAGAIN) + goto blocking_lock; } - return _gf_true; + } + + return 0; + +blocking_lock: + afr_dom_lock_release(frame); + AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM, + local->fd, F_SETLKW, &flock, NULL); + if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) { + afr_dom_lock_release(frame); + return -afr_quorum_errno(priv); + } + + return 0; } -call_frame_t * -afr_copy_frame (call_frame_t *base) +int +afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *frame = NULL; - int op_errno = 0; + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = (long)cookie; - frame = copy_frame (base); - if (!frame) - return NULL; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) { - AFR_STACK_DESTROY (frame); - return NULL; - } + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to release %s on %s", local->loc.path, + AFR_LK_HEAL_DOM, priv->children[i]->name); + } + local->cont.lk.dom_locked_nodes[i] = 0; - return frame; + syncbarrier_wake(&local->barrier); + + return 0; } -/* Check if an entry or inode could be undergoing a transaction. */ -gf_boolean_t -afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local, - xlator_t *this) +void +afr_dom_lock_release(call_frame_t *frame) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + struct gf_flock flock = { + 0, + }; + + local = frame->local; + priv = frame->this->private; + locked_on = local->cont.lk.dom_locked_nodes; + if (AFR_COUNT(locked_on, priv->child_count) == 0) + return; + flock.l_type = F_UNLCK; + + AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk, + AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL); + + return; +} + +static void +afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info) { - int i = 0; - int tmp = 0; - afr_private_t *priv = NULL; - GF_UNUSED char *key = NULL; + if (!info) + return; + if (info->xdata_req) + dict_unref(info->xdata_req); + if (info->fd) + fd_unref(info->fd); + GF_FREE(info->locked_nodes); + GF_FREE(info->child_up_event_gen); + GF_FREE(info->child_down_event_gen); + GF_FREE(info); +} - priv = this->private; +static int +afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -ENOMEM; + + info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t); + if (!info) { + goto cleanup; + } + INIT_LIST_HEAD(&info->pos); + info->fd = fd_ref(local->fd); + info->cmd = local->cont.lk.cmd; + info->pid = frame->root->pid; + info->flock = local->cont.lk.user_flock; + info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL); + if (!info->xdata_req) { + goto cleanup; + } + info->lk_owner = frame->root->lk_owner; + info->locked_nodes = GF_MALLOC( + sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char); + if (!info->locked_nodes) { + goto cleanup; + } + memcpy(info->locked_nodes, local->cont.lk.locked_nodes, + sizeof(*info->locked_nodes) * priv->child_count); + info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen), + priv->child_count, gf_afr_mt_int32_t); + if (!info->child_up_event_gen) { + goto cleanup; + } + info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen), + priv->child_count, + gf_afr_mt_int32_t); + if (!info->child_down_event_gen) { + goto cleanup; + } + + LOCK(&local->fd->lock); + { + fd_ctx = __afr_fd_ctx_get(local->fd, this); + if (fd_ctx) + fd_ctx->lk_heal_info = info; + } + UNLOCK(&local->fd->lock); + if (!fd_ctx) { + goto cleanup; + } + + LOCK(&priv->lock); + { + list_add_tail(&info->pos, &priv->saved_locks); + } + UNLOCK(&priv->lock); + + return 0; +cleanup: + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to add lock to healq", + uuid_utoa(local->fd->inode->gfid)); + if (info) { + afr_lk_heal_info_cleanup(info); + if (fd_ctx) { + LOCK(&local->fd->lock); + { + fd_ctx->lk_heal_info = NULL; + } + UNLOCK(&local->fd->lock); + } + } + return ret; +} - if (type == AFR_ENTRY_TRANSACTION) - key = GLUSTERFS_PARENT_ENTRYLK; - else if (type == AFR_DATA_TRANSACTION) - /*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once - * pl_inodelk_xattr_fill supports separate keys for different - * domains.*/ - key = GLUSTERFS_INODELK_COUNT; +static int +afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = local->cont.lk.user_flock; + afr_lk_heal_info_t *info = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = -EINVAL; + + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx || !fd_ctx->lk_heal_info) { + goto out; + } + + info = fd_ctx->lk_heal_info; + if ((info->flock.l_start != flock.l_start) || + (info->flock.l_whence != flock.l_whence) || + (info->flock.l_len != flock.l_len)) { + /*TODO: Compare lkowners too.*/ + goto out; + } + + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + UNLOCK(&priv->lock); + + afr_lk_heal_info_cleanup(info); + fd_ctx->lk_heal_info = NULL; + ret = 0; +out: + if (ret) + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM, + "%s: Failed to remove lock from healq", + uuid_utoa(local->fd->inode->gfid)); + return ret; +} - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].xdata) - continue; - if (dict_get_int32 (local->replies[i].xdata, key, &tmp) == 0) - if (tmp) - return _gf_true; - } +int +afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; - return _gf_false; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed to heal lock on child %d for %s", i, + uuid_utoa(local->fd->inode->gfid)); + } + syncbarrier_wake(&local->barrier); + return 0; } int -__afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) +afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - uint64_t ctx_int = 0; - int ret = -1; - afr_inode_ctx_t *tmp_ctx = NULL; + afr_local_t *local = frame->local; + int i = (long)cookie; - ret = __inode_ctx_get (inode, this, &ctx_int); - if (ret) { - tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), - gf_afr_mt_inode_ctx_t); - if (!tmp_ctx) - goto out; - - ctx_int = (long) tmp_ctx; - ret = __inode_ctx_set (inode, this, &ctx_int); - if (ret) { - GF_FREE (tmp_ctx); - goto out; - } - tmp_ctx->spb_choice = -1; - tmp_ctx->read_subvol = 0; - } else { - tmp_ctx = (afr_inode_ctx_t *) ctx_int; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret != 0) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid)); + } else { + local->cont.lk.getlk_rsp[i] = *lock; + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +static gf_boolean_t +afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + afr_local_t *local = frame->local; + struct gf_flock flock = { + 0, + }; + gf_boolean_t ret = _gf_true; + char *wind_on = alloca0(priv->child_count); + unsigned char *success_replies = alloca0(priv->child_count); + local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp), + priv->child_count, gf_afr_mt_gf_lock); + + flock = info->flock; + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + wind_on[i] = 1; + } + + AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock, + info->xdata_req); + + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) == 0) { + ret = _gf_false; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret != 0) + continue; + if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK) + continue; + /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/ + if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner, + &info->lk_owner)) { + ret = _gf_false; + break; + } + } +out: + afr_local_replies_wipe(local, priv); + GF_FREE(local->cont.lk.getlk_rsp); + local->cont.lk.getlk_rsp = NULL; + return ret; +} + +static void +afr_mark_fd_bad(fd_t *fd, xlator_t *this) +{ + afr_fd_ctx_t *fd_ctx = NULL; + + if (!fd) + return; + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + if (fd_ctx) { + fd_ctx->is_fd_bad = _gf_true; + fd_ctx->lk_heal_info = NULL; } + } + UNLOCK(&fd->lock); +} - *ctx = tmp_ctx; - ret = 0; +static void +afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info) +{ + LOCK(&priv->lock); + { + list_del(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + UNLOCK(&priv->lock); +} + +static void +afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv, + afr_lk_heal_info_t *info) +{ + int i = 0; + int op_errno = 0; + int32_t *current_event_gen = NULL; + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + char *wind_on = alloca0(priv->child_count); + gf_boolean_t retry = _gf_true; + + frame->root->pid = info->pid; + lk_owner_copy(&frame->root->lk_owner, &info->lk_owner); + + op_errno = -afr_dom_lock_acquire(frame); + if ((op_errno != 0)) { + goto release; + } + + if (!afr_does_lk_owner_match(frame, priv, info)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM, + "Ignoring lock heal for %s since lk-onwers mismatch. " + "Lock possibly pre-empted by another client.", + uuid_utoa(info->fd->inode->gfid)); + goto release; + } + + for (i = 0; i < priv->child_count; i++) { + if (info->locked_nodes[i]) + continue; + wind_on[i] = 1; + } + + current_event_gen = alloca(priv->child_count); + memcpy(current_event_gen, info->child_up_event_gen, + priv->child_count * sizeof *current_event_gen); + AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd, + &info->flock, info->xdata_req); + + LOCK(&priv->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) { + continue; + } + + if ((current_event_gen[i] == info->child_up_event_gen[i]) && + (current_event_gen[i] > info->child_down_event_gen[i])) { + info->locked_nodes[i] = 1; + retry = _gf_false; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->saved_locks); + } else { + /*We received subsequent child up/down events while heal was in + * progress; don't mark child as healed. Attempt again on the + * new child up*/ + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM, + "Event gen mismatch: skipped healing lock on child %d " + "for %s.", + i, uuid_utoa(info->fd->inode->gfid)); + } + } + } + UNLOCK(&priv->lock); + +release: + afr_dom_lock_release(frame); + if (retry) + afr_add_lock_to_lkhealq(priv, info); + return; +} + +static int +afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque) +{ + STACK_DESTROY(frame->root); + return 0; +} + +static int +afr_lock_heal(void *opaque) +{ + call_frame_t *frame = (call_frame_t *)opaque; + call_frame_t *iter_frame = NULL; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + struct list_head healq = { + 0, + }; + int ret = 0; + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + return ENOMEM; + } + + INIT_LIST_HEAD(&healq); + LOCK(&priv->lock); + { + list_splice_init(&priv->lk_healq, &healq); + } + UNLOCK(&priv->lock); + + list_for_each_entry_safe(info, tmp, &healq, pos) + { + GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) < + priv->child_count)); + ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd); + afr_lock_heal_do(iter_frame, priv, info); + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = ENOTCONN; + gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN, + AFR_MSG_LK_HEAL_DOM, + "Aborting processing of lk_healq." + "Healing will be reattempted on next child up for locks " + "that are still in quorum."); + LOCK(&priv->lock); + { + list_add_tail(&healq, &priv->lk_healq); + } + UNLOCK(&priv->lock); + break; + } + } + + AFR_STACK_DESTROY(iter_frame); + return ret; +} + +static int +__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child) +{ + int ret = 0; + call_frame_t *frame = NULL; + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_up_event_gen[child] = priv->event_generation; + list_del_init(&info->pos); + list_add_tail(&info->pos, &priv->lk_healq); + } + + frame = create_frame(this, this->ctx->pool); + if (!frame) + return -1; + + ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame, + frame); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM, + "Failed to launch lock heal synctask"); + + return ret; +} + +static int +__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child) +{ + afr_lk_heal_info_t *info = NULL; + afr_lk_heal_info_t *tmp = NULL; + + if (priv->shd.iamshd) + return 0; + list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos) + { + info->child_down_event_gen[child] = priv->event_generation; + if (info->locked_nodes[child] == 1) + info->locked_nodes[child] = 0; + if (!afr_has_quorum(info->locked_nodes, this, NULL)) { + /* Since the lock was lost on quorum no. of nodes, we should + * not attempt to heal it anymore. Some other client could have + * acquired the lock, modified data and released it and this + * client wouldn't know about it if we heal it.*/ + afr_mark_fd_bad(info->fd, this); + list_del(&info->pos); + afr_lk_heal_info_cleanup(info); + /* We're not winding an unlock on the node where the lock is still + * present because when fencing logic switches over to the new + * client (since we marked the fd bad), it should preempt any + * existing lock. */ + } + } + return 0; +} + +gf_boolean_t +afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv, + int32_t *op_errno) +{ + if (priv->consistent_io && local->call_count != priv->child_count) { + gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN, + "All subvolumes are not up"); + if (op_errno) + *op_errno = ENOTCONN; + return _gf_false; + } + return _gf_true; +} + +gf_boolean_t +afr_is_lock_mode_mandatory(dict_t *xdata) +{ + int ret = 0; + uint32_t lk_mode = GF_LK_ADVISORY; + + ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode); + if (!ret && lk_mode == GF_LK_MANDATORY) + return _gf_true; + + return _gf_false; +} + +call_frame_t * +afr_copy_frame(call_frame_t *base) +{ + afr_local_t *local = NULL; + call_frame_t *frame = NULL; + int op_errno = 0; + + frame = copy_frame(base); + if (!frame) + return NULL; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) { + AFR_STACK_DESTROY(frame); + return NULL; + } + + return frame; +} + +/* Check if an entry or inode could be undergoing a transaction. */ +gf_boolean_t +afr_is_possibly_under_txn(afr_transaction_type type, afr_local_t *local, + xlator_t *this) +{ + int i = 0; + int tmp = 0; + afr_private_t *priv = NULL; + GF_UNUSED char *key = NULL; + int keylen = 0; + + priv = this->private; + + if (type == AFR_ENTRY_TRANSACTION) { + key = GLUSTERFS_PARENT_ENTRYLK; + keylen = SLEN(GLUSTERFS_PARENT_ENTRYLK); + } else if (type == AFR_DATA_TRANSACTION) { + /*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once + * pl_inodelk_xattr_fill supports separate keys for different + * domains.*/ + key = GLUSTERFS_INODELK_COUNT; + keylen = SLEN(GLUSTERFS_INODELK_COUNT); + } + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].xdata) + continue; + if (dict_get_int32n(local->replies[i].xdata, key, keylen, &tmp) == 0) + if (tmp) + return _gf_true; + } + + return _gf_false; +} + +static void +afr_inode_ctx_destroy(afr_inode_ctx_t *ctx) +{ + int i = 0; + + if (!ctx) + return; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + GF_FREE(ctx->pre_op_done[i]); + } + + GF_FREE(ctx); +} + +int +__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) +{ + uint64_t ctx_int = 0; + int ret = -1; + int i = -1; + int num_locks = -1; + afr_inode_ctx_t *ictx = NULL; + afr_lock_t *lock = NULL; + afr_private_t *priv = this->private; + + ret = __inode_ctx_get(inode, this, &ctx_int); + if (ret == 0) { + *ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int; + return 0; + } + + ictx = GF_CALLOC(1, sizeof(afr_inode_ctx_t), gf_afr_mt_inode_ctx_t); + if (!ictx) + goto out; + + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + ictx->pre_op_done[i] = GF_CALLOC(sizeof *ictx->pre_op_done[i], + priv->child_count, gf_afr_mt_int32_t); + if (!ictx->pre_op_done[i]) { + ret = -ENOMEM; + goto out; + } + } + + num_locks = sizeof(ictx->lock) / sizeof(afr_lock_t); + for (i = 0; i < num_locks; i++) { + lock = &ictx->lock[i]; + INIT_LIST_HEAD(&lock->post_op); + INIT_LIST_HEAD(&lock->frozen); + INIT_LIST_HEAD(&lock->waiting); + INIT_LIST_HEAD(&lock->owners); + } + + ctx_int = (uint64_t)(uintptr_t)ictx; + ret = __inode_ctx_set(inode, this, &ctx_int); + if (ret) { + goto out; + } + + ictx->spb_choice = -1; + ictx->read_subvol = 0; + ictx->write_subvol = 0; + ictx->lock_count = 0; + ret = 0; + *ctx = ictx; out: - return ret; + if (ret) { + afr_inode_ctx_destroy(ictx); + } + return ret; } /* @@ -187,1646 +863,1681 @@ out: */ int -__afr_set_in_flight_sb_status (xlator_t *this, afr_local_t *local, - inode_t *inode) -{ - int i = 0; - int ret = -1; - int txn_type = 0; - int count = 0; - int index = -1; - uint16_t datamap_old = 0; - uint16_t metadatamap_old = 0; - uint16_t datamap = 0; - uint16_t metadatamap = 0; - uint16_t tmp_map = 0; - uint16_t mask = 0; - uint32_t event = 0; - uint64_t val = 0; - afr_private_t *priv = NULL; - afr_inode_ctx_t *ctx = NULL; - - priv = this->private; - txn_type = local->transaction.type; - - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret < 0) - return ret; - - val = ctx->read_subvol; - - metadatamap_old = metadatamap = (val & 0x000000000000ffff); - datamap_old = datamap = (val & 0x00000000ffff0000) >> 16; - event = (val & 0xffffffff00000000) >> 32; - - if (txn_type == AFR_DATA_TRANSACTION) - tmp_map = datamap; - else if (txn_type == AFR_METADATA_TRANSACTION) - tmp_map = metadatamap; - - count = gf_bits_count (tmp_map); - - if (count == 1) - index = gf_bits_index (tmp_map); - - for (i = 0; i < priv->child_count; i++) { - mask = 0; - if (!local->transaction.failed_subvols[i]) - continue; - - mask = 1 << i; - if (txn_type == AFR_METADATA_TRANSACTION) - metadatamap &= ~mask; - else if (txn_type == AFR_DATA_TRANSACTION) - datamap &= ~mask; - } - - switch (txn_type) { +__afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local, + inode_t *inode) +{ + int i = 0; + int txn_type = 0; + int count = 0; + int index = -1; + uint16_t datamap_old = 0; + uint16_t metadatamap_old = 0; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint16_t tmp_map = 0; + uint16_t mask = 0; + uint32_t event = 0; + uint64_t val = 0; + afr_private_t *priv = NULL; + + priv = this->private; + txn_type = local->transaction.type; + + if (txn_type == AFR_DATA_TRANSACTION) + val = local->inode_ctx->write_subvol; + else + val = local->inode_ctx->read_subvol; + + metadatamap_old = metadatamap = (val & 0x000000000000ffff); + datamap_old = datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; + + if (txn_type == AFR_DATA_TRANSACTION) + tmp_map = datamap; + else if (txn_type == AFR_METADATA_TRANSACTION) + tmp_map = metadatamap; + + count = gf_bits_count(tmp_map); + + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.failed_subvols[i]) + continue; + + mask = 1 << i; + if (txn_type == AFR_METADATA_TRANSACTION) + metadatamap &= ~mask; + else if (txn_type == AFR_DATA_TRANSACTION) + datamap &= ~mask; + } + + switch (txn_type) { case AFR_METADATA_TRANSACTION: - if ((metadatamap_old != 0) && (metadatamap == 0) && - (count == 1)) { - local->transaction.in_flight_sb_errno = - local->replies[index].op_errno; - local->transaction.in_flight_sb = _gf_true; - metadatamap |= (1 << index); - } - if (metadatamap_old != metadatamap) { - event = 0; - } - break; + if ((metadatamap_old != 0) && (metadatamap == 0) && (count == 1)) { + index = gf_bits_index(tmp_map); + local->transaction.in_flight_sb_errno = local->replies[index] + .op_errno; + local->transaction.in_flight_sb = _gf_true; + metadatamap |= (1 << index); + } + if (metadatamap_old != metadatamap) { + __afr_inode_need_refresh_set(inode, this); + } + break; case AFR_DATA_TRANSACTION: - if ((datamap_old != 0) && (datamap == 0) && (count == 1)) { - local->transaction.in_flight_sb_errno = - local->replies[index].op_errno; - local->transaction.in_flight_sb = _gf_true; - datamap |= (1 << index); - } - if (datamap_old != datamap) - event = 0; - break; + if ((datamap_old != 0) && (datamap == 0) && (count == 1)) { + index = gf_bits_index(tmp_map); + local->transaction.in_flight_sb_errno = local->replies[index] + .op_errno; + local->transaction.in_flight_sb = _gf_true; + datamap |= (1 << index); + } + if (datamap_old != datamap) + __afr_inode_need_refresh_set(inode, this); + break; default: - break; - } + break; + } - val = ((uint64_t) metadatamap) | - (((uint64_t) datamap) << 16) | - (((uint64_t) event) << 32); + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); - ctx->read_subvol = val; + if (txn_type == AFR_DATA_TRANSACTION) + local->inode_ctx->write_subvol = val; + local->inode_ctx->read_subvol = val; - return ret; + return 0; } gf_boolean_t -afr_is_symmetric_error (call_frame_t *frame, xlator_t *this) +afr_is_symmetric_error(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int op_errno = 0; - int i_errno = 0; - gf_boolean_t matching_errors = _gf_true; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int op_errno = 0; + int i_errno = 0; + gf_boolean_t matching_errors = _gf_true; + int i = 0; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret != -1) { - /* Operation succeeded on at least one subvol, - so it is not a failed-everywhere situation. - */ - matching_errors = _gf_false; - break; - } - i_errno = local->replies[i].op_errno; - - if (i_errno == ENOTCONN) { - /* ENOTCONN is not a symmetric error. We do not - know if the operation was performed on the - backend or not. - */ - matching_errors = _gf_false; - break; - } + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret != -1) { + /* Operation succeeded on at least one subvol, + so it is not a failed-everywhere situation. + */ + matching_errors = _gf_false; + break; + } + i_errno = local->replies[i].op_errno; - if (!op_errno) { - op_errno = i_errno; - } else if (op_errno != i_errno) { - /* Mismatching op_errno's */ - matching_errors = _gf_false; - break; - } + if (i_errno == ENOTCONN) { + /* ENOTCONN is not a symmetric error. We do not + know if the operation was performed on the + backend or not. + */ + matching_errors = _gf_false; + break; + } + + if (!op_errno) { + op_errno = i_errno; + } else if (op_errno != i_errno) { + /* Mismatching op_errno's */ + matching_errors = _gf_false; + break; } + } - return matching_errors; + return matching_errors; } int -afr_set_in_flight_sb_status (xlator_t *this, call_frame_t *frame, - inode_t *inode) +afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, inode_t *inode) { - int ret = -1; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + int ret = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - /* If this transaction saw no failures, then exit. */ - if (AFR_COUNT (local->transaction.failed_subvols, - priv->child_count) == 0) - return 0; + /* If this transaction saw no failures, then exit. */ + if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == 0) + return 0; - if (afr_is_symmetric_error (frame, this)) - return 0; + if (afr_is_symmetric_error(frame, this)) + return 0; - LOCK (&inode->lock); - { - ret = __afr_set_in_flight_sb_status (this, local, inode); - } - UNLOCK (&inode->lock); + LOCK(&inode->lock); + { + ret = __afr_set_in_flight_sb_status(this, local, inode); + } + UNLOCK(&inode->lock); - return ret; + return ret; } int -__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this, - unsigned char *data, unsigned char *metadata, - int *event_p) -{ - afr_private_t *priv = NULL; - int ret = -1; - uint16_t datamap = 0; - uint16_t metadatamap = 0; - uint32_t event = 0; - uint64_t val = 0; - int i = 0; - afr_inode_ctx_t *ctx = NULL; - - priv = this->private; - - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret < 0) - return ret; +__afr_inode_read_subvol_get_small(inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int *event_p) +{ + afr_private_t *priv = NULL; + int ret = -1; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint32_t event = 0; + uint64_t val = 0; + int i = 0; + afr_inode_ctx_t *ctx = NULL; + + priv = this->private; + + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0) + return ret; - val = ctx->read_subvol; + val = ctx->read_subvol; - metadatamap = (val & 0x000000000000ffff); - datamap = (val & 0x00000000ffff0000) >> 16; - event = (val & 0xffffffff00000000) >> 32; + metadatamap = (val & 0x000000000000ffff); + datamap = (val & 0x00000000ffff0000) >> 16; + event = (val & 0xffffffff00000000) >> 32; - for (i = 0; i < priv->child_count; i++) { - if (metadata) - metadata[i] = (metadatamap >> i) & 1; - if (data) - data[i] = (datamap >> i) & 1; - } + for (i = 0; i < priv->child_count; i++) { + if (metadata) + metadata[i] = (metadatamap >> i) & 1; + if (data) + data[i] = (datamap >> i) & 1; + } - if (event_p) - *event_p = event; - return ret; + if (event_p) + *event_p = event; + return ret; } - int -__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this, - unsigned char *data, unsigned char *metadata, - int event) +__afr_inode_read_subvol_set_small(inode_t *inode, xlator_t *this, + unsigned char *data, unsigned char *metadata, + int event) { - afr_private_t *priv = NULL; - uint16_t datamap = 0; - uint16_t metadatamap = 0; - uint64_t val = 0; - int i = 0; - int ret = -1; - afr_inode_ctx_t *ctx = NULL; + afr_private_t *priv = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int i = 0; + int ret = -1; + afr_inode_ctx_t *ctx = NULL; - priv = this->private; + priv = this->private; - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret) - goto out; + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto out; - for (i = 0; i < priv->child_count; i++) { - if (data[i]) - datamap |= (1 << i); - if (metadata[i]) - metadatamap |= (1 << i); - } + for (i = 0; i < priv->child_count; i++) { + if (data[i]) + datamap |= (1 << i); + if (metadata[i]) + metadatamap |= (1 << i); + } - val = ((uint64_t) metadatamap) | - (((uint64_t) datamap) << 16) | - (((uint64_t) event) << 32); + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); - ctx->read_subvol = val; + ctx->read_subvol = val; - ret = 0; + ret = 0; out: - return ret; + return ret; } int -__afr_inode_event_gen_reset_small (inode_t *inode, xlator_t *this) +__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - int ret = -1; - uint16_t datamap = 0; - uint16_t metadatamap = 0; - uint32_t event = 0; - uint64_t val = 0; - afr_inode_ctx_t *ctx = NULL; - - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret) - return ret; - - val = ctx->read_subvol; + afr_private_t *priv = NULL; + int ret = -1; - metadatamap = (val & 0x000000000000ffff) >> 0; - datamap = (val & 0x00000000ffff0000) >> 16; - event = 0; + priv = this->private; - val = ((uint64_t) metadatamap) | - (((uint64_t) datamap) << 16) | - (((uint64_t) event) << 32); + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_get_small(inode, this, data, metadata, + event_p); + else + /* TBD: allocate structure with array and read from it */ + ret = -1; - ctx->read_subvol = val; - - return ret; + return ret; } - int -__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, - unsigned char *data, unsigned char *metadata, - int *event_p) +__afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, + int *spb_choice) { - afr_private_t *priv = NULL; - int ret = -1; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - priv = this->private; - - if (priv->child_count <= 16) - ret = __afr_inode_read_subvol_get_small (inode, this, data, - metadata, event_p); - else - /* TBD: allocate structure with array and read from it */ - ret = -1; + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0) + return ret; - return ret; + *spb_choice = ctx->spb_choice; + return 0; } int -__afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, - int *spb_choice) +__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - afr_inode_ctx_t *ctx = NULL; - int ret = -1; + afr_private_t *priv = NULL; + int ret = -1; - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret < 0) - return ret; + priv = this->private; - *spb_choice = ctx->spb_choice; - return 0; + if (priv->child_count <= 16) + ret = __afr_inode_read_subvol_set_small(inode, this, data, metadata, + event); + else + ret = -1; + + return ret; } int -__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, - unsigned char *metadata, int event) +__afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, + int spb_choice) { - afr_private_t *priv = NULL; - int ret = -1; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - priv = this->private; + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto out; - if (priv->child_count <= 16) - ret = __afr_inode_read_subvol_set_small (inode, this, data, - metadata, event); - else - ret = -1; + ctx->spb_choice = spb_choice; - return ret; + ret = 0; +out: + return ret; } int -__afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this, - int spb_choice) +afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int *event_p) { - afr_inode_ctx_t *ctx = NULL; - int ret = -1; - - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret) - goto out; + int ret = -1; - ctx->spb_choice = spb_choice; + GF_VALIDATE_OR_GOTO(this->name, inode, out); - ret = 0; + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_get(inode, this, data, metadata, event_p); + } + UNLOCK(&inode->lock); out: - return ret; + return ret; } int -__afr_inode_event_gen_reset (inode_t *inode, xlator_t *this) +afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type) { - afr_private_t *priv = NULL; - int ret = -1; - - priv = this->private; - - if (priv->child_count <= 16) - ret = __afr_inode_event_gen_reset_small (inode, this); - else - ret = -1; + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + unsigned char *data = alloca0(priv->child_count); + unsigned char *metadata = alloca0(priv->child_count); + int data_count = 0; + int metadata_count = 0; + int event_generation = 0; + int ret = 0; + + ret = afr_inode_read_subvol_get(inode, this, data, metadata, + &event_generation); + if (ret == -1) + return -EIO; + + data_count = AFR_COUNT(data, priv->child_count); + metadata_count = AFR_COUNT(metadata, priv->child_count); + + if (inode->ia_type == IA_IFDIR) { + /* For directories, allow even if it is in data split-brain. */ + if (type == AFR_METADATA_TRANSACTION || local->op == GF_FOP_STAT || + local->op == GF_FOP_FSTAT) { + if (!metadata_count) + return -EIO; + } + } else { + /* For files, abort in case of data/metadata split-brain. */ + if (!data_count || !metadata_count) { + return -EIO; + } + } - return ret; + if (type == AFR_METADATA_TRANSACTION && readable) + memcpy(readable, metadata, priv->child_count * sizeof *metadata); + if (type == AFR_DATA_TRANSACTION && readable) { + if (!data_count) + memcpy(readable, local->child_up, + priv->child_count * sizeof *readable); + else + memcpy(readable, data, priv->child_count * sizeof *data); + } + if (event_p) + *event_p = event_generation; + return 0; } - -int -afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data, - unsigned char *metadata, int *event_p) +static int +afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this, + int *spb_choice) { - int ret = -1; - - GF_VALIDATE_OR_GOTO (this->name, inode, out); + int ret = -1; + GF_VALIDATE_OR_GOTO(this->name, inode, out); - LOCK(&inode->lock); - { - ret = __afr_inode_read_subvol_get (inode, this, data, - metadata, event_p); - } - UNLOCK(&inode->lock); + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_get(inode, this, spb_choice); + } + UNLOCK(&inode->lock); out: - return ret; + return ret; } +/* + * frame is used to get the favourite policy. Since + * afr_inode_split_brain_choice_get was called with afr_open, it is possible to + * have a frame with out local->replies. So in that case, frame is passed as + * null, hence this function will handle the frame NULL case. + */ int -afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this, - unsigned char *readable, int *event_p, int type) +afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this, + call_frame_t *frame, int *spb_subvol) { + int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - afr_private_t *priv = this->private; - afr_local_t *local = frame->local; - unsigned char *data = alloca0 (priv->child_count); - unsigned char *metadata = alloca0 (priv->child_count); - int data_count = 0; - int metadata_count = 0; - int event_generation = 0; - int ret = 0; - - ret = afr_inode_read_subvol_get (inode, this, data, metadata, - &event_generation); - if (ret == -1) - return -EIO; + GF_VALIDATE_OR_GOTO("afr", this, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out); - data_count = AFR_COUNT (data, priv->child_count); - metadata_count = AFR_COUNT (metadata, priv->child_count); + priv = this->private; - if (inode->ia_type == IA_IFDIR) { - /* For directories, allow even if it is in data split-brain. */ - if (type == AFR_METADATA_TRANSACTION || - local->op == GF_FOP_STAT || local->op == GF_FOP_FSTAT) { - if (!metadata_count) - return -EIO; - } - } else { - /* For files, abort in case of data/metadata split-brain. */ - if (!data_count || !metadata_count) { - return -EIO; - } - } - - if (type == AFR_METADATA_TRANSACTION && readable) - memcpy (readable, metadata, priv->child_count * sizeof *metadata); - if (type == AFR_DATA_TRANSACTION && readable) { - if (!data_count) - memcpy (readable, local->child_up, - priv->child_count * sizeof *readable); - else - memcpy (readable, data, priv->child_count * sizeof *data); + ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol); + if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) { + local = frame->local; + *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (*spb_subvol >= 0) { + ret = 0; } - if (event_p) - *event_p = event_generation; - return 0; -} - -int -afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this, - int *spb_choice) -{ - int ret = -1; - - GF_VALIDATE_OR_GOTO (this->name, inode, out); + } - LOCK(&inode->lock); - { - ret = __afr_inode_split_brain_choice_get (inode, this, - spb_choice); - } - UNLOCK(&inode->lock); out: - return ret; + return ret; } - - int -afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data, - unsigned char *metadata, int event) +afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data, + unsigned char *metadata, int event) { - int ret = -1; + int ret = -1; - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - LOCK(&inode->lock); - { - ret = __afr_inode_read_subvol_set (inode, this, data, metadata, - event); - } - UNLOCK(&inode->lock); + LOCK(&inode->lock); + { + ret = __afr_inode_read_subvol_set(inode, this, data, metadata, event); + } + UNLOCK(&inode->lock); out: - return ret; + return ret; } - int -afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this, - int spb_choice) +afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice) { - int ret = -1; + int ret = -1; - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - LOCK(&inode->lock); - { - ret = __afr_inode_split_brain_choice_set (inode, this, - spb_choice); - } - UNLOCK(&inode->lock); + LOCK(&inode->lock); + { + ret = __afr_inode_split_brain_choice_set(inode, this, spb_choice); + } + UNLOCK(&inode->lock); out: - return ret; + return ret; } - /* The caller of this should perform afr_inode_refresh, if this function * returns _gf_true */ gf_boolean_t -afr_is_inode_refresh_reqd (inode_t *inode, xlator_t *this, - int event_gen1, int event_gen2) +afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1, + int event_gen2) { - gf_boolean_t need_refresh = _gf_false; - afr_inode_ctx_t *ctx = NULL; - int ret = -1; + gf_boolean_t need_refresh = _gf_false; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - LOCK(&inode->lock); - { - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret) - goto unlock; + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret) + goto unlock; - need_refresh = ctx->need_refresh; - /* Hoping that the caller will do inode_refresh followed by - * this, hence setting the need_refresh to false */ - ctx->need_refresh = _gf_false; - } + need_refresh = ctx->need_refresh; + /* Hoping that the caller will do inode_refresh followed by + * this, hence setting the need_refresh to false */ + ctx->need_refresh = _gf_false; + } unlock: - UNLOCK(&inode->lock); + UNLOCK(&inode->lock); - if (event_gen1 != event_gen2) - need_refresh = _gf_true; + if (event_gen1 != event_gen2) + need_refresh = _gf_true; out: - return need_refresh; + return need_refresh; } - -static int -afr_inode_need_refresh_set (inode_t *inode, xlator_t *this) +int +__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { - int ret = -1; - afr_inode_ctx_t *ctx = NULL; + int ret = -1; + afr_inode_ctx_t *ctx = NULL; - GF_VALIDATE_OR_GOTO (this->name, inode, out); + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret == 0) { + ctx->need_refresh = _gf_true; + } - LOCK(&inode->lock); - { - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret) - goto unlock; - - ctx->need_refresh = _gf_true; - } -unlock: - UNLOCK(&inode->lock); -out: - return ret; + return ret; } int -afr_inode_event_gen_reset (inode_t *inode, xlator_t *this) +afr_inode_need_refresh_set(inode_t *inode, xlator_t *this) { - int ret = -1; + int ret = -1; - GF_VALIDATE_OR_GOTO (this->name, inode, out); + GF_VALIDATE_OR_GOTO(this->name, inode, out); - LOCK(&inode->lock); - { - ret = __afr_inode_event_gen_reset (inode, this); - } - UNLOCK(&inode->lock); + LOCK(&inode->lock); + { + ret = __afr_inode_need_refresh_set(inode, this); + } + UNLOCK(&inode->lock); out: - return ret; + return ret; } int -afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode) +afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode) { - afr_inode_ctx_t *ctx = NULL; - int ret = -1; + afr_inode_ctx_t *ctx = NULL; + int ret = -1; - if (!inode) - return ret; + if (!inode) + return ret; - LOCK(&inode->lock); - { - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret < 0 || !ctx) { - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to cancel split-brain choice timer."); - goto out; - } - ctx->spb_choice = -1; - if (ctx->timer) { - gf_timer_call_cancel (this->ctx, ctx->timer); - ctx->timer = NULL; - } - ret = 0; + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); + if (ret < 0 || !ctx) { + UNLOCK(&inode->lock); + gf_msg(this->name, GF_LOG_WARNING, 0, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Failed to cancel split-brain choice timer."); + goto out; + } + ctx->spb_choice = -1; + if (ctx->timer) { + gf_timer_call_cancel(this->ctx, ctx->timer); + ctx->timer = NULL; } + ret = 0; + } + UNLOCK(&inode->lock); out: - UNLOCK(&inode->lock); - return ret; + return ret; } void -afr_set_split_brain_choice_cbk (void *data) +afr_set_split_brain_choice_cbk(void *data) { - inode_t *inode = data; - xlator_t *this = THIS; + inode_t *inode = data; + xlator_t *this = THIS; - afr_spb_choice_timeout_cancel (this, inode); - inode_invalidate (inode); - inode_unref (inode); - return; + afr_spb_choice_timeout_cancel(this, inode); + inode_invalidate(inode); + inode_unref(inode); + return; } - int -afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque) -{ - int op_errno = ENOMEM; - afr_private_t *priv = NULL; - afr_inode_ctx_t *ctx = NULL; - inode_t *inode = NULL; - loc_t *loc = NULL; - xlator_t *this = NULL; - afr_spbc_timeout_t *data = opaque; - struct timespec delta = {0, }; - gf_boolean_t timer_set = _gf_false; - gf_boolean_t timer_cancelled = _gf_false; - gf_boolean_t timer_reset = _gf_false; - gf_boolean_t need_invalidate = _gf_true; - int old_spb_choice = -1; - - frame = data->frame; - loc = data->loc; - this = frame->this; - priv = this->private; - +afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque) +{ + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_inode_ctx_t *ctx = NULL; + inode_t *inode = NULL; + loc_t *loc = NULL; + xlator_t *this = NULL; + afr_spbc_timeout_t *data = opaque; + struct timespec delta = { + 0, + }; + gf_boolean_t timer_set = _gf_false; + gf_boolean_t timer_cancelled = _gf_false; + gf_boolean_t timer_reset = _gf_false; + int old_spb_choice = -1; + + frame = data->frame; + loc = data->loc; + this = frame->this; + priv = this->private; + + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + delta.tv_sec = priv->spb_choice_timeout; + delta.tv_nsec = 0; + + if (!loc->inode) { + ret = -1; + op_errno = EINVAL; + goto out; + } + + if (!(data->d_spb || data->m_spb)) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Cannot set " + "replica.split-brain-choice on %s. File is" + " not in data/metadata split-brain.", + uuid_utoa(loc->gfid)); + ret = -1; + op_errno = EINVAL; + goto out; + } + + /* + * we're ref'ing the inode before LOCK like it is done elsewhere in the + * code. If we ref after LOCK, coverity complains of possible deadlocks. + */ + inode = inode_ref(loc->inode); + + LOCK(&inode->lock); + { + ret = __afr_inode_ctx_get(this, inode, &ctx); if (ret) { - op_errno = -ret; - ret = -1; - goto out; + UNLOCK(&inode->lock); + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, + "Failed to get inode_ctx for %s", loc->name); + goto post_unlock; } - delta.tv_sec = priv->spb_choice_timeout; - delta.tv_nsec = 0; - - if (!loc->inode) { - ret = -1; - op_errno = EINVAL; - goto out; - } - - if (!(data->d_spb || data->m_spb)) { - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, "Cannot set " - "replica.split-brain-choice on %s. File is" - " not in data/metadata split-brain.", - uuid_utoa (loc->gfid)); - ret = -1; - op_errno = EINVAL; - goto out; - } + old_spb_choice = ctx->spb_choice; + ctx->spb_choice = data->spb_child_index; - /* - * we're ref'ing the inode before LOCK like it is done elsewhere in the - * code. If we ref after LOCK, coverity complains of possible deadlocks. + /* Possible changes in spb-choice : + * valid to -1 : cancel timer and unref + * valid to valid : cancel timer and inject new one + * -1 to -1 : unref and do not do anything + * -1 to valid : inject timer */ - inode = inode_ref (loc->inode); - LOCK(&inode->lock); - { - ret = __afr_inode_ctx_get (this, inode, &ctx); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to get inode_ctx for %s", loc->name); - goto unlock; + /* ctx->timer is NULL iff previous value of + * ctx->spb_choice is -1 + */ + if (ctx->timer) { + if (ctx->spb_choice == -1) { + if (!gf_timer_call_cancel(this->ctx, ctx->timer)) { + ctx->timer = NULL; + timer_cancelled = _gf_true; } - - old_spb_choice = ctx->spb_choice; - ctx->spb_choice = data->spb_child_index; - - /* Possible changes in spb-choice : - * valid to -1 : cancel timer and unref - * valid to valid : cancel timer and inject new one - * -1 to -1 : unref and do not do anything - * -1 to valid : inject timer + /* If timer cancel failed here it means that the + * previous cbk will be executed which will set + * spb_choice to -1. So we can consider the + * 'valid to -1' case to be a success + * (i.e. ret = 0) and goto unlock. */ - - /* ctx->timer is NULL iff previous value of - * ctx->spb_choice is -1 - */ - if (ctx->timer) { - if (ctx->spb_choice == -1) { - if (!gf_timer_call_cancel (this->ctx, - ctx->timer)) { - ctx->timer = NULL; - timer_cancelled = _gf_true; - } - /* If timer cancel failed here it means that the - * previous cbk will be executed which will set - * spb_choice to -1. So we can consider the - * 'valid to -1' case to be a sucess - * (i.e. ret = 0) and goto unlock. - */ - goto unlock; - } - goto reset_timer; - } else { - if (ctx->spb_choice == -1) - goto unlock; - goto set_timer; - } - -reset_timer: - ret = gf_timer_call_cancel (this->ctx, ctx->timer); - if (ret != 0) { - /* We need to bail out now instead of launching a new - * timer. Otherwise the cbk of the previous timer event - * will cancel the new ctx->timer. - */ - ctx->spb_choice = old_spb_choice; - ret = -1; - op_errno = EAGAIN; - goto unlock; - } - ctx->timer = NULL; - timer_reset = _gf_true; - -set_timer: - ctx->timer = gf_timer_call_after (this->ctx, delta, - afr_set_split_brain_choice_cbk, - inode); - if (!ctx->timer) { - ctx->spb_choice = old_spb_choice; - ret = -1; - op_errno = ENOMEM; - } - if (!timer_reset && ctx->timer) - timer_set = _gf_true; - if (timer_reset && !ctx->timer) - timer_cancelled = _gf_true; - need_invalidate = _gf_false; + goto unlock; + } + goto reset_timer; + } else { + if (ctx->spb_choice == -1) + goto unlock; + goto set_timer; } + + reset_timer: + ret = gf_timer_call_cancel(this->ctx, ctx->timer); + if (ret != 0) { + /* We need to bail out now instead of launching a new + * timer. Otherwise the cbk of the previous timer event + * will cancel the new ctx->timer. + */ + ctx->spb_choice = old_spb_choice; + ret = -1; + op_errno = EAGAIN; + goto unlock; + } + ctx->timer = NULL; + timer_reset = _gf_true; + + set_timer: + ctx->timer = gf_timer_call_after(this->ctx, delta, + afr_set_split_brain_choice_cbk, inode); + if (!ctx->timer) { + ctx->spb_choice = old_spb_choice; + ret = -1; + op_errno = ENOMEM; + } + if (!timer_reset && ctx->timer) + timer_set = _gf_true; + if (timer_reset && !ctx->timer) + timer_cancelled = _gf_true; + } unlock: - UNLOCK(&inode->lock); - if (!timer_set) - inode_unref (inode); - if (timer_cancelled) - inode_unref (inode); - /* - * We need to invalidate the inode to prevent the kernel from serving - * reads from an older cached value despite a change in spb_choice to - * a new value. - */ - if (need_invalidate) - inode_invalidate (inode); + UNLOCK(&inode->lock); +post_unlock: + if (!timer_set) + inode_unref(inode); + if (timer_cancelled) + inode_unref(inode); + /* + * We need to invalidate the inode to prevent the kernel from serving + * reads from an older cached value despite a change in spb_choice to + * a new value. + */ + inode_invalidate(inode); out: - if (data) - GF_FREE (data); - AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); - return 0; + GF_FREE(data); + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + return 0; } int -afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused, - afr_transaction_type type) +afr_accused_fill(xlator_t *this, dict_t *xdata, unsigned char *accused, + afr_transaction_type type) { - afr_private_t *priv = NULL; - int i = 0; - int idx = afr_index_for_transaction_type (type); - void *pending_raw = NULL; - int pending[3]; - int ret = 0; + afr_private_t *priv = NULL; + int i = 0; + int idx = afr_index_for_transaction_type(type); + void *pending_raw = NULL; + int pending[3]; + int ret = 0; - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - ret = dict_get_ptr (xdata, priv->pending_key[i], - &pending_raw); - if (ret) /* no pending flags */ - continue; - memcpy (pending, pending_raw, sizeof(pending)); + for (i = 0; i < priv->child_count; i++) { + ret = dict_get_ptr(xdata, priv->pending_key[i], &pending_raw); + if (ret) /* no pending flags */ + continue; + memcpy(pending, pending_raw, sizeof(pending)); - if (ntoh32 (pending[idx])) - accused[i] = 1; - } + if (ntoh32(pending[idx])) + accused[i] = 1; + } - return 0; + return 0; } int -afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies, - unsigned char *data_accused) +afr_accuse_smallfiles(xlator_t *this, struct afr_reply *replies, + unsigned char *data_accused) { - int i = 0; - afr_private_t *priv = NULL; - uint64_t maxsize = 0; + int i = 0; + afr_private_t *priv = NULL; + uint64_t maxsize = 0; - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (replies[i].valid && replies[i].xdata && - dict_get (replies[i].xdata, GLUSTERFS_BAD_INODE)) - continue; - if (data_accused[i]) - continue; - if (replies[i].poststat.ia_size > maxsize) - maxsize = replies[i].poststat.ia_size; - } + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].xdata && + dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE)) + continue; + if (data_accused[i]) + continue; + if (replies[i].poststat.ia_size > maxsize) + maxsize = replies[i].poststat.ia_size; + } - for (i = 0; i < priv->child_count; i++) { - if (data_accused[i]) - continue; - if (AFR_IS_ARBITER_BRICK(priv, i)) - continue; - if (replies[i].poststat.ia_size < maxsize) - data_accused[i] = 1; - } + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) + continue; + if (AFR_IS_ARBITER_BRICK(priv, i)) + continue; + if (replies[i].poststat.ia_size < maxsize) + data_accused[i] = 1; + } - return 0; + return 0; } int -afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode, - gf_boolean_t *start_heal) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - struct afr_reply *replies = NULL; - int event_generation = 0; - int i = 0; - unsigned char *data_accused = NULL; - unsigned char *metadata_accused = NULL; - unsigned char *data_readable = NULL; - unsigned char *metadata_readable = NULL; - int ret = 0; - - local = frame->local; - priv = this->private; - replies = local->replies; - event_generation = local->event_generation; - - data_accused = alloca0 (priv->child_count); - data_readable = alloca0 (priv->child_count); - metadata_accused = alloca0 (priv->child_count); - metadata_readable = alloca0 (priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - data_readable[i] = 1; - metadata_readable[i] = 1; - } - if (AFR_IS_ARBITER_BRICK (priv, ARBITER_BRICK_INDEX)) { - data_readable[ARBITER_BRICK_INDEX] = 0; - metadata_readable[ARBITER_BRICK_INDEX] = 0; - } - - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) { - data_readable[i] = 0; - metadata_readable[i] = 0; - continue; - } - - if (replies[i].op_ret == -1) { - data_readable[i] = 0; - metadata_readable[i] = 0; - continue; - } - - if (replies[i].xdata && - dict_get (replies[i].xdata, GLUSTERFS_BAD_INODE)) { - data_readable[i] = 0; - metadata_readable[i] = 0; - continue; - } +afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *data_accused, unsigned char *metadata_accused, + unsigned char *data_readable, + unsigned char *metadata_readable, struct afr_reply *replies) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata = NULL; + int i = 0; + int ret = 0; + ia_type_t ia_type = IA_INVAL; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + data_readable[i] = 1; + metadata_readable[i] = 1; + } + if (AFR_IS_ARBITER_BRICK(priv, ARBITER_BRICK_INDEX)) { + data_readable[ARBITER_BRICK_INDEX] = 0; + metadata_readable[ARBITER_BRICK_INDEX] = 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (replies) { /* Lookup */ + if (!replies[i].valid || replies[i].op_ret == -1 || + (replies[i].xdata && + dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))) { + data_readable[i] = 0; + metadata_readable[i] = 0; + continue; + } - afr_accused_fill (this, replies[i].xdata, data_accused, - (replies[i].poststat.ia_type == IA_IFDIR) ? - AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION); - - afr_accused_fill (this, replies[i].xdata, - metadata_accused, AFR_METADATA_TRANSACTION); - - } - - if ((inode->ia_type != IA_IFDIR) && - /* We want to accuse small files only when we know for sure that - * there is no IO happening. Otherwise, the ia_sizes obtained in - * post-refresh replies may mismatch due to a race between inode- - * refresh and ongoing writes, causing spurious heal launches*/ - !afr_is_possibly_under_txn (AFR_DATA_TRANSACTION, local, this)) - afr_accuse_smallfiles (this, replies, data_accused); - - for (i = 0; i < priv->child_count; i++) { - if (data_accused[i]) { - data_readable[i] = 0; - ret = 1; - } - if (metadata_accused[i]) { - metadata_readable[i] = 0; - ret = 1; - } - } - - for (i = 0; i < priv->child_count; i++) { - if (start_heal && priv->child_up[i] && - (data_accused[i] || metadata_accused[i])) { - *start_heal = _gf_true; - break; - } + xdata = replies[i].xdata; + ia_type = replies[i].poststat.ia_type; + } else { /* pre-op xattrop */ + xdata = local->transaction.changelog_xdata[i]; + ia_type = inode->ia_type; } - afr_inode_read_subvol_set (inode, this, data_readable, - metadata_readable, event_generation); - return ret; -} + if (!xdata) + continue; /* mkdir_cbk sends NULL xdata_rsp. */ + afr_accused_fill(this, xdata, data_accused, + (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION + : AFR_DATA_TRANSACTION); + + afr_accused_fill(this, xdata, metadata_accused, + AFR_METADATA_TRANSACTION); + } + if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR && + /* We want to accuse small files only when we know for + * sure that there is no IO happening. Otherwise, the + * ia_sizes obtained in post-refresh replies may + * mismatch due to a race between inode-refresh and + * ongoing writes, causing spurious heal launches*/ + !afr_is_possibly_under_txn(AFR_DATA_TRANSACTION, local, this)) { + afr_accuse_smallfiles(this, replies, data_accused); + } + + for (i = 0; i < priv->child_count; i++) { + if (data_accused[i]) { + data_readable[i] = 0; + ret = 1; + } + if (metadata_accused[i]) { + metadata_readable[i] = 0; + ret = 1; + } + } + return ret; +} + +int +afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode, + gf_boolean_t *start_heal) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int event_generation = 0; + int i = 0; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + int ret = 0; + + local = frame->local; + priv = this->private; + replies = local->replies; + event_generation = local->event_generation; + + data_accused = alloca0(priv->child_count); + data_readable = alloca0(priv->child_count); + metadata_accused = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + + ret = afr_readables_fill(frame, this, inode, data_accused, metadata_accused, + data_readable, metadata_readable, replies); + + for (i = 0; i < priv->child_count; i++) { + if (start_heal && priv->child_up[i] && + (data_accused[i] || metadata_accused[i])) { + *start_heal = _gf_true; + break; + } + } + afr_inode_read_subvol_set(inode, this, data_readable, metadata_readable, + event_generation); + return ret; +} int -afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque) +afr_refresh_selfheal_done(int ret, call_frame_t *heal, void *opaque) { - if (heal) - AFR_STACK_DESTROY (heal); - return 0; + if (heal) + AFR_STACK_DESTROY(heal); + return 0; } int -afr_inode_refresh_err (call_frame_t *frame, xlator_t *this) +afr_inode_refresh_err(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int err = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int err = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (local->replies[i].valid && !local->replies[i].op_ret) { - err = 0; - goto ret; - } - } + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && !local->replies[i].op_ret) { + err = 0; + goto ret; + } + } - err = afr_final_errno (local, priv); + err = afr_final_errno(local, priv); ret: - return -err; + return err; } gf_boolean_t -afr_selfheal_enabled (xlator_t *this) +afr_selfheal_enabled(const xlator_t *this) { - afr_private_t *priv = NULL; - gf_boolean_t data = _gf_false; - int ret = 0; - - priv = this->private; - - ret = gf_string2boolean (priv->data_self_heal, &data); - GF_ASSERT (!ret); + const afr_private_t *priv = this->private; - return data || priv->metadata_self_heal || priv->entry_self_heal; + return priv->data_self_heal || priv->metadata_self_heal || + priv->entry_self_heal; } - int -afr_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) -{ - - call_frame_t *heal_frame = NULL; - afr_local_t *heal_local = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - inode_t *inode = NULL; - int event_generation = 0; - int read_subvol = -1; - int op_errno = ENOMEM; - int ret = 0; - - local = frame->local; - inode = local->inode; - priv = this->private; - - if (err) - goto refresh_done; - - if (local->op == GF_FOP_LOOKUP) - goto refresh_done; - - ret = afr_inode_get_readable (frame, inode, this, local->readable, - &event_generation, - local->transaction.type); - - if (ret == -EIO || (local->is_read_txn && !event_generation)) { - /* No readable subvolume even after refresh ==> splitbrain.*/ - if (!priv->fav_child_policy) { - err = -EIO; - goto refresh_done; - } - read_subvol = afr_sh_get_fav_by_policy (this, local->replies, - inode, NULL); - if (read_subvol == -1) { - err = -EIO; - goto refresh_done; - } +afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) +{ + call_frame_t *heal_frame = NULL; + afr_local_t *heal_local = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; + int event_generation = 0; + int read_subvol = -1; + int ret = 0; + + local = frame->local; + inode = local->inode; + priv = this->private; + + if (err) + goto refresh_done; + + if (local->op == GF_FOP_LOOKUP) + goto refresh_done; + + ret = afr_inode_get_readable(frame, inode, this, local->readable, + &event_generation, local->transaction.type); + + if (ret == -EIO) { + /* No readable subvolume even after refresh ==> splitbrain.*/ + if (!priv->fav_child_policy) { + err = EIO; + goto refresh_done; + } + read_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode, + NULL); + if (read_subvol == -1) { + err = EIO; + goto refresh_done; + } - heal_frame = copy_frame (frame); - if (!heal_frame) { - err = -EIO; - goto refresh_done; - } - heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD; - heal_local = AFR_FRAME_INIT (heal_frame, op_errno); - if (!heal_local) { - err = -EIO; - AFR_STACK_DESTROY (heal_frame); - goto refresh_done; - } - heal_local->xdata_req = dict_new(); - if (!heal_local->xdata_req) { - err = -EIO; - AFR_STACK_DESTROY (heal_frame); - goto refresh_done; - } - heal_local->heal_frame = frame; - ret = synctask_new (this->ctx->env, - afr_fav_child_reset_sink_xattrs, - afr_fav_child_reset_sink_xattrs_cbk, - heal_frame, - heal_frame); - return 0; + heal_frame = afr_frame_create(this, NULL); + if (!heal_frame) { + err = EIO; + goto refresh_done; } + heal_local = heal_frame->local; + heal_local->xdata_req = dict_new(); + if (!heal_local->xdata_req) { + err = EIO; + AFR_STACK_DESTROY(heal_frame); + goto refresh_done; + } + heal_local->heal_frame = frame; + ret = synctask_new(this->ctx->env, afr_fav_child_reset_sink_xattrs, + afr_fav_child_reset_sink_xattrs_cbk, heal_frame, + heal_frame); + return 0; + } refresh_done: - afr_local_replies_wipe (local, this->private); - local->refreshfn (frame, this, err); + afr_local_replies_wipe(local, this->private); + local->refreshfn(frame, this, err); - return 0; + return 0; } int -afr_inode_refresh_done (call_frame_t *frame, xlator_t *this, int error) -{ - call_frame_t *heal_frame = NULL; - afr_local_t *local = NULL; - gf_boolean_t start_heal = _gf_false; - afr_local_t *heal_local = NULL; - int op_errno = ENOMEM; - int ret = 0; - int err = 0; - - if (error != 0) { - err = error; - goto refresh_done; - } - - local = frame->local; - - ret = afr_replies_interpret (frame, this, local->refreshinode, - &start_heal); - - err = afr_inode_refresh_err (frame, this); - - if (ret && afr_selfheal_enabled (this) && start_heal) { - heal_frame = copy_frame (frame); - if (!heal_frame) - goto refresh_done; - heal_frame->root->pid = GF_CLIENT_PID_SELF_HEALD; - heal_local = AFR_FRAME_INIT (heal_frame, op_errno); - if (!heal_local) { - AFR_STACK_DESTROY (heal_frame); - goto refresh_done; - } - heal_local->refreshinode = inode_ref (local->refreshinode); - heal_local->heal_frame = heal_frame; - if (!afr_throttled_selfheal (heal_frame, this)) { - AFR_STACK_DESTROY (heal_frame); - goto refresh_done; - } +afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error) +{ + call_frame_t *heal_frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + gf_boolean_t start_heal = _gf_false; + afr_local_t *heal_local = NULL; + unsigned char *success_replies = NULL; + int ret = 0; + + if (error != 0) { + goto refresh_done; + } + + local = frame->local; + priv = this->private; + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + + if (priv->thin_arbiter_count && local->is_read_txn && + AFR_COUNT(success_replies, priv->child_count) != priv->child_count) { + /* We need to query the good bricks and/or thin-arbiter.*/ + if (success_replies[0]) { + local->read_txn_query_child = AFR_CHILD_ZERO; + } else if (success_replies[1]) { + local->read_txn_query_child = AFR_CHILD_ONE; + } + error = EINVAL; + goto refresh_done; + } + + if (!afr_has_quorum(success_replies, this, frame)) { + error = afr_final_errno(frame->local, this->private); + if (!error) + error = afr_quorum_errno(priv); + goto refresh_done; + } + + ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal); + + if (ret && afr_selfheal_enabled(this) && start_heal) { + heal_frame = afr_frame_create(this, NULL); + if (!heal_frame) + goto refresh_done; + heal_local = heal_frame->local; + heal_local->refreshinode = inode_ref(local->refreshinode); + heal_local->heal_frame = heal_frame; + if (!afr_throttled_selfheal(heal_frame, this)) { + AFR_STACK_DESTROY(heal_frame); + goto refresh_done; } + } refresh_done: - afr_txn_refresh_done (frame, this, err); + afr_txn_refresh_done(frame, this, error); - return 0; + return 0; } void -afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *buf, - dict_t *xdata, struct iatt *par) -{ - afr_local_t *local = NULL; - int call_child = (long) cookie; - int8_t need_heal = 1; - int call_count = 0; - GF_UNUSED int ret = 0; +afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + dict_t *xdata, struct iatt *par) +{ + afr_local_t *local = NULL; + int call_child = (long)cookie; + int8_t need_heal = 1; + int call_count = 0; + int ret = 0; + + local = frame->local; + local->replies[call_child].valid = 1; + local->replies[call_child].op_ret = op_ret; + local->replies[call_child].op_errno = op_errno; + if (op_ret != -1) { + local->replies[call_child].poststat = *buf; + if (par) + local->replies[call_child].postparent = *par; + if (xdata) + local->replies[call_child].xdata = dict_ref(xdata); + } - local = frame->local; - local->replies[call_child].valid = 1; - local->replies[call_child].op_ret = op_ret; - local->replies[call_child].op_errno = op_errno; - if (op_ret != -1) { - local->replies[call_child].poststat = *buf; - if (par) - local->replies[call_child].postparent = *par; - if (xdata) - local->replies[call_child].xdata = dict_ref (xdata); - } - if (xdata) { - ret = dict_get_int8 (xdata, "link-count", &need_heal); - local->replies[call_child].need_heal = need_heal; - } else { - local->replies[call_child].need_heal = need_heal; + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to get link count"); } + } - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_need_heal (this, local); - afr_inode_refresh_done (frame, this, 0); + local->replies[call_child].need_heal = need_heal; + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + ret = afr_inode_refresh_err(frame, this); + if (ret) { + gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed"); } - + afr_inode_refresh_done(frame, this, ret); + } } int -afr_inode_refresh_subvol_with_lookup_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, - int op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, - struct iatt *par) +afr_inode_refresh_subvol_with_lookup_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, + int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, + struct iatt *par) { - afr_inode_refresh_subvol_cbk (frame, cookie, this, op_ret, op_errno, - buf, xdata, par); - return 0; + afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf, + xdata, par); + return 0; } - int -afr_inode_refresh_subvol_with_lookup (call_frame_t *frame, xlator_t *this, - int i, inode_t *inode, uuid_t gfid, - dict_t *xdata) +afr_inode_refresh_subvol_with_lookup(call_frame_t *frame, xlator_t *this, int i, + inode_t *inode, uuid_t gfid, dict_t *xdata) { - loc_t loc = {0, }; - afr_private_t *priv = NULL; + loc_t loc = { + 0, + }; + afr_private_t *priv = NULL; - priv = this->private; + priv = this->private; - loc.inode = inode; - if (gf_uuid_is_null (inode->gfid) && gfid) { - /* To handle setattr/setxattr on yet to be linked inode from - * dht */ - gf_uuid_copy (loc.gfid, gfid); - } else { - gf_uuid_copy (loc.gfid, inode->gfid); - } + loc.inode = inode; + if (gf_uuid_is_null(inode->gfid) && gfid) { + /* To handle setattr/setxattr on yet to be linked inode from + * dht */ + gf_uuid_copy(loc.gfid, gfid); + } else { + gf_uuid_copy(loc.gfid, inode->gfid); + } - STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_with_lookup_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->lookup, &loc, xdata); - return 0; + STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_lookup_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &loc, xdata); + return 0; } int -afr_inode_refresh_subvol_with_fstat_cbk (call_frame_t *frame, - void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) +afr_inode_refresh_subvol_with_fstat_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, + dict_t *xdata) { - afr_inode_refresh_subvol_cbk (frame, cookie, this, op_ret, op_errno, - buf, xdata, NULL); - return 0; + afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf, + xdata, NULL); + return 0; } int -afr_inode_refresh_subvol_with_fstat (call_frame_t *frame, xlator_t *this, int i, - dict_t *xdata) +afr_inode_refresh_subvol_with_fstat(call_frame_t *frame, xlator_t *this, int i, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_with_fstat_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->fstat, local->fd, xdata); - return 0; + STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_fstat_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->fstat, local->fd, xdata); + return 0; } int -afr_inode_refresh_do (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; - int ret = 0; - dict_t *xdata = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - unsigned char *wind_subvols = NULL; - - priv = this->private; - local = frame->local; - wind_subvols = alloca0 (priv->child_count); - - afr_local_replies_wipe (local, priv); - - if (local->fd) { - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - afr_inode_refresh_done (frame, this, EINVAL); - return 0; - } - } +afr_inode_refresh_do(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + int ret = 0; + dict_t *xdata = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *wind_subvols = NULL; - xdata = dict_new (); - if (!xdata) { - afr_inode_refresh_done (frame, this, ENOMEM); - return 0; - } + priv = this->private; + local = frame->local; + wind_subvols = alloca0(priv->child_count); - ret = afr_xattr_req_prepare (this, xdata); - if (ret != 0) { - dict_unref (xdata); - afr_inode_refresh_done (frame, this, -ret); - return 0; - } + afr_local_replies_wipe(local, priv); - ret = dict_set_str (xdata, "link-count", GF_XATTROP_INDEX_COUNT); - if (ret) { - gf_msg_debug (this->name, -ret, - "Unable to set link-count in dict "); + if (local->fd) { + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + afr_inode_refresh_done(frame, this, EINVAL); + return 0; } + } - ret = dict_set_str (xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name); - if (ret) { - gf_msg_debug (this->name, -ret, - "Unable to set inodelk-dom-count in dict "); + xdata = dict_new(); + if (!xdata) { + afr_inode_refresh_done(frame, this, ENOMEM); + return 0; + } - } + ret = afr_xattr_req_prepare(this, xdata); + if (ret != 0) { + dict_unref(xdata); + afr_inode_refresh_done(frame, this, -ret); + return 0; + } - if (local->fd) { - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i] && - fd_ctx->opened_on[i] == AFR_FD_OPENED) - wind_subvols[i] = 1; - } - } else { - memcpy (wind_subvols, local->child_up, - sizeof (*local->child_up) * priv->child_count); - } + ret = dict_set_sizen_str_sizen(xdata, "link-count", GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } - local->call_count = AFR_COUNT (wind_subvols, priv->child_count); + ret = dict_set_str_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name); + if (ret) { + gf_msg_debug(this->name, -ret, + "Unable to set inodelk-dom-count in dict "); + } - call_count = local->call_count; - if (!call_count) { - dict_unref (xdata); - if (local->fd && AFR_COUNT(local->child_up, priv->child_count)) - afr_inode_refresh_done (frame, this, EBADFD); - else - afr_inode_refresh_done (frame, this, ENOTCONN); - return 0; + if (local->fd) { + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i] && fd_ctx->opened_on[i] == AFR_FD_OPENED) + wind_subvols[i] = 1; } - for (i = 0; i < priv->child_count; i++) { - if (!wind_subvols[i]) - continue; + } else { + memcpy(wind_subvols, local->child_up, + sizeof(*local->child_up) * priv->child_count); + } - if (local->fd) - afr_inode_refresh_subvol_with_fstat (frame, this, i, - xdata); - else - afr_inode_refresh_subvol_with_lookup (frame, this, i, - local->refreshinode, - local->refreshgfid, xdata); + local->call_count = AFR_COUNT(wind_subvols, priv->child_count); - if (!--call_count) - break; - } + call_count = local->call_count; + if (!call_count) { + dict_unref(xdata); + if (local->fd && AFR_COUNT(local->child_up, priv->child_count)) + afr_inode_refresh_done(frame, this, EBADFD); + else + afr_inode_refresh_done(frame, this, ENOTCONN); + return 0; + } + for (i = 0; i < priv->child_count; i++) { + if (!wind_subvols[i]) + continue; - dict_unref (xdata); + if (local->fd) + afr_inode_refresh_subvol_with_fstat(frame, this, i, xdata); + else + afr_inode_refresh_subvol_with_lookup( + frame, this, i, local->refreshinode, local->refreshgfid, xdata); - return 0; -} + if (!--call_count) + break; + } + + dict_unref(xdata); + return 0; +} int -afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode, - uuid_t gfid, afr_inode_refresh_cbk_t refreshfn) +afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, afr_inode_refresh_cbk_t refreshfn) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - local->refreshfn = refreshfn; + local->refreshfn = refreshfn; - if (local->refreshinode) { - inode_unref (local->refreshinode); - local->refreshinode = NULL; - } + if (local->refreshinode) { + inode_unref(local->refreshinode); + local->refreshinode = NULL; + } - local->refreshinode = inode_ref (inode); + local->refreshinode = inode_ref(inode); - if (gfid) - gf_uuid_copy (local->refreshgfid, gfid); - else - gf_uuid_clear (local->refreshgfid); + if (gfid) + gf_uuid_copy(local->refreshgfid, gfid); + else + gf_uuid_clear(local->refreshgfid); - afr_inode_refresh_do (frame, this); + afr_inode_refresh_do(frame, this); - return 0; + return 0; } - int -afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req) +afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req) { - int i = 0; - afr_private_t *priv = NULL; - int ret = 0; - - priv = this->private; + int i = 0; + afr_private_t *priv = NULL; + int ret = 0; - for (i = 0; i < priv->child_count; i++) { - ret = dict_set_uint64 (xattr_req, priv->pending_key[i], - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret < 0) - gf_msg (this->name, GF_LOG_WARNING, - -ret, AFR_MSG_DICT_SET_FAILED, - "Unable to set dict value for %s", - priv->pending_key[i]); - /* 3 = data+metadata+entry */ - } - ret = dict_set_uint64 (xattr_req, AFR_DIRTY, - AFR_NUM_CHANGE_LOGS * sizeof(int)); - if (ret) { - gf_msg_debug (this->name, -ret, "failed to set dirty " - "query flag"); - } + priv = this->private; - ret = dict_set_int32 (xattr_req, "list-xattr", 1); - if (ret) { - gf_msg_debug (this->name, -ret, - "Unable to set list-xattr in dict "); - } + for (i = 0; i < priv->child_count; i++) { + ret = dict_set_uint64(xattr_req, priv->pending_key[i], + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret < 0) + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Unable to set dict value for %s", priv->pending_key[i]); + /* 3 = data+metadata+entry */ + } + ret = dict_set_uint64(xattr_req, AFR_DIRTY, + AFR_NUM_CHANGE_LOGS * sizeof(int)); + if (ret) { + gf_msg_debug(this->name, -ret, + "failed to set dirty " + "query flag"); + } + + ret = dict_set_int32_sizen(xattr_req, "list-xattr", 1); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set list-xattr in dict "); + } + + return ret; +} - return ret; +int +afr_lookup_xattr_req_prepare(afr_local_t *local, xlator_t *this, + dict_t *xattr_req, loc_t *loc) +{ + int ret = -ENOMEM; + + if (!local->xattr_req) + local->xattr_req = dict_new(); + + if (!local->xattr_req) + goto out; + + if (xattr_req && (xattr_req != local->xattr_req)) + dict_copy(xattr_req, local->xattr_req); + + ret = afr_xattr_req_prepare(this, local->xattr_req); + + ret = dict_set_uint64(local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_INODELK_COUNT); + } + ret = dict_set_uint64(local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_ENTRYLK_COUNT); + } + + ret = dict_set_uint32(local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "%s: Unable to set dict value for %s", loc->path, + GLUSTERFS_PARENT_ENTRYLK); + } + + ret = dict_set_sizen_str_sizen(local->xattr_req, "link-count", + GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } + + ret = 0; +out: + return ret; } int -afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this, - dict_t *xattr_req, loc_t *loc) +afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable) { - int ret = -ENOMEM; - - if (!local->xattr_req) - local->xattr_req = dict_new (); - - if (!local->xattr_req) - goto out; + int i = 0; + int child = -1; + int64_t read_iter = -1; + int64_t pending_read = -1; - if (xattr_req && (xattr_req != local->xattr_req)) - dict_copy (xattr_req, local->xattr_req); + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i]) + continue; + read_iter = GF_ATOMIC_GET(priv->pending_reads[i]); + if (child == -1 || read_iter < pending_read) { + pending_read = read_iter; + child = i; + } + } - ret = afr_xattr_req_prepare (this, local->xattr_req); + return child; +} - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0); - if (ret < 0) { - gf_msg (this->name, GF_LOG_WARNING, - -ret, AFR_MSG_DICT_SET_FAILED, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_INODELK_COUNT); - } - ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0); - if (ret < 0) { - gf_msg (this->name, GF_LOG_WARNING, - -ret, AFR_MSG_DICT_SET_FAILED, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_ENTRYLK_COUNT); - } +static int32_t +afr_least_latency_child(afr_private_t *priv, unsigned char *readable) +{ + int32_t i = 0; + int child = -1; - ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0); - if (ret < 0) { - gf_msg (this->name, GF_LOG_WARNING, - -ret, AFR_MSG_DICT_SET_FAILED, - "%s: Unable to set dict value for %s", - loc->path, GLUSTERFS_PARENT_ENTRYLK); - } + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; - ret = dict_set_str (local->xattr_req, "link-count", - GF_XATTROP_INDEX_COUNT); - if (ret) { - gf_msg_debug (this->name, -ret, - "Unable to set link-count in dict "); + if (child == -1 || + priv->child_latency[i] < priv->child_latency[child]) { + child = i; } - - ret = 0; -out: - return ret; + } + return child; } - -int -afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode) +static int32_t +afr_least_latency_times_pending_reads_child(afr_private_t *priv, + unsigned char *readable) { - uuid_t gfid_copy = {0,}; - pid_t pid; + int32_t i = 0; + int child = -1; + int64_t pending_read = 0; + int64_t latency = -1; + int64_t least_latency = -1; - if (!hashmode) { - return -1; - } + for (i = 0; i < priv->child_count; i++) { + if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] || + priv->child_latency[i] < 0) + continue; - gf_uuid_copy (gfid_copy, args->gfid); + pending_read = GF_ATOMIC_GET(priv->pending_reads[i]); + latency = (pending_read + 1) * priv->child_latency[i]; - if ((hashmode > 1) && (args->ia_type != IA_IFDIR)) { + if (child == -1 || latency < least_latency) { + least_latency = latency; + child = i; + } + } + return child; +} + +int +afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv, + unsigned char *readable) +{ + uuid_t gfid_copy = { + 0, + }; + pid_t pid; + int child = -1; + + switch (priv->hash_mode) { + case AFR_READ_POLICY_FIRST_UP: + break; + case AFR_READ_POLICY_GFID_HASH: + gf_uuid_copy(gfid_copy, args->gfid); + child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % + priv->child_count; + break; + case AFR_READ_POLICY_GFID_PID_HASH: + if (args->ia_type != IA_IFDIR) { /* * Why getpid? Because it's one of the cheapest calls - * available - faster than gethostname etc. - and returns a - * constant-length value that's sure to be shorter than a UUID. - * It's still very unlikely to be the same across clients, so - * it still provides good mixing. We're not trying for - * perfection here. All we need is a low probability that - * multiple clients won't converge on the same subvolume. + * available - faster than gethostname etc. - and + * returns a constant-length value that's sure to be + * shorter than a UUID. It's still very unlikely to be + * the same across clients, so it still provides good + * mixing. We're not trying for perfection here. All we + * need is a low probability that multiple clients + * won't converge on the same subvolume. */ + gf_uuid_copy(gfid_copy, args->gfid); pid = getpid(); - memcpy (gfid_copy, &pid, sizeof(pid)); - } - - return SuperFastHash((char *)gfid_copy, - sizeof(gfid_copy)) % child_count; + *(pid_t *)gfid_copy ^= pid; + } + child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) % + priv->child_count; + break; + case AFR_READ_POLICY_LESS_LOAD: + child = afr_least_pending_reads_child(priv, readable); + break; + case AFR_READ_POLICY_LEAST_LATENCY: + child = afr_least_latency_child(priv, readable); + break; + case AFR_READ_POLICY_LOAD_LATENCY_HYBRID: + child = afr_least_latency_times_pending_reads_child(priv, readable); + break; + } + + return child; } - int -afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this, - unsigned char *readable, - afr_read_subvol_args_t *args) +afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this, + unsigned char *readable, + afr_read_subvol_args_t *args) { - int i = 0; - int read_subvol = -1; - afr_private_t *priv = NULL; - afr_read_subvol_args_t local_args = {0,}; + int i = 0; + int read_subvol = -1; + afr_private_t *priv = NULL; + afr_read_subvol_args_t local_args = { + 0, + }; - priv = this->private; + priv = this->private; - /* first preference - explicitly specified or local subvolume */ - if (priv->read_child >= 0 && readable[priv->read_child]) - return priv->read_child; + /* first preference - explicitly specified or local subvolume */ + if (priv->read_child >= 0 && readable[priv->read_child]) + return priv->read_child; - if (inode_is_linked (inode)) { - gf_uuid_copy (local_args.gfid, inode->gfid); - local_args.ia_type = inode->ia_type; - } else if (args) { - local_args = *args; - } + if (inode_is_linked(inode)) { + gf_uuid_copy(local_args.gfid, inode->gfid); + local_args.ia_type = inode->ia_type; + } else if (args) { + local_args = *args; + } - /* second preference - use hashed mode */ - read_subvol = afr_hash_child (&local_args, priv->child_count, - priv->hash_mode); - if (read_subvol >= 0 && readable[read_subvol]) - return read_subvol; + /* second preference - use hashed mode */ + read_subvol = afr_hash_child(&local_args, priv, readable); + if (read_subvol >= 0 && readable[read_subvol]) + return read_subvol; - for (i = 0; i < priv->child_count; i++) { - if (readable[i]) - return i; - } + for (i = 0; i < priv->child_count; i++) { + if (readable[i]) + return i; + } - /* no readable subvolumes, either split brain or all subvols down */ + /* no readable subvolumes, either split brain or all subvols down */ - return -1; + return -1; } - int -afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this, - unsigned char *readable, int *event_p, - int type) +afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this, + unsigned char *readable, int *event_p, int type) { - int ret = -1; + int ret = -1; - if (type == AFR_METADATA_TRANSACTION) - ret = afr_inode_read_subvol_get (inode, this, 0, readable, - event_p); - else - ret = afr_inode_read_subvol_get (inode, this, readable, 0, - event_p); - return ret; + if (type == AFR_METADATA_TRANSACTION) + ret = afr_inode_read_subvol_get(inode, this, 0, readable, event_p); + else + ret = afr_inode_read_subvol_get(inode, this, readable, 0, event_p); + return ret; } +void +afr_readables_intersect_get(inode_t *inode, xlator_t *this, int *event, + unsigned char *intersection) +{ + afr_private_t *priv = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + unsigned char *intersect = NULL; + + priv = this->private; + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + intersect = alloca0(priv->child_count); -int -afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p, - unsigned char *readables, - int *event_p, afr_transaction_type type, - afr_read_subvol_args_t *args) -{ - afr_private_t *priv = NULL; - unsigned char *data_readable = NULL; - unsigned char *metadata_readable = NULL; - unsigned char *readable = NULL; - unsigned char *intersection = NULL; - int subvol = -1; - int event = 0; - - priv = this->private; - - readable = alloca0 (priv->child_count); - data_readable = alloca0 (priv->child_count); - metadata_readable = alloca0 (priv->child_count); - intersection = alloca0 (priv->child_count); - - afr_inode_read_subvol_type_get (inode, this, readable, &event, type); - - afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable, - &event); - - AFR_INTERSECT (intersection, data_readable, metadata_readable, - priv->child_count); - - if (AFR_COUNT (intersection, priv->child_count) > 0) - subvol = afr_read_subvol_select_by_policy (inode, this, - intersection, args); - else - subvol = afr_read_subvol_select_by_policy (inode, this, - readable, args); - if (subvol_p) - *subvol_p = subvol; - if (event_p) - *event_p = event; - if (readables) - memcpy (readables, readable, - sizeof (*readables) * priv->child_count); - return subvol; -} + afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable, + event); + AFR_INTERSECT(intersect, data_readable, metadata_readable, + priv->child_count); + if (intersection) + memcpy(intersection, intersect, + sizeof(*intersection) * priv->child_count); +} -void -afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) +int +afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p, + unsigned char *readables, int *event_p, + afr_transaction_type type, afr_read_subvol_args_t *args) { - afr_private_t *priv = NULL; - int i = 0; + afr_private_t *priv = NULL; + unsigned char *readable = NULL; + unsigned char *intersection = NULL; + int subvol = -1; + int event = 0; - priv = this->private; + priv = this->private; - afr_matrix_cleanup (local->pending, priv->child_count); + readable = alloca0(priv->child_count); + intersection = alloca0(priv->child_count); - GF_FREE (local->internal_lock.locked_nodes); + afr_inode_read_subvol_type_get(inode, this, readable, &event, type); - for (i = 0; local->internal_lock.inodelk[i].domain; i++) { - GF_FREE (local->internal_lock.inodelk[i].locked_nodes); - } + afr_readables_intersect_get(inode, this, &event, intersection); - GF_FREE (local->internal_lock.lower_locked_nodes); + if (AFR_COUNT(intersection, priv->child_count) > 0) + subvol = afr_read_subvol_select_by_policy(inode, this, intersection, + args); + else + subvol = afr_read_subvol_select_by_policy(inode, this, readable, args); + if (subvol_p) + *subvol_p = subvol; + if (event_p) + *event_p = event; + if (readables) + memcpy(readables, readable, sizeof(*readables) * priv->child_count); + return subvol; +} - afr_entry_lockee_cleanup (&local->internal_lock); +void +afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this) +{ + afr_private_t *priv = NULL; + int i = 0; - GF_FREE (local->transaction.pre_op); + priv = this->private; - GF_FREE (local->transaction.pre_op_sources); - if (local->transaction.pre_op_xdata) { - for (i = 0; i < priv->child_count; i++) { - if (!local->transaction.pre_op_xdata[i]) - continue; - dict_unref (local->transaction.pre_op_xdata[i]); - } - GF_FREE (local->transaction.pre_op_xdata); - } + afr_matrix_cleanup(local->pending, priv->child_count); - GF_FREE (local->transaction.eager_lock); - GF_FREE (local->transaction.failed_subvols); + GF_FREE(local->internal_lock.lower_locked_nodes); - GF_FREE (local->transaction.basename); - GF_FREE (local->transaction.new_basename); + afr_lockees_cleanup(&local->internal_lock); - loc_wipe (&local->transaction.parent_loc); - loc_wipe (&local->transaction.new_parent_loc); + GF_FREE(local->transaction.pre_op); -} - -void -afr_reply_wipe (struct afr_reply *reply) -{ - if (reply->xdata) { - dict_unref (reply->xdata); - reply->xdata = NULL; + GF_FREE(local->transaction.pre_op_sources); + if (local->transaction.changelog_xdata) { + for (i = 0; i < priv->child_count; i++) { + if (!local->transaction.changelog_xdata[i]) + continue; + dict_unref(local->transaction.changelog_xdata[i]); } + GF_FREE(local->transaction.changelog_xdata); + } - if (reply->xattr) { - dict_unref (reply->xattr); - reply->xattr = NULL; - } + GF_FREE(local->transaction.failed_subvols); + + GF_FREE(local->transaction.basename); + GF_FREE(local->transaction.new_basename); + + loc_wipe(&local->transaction.parent_loc); + loc_wipe(&local->transaction.new_parent_loc); } void -afr_replies_wipe (struct afr_reply *replies, int count) +afr_reply_wipe(struct afr_reply *reply) { - int i = 0; + if (reply->xdata) { + dict_unref(reply->xdata); + reply->xdata = NULL; + } - for (i = 0; i < count; i++) { - afr_reply_wipe (&replies[i]); - } + if (reply->xattr) { + dict_unref(reply->xattr); + reply->xattr = NULL; + } } void -afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv) +afr_replies_wipe(struct afr_reply *replies, int count) { + int i = 0; - if (!local->replies) - return; - - afr_replies_wipe (local->replies, priv->child_count); - - memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); + for (i = 0; i < count; i++) { + afr_reply_wipe(&replies[i]); + } } void -afr_remove_eager_lock_stub (afr_local_t *local) +afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv) { - LOCK (&local->fd->lock); - { - list_del_init (&local->transaction.eager_locked); - } - UNLOCK (&local->fd->lock); + if (!local->replies) + return; + + afr_replies_wipe(local->replies, priv->child_count); + + memset(local->replies, 0, sizeof(*local->replies) * priv->child_count); } static gf_boolean_t -afr_fop_lock_is_unlock (call_frame_t *frame) +afr_fop_lock_is_unlock(call_frame_t *frame) { - afr_local_t *local = frame->local; - switch (local->op) { + afr_local_t *local = frame->local; + switch (local->op) { case GF_FOP_INODELK: case GF_FOP_FINODELK: - if ((F_UNLCK == local->cont.inodelk.in_flock.l_type) && - (local->cont.inodelk.in_cmd == F_SETLKW || - local->cont.inodelk.in_cmd == F_SETLK)) - return _gf_true; - break; + if ((F_UNLCK == local->cont.inodelk.in_flock.l_type) && + (local->cont.inodelk.in_cmd == F_SETLKW || + local->cont.inodelk.in_cmd == F_SETLK)) + return _gf_true; + break; case GF_FOP_ENTRYLK: case GF_FOP_FENTRYLK: - if (ENTRYLK_UNLOCK == local->cont.entrylk.in_cmd) - return _gf_true; - break; + if (ENTRYLK_UNLOCK == local->cont.entrylk.in_cmd) + return _gf_true; + break; default: - return _gf_false; - } - return _gf_false; + return _gf_false; + } + return _gf_false; } static gf_boolean_t -afr_lk_is_unlock (int32_t cmd, struct gf_flock *flock) +afr_lk_is_unlock(int32_t cmd, struct gf_flock *flock) { - switch (cmd) { + switch (cmd) { case F_RESLK_UNLCK: - return _gf_true; - break; + return _gf_true; + break; #if F_SETLKW != F_SETLKW64 case F_SETLKW64: @@ -1837,514 +2548,573 @@ afr_lk_is_unlock (int32_t cmd, struct gf_flock *flock) case F_SETLK64: #endif case F_SETLK: - if (F_UNLCK == flock->l_type) - return _gf_true; - break; + if (F_UNLCK == flock->l_type) + return _gf_true; + break; default: - return _gf_false; - } - return _gf_false; + return _gf_false; + } + return _gf_false; } void -afr_handle_inconsistent_fop (call_frame_t *frame, int32_t *op_ret, - int32_t *op_errno) +afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret, + int32_t *op_errno) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - if (!frame || !frame->this || !frame->local || !frame->this->private) - return; + if (!frame || !frame->this || !frame->local || !frame->this->private) + return; - if (*op_ret < 0) - return; + if (*op_ret < 0) + return; - /* Failing inodelk/entrylk/lk here is not a good idea because we - * need to cleanup the locks on the other bricks if we choose to fail - * the fop here. The brick may go down just after unwind happens as well - * so anyways the fop will fail when the next fop is sent so leaving - * it like this for now.*/ - local = frame->local; - switch (local->op) { + /* Failing inodelk/entrylk/lk here is not a good idea because we + * need to cleanup the locks on the other bricks if we choose to fail + * the fop here. The brick may go down just after unwind happens as well + * so anyways the fop will fail when the next fop is sent so leaving + * it like this for now.*/ + local = frame->local; + switch (local->op) { case GF_FOP_LOOKUP: case GF_FOP_INODELK: case GF_FOP_FINODELK: case GF_FOP_ENTRYLK: case GF_FOP_FENTRYLK: case GF_FOP_LK: - return; + return; default: - break; - } + break; + } - priv = frame->this->private; - if (!priv->consistent_io) - return; + priv = frame->this->private; + if (!priv->consistent_io) + return; - if (local->event_generation && - (local->event_generation != priv->event_generation)) - goto inconsistent; + if (local->event_generation && + (local->event_generation != priv->event_generation)) + goto inconsistent; - return; + return; inconsistent: - *op_ret = -1; - *op_errno = ENOTCONN; + *op_ret = -1; + *op_errno = ENOTCONN; } void -afr_local_cleanup (afr_local_t *local, xlator_t *this) +afr_local_cleanup(afr_local_t *local, xlator_t *this) { - afr_private_t * priv = NULL; + afr_private_t *priv = NULL; - if (!local) - return; - - syncbarrier_destroy (&local->barrier); - - if (local->transaction.eager_lock_on && - !list_empty (&local->transaction.eager_locked)) - afr_remove_eager_lock_stub (local); - - afr_local_transaction_cleanup (local, this); - - priv = this->private; - - loc_wipe (&local->loc); - loc_wipe (&local->newloc); - - if (local->fd) - fd_unref (local->fd); + if (!local) + return; - if (local->xattr_req) - dict_unref (local->xattr_req); + syncbarrier_destroy(&local->barrier); - if (local->xattr_rsp) - dict_unref (local->xattr_rsp); + afr_local_transaction_cleanup(local, this); - if (local->dict) - dict_unref (local->dict); + priv = this->private; - afr_local_replies_wipe (local, priv); - GF_FREE(local->replies); + loc_wipe(&local->loc); + loc_wipe(&local->newloc); - GF_FREE (local->child_up); + if (local->fd) + fd_unref(local->fd); - GF_FREE (local->read_attempted); + if (local->xattr_req) + dict_unref(local->xattr_req); - GF_FREE (local->readable); - GF_FREE (local->readable2); + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); - if (local->inode) - inode_unref (local->inode); + if (local->dict) + dict_unref(local->dict); - if (local->parent) - inode_unref (local->parent); + afr_local_replies_wipe(local, priv); + GF_FREE(local->replies); - if (local->parent2) - inode_unref (local->parent2); + GF_FREE(local->child_up); - if (local->refreshinode) - inode_unref (local->refreshinode); + GF_FREE(local->read_attempted); + + GF_FREE(local->readable); + GF_FREE(local->readable2); + + if (local->inode) + inode_unref(local->inode); - { /* getxattr */ - GF_FREE (local->cont.getxattr.name); - } + if (local->parent) + inode_unref(local->parent); - { /* lk */ - GF_FREE (local->cont.lk.locked_nodes); - } + if (local->parent2) + inode_unref(local->parent2); - { /* create */ - if (local->cont.create.fd) - fd_unref (local->cont.create.fd); - if (local->cont.create.params) - dict_unref (local->cont.create.params); - } + if (local->refreshinode) + inode_unref(local->refreshinode); - { /* mknod */ - if (local->cont.mknod.params) - dict_unref (local->cont.mknod.params); - } + { /* getxattr */ + GF_FREE(local->cont.getxattr.name); + } - { /* mkdir */ - if (local->cont.mkdir.params) - dict_unref (local->cont.mkdir.params); - } + { /* lk */ + GF_FREE(local->cont.lk.locked_nodes); + GF_FREE(local->cont.lk.dom_locked_nodes); + GF_FREE(local->cont.lk.dom_lock_op_ret); + GF_FREE(local->cont.lk.dom_lock_op_errno); + } - { /* symlink */ - if (local->cont.symlink.params) - dict_unref (local->cont.symlink.params); - } + { /* create */ + if (local->cont.create.fd) + fd_unref(local->cont.create.fd); + if (local->cont.create.params) + dict_unref(local->cont.create.params); + } - { /* writev */ - GF_FREE (local->cont.writev.vector); - if (local->cont.writev.iobref) - iobref_unref (local->cont.writev.iobref); - } + { /* mknod */ + if (local->cont.mknod.params) + dict_unref(local->cont.mknod.params); + } - { /* setxattr */ - if (local->cont.setxattr.dict) - dict_unref (local->cont.setxattr.dict); - } + { /* mkdir */ + if (local->cont.mkdir.params) + dict_unref(local->cont.mkdir.params); + } - { /* fsetxattr */ - if (local->cont.fsetxattr.dict) - dict_unref (local->cont.fsetxattr.dict); - } + { /* symlink */ + if (local->cont.symlink.params) + dict_unref(local->cont.symlink.params); + } - { /* removexattr */ - GF_FREE (local->cont.removexattr.name); - } - { /* xattrop */ - if (local->cont.xattrop.xattr) - dict_unref (local->cont.xattrop.xattr); - } - { /* symlink */ - GF_FREE (local->cont.symlink.linkpath); - } + { /* writev */ + GF_FREE(local->cont.writev.vector); + if (local->cont.writev.iobref) + iobref_unref(local->cont.writev.iobref); + } - { /* opendir */ - GF_FREE (local->cont.opendir.checksum); - } + { /* setxattr */ + if (local->cont.setxattr.dict) + dict_unref(local->cont.setxattr.dict); + } - { /* open */ - if (local->cont.open.fd) - fd_unref (local->cont.open.fd); - } + { /* fsetxattr */ + if (local->cont.fsetxattr.dict) + dict_unref(local->cont.fsetxattr.dict); + } - { /* readdirp */ - if (local->cont.readdir.dict) - dict_unref (local->cont.readdir.dict); - } + { /* removexattr */ + GF_FREE(local->cont.removexattr.name); + } + { /* xattrop */ + if (local->cont.xattrop.xattr) + dict_unref(local->cont.xattrop.xattr); + } + { /* symlink */ + GF_FREE(local->cont.symlink.linkpath); + } - { /* inodelk */ - GF_FREE (local->cont.inodelk.volume); - if (local->cont.inodelk.xdata) - dict_unref (local->cont.inodelk.xdata); - } - - { /* entrylk */ - GF_FREE (local->cont.entrylk.volume); - GF_FREE (local->cont.entrylk.basename); - if (local->cont.entrylk.xdata) - dict_unref (local->cont.entrylk.xdata); - } - - if (local->xdata_req) - dict_unref (local->xdata_req); - - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); + { /* opendir */ + GF_FREE(local->cont.opendir.checksum); + } + + { /* open */ + if (local->cont.open.fd) + fd_unref(local->cont.open.fd); + } + + { /* readdirp */ + if (local->cont.readdir.dict) + dict_unref(local->cont.readdir.dict); + } + + { /* inodelk */ + GF_FREE(local->cont.inodelk.volume); + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + } + + { /* entrylk */ + GF_FREE(local->cont.entrylk.volume); + GF_FREE(local->cont.entrylk.basename); + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + } + + if (local->xdata_req) + dict_unref(local->xdata_req); + + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); } - int -afr_frame_return (call_frame_t *frame) +afr_frame_return(call_frame_t *frame) { - afr_local_t *local = NULL; - int call_count = 0; + afr_local_t *local = NULL; + int call_count = 0; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - call_count = --local->call_count; - } - UNLOCK (&frame->lock); + LOCK(&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - return call_count; + return call_count; } -static char *afr_ignore_xattrs[] = { - GF_SELINUX_XATTR_KEY, - QUOTA_SIZE_KEY, - NULL -}; +static char *afr_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL}; gf_boolean_t -afr_is_xattr_ignorable (char *key) +afr_is_xattr_ignorable(char *key) { - int i = 0; + int i = 0; - if (!strncmp (key, AFR_XATTR_PREFIX, strlen(AFR_XATTR_PREFIX))) - return _gf_true; - for (i = 0; afr_ignore_xattrs[i]; i++) { - if (!strcmp (key, afr_ignore_xattrs[i])) - return _gf_true; - } - return _gf_false; + if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) + return _gf_true; + for (i = 0; afr_ignore_xattrs[i]; i++) { + if (!strcmp(key, afr_ignore_xattrs[i])) + return _gf_true; + } + return _gf_false; } static gf_boolean_t -afr_xattr_match_needed (dict_t *this, char *key1, data_t *value1, void *data) +afr_xattr_match_needed(dict_t *this, char *key1, data_t *value1, void *data) { - /* Ignore all non-disk (i.e. virtual) xattrs right away. */ - if (!gf_is_valid_xattr_namespace (key1)) - return _gf_false; + /* Ignore all non-disk (i.e. virtual) xattrs right away. */ + if (!gf_is_valid_xattr_namespace(key1)) + return _gf_false; - /* Ignore on-disk xattrs that AFR doesn't need to heal. */ - if (!afr_is_xattr_ignorable (key1)) - return _gf_true; + /* Ignore on-disk xattrs that AFR doesn't need to heal. */ + if (!afr_is_xattr_ignorable(key1)) + return _gf_true; - return _gf_false; + return _gf_false; } gf_boolean_t -afr_xattrs_are_equal (dict_t *dict1, dict_t *dict2) +afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2) { - return are_dicts_equal (dict1, dict2, afr_xattr_match_needed, NULL); + return are_dicts_equal(dict1, dict2, afr_xattr_match_needed, NULL); } static int -afr_get_parent_read_subvol (xlator_t *this, inode_t *parent, - struct afr_reply *replies, unsigned char *readable) +afr_get_parent_read_subvol(xlator_t *this, inode_t *parent, + struct afr_reply *replies, unsigned char *readable) { - int i = 0; - int par_read_subvol = -1; - int par_read_subvol_iter = -1; - afr_private_t *priv = NULL; + int i = 0; + int par_read_subvol = -1; + int par_read_subvol_iter = -1; + afr_private_t *priv = NULL; - priv = this->private; - - if (parent) - par_read_subvol = afr_data_subvol_get (parent, this, NULL, NULL, - NULL, NULL); - - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) - continue; + priv = this->private; - if (replies[i].op_ret < 0) - continue; + if (parent) + par_read_subvol = afr_data_subvol_get(parent, this, NULL, NULL, NULL, + NULL); - if (par_read_subvol_iter == -1) { - par_read_subvol_iter = i; - continue; - } + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - if ((par_read_subvol_iter != par_read_subvol) && readable[i]) - par_read_subvol_iter = i; + if (replies[i].op_ret < 0) + continue; - if (i == par_read_subvol) - par_read_subvol_iter = i; + if (par_read_subvol_iter == -1) { + par_read_subvol_iter = i; + continue; } - /* At the end of the for-loop, the only reason why @par_read_subvol_iter - * could be -1 is when this LOOKUP has failed on all sub-volumes. - * So it is okay to send an arbitrary subvolume (0 in this case) - * as parent read subvol. - */ - if (par_read_subvol_iter == -1) - par_read_subvol_iter = 0; - return par_read_subvol_iter; + if ((par_read_subvol_iter != par_read_subvol) && readable[i]) + par_read_subvol_iter = i; + + if (i == par_read_subvol) + par_read_subvol_iter = i; + } + /* At the end of the for-loop, the only reason why @par_read_subvol_iter + * could be -1 is when this LOOKUP has failed on all sub-volumes. + * So it is okay to send an arbitrary subvolume (0 in this case) + * as parent read subvol. + */ + if (par_read_subvol_iter == -1) + par_read_subvol_iter = 0; + return par_read_subvol_iter; } int -afr_read_subvol_decide (inode_t *inode, xlator_t *this, - afr_read_subvol_args_t *args) +afr_read_subvol_decide(inode_t *inode, xlator_t *this, + afr_read_subvol_args_t *args, unsigned char *readable) { - int data_subvol = -1; - int mdata_subvol = -1; + int event = 0; + afr_private_t *priv = NULL; + unsigned char *intersection = NULL; - data_subvol = afr_data_subvol_get (inode, this, NULL, NULL, NULL, args); - mdata_subvol = afr_metadata_subvol_get (inode, this, - NULL, NULL, NULL, args); - if (data_subvol == -1 || mdata_subvol == -1) - return -1; + priv = this->private; + intersection = alloca0(priv->child_count); + + afr_readables_intersect_get(inode, this, &event, intersection); + + if (AFR_COUNT(intersection, priv->child_count) <= 0) { + /* TODO: If we have one brick with valid data_readable and + * another with metadata_readable, try to send an iatt with + * valid bits from both.*/ + return -1; + } - return data_subvol; + memcpy(readable, intersection, sizeof(*readable) * priv->child_count); + + return afr_read_subvol_select_by_policy(inode, this, intersection, args); } static inline int -afr_first_up_child (call_frame_t *frame, xlator_t *this) +afr_first_up_child(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - for (i = 0; i < priv->child_count; i++) - if (local->replies[i].valid && - local->replies[i].op_ret == 0) - return i; - return 0; + for (i = 0; i < priv->child_count; i++) + if (local->replies[i].valid && local->replies[i].op_ret == 0) + return i; + return -1; } static void -afr_lookup_done (call_frame_t *frame, xlator_t *this) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = -1; - int op_errno = 0; - int read_subvol = 0; - int par_read_subvol = 0; - int ret = -1; - unsigned char *readable = NULL; - int event = 0; - struct afr_reply *replies = NULL; - uuid_t read_gfid = {0, }; - gf_boolean_t locked_entry = _gf_false; - gf_boolean_t can_interpret = _gf_true; - inode_t *parent = NULL; - int spb_choice = -1; - ia_type_t ia_type = IA_INVAL; - afr_read_subvol_args_t args = {0,}; - char *gfid_heal_msg = NULL; - - priv = this->private; - local = frame->local; - replies = local->replies; - parent = local->loc.parent; - - locked_entry = afr_is_possibly_under_txn (AFR_ENTRY_TRANSACTION, local, - this); - - readable = alloca0 (priv->child_count); - - afr_inode_read_subvol_get (parent, this, readable, NULL, &event); - - afr_inode_split_brain_choice_get (local->inode, this, - &spb_choice); - /* First, check if we have a gfid-change from somewhere, - If so, propagate that so that a fresh lookup can be - issued - */ - if (local->cont.lookup.needs_fresh_lookup) { - local->op_ret = -1; - local->op_errno = ESTALE; - goto unwind; - } - - op_errno = afr_final_errno (frame->local, this->private); - local->op_errno = op_errno; - - read_subvol = -1; - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) - continue; - - if (locked_entry && replies[i].op_ret == -1 && - replies[i].op_errno == ENOENT) { - /* Second, check entry is still - "underway" in creation */ - local->op_ret = -1; - local->op_errno = ENOENT; - goto unwind; - } - - if (replies[i].op_ret == -1) - continue; - - if (read_subvol == -1 || !readable[read_subvol]) { - read_subvol = i; - gf_uuid_copy (read_gfid, replies[i].poststat.ia_gfid); - ia_type = replies[i].poststat.ia_type; - local->op_ret = 0; - } - } - - if (read_subvol == -1) - goto unwind; - /* We now have a read_subvol, which is readable[] (if there - were any). Next we look for GFID mismatches. We don't - consider a GFID mismatch as an error if read_subvol is - readable[] but the mismatching GFID subvol is not. - */ - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid || replies[i].op_ret == -1) { - if (priv->child_up[i]) - can_interpret = _gf_false; - continue; - } - - if (!gf_uuid_compare (replies[i].poststat.ia_gfid, read_gfid)) - continue; - - can_interpret = _gf_false; - - if (locked_entry) - continue; - - /* Now GFIDs mismatch. It's OK as long as this subvol - is not readable[] but read_subvol is */ - if (readable[read_subvol] && !readable[i]) - continue; - - /* LOG ERROR */ - local->op_ret = -1; - local->op_errno = EIO; - goto unwind; - } - - /* Forth, for the finalized GFID, pick the best subvolume - to return stats from. - */ - if (can_interpret) { - /* It is safe to call afr_replies_interpret() because we have - a response from all the UP subvolumes and all of them resolved - to the same GFID - */ - gf_uuid_copy (args.gfid, read_gfid); - args.ia_type = ia_type; - if (afr_replies_interpret (frame, this, local->inode, NULL)) { - read_subvol = afr_read_subvol_decide (local->inode, - this, &args); - afr_inode_event_gen_reset (local->inode, this); - goto cant_interpret; - } else { - read_subvol = afr_data_subvol_get (local->inode, this, - NULL, NULL, NULL, &args); - } - } else { - cant_interpret: - if (read_subvol == -1) { - if (spb_choice >= 0) - read_subvol = spb_choice; - else - read_subvol = afr_first_up_child (frame, this); - } - dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY); - } +afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this, + unsigned char *success_replies, + unsigned char *data_readable, int *read_subvol) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int spb_subvol = -1; + int child_count = -1; - afr_handle_quota_size (frame, this); + if (*read_subvol != -1) + return; -unwind: - afr_set_need_heal (this, local); - if (read_subvol == -1) { - if (spb_choice >= 0) - read_subvol = spb_choice; - else - read_subvol = afr_first_up_child (frame, this); + priv = this->private; + local = frame->local; + child_count = priv->child_count; + + afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol); + if ((spb_subvol >= 0) && + (AFR_COUNT(success_replies, child_count) == child_count)) { + *read_subvol = spb_subvol; + } else if (!priv->quorum_count || + frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) { + *read_subvol = afr_first_up_child(frame, this); + } else if (priv->quorum_count && + afr_has_quorum(data_readable, this, NULL)) { + /* read_subvol is guaranteed to be valid if we hit this path. */ + *read_subvol = afr_first_up_child(frame, this); + } else { + /* If quorum is enabled and we do not have a + readable yet, it means all good copies are down. + */ + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR, + "no read " + "subvols for %s", + local->loc.path); + } + if (*read_subvol >= 0) + dict_del_sizen(local->replies[*read_subvol].xdata, GF_CONTENT_KEY); +} +static void +afr_lookup_done(call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = -1; + int op_errno = 0; + int read_subvol = 0; + int par_read_subvol = 0; + int ret = -1; + unsigned char *readable = NULL; + unsigned char *success_replies = NULL; + int event = 0; + struct afr_reply *replies = NULL; + uuid_t read_gfid = { + 0, + }; + gf_boolean_t locked_entry = _gf_false; + gf_boolean_t in_flight_create = _gf_false; + gf_boolean_t can_interpret = _gf_true; + inode_t *parent = NULL; + ia_type_t ia_type = IA_INVAL; + afr_read_subvol_args_t args = { + 0, + }; + char *gfid_heal_msg = NULL; + + priv = this->private; + local = frame->local; + replies = local->replies; + parent = local->loc.parent; + + locked_entry = afr_is_possibly_under_txn(AFR_ENTRY_TRANSACTION, local, + this); + + readable = alloca0(priv->child_count); + success_replies = alloca0(priv->child_count); + + afr_inode_read_subvol_get(parent, this, readable, NULL, &event); + par_read_subvol = afr_get_parent_read_subvol(this, parent, replies, + readable); + + /* First, check if we have a gfid-change from somewhere, + If so, propagate that so that a fresh lookup can be + issued + */ + if (local->cont.lookup.needs_fresh_lookup) { + local->op_ret = -1; + local->op_errno = ESTALE; + goto error; + } + + op_errno = afr_final_errno(frame->local, this->private); + local->op_errno = op_errno; + + read_subvol = -1; + afr_fill_success_replies(local, priv, success_replies); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == -1) { + if (locked_entry && replies[i].op_errno == ENOENT) { + in_flight_create = _gf_true; + } + continue; } - par_read_subvol = afr_get_parent_read_subvol (this, parent, replies, - readable); - if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) { - local->op_ret = -1; - local->op_errno = ENOTCONN; + + if (read_subvol == -1 || !readable[read_subvol]) { + read_subvol = i; + gf_uuid_copy(read_gfid, replies[i].poststat.ia_gfid); + ia_type = replies[i].poststat.ia_type; + local->op_ret = 0; } + } - ret = dict_get_str (local->xattr_req, "gfid-heal-msg", &gfid_heal_msg); - if (!ret) { - ret = dict_set_str (local->replies[read_subvol].xdata, - "gfid-heal-msg", gfid_heal_msg); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_DICT_SET_FAILED, - "Error setting gfid-heal-msg dict"); - local->op_ret = -1; - local->op_errno = ENOMEM; - } + if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) { + local->op_ret = -1; + local->op_errno = ENOENT; + goto error; + } + + if (read_subvol == -1) + goto error; + /* We now have a read_subvol, which is readable[] (if there + were any). Next we look for GFID mismatches. We don't + consider a GFID mismatch as an error if read_subvol is + readable[] but the mismatching GFID subvol is not. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { + continue; + } + + if (!gf_uuid_compare(replies[i].poststat.ia_gfid, read_gfid)) + continue; + + can_interpret = _gf_false; + + if (locked_entry) + continue; + + /* Now GFIDs mismatch. It's OK as long as this subvol + is not readable[] but read_subvol is */ + if (readable[read_subvol] && !readable[i]) + continue; + + /* If we were called from glfsheal and there is still a gfid + * mismatch, succeed the lookup and let glfsheal print the + * response via gfid-heal-msg.*/ + if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", + &gfid_heal_msg)) + goto cant_interpret; + + /* LOG ERROR */ + local->op_ret = -1; + local->op_errno = EIO; + goto error; + } + + /* Forth, for the finalized GFID, pick the best subvolume + to return stats from. + */ + read_subvol = -1; + memset(readable, 0, sizeof(*readable) * priv->child_count); + if (can_interpret) { + if (!afr_has_quorum(success_replies, this, NULL)) + goto cant_interpret; + /* It is safe to call afr_replies_interpret() because we have + a response from all the UP subvolumes and all of them resolved + to the same GFID + */ + gf_uuid_copy(args.gfid, read_gfid); + args.ia_type = ia_type; + ret = afr_replies_interpret(frame, this, local->inode, NULL); + read_subvol = afr_read_subvol_decide(local->inode, this, &args, + readable); + if (read_subvol == -1) + goto cant_interpret; + if (ret) { + afr_inode_need_refresh_set(local->inode, this); + dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY); + } + } else { + cant_interpret: + afr_attempt_readsubvol_set(frame, this, success_replies, readable, + &read_subvol); + if (read_subvol == -1) { + goto error; } + } - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->replies[read_subvol].poststat, - local->replies[read_subvol].xdata, - &local->replies[par_read_subvol].postparent); + afr_handle_quota_size(frame, this); + + afr_set_need_heal(this, local); + if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg_debug(this->name, 0, + "Arbiter cannot be a read subvol " + "for %s", + local->loc.path); + goto error; + } + + ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg); + if (!ret) { + ret = dict_set_str_sizen(local->replies[read_subvol].xdata, + "gfid-heal-msg", gfid_heal_msg); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "Error setting gfid-heal-msg dict"); + local->op_ret = -1; + local->op_errno = ENOMEM; + } + } + + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[par_read_subvol].postparent); + return; + +error: + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL, NULL); } /* @@ -2352,678 +3122,845 @@ unwind: * others in that they must be given higher priority while * returning to the user. * - * The hierarchy is ENODATA > ENOENT > ESTALE > others + * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others */ int -afr_higher_errno (int32_t old_errno, int32_t new_errno) +afr_higher_errno(int32_t old_errno, int32_t new_errno) { - if (old_errno == ENODATA || new_errno == ENODATA) - return ENODATA; - if (old_errno == ENOENT || new_errno == ENOENT) - return ENOENT; - if (old_errno == ESTALE || new_errno == ESTALE) - return ESTALE; + if (old_errno == ENODATA || new_errno == ENODATA) + return ENODATA; + if (old_errno == ENOENT || new_errno == ENOENT) + return ENOENT; + if (old_errno == ESTALE || new_errno == ESTALE) + return ESTALE; + if (old_errno == ENOSPC || new_errno == ENOSPC) + return ENOSPC; - return new_errno; + return new_errno; } - int -afr_final_errno (afr_local_t *local, afr_private_t *priv) +afr_final_errno(afr_local_t *local, afr_private_t *priv) { - int i = 0; - int op_errno = 0; - int tmp_errno = 0; + int i = 0; + int op_errno = 0; + int tmp_errno = 0; - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret >= 0) - continue; - tmp_errno = local->replies[i].op_errno; - op_errno = afr_higher_errno (op_errno, tmp_errno); - } + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret >= 0) + continue; + tmp_errno = local->replies[i].op_errno; + op_errno = afr_higher_errno(op_errno, tmp_errno); + } - return op_errno; + return op_errno; } static int32_t -afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *dict, - dict_t *xdata) -{ - int ret = 0; - char *pathinfo = NULL; - gf_boolean_t is_local = _gf_false; - afr_private_t *priv = NULL; - int32_t child_index = -1; - - if (op_ret != 0) { - goto out; - } - - priv = this->private; - child_index = (int32_t)(long)cookie; - - ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo); - if (ret != 0) { - goto out; - } - - ret = glusterfs_is_local_pathinfo (pathinfo, &is_local); - if (ret) { - goto out; - } - - /* - * Note that one local subvolume will override another here. The only - * way to avoid that would be to retain extra information about whether - * the previous read_child is local, and it's just not worth it. Even - * the slowest local subvolume is far preferable to a remote one. - */ - if (is_local) { - priv->local[child_index] = 1; - /* Don't set arbiter as read child. */ - if (AFR_IS_ARBITER_BRICK(priv, child_index)) - goto out; - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_LOCAL_CHILD, "selecting local read_child %s", - priv->children[child_index]->name); - - priv->read_child = child_index; - } +afr_local_discovery_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) +{ + int ret = 0; + char *pathinfo = NULL; + gf_boolean_t is_local = _gf_false; + afr_private_t *priv = NULL; + int32_t child_index = -1; + + if (op_ret != 0) { + goto out; + } + + priv = this->private; + child_index = (int32_t)(long)cookie; + + ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, &pathinfo); + if (ret != 0) { + goto out; + } + + ret = glusterfs_is_local_pathinfo(pathinfo, &is_local); + if (ret) { + goto out; + } + + /* + * Note that one local subvolume will override another here. The only + * way to avoid that would be to retain extra information about whether + * the previous read_child is local, and it's just not worth it. Even + * the slowest local subvolume is far preferable to a remote one. + */ + if (is_local) { + priv->local[child_index] = 1; + /* Don't set arbiter as read child. */ + if (AFR_IS_ARBITER_BRICK(priv, child_index)) + goto out; + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_LOCAL_CHILD, + "selecting local read_child %s", + priv->children[child_index]->name); + + priv->read_child = child_index; + } out: - STACK_DESTROY(frame->root); - return 0; + STACK_DESTROY(frame->root); + return 0; } static void -afr_attempt_local_discovery (xlator_t *this, int32_t child_index) +afr_attempt_local_discovery(xlator_t *this, int32_t child_index) { - call_frame_t *newframe = NULL; - loc_t tmploc = {0,}; - afr_private_t *priv = this->private; + call_frame_t *newframe = NULL; + loc_t tmploc = { + 0, + }; + afr_private_t *priv = this->private; - newframe = create_frame(this,this->ctx->pool); - if (!newframe) { - return; - } + newframe = create_frame(this, this->ctx->pool); + if (!newframe) { + return; + } - tmploc.gfid[sizeof(tmploc.gfid)-1] = 1; - STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk, - (void *)(long)child_index, - priv->children[child_index], - priv->children[child_index]->fops->getxattr, - &tmploc, GF_XATTR_PATHINFO_KEY, NULL); + tmploc.gfid[sizeof(tmploc.gfid) - 1] = 1; + STACK_WIND_COOKIE(newframe, afr_local_discovery_cbk, + (void *)(long)child_index, priv->children[child_index], + priv->children[child_index]->fops->getxattr, &tmploc, + GF_XATTR_PATHINFO_KEY, NULL); } int -afr_lookup_sh_metadata_wrap (void *opaque) -{ - call_frame_t *frame = opaque; - afr_local_t *local = NULL; - xlator_t *this = NULL; - inode_t *inode = NULL; - afr_private_t *priv = NULL; - struct afr_reply *replies = NULL; - int i= 0, first = -1; - int ret = -1; - dict_t *dict = NULL; +afr_lookup_sh_metadata_wrap(void *opaque) +{ + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0, first = -1; + int ret = -1; + dict_t *dict = NULL; + + local = frame->local; + this = frame->this; + priv = this->private; + replies = local->replies; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + first = i; + break; + } + if (first == -1) + goto out; + + if (afr_selfheal_metadata_by_stbuf(this, &replies[first].poststat)) + goto out; + + afr_local_replies_wipe(local, this->private); + + dict = dict_new(); + if (!dict) + goto out; + if (local->xattr_req) { + dict_copy(local->xattr_req, dict); + } + + ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT); + if (ret) { + gf_msg_debug(this->name, -ret, "Unable to set link-count in dict "); + } + + if (loc_is_nameless(&local->loc)) { + ret = afr_selfheal_unlocked_discover_on(frame, local->inode, + local->loc.gfid, local->replies, + local->child_up, dict); + } else { + inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up, dict); + } + if (inode) + inode_unref(inode); +out: + if (loc_is_nameless(&local->loc)) + afr_discover_done(frame, this); + else + afr_lookup_done(frame, this); - local = frame->local; - this = frame->this; - priv = this->private; - replies = local->replies; - - for (i =0; i < priv->child_count; i++) { - if(!replies[i].valid || replies[i].op_ret == -1) - continue; - first = i; - break; - } - if (first == -1) - goto out; + if (dict) + dict_unref(dict); - if (afr_selfheal_metadata_by_stbuf (this, &replies[first].poststat)) - goto out; + return 0; +} - afr_local_replies_wipe (local, this->private); +gf_boolean_t +afr_is_pending_set(xlator_t *this, dict_t *xdata, int type) +{ + int idx = -1; + afr_private_t *priv = NULL; + void *pending_raw = NULL; + int *pending_int = NULL; + int i = 0; - dict = dict_new (); - if (!dict) - goto out; - ret = dict_set_str (dict, "link-count", GF_XATTROP_INDEX_COUNT); - if (ret) { - gf_msg_debug (this->name, -ret, - "Unable to set link-count in dict "); + priv = this->private; + idx = afr_index_for_transaction_type(type); + + if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw) == 0) { + if (pending_raw) { + pending_int = pending_raw; + + if (ntoh32(pending_int[idx])) + return _gf_true; } + } - if (loc_is_nameless (&local->loc)) { - ret = afr_selfheal_unlocked_discover_on (frame, local->inode, - local->loc.gfid, - local->replies, - local->child_up); - } else { - inode = afr_selfheal_unlocked_lookup_on (frame, - local->loc.parent, - local->loc.name, - local->replies, - local->child_up, dict); - } - if (inode) - inode_unref (inode); -out: - if (loc_is_nameless (&local->loc)) - afr_discover_done (frame, this); - else - afr_lookup_done (frame, this); + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; + if (!pending_raw) + continue; + pending_int = pending_raw; - if (dict) - dict_unref (dict); + if (ntoh32(pending_int[idx])) + return _gf_true; + } - return 0; + return _gf_false; } static gf_boolean_t afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - struct afr_reply *replies = NULL; - int i = 0, first = -1; - gf_boolean_t start = _gf_false; - struct iatt stbuf = {0, }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0, first = -1; + gf_boolean_t start = _gf_false; + struct iatt stbuf = { + 0, + }; - local = frame->local; - replies = local->replies; - priv = this->private; + local = frame->local; + replies = local->replies; + priv = this->private; - if (!priv->metadata_self_heal) - return _gf_false; + if (!priv->metadata_self_heal) + return _gf_false; - for (i = 0; i < priv->child_count; i++) { - if(!replies[i].valid || replies[i].op_ret == -1) - continue; - if (first == -1) { - first = i; - stbuf = replies[i].poststat; - continue; - } + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (first == -1) { + first = i; + stbuf = replies[i].poststat; + continue; + } - if (gf_uuid_compare (stbuf.ia_gfid, replies[i].poststat.ia_gfid)) { - start = _gf_false; - break; - } - if (!IA_EQUAL (stbuf, replies[i].poststat, type)) { - start = _gf_false; - break; - } + if (afr_is_pending_set(this, replies[i].xdata, + AFR_METADATA_TRANSACTION)) { + /* Let shd do the heal so that lookup is not blocked + * on getting metadata lock/doing the heal */ + start = _gf_false; + break; + } - /*Check if iattrs need heal*/ - if ((!IA_EQUAL (stbuf, replies[i].poststat, uid)) || - (!IA_EQUAL (stbuf, replies[i].poststat, gid)) || - (!IA_EQUAL (stbuf, replies[i].poststat, prot))) { - start = _gf_true; - continue; - } + if (gf_uuid_compare(stbuf.ia_gfid, replies[i].poststat.ia_gfid)) { + start = _gf_false; + break; + } + if (!IA_EQUAL(stbuf, replies[i].poststat, type)) { + start = _gf_false; + break; + } - /*Check if xattrs need heal*/ - if (!afr_xattrs_are_equal (replies[first].xdata, - replies[i].xdata)) - start = _gf_true; + /*Check if iattrs need heal*/ + if ((!IA_EQUAL(stbuf, replies[i].poststat, uid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, gid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, prot))) { + start = _gf_true; + continue; } - return start; + /*Check if xattrs need heal*/ + if (!afr_xattrs_are_equal(replies[first].xdata, replies[i].xdata)) + start = _gf_true; + } + + return start; } int -afr_lookup_metadata_heal_check (call_frame_t *frame, xlator_t *this) +afr_lookup_metadata_heal_check(call_frame_t *frame, xlator_t *this) { - call_frame_t *heal = NULL; - afr_local_t *local = NULL; - int ret = 0; + call_frame_t *heal = NULL; + afr_local_t *local = NULL; + int ret = 0; - local = frame->local; - if (!afr_can_start_metadata_self_heal (frame, this)) - goto out; + local = frame->local; + if (!afr_can_start_metadata_self_heal(frame, this)) + goto out; - heal = afr_frame_create (this, &ret); - if (!heal) { - ret = -ret; - goto out; - } + heal = afr_frame_create(this, &ret); + if (!heal) { + ret = -ret; + goto out; + } - ret = synctask_new (this->ctx->env, afr_lookup_sh_metadata_wrap, - afr_refresh_selfheal_done, heal, frame); - if (ret) - goto out; - return ret; + ret = synctask_new(this->ctx->env, afr_lookup_sh_metadata_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) + goto out; + return ret; out: - if (loc_is_nameless (&local->loc)) - afr_discover_done (frame, this); - else - afr_lookup_done (frame, this); - if (heal) - AFR_STACK_DESTROY (heal); - return ret; + if (loc_is_nameless(&local->loc)) + afr_discover_done(frame, this); + else + afr_lookup_done(frame, this); + if (heal) + AFR_STACK_DESTROY(heal); + return ret; } int -afr_lookup_selfheal_wrap (void *opaque) +afr_lookup_selfheal_wrap(void *opaque) { - int ret = 0; - call_frame_t *frame = opaque; - afr_local_t *local = NULL; - xlator_t *this = NULL; - inode_t *inode = NULL; - uuid_t pargfid = {0,}; + int ret = 0; + call_frame_t *frame = opaque; + afr_local_t *local = NULL; + xlator_t *this = NULL; + inode_t *inode = NULL; + uuid_t pargfid = { + 0, + }; - local = frame->local; - this = frame->this; - loc_pargfid (&local->loc, pargfid); + local = frame->local; + this = frame->this; + loc_pargfid(&local->loc, pargfid); - ret = afr_selfheal_name (frame->this, pargfid, local->loc.name, - &local->cont.lookup.gfid_req, local->xattr_req); - if (ret == -EIO) - goto unwind; + ret = afr_selfheal_name(frame->this, pargfid, local->loc.name, + &local->cont.lookup.gfid_req, local->xattr_req); + if (ret == -EIO) + goto unwind; - afr_local_replies_wipe (local, this->private); + afr_local_replies_wipe(local, this->private); - inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent, - local->loc.name, local->replies, - local->child_up, NULL); - if (inode) - inode_unref (inode); + inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent, + local->loc.name, local->replies, + local->child_up, local->xattr_req); + if (inode) + inode_unref(inode); - afr_lookup_metadata_heal_check(frame, this); - return 0; + afr_lookup_metadata_heal_check(frame, this); + return 0; unwind: - AFR_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL); + return 0; } int -afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - call_frame_t *heal = NULL; - int i = 0, first = -1; - gf_boolean_t need_heal = _gf_false; - struct afr_reply *replies = NULL; - int ret = 0; - - local = frame->local; - replies = local->replies; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) - continue; - - if ((replies[i].op_ret == -1) && - (replies[i].op_errno == ENODATA)) - need_heal = _gf_true; - - if (first == -1) { - first = i; - continue; - } - - if (replies[i].op_ret != replies[first].op_ret) { - need_heal = _gf_true; - break; - } - - if (gf_uuid_compare (replies[i].poststat.ia_gfid, - replies[first].poststat.ia_gfid)) { - need_heal = _gf_true; - break; - } - } - - if (need_heal) { - heal = afr_frame_create (this, NULL); - if (!heal) - goto metadata_heal; - - ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, - afr_refresh_selfheal_done, heal, frame); - if (ret) { - AFR_STACK_DESTROY (heal); - goto metadata_heal; - } - return ret; - } -metadata_heal: - ret = afr_lookup_metadata_heal_check (frame, this); +afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; + gf_boolean_t name_state_mismatch = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; + unsigned char *par_readables = NULL; + unsigned char *success = NULL; + int32_t op_errno = 0; + uuid_t gfid = {0}; + + local = frame->local; + replies = local->replies; + priv = this->private; + par_readables = alloca0(priv->child_count); + success = alloca0(priv->child_count); + + ret = afr_inode_read_subvol_get(local->loc.parent, this, par_readables, + NULL, NULL); + if (ret < 0 || AFR_COUNT(par_readables, priv->child_count) == 0) { + /* In this case set par_readables to all 1 so that name_heal + * need checks at the end of this function will flag missing + * entry when name state mismatches*/ + memset(par_readables, 1, priv->child_count); + } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + + if (replies[i].op_ret == 0) { + if (gf_uuid_is_null(gfid)) { + gf_uuid_copy(gfid, replies[i].poststat.ia_gfid); + } + success[i] = 1; + } else { + if ((replies[i].op_errno != ENOTCONN) && + (replies[i].op_errno != ENOENT) && + (replies[i].op_errno != ESTALE)) { + op_errno = replies[i].op_errno; + } + } - return ret; -} + /*gfid is missing, needs heal*/ + if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) { + goto name_heal; + } + if (first == -1) { + first = i; + continue; + } -int -afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, struct iatt *buf, - dict_t *xdata, struct iatt *postparent) -{ - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; - GF_UNUSED int ret = 0; - int8_t need_heal = 1; - - child_index = (long) cookie; - - local = frame->local; - - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - /* - * On revalidate lookup if the gfid-changed, afr should unwind the fop - * with ESTALE so that a fresh lookup will be sent by the top xlator. - * So remember it. - */ - if (xdata && dict_get (xdata, "gfid-changed")) - local->cont.lookup.needs_fresh_lookup = _gf_true; + if (replies[i].op_ret != replies[first].op_ret) { + name_state_mismatch = _gf_true; + } - if (xdata) { - ret = dict_get_int8 (xdata, "link-count", &need_heal); - local->replies[child_index].need_heal = need_heal; - } else { - local->replies[child_index].need_heal = need_heal; + if (replies[i].op_ret == 0) { + /* Rename after this lookup may succeed if we don't do + * a name-heal and the destination may not have pending xattrs + * to indicate which name is good and which is bad so always do + * this heal*/ + if (gf_uuid_compare(replies[i].poststat.ia_gfid, gfid)) { + goto name_heal; + } } - if (op_ret != -1) { - local->replies[child_index].poststat = *buf; - local->replies[child_index].postparent = *postparent; - if (xdata) - local->replies[child_index].xdata = dict_ref (xdata); - } + } - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_need_heal (this, local); - afr_lookup_entry_heal (frame, this); + if (name_state_mismatch) { + if (!priv->quorum_count) + goto name_heal; + if (!afr_has_quorum(success, this, NULL)) + goto name_heal; + if (op_errno) + goto name_heal; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (par_readables[i] && replies[i].op_ret < 0 && + replies[i].op_errno != ENOTCONN) { + goto name_heal; + } } + } - return 0; + goto metadata_heal; + +name_heal: + heal = afr_frame_create(this, NULL); + if (!heal) + goto metadata_heal; + + ret = synctask_new(this->ctx->env, afr_lookup_selfheal_wrap, + afr_refresh_selfheal_done, heal, frame); + if (ret) { + AFR_STACK_DESTROY(heal); + goto metadata_heal; + } + return ret; + +metadata_heal: + ret = afr_lookup_metadata_heal_check(frame, this); + + return ret; } +int +afr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; + GF_UNUSED int ret = 0; + int8_t need_heal = 1; + + child_index = (long)cookie; + + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + /* + * On revalidate lookup if the gfid-changed, afr should unwind the fop + * with ESTALE so that a fresh lookup will be sent by the top xlator. + * So remember it. + */ + if (xdata && dict_get_sizen(xdata, "gfid-changed")) + local->cont.lookup.needs_fresh_lookup = _gf_true; + + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + local->replies[child_index].need_heal = need_heal; + } else { + local->replies[child_index].need_heal = need_heal; + } + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + } + + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + afr_lookup_entry_heal(frame, this); + } + return 0; +} static void -afr_discover_done (call_frame_t *frame, xlator_t *this) +afr_discover_unwind(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = -1; - int op_errno = 0; - int spb_choice = -1; - int read_subvol = -1; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int read_subvol = -1; + int ret = 0; + unsigned char *data_readable = NULL; + unsigned char *success_replies = NULL; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; + data_readable = alloca0(priv->child_count); + success_replies = alloca0(priv->child_count); - afr_inode_split_brain_choice_get (local->inode, this, - &spb_choice); + afr_fill_success_replies(local, priv, success_replies); + if (AFR_COUNT(success_replies, priv->child_count) > 0) + local->op_ret = 0; - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret == 0) - local->op_ret = 0; - } + if (local->op_ret < 0) { + local->op_ret = -1; + local->op_errno = afr_final_errno(frame->local, this->private); + goto error; + } - op_errno = afr_final_errno (frame->local, this->private); + if (!afr_has_quorum(success_replies, this, frame)) + goto unwind; - if (local->op_ret < 0) { - local->op_errno = op_errno; - local->op_ret = -1; - goto unwind; - } + ret = afr_replies_interpret(frame, this, local->inode, NULL); + if (ret) { + afr_inode_need_refresh_set(local->inode, this); + } - afr_replies_interpret (frame, this, local->inode, NULL); + read_subvol = afr_read_subvol_decide(local->inode, this, NULL, + data_readable); - read_subvol = afr_read_subvol_decide (local->inode, this, NULL); - if (read_subvol == -1) { - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s", - local->loc.path); +unwind: + afr_attempt_readsubvol_set(frame, this, success_replies, data_readable, + &read_subvol); + if (read_subvol == -1) + goto error; - if (spb_choice >= 0) { - read_subvol = spb_choice; - } else { - read_subvol = afr_first_up_child (frame, this); - } - } + if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + gf_msg_debug(this->name, 0, + "Arbiter cannot be a read subvol " + "for %s", + local->loc.path); + } -unwind: - if (read_subvol == -1) { - if (spb_choice >= 0) - read_subvol = spb_choice; - else - read_subvol = afr_first_up_child (frame, this); - } - if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) { - local->op_ret = -1; - local->op_errno = ENOTCONN; - } + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, + local->inode, &local->replies[read_subvol].poststat, + local->replies[read_subvol].xdata, + &local->replies[read_subvol].postparent); + return; - AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno, - local->inode, &local->replies[read_subvol].poststat, - local->replies[read_subvol].xdata, - &local->replies[read_subvol].postparent); +error: + AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL, + NULL, NULL); } +static int +afr_ta_id_file_check(void *opaque) +{ + afr_private_t *priv = NULL; + xlator_t *this = NULL; + loc_t loc = { + 0, + }; + struct iatt stbuf = { + 0, + }; + dict_t *dict = NULL; + uuid_t gfid = { + 0, + }; + fd_t *fd = NULL; + int ret = 0; + + this = opaque; + priv = this->private; + + ret = afr_fill_ta_loc(this, &loc, _gf_false); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + goto out; + } + + ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, &stbuf, + 0, 0, 0); + if (ret == 0) { + goto out; + } else if (ret == -ENOENT) { + fd = fd_create(loc.inode, getpid()); + if (!fd) + goto out; + dict = dict_new(); + if (!dict) + goto out; + gf_uuid_generate(gfid); + ret = dict_set_gfuuid(dict, "gfid-req", gfid, true); + ret = syncop_create(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + O_RDWR, 0664, fd, &stbuf, dict, NULL); + } -int -afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, struct iatt *buf, - dict_t *xdata, struct iatt *postparent) -{ - afr_local_t * local = NULL; - int call_count = -1; - int child_index = -1; - GF_UNUSED int ret = 0; - int8_t need_heal = 1; - - child_index = (long) cookie; - - local = frame->local; - - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - if (op_ret != -1) { - local->replies[child_index].poststat = *buf; - local->replies[child_index].postparent = *postparent; - if (xdata) - local->replies[child_index].xdata = dict_ref (xdata); - } - - if (local->do_discovery && (op_ret == 0)) - afr_attempt_local_discovery (this, child_index); - - if (xdata) { - ret = dict_get_int8 (xdata, "link-count", &need_heal); - local->replies[child_index].need_heal = need_heal; - } else { - local->replies[child_index].need_heal = need_heal; - } - - call_count = afr_frame_return (frame); - if (call_count == 0) { - afr_set_need_heal (this, local); - afr_lookup_metadata_heal_check (frame, this); - } +out: + if (ret == 0) { + gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid); + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to lookup/create thin-arbiter id file."); + } + if (dict) + dict_unref(dict); + if (fd) + fd_unref(fd); + loc_wipe(&loc); - return 0; + return 0; } +static int +afr_ta_id_file_check_cbk(int ret, call_frame_t *ta_frame, void *opaque) +{ + return 0; +} -int -afr_discover_do (call_frame_t *frame, xlator_t *this, int err) +static void +afr_discover_done(call_frame_t *frame, xlator_t *this) { - int ret = 0; - int i = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; + int ret = 0; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + priv = this->private; + if (!priv->thin_arbiter_count) + goto unwind; + if (!gf_uuid_is_null(priv->ta_gfid)) + goto unwind; - if (err) { - local->op_errno = -err; - ret = -1; - goto out; - } + ret = synctask_new(this->ctx->env, afr_ta_id_file_check, + afr_ta_id_file_check_cbk, NULL, this); + if (ret) + goto unwind; +unwind: + afr_discover_unwind(frame, this); +} - call_count = local->call_count = AFR_COUNT (local->child_up, - priv->child_count); +int +afr_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) +{ + afr_local_t *local = NULL; + int call_count = -1; + int child_index = -1; + GF_UNUSED int ret = 0; + int8_t need_heal = 1; + + child_index = (long)cookie; + + local = frame->local; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret != -1) { + local->replies[child_index].poststat = *buf; + local->replies[child_index].postparent = *postparent; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + } + + if (local->do_discovery && (op_ret == 0)) + afr_attempt_local_discovery(this, child_index); + + if (xdata) { + ret = dict_get_int8(xdata, "link-count", &need_heal); + local->replies[child_index].need_heal = need_heal; + } else { + local->replies[child_index].need_heal = need_heal; + } + + call_count = afr_frame_return(frame); + if (call_count == 0) { + afr_set_need_heal(this, local); + afr_lookup_metadata_heal_check(frame, this); + } - ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, - &local->loc); - if (ret) { - local->op_errno = -ret; - ret = -1; - goto out; - } + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_discover_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, local->xattr_req); - if (!--call_count) - break; - } +int +afr_discover_do(call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err) { + local->op_errno = err; + goto out; + } + + call_count = local->call_count = AFR_COUNT(local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE( + frame, afr_discover_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &local->loc, local->xattr_req); + if (!--call_count) + break; } + } - return 0; + return 0; out: - AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); - return 0; + AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } - int -afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) { - int op_errno = ENOMEM; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int event = 0; + int op_errno = ENOMEM; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int event = 0; - priv = this->private; + priv = this->private; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - if (!local->call_count) { - op_errno = ENOTCONN; - goto out; - } + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } - if (__is_root_gfid (loc->inode->gfid)) { - if (!this->itable) - this->itable = loc->inode->table; - if (!priv->root_inode) - priv->root_inode = inode_ref (loc->inode); + if (__is_root_gfid(loc->inode->gfid)) { + if (!priv->root_inode) + priv->root_inode = inode_ref(loc->inode); - if (priv->choose_local && !priv->did_discovery) { - /* Logic to detect which subvolumes of AFR are - local, in order to prefer them for reads - */ - local->do_discovery = _gf_true; - priv->did_discovery = _gf_true; - } - } + if (priv->choose_local && !priv->did_discovery) { + /* Logic to detect which subvolumes of AFR are + local, in order to prefer them for reads + */ + local->do_discovery = _gf_true; + priv->did_discovery = _gf_true; + } + } - local->op = GF_FOP_LOOKUP; + local->op = GF_FOP_LOOKUP; - loc_copy (&local->loc, loc); + loc_copy(&local->loc, loc); - local->inode = inode_ref (loc->inode); + local->inode = inode_ref(loc->inode); - if (xattr_req) - /* If xattr_req was null, afr_lookup_xattr_req_prepare() will - allocate one for us */ - local->xattr_req = dict_ref (xattr_req); + if (xattr_req) { + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_copy_with_ref(xattr_req, NULL); + if (!local->xattr_req) { + op_errno = ENOMEM; + goto out; + } + } - if (gf_uuid_is_null (loc->inode->gfid)) { - afr_discover_do (frame, this, 0); - return 0; - } + if (gf_uuid_is_null(loc->inode->gfid)) { + afr_discover_do(frame, this, 0); + return 0; + } - afr_read_subvol_get (loc->inode, this, NULL, NULL, &event, - AFR_DATA_TRANSACTION, NULL); + afr_read_subvol_get(loc->inode, this, NULL, NULL, &event, + AFR_DATA_TRANSACTION, NULL); - if (afr_is_inode_refresh_reqd (loc->inode, this, event, - local->event_generation)) - afr_inode_refresh (frame, this, loc->inode, NULL, - afr_discover_do); - else - afr_discover_do (frame, this, 0); + afr_discover_do(frame, this, 0); - return 0; + return 0; out: - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + return 0; } - int -afr_lookup_do (call_frame_t *frame, xlator_t *this, int err) -{ - int ret = 0; - int i = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - - local = frame->local; - priv = this->private; - - if (err < 0) { - local->op_errno = -err; - ret = -1; - goto out; - } - - call_count = local->call_count = AFR_COUNT (local->child_up, - priv->child_count); - - ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req, - &local->loc); - if (ret) { - local->op_errno = -ret; - ret = -1; - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_lookup_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->lookup, - &local->loc, local->xattr_req); - if (!--call_count) - break; - } +afr_lookup_do(call_frame_t *frame, xlator_t *this, int err) +{ + int ret = 0; + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + if (err < 0) { + local->op_errno = err; + goto out; + } + + call_count = local->call_count = AFR_COUNT(local->child_up, + priv->child_count); + + ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req, + &local->loc); + if (ret) { + local->op_errno = -ret; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE( + frame, afr_lookup_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->lookup, &local->loc, local->xattr_req); + if (!--call_count) + break; } - return 0; + } + return 0; out: - AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0); - return 0; + AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0); + return 0; } /* @@ -3063,1633 +4000,1722 @@ out: */ int -afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) -{ - afr_local_t *local = NULL; - int32_t op_errno = 0; - int event = 0; - void *gfid_req = NULL; - int ret = 0; - - if (loc_is_nameless (loc)) { - if (xattr_req) - dict_del (xattr_req, "gfid-req"); - afr_discover (frame, this, loc, xattr_req); - return 0; - } - - if (__is_root_gfid (loc->parent->gfid)) { - if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) { - op_errno = EPERM; - goto out; - } - } - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - if (!local->call_count) { - op_errno = ENOTCONN; - goto out; - } +afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) +{ + afr_local_t *local = NULL; + int32_t op_errno = 0; + int event = 0; + int ret = 0; - local->op = GF_FOP_LOOKUP; + if (loc_is_nameless(loc)) { + if (xattr_req) + dict_del_sizen(xattr_req, "gfid-req"); + afr_discover(frame, this, loc, xattr_req); + return 0; + } - loc_copy (&local->loc, loc); + if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name, + frame->root->pid)) { + op_errno = EPERM; + goto out; + } - local->inode = inode_ref (loc->inode); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - if (xattr_req) { - /* If xattr_req was null, afr_lookup_xattr_req_prepare() will - allocate one for us */ - local->xattr_req = dict_copy_with_ref (xattr_req, NULL); - if (!local->xattr_req) { - op_errno = ENOMEM; - goto out; - } - ret = dict_get_ptr (local->xattr_req, "gfid-req", &gfid_req); - if (ret == 0) { - gf_uuid_copy (local->cont.lookup.gfid_req, gfid_req); - dict_del (local->xattr_req, "gfid-req"); - } + if (!local->call_count) { + op_errno = ENOTCONN; + goto out; + } + + local->op = GF_FOP_LOOKUP; + + loc_copy(&local->loc, loc); + + local->inode = inode_ref(loc->inode); + + if (xattr_req) { + /* If xattr_req was null, afr_lookup_xattr_req_prepare() will + allocate one for us */ + local->xattr_req = dict_copy_with_ref(xattr_req, NULL); + if (!local->xattr_req) { + op_errno = ENOMEM; + goto out; + } + ret = dict_get_gfuuid(local->xattr_req, "gfid-req", + &local->cont.lookup.gfid_req); + if (ret == 0) { + dict_del_sizen(local->xattr_req, "gfid-req"); } + } - afr_read_subvol_get (loc->parent, this, NULL, NULL, &event, - AFR_DATA_TRANSACTION, NULL); + afr_read_subvol_get(loc->parent, this, NULL, NULL, &event, + AFR_DATA_TRANSACTION, NULL); - if (afr_is_inode_refresh_reqd (loc->inode, this, event, - local->event_generation)) - afr_inode_refresh (frame, this, loc->parent, NULL, - afr_lookup_do); - else - afr_lookup_do (frame, this, 0); + afr_lookup_do(frame, this, 0); - return 0; + return 0; out: - AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); + AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL); - return 0; + return 0; } void -_afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx) +_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx) { - int i = 0; - - - for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) - GF_FREE (fd_ctx->pre_op_done[i]); - - GF_FREE (fd_ctx->opened_on); - - GF_FREE (fd_ctx->lock_piggyback); - - GF_FREE (fd_ctx->lock_acquired); + afr_private_t *priv = this->private; - pthread_mutex_destroy (&fd_ctx->delay_lock); - - GF_FREE (fd_ctx); - - return; + if (fd_ctx->lk_heal_info) { + LOCK(&priv->lock); + { + list_del(&fd_ctx->lk_heal_info->pos); + } + afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info); + fd_ctx->lk_heal_info = NULL; + } + GF_FREE(fd_ctx->opened_on); + GF_FREE(fd_ctx); + return; } int -afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) +afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd) { - uint64_t ctx = 0; - afr_fd_ctx_t *fd_ctx = NULL; - int ret = 0; - - ret = fd_ctx_get (fd, this, &ctx); - if (ret < 0) - goto out; - - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int ret = 0; - if (fd_ctx) { - /*no need to take any locks*/ - if (!list_empty (&fd_ctx->eager_locked)) - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_INVALID_DATA, "%s: Stale " - "Eager-lock stubs found", - uuid_utoa (fd->inode->gfid)); + ret = fd_ctx_get(fd, this, &ctx); + if (ret < 0) + goto out; - _afr_cleanup_fd_ctx (fd_ctx); + fd_ctx = (afr_fd_ctx_t *)(long)ctx; - } + if (fd_ctx) { + _afr_cleanup_fd_ctx(this, fd_ctx); + } out: - return 0; + return 0; } int -afr_release (xlator_t *this, fd_t *fd) +afr_release(xlator_t *this, fd_t *fd) { - afr_cleanup_fd_ctx (this, fd); + afr_cleanup_fd_ctx(this, fd); - return 0; + return 0; } afr_fd_ctx_t * -__afr_fd_ctx_get (fd_t *fd, xlator_t *this) +__afr_fd_ctx_get(fd_t *fd, xlator_t *this) { - uint64_t ctx = 0; - int ret = 0; - afr_fd_ctx_t *fd_ctx = NULL; + uint64_t ctx = 0; + int ret = 0; + afr_fd_ctx_t *fd_ctx = NULL; - ret = __fd_ctx_get (fd, this, &ctx); + ret = __fd_ctx_get(fd, this, &ctx); - if (ret < 0) { - ret = __afr_fd_ctx_set (this, fd); - if (ret < 0) - goto out; + if (ret < 0) { + ret = __afr_fd_ctx_set(this, fd); + if (ret < 0) + goto out; - ret = __fd_ctx_get (fd, this, &ctx); - if (ret < 0) - goto out; - } + ret = __fd_ctx_get(fd, this, &ctx); + if (ret < 0) + goto out; + } - fd_ctx = (afr_fd_ctx_t *)(long) ctx; + fd_ctx = (afr_fd_ctx_t *)(long)ctx; out: - return fd_ctx; + return fd_ctx; } - afr_fd_ctx_t * -afr_fd_ctx_get (fd_t *fd, xlator_t *this) +afr_fd_ctx_get(fd_t *fd, xlator_t *this) { - afr_fd_ctx_t *fd_ctx = NULL; + afr_fd_ctx_t *fd_ctx = NULL; - LOCK(&fd->lock); - { - fd_ctx = __afr_fd_ctx_get (fd, this); - } - UNLOCK(&fd->lock); + LOCK(&fd->lock); + { + fd_ctx = __afr_fd_ctx_get(fd, this); + } + UNLOCK(&fd->lock); - return fd_ctx; + return fd_ctx; } - int -__afr_fd_ctx_set (xlator_t *this, fd_t *fd) +__afr_fd_ctx_set(xlator_t *this, fd_t *fd) { - afr_private_t * priv = NULL; - int ret = -1; - uint64_t ctx = 0; - afr_fd_ctx_t * fd_ctx = NULL; - int i = 0; - - VALIDATE_OR_GOTO (this->private, out); - VALIDATE_OR_GOTO (fd, out); - - priv = this->private; - - ret = __fd_ctx_get (fd, this, &ctx); + afr_private_t *priv = NULL; + int ret = -1; + uint64_t ctx = 0; + afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; - if (ret == 0) - goto out; + VALIDATE_OR_GOTO(this->private, out); + VALIDATE_OR_GOTO(fd, out); - fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t), - gf_afr_mt_afr_fd_ctx_t); - if (!fd_ctx) { - ret = -ENOMEM; - goto out; - } + priv = this->private; - ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL); - if (ret) { - GF_FREE (fd_ctx); - fd_ctx = NULL; - goto out; - } + ret = __fd_ctx_get(fd, this, &ctx); - for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { - fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), - priv->child_count, - gf_afr_mt_int32_t); - if (!fd_ctx->pre_op_done[i]) { - ret = -ENOMEM; - goto out; - } - } - - fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), - priv->child_count, - gf_afr_mt_int32_t); - if (!fd_ctx->opened_on) { - ret = -ENOMEM; - goto out; - } - - for (i = 0; i < priv->child_count; i++) { - if (fd_is_anonymous (fd)) - fd_ctx->opened_on[i] = AFR_FD_OPENED; - else - fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; - } + if (ret == 0) + goto out; - fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_piggyback) { - ret = -ENOMEM; - goto out; - } + fd_ctx = GF_CALLOC(1, sizeof(afr_fd_ctx_t), gf_afr_mt_afr_fd_ctx_t); + if (!fd_ctx) { + ret = -ENOMEM; + goto out; + } - fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), - priv->child_count, - gf_afr_mt_char); - if (!fd_ctx->lock_acquired) { - ret = -ENOMEM; - goto out; - } + fd_ctx->opened_on = GF_CALLOC(sizeof(*fd_ctx->opened_on), priv->child_count, + gf_afr_mt_int32_t); + if (!fd_ctx->opened_on) { + ret = -ENOMEM; + goto out; + } - fd_ctx->readdir_subvol = -1; + for (i = 0; i < priv->child_count; i++) { + if (fd_is_anonymous(fd)) + fd_ctx->opened_on[i] = AFR_FD_OPENED; + else + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } - INIT_LIST_HEAD (&fd_ctx->eager_locked); + fd_ctx->readdir_subvol = -1; + fd_ctx->lk_heal_info = NULL; - ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); - if (ret) - gf_msg_debug (this->name, 0, - "failed to set fd ctx (%p)", fd); + ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx); + if (ret) + gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd); out: - if (ret && fd_ctx) - _afr_cleanup_fd_ctx (fd_ctx); - return ret; + if (ret && fd_ctx) + _afr_cleanup_fd_ctx(this, fd_ctx); + return ret; } - /* {{{ flush */ int -afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = -1; + afr_local_t *local = NULL; + int call_count = -1; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (op_ret != -1) { - local->op_ret = op_ret; - if (!local->xdata_rsp && xdata) - local->xdata_rsp = dict_ref (xdata); - } else { - local->op_errno = op_errno; - } + LOCK(&frame->lock); + { + if (op_ret != -1) { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } else { + local->op_errno = op_errno; } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - if (call_count == 0) - AFR_STACK_UNWIND (flush, frame, local->op_ret, - local->op_errno, local->xdata_rsp); + if (call_count == 0) + AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno, + local->xdata_rsp); - return 0; + return 0; } static int -afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) -{ - int i = 0; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - - priv = this->private; - local = frame->local; - call_count = local->call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_flush_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->flush, - local->fd, xdata); - if (!--call_count) - break; - - } +afr_flush_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +{ + int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + + priv = this->private; + local = frame->local; + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_flush_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->flush, + local->fd, xdata); + if (!--call_count) + break; } + } - return 0; + return 0; } -int -afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) +afr_local_t * +afr_wakeup_same_fd_delayed_op(xlator_t *this, afr_lock_t *lock, fd_t *fd) { - afr_local_t *local = NULL; - call_stub_t *stub = NULL; - int op_errno = ENOMEM; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - local->op = GF_FOP_FLUSH; - if (!afr_is_consistent_io_possible (local, this->private, &op_errno)) - goto out; - - local->fd = fd_ref(fd); - - stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata); - if (!stub) - goto out; - - afr_delayed_changelog_wake_resume (this, fd, stub); - - return 0; -out: - AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL); - return 0; -} - -/* }}} */ - + afr_local_t *local = NULL; -/* {{{ fsync */ - -int -afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, - xdata); - return 0; -} - -int -afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = -1; - int child_index = (long) cookie; - int read_subvol = 0; - call_stub_t *stub = NULL; - - local = frame->local; - priv = this->private; - - LOCK (&frame->lock); - { - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - if (op_ret == 0) { - if (prebuf) - local->replies[child_index].prestat = *prebuf; - if (postbuf) - local->replies[child_index].poststat = *postbuf; - if (xdata) - local->replies[child_index].xdata = - dict_ref (xdata); - } - } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - local->op_ret = -1; - local->op_errno = afr_final_errno (local, priv); - read_subvol = afr_data_subvol_get (local->inode, this, NULL, - local->readable, NULL, NULL); - /* Pick a reply that is valid and readable, with a preference - * given to read_subvol. */ - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret != 0) - continue; - if (!local->readable[i]) - continue; - local->op_ret = local->replies[i].op_ret; - local->op_errno = local->replies[i].op_errno; - local->cont.inode_wfop.prebuf = - local->replies[i].prestat; - local->cont.inode_wfop.postbuf = - local->replies[i].poststat; - if (local->replies[i].xdata) { - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); - local->xdata_rsp = - dict_ref (local->replies[i].xdata); - } - if (i == read_subvol) - break; - } - - /* Make a stub out of the frame, and register it - with the waking up post-op. When the call-stub resumes, - we are guaranteed that there was no post-op pending - (i.e changelogs were unset in the server). This is an - essential "guarantee", that fsync() returns only after - completely finishing EVERYTHING, including the delayed - post-op. This guarantee is expected by FUSE graph switching - for example. - */ - stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, - local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - local->xdata_rsp); - if (!stub) { - AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); - return 0; - } - - /* If no new unstable writes happened between the - time we cleared the unstable write witness flag in afr_fsync - and now, calling afr_delayed_changelog_wake_up() should - wake up and skip over the fsync phase and go straight to - afr_changelog_post_op_now() - */ - afr_delayed_changelog_wake_resume (this, local->fd, stub); + if (lock->delay_timer) { + local = list_entry(lock->post_op.next, afr_local_t, + transaction.owner_list); + if (fd == local->fd) { + if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) { + local = NULL; + } else { + lock->delay_timer = NULL; + } + } else { + local = NULL; } + } - return 0; + return local; } +void +afr_delayed_changelog_wake_resume(xlator_t *this, inode_t *inode, + call_stub_t *stub) +{ + afr_inode_ctx_t *ctx = NULL; + afr_lock_t *lock = NULL; + afr_local_t *metadata_local = NULL; + afr_local_t *data_local = NULL; + LOCK(&inode->lock); + { + (void)__afr_inode_ctx_get(this, inode, &ctx); + lock = &ctx->lock[AFR_DATA_TRANSACTION]; + data_local = afr_wakeup_same_fd_delayed_op(this, lock, stub->args.fd); + lock = &ctx->lock[AFR_METADATA_TRANSACTION]; + metadata_local = afr_wakeup_same_fd_delayed_op(this, lock, + stub->args.fd); + } + UNLOCK(&inode->lock); + + if (data_local) { + data_local->transaction.resume_stub = stub; + } else if (metadata_local) { + metadata_local->transaction.resume_stub = stub; + } else { + call_resume(stub); + } + if (data_local) { + afr_delayed_changelog_wake_up_cbk(data_local); + } + if (metadata_local) { + afr_delayed_changelog_wake_up_cbk(metadata_local); + } +} int -afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, - dict_t *xdata) +afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = ENOMEM; - - priv = this->private; + afr_local_t *local = NULL; + call_stub_t *stub = NULL; + int op_errno = ENOMEM; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->op = GF_FOP_FSYNC; - if (!afr_is_consistent_io_possible (local, priv, &op_errno)) - goto out; + local->op = GF_FOP_FLUSH; + if (!afr_is_consistent_io_possible(local, this->private, &op_errno)) + goto out; - local->fd = fd_ref (fd); + local->fd = fd_ref(fd); - if (afr_fd_has_witnessed_unstable_write (this, fd)) { - /* don't care. we only wanted to CLEAR the bit */ - } + stub = fop_flush_stub(frame, afr_flush_wrapper, fd, xdata); + if (!stub) + goto out; - local->inode = inode_ref (fd->inode); - - call_count = local->call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_fsync_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fsync, - fd, datasync, xdata); - if (!--call_count) - break; - } - } + afr_delayed_changelog_wake_resume(this, fd->inode, stub); - return 0; + return 0; out: - AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); - - return 0; + AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL); + return 0; } -/* }}} */ - -/* {{{ fsync */ - int -afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = -1; + afr_local_t *local = NULL; + int call_count = -1; - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - if (op_ret == 0) { - local->op_ret = 0; - if (!local->xdata_rsp && xdata) - local->xdata_rsp = dict_ref (xdata); - } else { - local->op_errno = op_errno; - } + LOCK(&frame->lock); + { + if (op_ret == 0) { + local->op_ret = 0; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } else { + local->op_errno = op_errno; } - UNLOCK (&frame->lock); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - call_count = afr_frame_return (frame); + if (call_count == 0) + AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno, + local->xdata_rsp); - if (call_count == 0) - AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret, - local->op_errno, local->xdata_rsp); - - return 0; + return 0; } - int -afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, - dict_t *xdata) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int32_t call_count = 0; - int32_t op_errno = ENOMEM; - - priv = this->private; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - local->op = GF_FOP_FSYNCDIR; - if (!afr_is_consistent_io_possible (local, priv, &op_errno)) - goto out; - - call_count = local->call_count; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND (frame, afr_fsyncdir_cbk, - priv->children[i], - priv->children[i]->fops->fsyncdir, - fd, datasync, xdata); - if (!--call_count) - break; - } +afr_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int32_t call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_FSYNCDIR; + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + call_count = local->call_count; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND(frame, afr_fsyncdir_cbk, priv->children[i], + priv->children[i]->fops->fsyncdir, fd, datasync, xdata); + if (!--call_count) + break; } + } - return 0; + return 0; out: - AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL); - return 0; + return 0; } /* }}} */ static int -afr_serialized_lock_wind (call_frame_t *frame, xlator_t *this); +afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this); static gf_boolean_t -afr_is_conflicting_lock_present (int32_t op_ret, int32_t op_errno) +afr_is_conflicting_lock_present(int32_t op_ret, int32_t op_errno) { - if (op_ret == -1 && op_errno == EAGAIN) - return _gf_true; - return _gf_false; + if (op_ret == -1 && op_errno == EAGAIN) + return _gf_true; + return _gf_false; } static void -afr_fop_lock_unwind (call_frame_t *frame, glusterfs_fop_t op, int32_t op_ret, - int32_t op_errno, dict_t *xdata) +afr_fop_lock_unwind(call_frame_t *frame, glusterfs_fop_t op, int32_t op_ret, + int32_t op_errno, dict_t *xdata) { - switch (op) { + switch (op) { case GF_FOP_INODELK: - AFR_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata); - break; + AFR_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata); + break; case GF_FOP_FINODELK: - AFR_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata); - break; + AFR_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata); + break; case GF_FOP_ENTRYLK: - AFR_STACK_UNWIND (entrylk, frame, op_ret, op_errno, xdata); - break; + AFR_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata); + break; case GF_FOP_FENTRYLK: - AFR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, xdata); - break; + AFR_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata); + break; default: - break; - } + break; + } } static void -afr_fop_lock_wind (call_frame_t *frame, xlator_t *this, int child_index, - int32_t (*lock_cbk) (call_frame_t *, void *, xlator_t *, - int32_t, int32_t, dict_t *)) +afr_fop_lock_wind(call_frame_t *frame, xlator_t *this, int child_index, + int32_t (*lock_cbk)(call_frame_t *, void *, xlator_t *, + int32_t, int32_t, dict_t *)) { - afr_local_t *local = frame->local; - afr_private_t *priv = this->private; - int i = child_index; + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int i = child_index; - switch (local->op) { + switch (local->op) { case GF_FOP_INODELK: - STACK_WIND_COOKIE (frame, lock_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - (const char *)local->cont.inodelk.volume, - &local->loc, local->cont.inodelk.cmd, - &local->cont.inodelk.flock, - local->cont.inodelk.xdata); - break; + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->inodelk, + (const char *)local->cont.inodelk.volume, &local->loc, + local->cont.inodelk.cmd, &local->cont.inodelk.flock, + local->cont.inodelk.xdata); + break; case GF_FOP_FINODELK: - STACK_WIND_COOKIE (frame, lock_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->finodelk, - (const char *)local->cont.inodelk.volume, - local->fd, local->cont.inodelk.cmd, - &local->cont.inodelk.flock, - local->cont.inodelk.xdata); - break; + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->finodelk, + (const char *)local->cont.inodelk.volume, local->fd, + local->cont.inodelk.cmd, &local->cont.inodelk.flock, + local->cont.inodelk.xdata); + break; case GF_FOP_ENTRYLK: - STACK_WIND_COOKIE (frame, lock_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->entrylk, - local->cont.entrylk.volume, &local->loc, - local->cont.entrylk.basename, - local->cont.entrylk.cmd, - local->cont.entrylk.type, - local->cont.entrylk.xdata); - break; + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->entrylk, local->cont.entrylk.volume, + &local->loc, local->cont.entrylk.basename, + local->cont.entrylk.cmd, local->cont.entrylk.type, + local->cont.entrylk.xdata); + break; case GF_FOP_FENTRYLK: - STACK_WIND_COOKIE (frame, lock_cbk, (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fentrylk, - local->cont.entrylk.volume, local->fd, - local->cont.entrylk.basename, - local->cont.entrylk.cmd, - local->cont.entrylk.type, - local->cont.entrylk.xdata); - break; + STACK_WIND_COOKIE( + frame, lock_cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->fentrylk, local->cont.entrylk.volume, + local->fd, local->cont.entrylk.basename, + local->cont.entrylk.cmd, local->cont.entrylk.type, + local->cont.entrylk.xdata); + break; default: - break; - } + break; + } } void -afr_fop_lock_proceed (call_frame_t *frame) +afr_fop_lock_proceed(call_frame_t *frame) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = frame->this->private; - - if (local->fop_lock_state != AFR_FOP_LOCK_PARALLEL) { - afr_fop_lock_unwind (frame, local->op, local->op_ret, - local->op_errno, local->xdata_rsp); - return; - } - /* At least one child is up */ - /* - * Non-blocking locks also need to be serialized. Otherwise there is - * a chance that both the mounts which issued same non-blocking inodelk - * may endup not acquiring the lock on any-brick. - * Ex: Mount1 and Mount2 - * request for full length lock on file f1. Mount1 afr may acquire the - * partial lock on brick-1 and may not acquire the lock on brick-2 - * because Mount2 already got the lock on brick-2, vice versa. Since - * both the mounts only got partial locks, afr treats them as failure in - * gaining the locks and unwinds with EAGAIN errno. - */ - local->op_ret = -1; - local->op_ret = EUCLEAN; - local->fop_lock_state = AFR_FOP_LOCK_SERIAL; - afr_local_replies_wipe (local, priv); - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); - local->xdata_rsp = NULL; - switch (local->op) { + local = frame->local; + priv = frame->this->private; + + if (local->fop_lock_state != AFR_FOP_LOCK_PARALLEL) { + afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno, + local->xdata_rsp); + return; + } + /* At least one child is up */ + /* + * Non-blocking locks also need to be serialized. Otherwise there is + * a chance that both the mounts which issued same non-blocking inodelk + * may endup not acquiring the lock on any-brick. + * Ex: Mount1 and Mount2 + * request for full length lock on file f1. Mount1 afr may acquire the + * partial lock on brick-1 and may not acquire the lock on brick-2 + * because Mount2 already got the lock on brick-2, vice versa. Since + * both the mounts only got partial locks, afr treats them as failure in + * gaining the locks and unwinds with EAGAIN errno. + */ + local->op_ret = -1; + local->op_errno = EUCLEAN; + local->fop_lock_state = AFR_FOP_LOCK_SERIAL; + afr_local_replies_wipe(local, priv); + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + switch (local->op) { case GF_FOP_INODELK: case GF_FOP_FINODELK: - local->cont.inodelk.cmd = local->cont.inodelk.in_cmd; - local->cont.inodelk.flock = local->cont.inodelk.in_flock; - if (local->cont.inodelk.xdata) - dict_unref (local->cont.inodelk.xdata); - local->cont.inodelk.xdata = NULL; - if (local->xdata_req) - local->cont.inodelk.xdata = dict_ref (local->xdata_req); - break; + local->cont.inodelk.cmd = local->cont.inodelk.in_cmd; + local->cont.inodelk.flock = local->cont.inodelk.in_flock; + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + local->cont.inodelk.xdata = NULL; + if (local->xdata_req) + local->cont.inodelk.xdata = dict_ref(local->xdata_req); + break; case GF_FOP_ENTRYLK: case GF_FOP_FENTRYLK: - local->cont.entrylk.cmd = local->cont.entrylk.in_cmd; - if (local->cont.entrylk.xdata) - dict_unref (local->cont.entrylk.xdata); - local->cont.entrylk.xdata = NULL; - if (local->xdata_req) - local->cont.entrylk.xdata = dict_ref (local->xdata_req); - break; + local->cont.entrylk.cmd = local->cont.entrylk.in_cmd; + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + local->cont.entrylk.xdata = NULL; + if (local->xdata_req) + local->cont.entrylk.xdata = dict_ref(local->xdata_req); + break; default: - break; - } - afr_serialized_lock_wind (frame, frame->this); + break; + } + afr_serialized_lock_wind(frame, frame->this); } static int32_t -afr_unlock_partial_lock_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *xdata) +afr_unlock_partial_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = -1; - int child_index = (long)cookie; - uuid_t gfid = {0}; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = -1; + int child_index = (long)cookie; + uuid_t gfid = {0}; - local = frame->local; - priv = this->private; - - if (op_ret < 0 && op_errno != ENOTCONN) { - if (local->fd) - gf_uuid_copy (gfid, local->fd->inode->gfid); - else - loc_gfid (&local->loc, gfid); - gf_msg (this->name, GF_LOG_ERROR, op_errno, - AFR_MSG_UNLOCK_FAIL, - "%s: Failed to unlock %s on %s " - "with lk_owner: %s", uuid_utoa (gfid), - gf_fop_list[local->op], - priv->children[child_index]->name, - lkowner_utoa (&frame->root->lk_owner)); - } - - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_fop_lock_proceed (frame); + local = frame->local; + priv = this->private; - return 0; + if (op_ret < 0 && op_errno != ENOTCONN) { + if (local->fd) + gf_uuid_copy(gfid, local->fd->inode->gfid); + else + loc_gfid(&local->loc, gfid); + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "%s: Failed to unlock %s on %s " + "with lk_owner: %s", + uuid_utoa(gfid), gf_fop_list[local->op], + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + + call_count = afr_frame_return(frame); + if (call_count == 0) + afr_fop_lock_proceed(frame); + + return 0; } static int32_t -afr_unlock_locks_and_proceed (call_frame_t *frame, xlator_t *this, +afr_unlock_locks_and_proceed(call_frame_t *frame, xlator_t *this, int call_count) { - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; - if (call_count == 0) { - afr_fop_lock_proceed (frame); - goto out; - } + if (call_count == 0) { + afr_fop_lock_proceed(frame); + goto out; + } - local = frame->local; - priv = this->private; - local->call_count = call_count; - switch (local->op) { + local = frame->local; + priv = this->private; + local->call_count = call_count; + switch (local->op) { case GF_FOP_INODELK: case GF_FOP_FINODELK: - local->cont.inodelk.flock.l_type = F_UNLCK; - local->cont.inodelk.cmd = F_SETLK; - if (local->cont.inodelk.xdata) - dict_unref (local->cont.inodelk.xdata); - local->cont.inodelk.xdata = NULL; - break; + local->cont.inodelk.flock.l_type = F_UNLCK; + local->cont.inodelk.cmd = F_SETLK; + if (local->cont.inodelk.xdata) + dict_unref(local->cont.inodelk.xdata); + local->cont.inodelk.xdata = NULL; + break; case GF_FOP_ENTRYLK: case GF_FOP_FENTRYLK: - local->cont.entrylk.cmd = ENTRYLK_UNLOCK; - if (local->cont.entrylk.xdata) - dict_unref (local->cont.entrylk.xdata); - local->cont.entrylk.xdata = NULL; - break; + local->cont.entrylk.cmd = ENTRYLK_UNLOCK; + if (local->cont.entrylk.xdata) + dict_unref(local->cont.entrylk.xdata); + local->cont.entrylk.xdata = NULL; + break; default: - break; - } + break; + } - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - if (local->replies[i].op_ret == -1) - continue; + if (local->replies[i].op_ret == -1) + continue; - afr_fop_lock_wind (frame, this, i, afr_unlock_partial_lock_cbk); + afr_fop_lock_wind(frame, this, i, afr_unlock_partial_lock_cbk); - if (!--call_count) - break; - } + if (!--call_count) + break; + } out: - return 0; + return 0; } int32_t -afr_fop_lock_done (call_frame_t *frame, xlator_t *this) +afr_fop_lock_done(call_frame_t *frame, xlator_t *this) { - int i = 0; - int lock_count = 0; - unsigned char *success = NULL; - - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - success = alloca0(priv->child_count); + int i = 0; + int lock_count = 0; + unsigned char *success = NULL; - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (local->replies[i].op_ret == 0) { - lock_count++; - success[i] = 1; - } + local = frame->local; + priv = this->private; + success = alloca0(priv->child_count); - if (local->op_ret == -1 && local->op_errno == EAGAIN) - continue; + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - if ((local->replies[i].op_ret == -1) && - (local->replies[i].op_errno == EAGAIN)) { - local->op_ret = -1; - local->op_errno = EAGAIN; - continue; - } + if (local->replies[i].op_ret == 0) { + lock_count++; + success[i] = 1; + } - if (local->replies[i].op_ret == 0) - local->op_ret = 0; + if (local->op_ret == -1 && local->op_errno == EAGAIN) + continue; - local->op_errno = local->replies[i].op_errno; + if ((local->replies[i].op_ret == -1) && + (local->replies[i].op_errno == EAGAIN)) { + local->op_ret = -1; + local->op_errno = EAGAIN; + continue; } - if (afr_fop_lock_is_unlock (frame)) - goto unwind; + if (local->replies[i].op_ret == 0) + local->op_ret = 0; - if (afr_is_conflicting_lock_present (local->op_ret, local->op_errno)) { - afr_unlock_locks_and_proceed (frame, this, lock_count); - } else if (priv->quorum_count && !afr_has_quorum (success, this)) { - local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED; - local->op_ret = -1; - local->op_errno = afr_final_errno (local, priv); - if (local->op_errno == 0) - local->op_errno = afr_quorum_errno (priv); - afr_unlock_locks_and_proceed (frame, this, lock_count); - } else { - goto unwind; - } + local->op_errno = local->replies[i].op_errno; + } - return 0; + if (afr_fop_lock_is_unlock(frame)) + goto unwind; + + if (afr_is_conflicting_lock_present(local->op_ret, local->op_errno)) { + afr_unlock_locks_and_proceed(frame, this, lock_count); + } else if (priv->quorum_count && !afr_has_quorum(success, this, NULL)) { + local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED; + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + if (local->op_errno == 0) + local->op_errno = afr_quorum_errno(priv); + afr_unlock_locks_and_proceed(frame, this, lock_count); + } else { + goto unwind; + } + + return 0; unwind: - afr_fop_lock_unwind (frame, local->op, local->op_ret, - local->op_errno, local->xdata_rsp); - return 0; + afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; } static int -afr_common_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_common_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - int child_index = (long)cookie; + afr_local_t *local = NULL; + int child_index = (long)cookie; - local = frame->local; + local = frame->local; - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - if (op_ret == 0 && xdata) { - local->replies[child_index].xdata = dict_ref (xdata); - LOCK (&frame->lock); - { - if (!local->xdata_rsp) - local->xdata_rsp = dict_ref (xdata); - } - UNLOCK (&frame->lock); + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (op_ret == 0 && xdata) { + local->replies[child_index].xdata = dict_ref(xdata); + LOCK(&frame->lock); + { + if (!local->xdata_rsp) + local->xdata_rsp = dict_ref(xdata); } - return 0; + UNLOCK(&frame->lock); + } + return 0; } static int32_t -afr_serialized_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_serialized_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int child_index = (long)cookie; - int next_child = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = (long)cookie; + int next_child = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - afr_common_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); - for (next_child = child_index + 1; next_child < priv->child_count; - next_child++) { - if (local->child_up[next_child]) - break; - } + for (next_child = child_index + 1; next_child < priv->child_count; + next_child++) { + if (local->child_up[next_child]) + break; + } - if (afr_is_conflicting_lock_present (op_ret, op_errno) || - (next_child == priv->child_count)) { - afr_fop_lock_done (frame, this); - } else { - afr_fop_lock_wind (frame, this, next_child, - afr_serialized_lock_cbk); - } + if (afr_is_conflicting_lock_present(op_ret, op_errno) || + (next_child == priv->child_count)) { + afr_fop_lock_done(frame, this); + } else { + afr_fop_lock_wind(frame, this, next_child, afr_serialized_lock_cbk); + } - return 0; + return 0; } static int -afr_serialized_lock_wind (call_frame_t *frame, xlator_t *this) +afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - afr_fop_lock_wind (frame, this, i, - afr_serialized_lock_cbk); - break; - } + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + afr_fop_lock_wind(frame, this, i, afr_serialized_lock_cbk); + break; } - return 0; + } + return 0; } static int32_t -afr_parallel_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_parallel_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - int call_count = 0; + int call_count = 0; - afr_common_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); - call_count = afr_frame_return (frame); - if (call_count == 0) - afr_fop_lock_done (frame, this); + call_count = afr_frame_return(frame); + if (call_count == 0) + afr_fop_lock_done(frame, this); - return 0; + return 0; } static int -afr_parallel_lock_wind (call_frame_t *frame, xlator_t *this) +afr_parallel_lock_wind(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int call_count = 0; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int i = 0; - priv = this->private; - local = frame->local; - call_count = local->call_count; + priv = this->private; + local = frame->local; + call_count = local->call_count; - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - afr_fop_lock_wind (frame, this, i, afr_parallel_lock_cbk); - if (!--call_count) - break; - } - return 0; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + afr_fop_lock_wind(frame, this, i, afr_parallel_lock_cbk); + if (!--call_count) + break; + } + return 0; } static int -afr_fop_handle_lock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = frame->local; - int op_errno = 0; - - if (!afr_fop_lock_is_unlock (frame)) { - if (!afr_is_consistent_io_possible (local, this->private, - &op_errno)) - goto out; - - switch (local->op) { - case GF_FOP_INODELK: - case GF_FOP_FINODELK: - local->cont.inodelk.cmd = F_SETLK; - break; - case GF_FOP_ENTRYLK: - case GF_FOP_FENTRYLK: - local->cont.entrylk.cmd = ENTRYLK_LOCK_NB; - break; - default: - break; - } +afr_fop_handle_lock(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + int op_errno = 0; + + if (!afr_fop_lock_is_unlock(frame)) { + if (!afr_is_consistent_io_possible(local, this->private, &op_errno)) + goto out; + + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.cmd = F_SETLK; + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.cmd = ENTRYLK_LOCK_NB; + break; + default: + break; } + } - if (local->xdata_req) { - switch (local->op) { - case GF_FOP_INODELK: - case GF_FOP_FINODELK: - local->cont.inodelk.xdata = dict_ref (local->xdata_req); - break; - case GF_FOP_ENTRYLK: - case GF_FOP_FENTRYLK: - local->cont.entrylk.xdata = dict_ref (local->xdata_req); - break; - default: - break; - } + if (local->xdata_req) { + switch (local->op) { + case GF_FOP_INODELK: + case GF_FOP_FINODELK: + local->cont.inodelk.xdata = dict_ref(local->xdata_req); + break; + case GF_FOP_ENTRYLK: + case GF_FOP_FENTRYLK: + local->cont.entrylk.xdata = dict_ref(local->xdata_req); + break; + default: + break; } + } - local->fop_lock_state = AFR_FOP_LOCK_PARALLEL; - afr_parallel_lock_wind (frame, this); + local->fop_lock_state = AFR_FOP_LOCK_PARALLEL; + afr_parallel_lock_wind(frame, this); out: - return -op_errno; + return -op_errno; } static int32_t -afr_handle_inodelk (call_frame_t *frame, glusterfs_fop_t fop, - const char *volume, loc_t *loc, fd_t *fd, int32_t cmd, - struct gf_flock *flock, dict_t *xdata) -{ - afr_local_t *local = NULL; - int32_t op_errno = ENOMEM; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - local->op = fop; - if (loc) - loc_copy (&local->loc, loc); - if (fd) - local->fd = fd_ref (fd); - - local->cont.inodelk.volume = gf_strdup (volume); - if (!local->cont.inodelk.volume) { - op_errno = ENOMEM; - goto out; - } - - local->cont.inodelk.in_cmd = cmd; - local->cont.inodelk.cmd = cmd; - local->cont.inodelk.in_flock = *flock; - local->cont.inodelk.flock = *flock; - if (xdata) - local->xdata_req = dict_ref (xdata); - - op_errno = -afr_fop_handle_lock (frame, frame->this); - if (op_errno) - goto out; - return 0; +afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = fop; + if (loc) + loc_copy(&local->loc, loc); + if (fd && (flock->l_type != F_UNLCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local->fd = fd_ref(fd); + } + + local->cont.inodelk.volume = gf_strdup(volume); + if (!local->cont.inodelk.volume) { + op_errno = ENOMEM; + goto out; + } + + local->cont.inodelk.in_cmd = cmd; + local->cont.inodelk.cmd = cmd; + local->cont.inodelk.in_flock = *flock; + local->cont.inodelk.flock = *flock; + if (xdata) + local->xdata_req = dict_ref(xdata); + + op_errno = -afr_fop_handle_lock(frame, frame->this); + if (op_errno) + goto out; + return 0; out: - afr_fop_lock_unwind (frame, fop, -1, op_errno, NULL); + afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL); - return 0; + return 0; } int32_t -afr_inodelk (call_frame_t *frame, xlator_t *this, - const char *volume, loc_t *loc, int32_t cmd, - struct gf_flock *flock, dict_t *xdata) +afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_handle_inodelk (frame, GF_FOP_INODELK, volume, loc, NULL, cmd, - flock, xdata); - return 0; + afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd, + flock, xdata); + return 0; } int32_t -afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - int32_t cmd, struct gf_flock *flock, dict_t *xdata) +afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int32_t cmd, struct gf_flock *flock, dict_t *xdata) { - afr_handle_inodelk (frame, GF_FOP_FINODELK, volume, NULL, fd, cmd, - flock, xdata); - return 0; + afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd, + flock, xdata); + return 0; } static int -afr_handle_entrylk (call_frame_t *frame, glusterfs_fop_t fop, - const char *volume, loc_t *loc, fd_t *fd, - const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop, + const char *volume, loc_t *loc, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - afr_local_t *local = NULL; - int32_t op_errno = ENOMEM; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - local->op = fop; - if (loc) - loc_copy (&local->loc, loc); - if (fd) - local->fd = fd_ref (fd); - local->cont.entrylk.cmd = cmd; - local->cont.entrylk.in_cmd = cmd; - local->cont.entrylk.type = type; - local->cont.entrylk.volume = gf_strdup (volume); - local->cont.entrylk.basename = gf_strdup (basename); - if (!local->cont.entrylk.volume || !local->cont.entrylk.basename) { - op_errno = ENOMEM; - goto out; - } - if (xdata) - local->xdata_req = dict_ref (xdata); - op_errno = -afr_fop_handle_lock (frame, frame->this); - if (op_errno) - goto out; - - return 0; + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = fop; + if (loc) + loc_copy(&local->loc, loc); + if (fd && (cmd != ENTRYLK_UNLOCK)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local->fd = fd_ref(fd); + } + local->cont.entrylk.cmd = cmd; + local->cont.entrylk.in_cmd = cmd; + local->cont.entrylk.type = type; + local->cont.entrylk.volume = gf_strdup(volume); + local->cont.entrylk.basename = gf_strdup(basename); + if (!local->cont.entrylk.volume || !local->cont.entrylk.basename) { + op_errno = ENOMEM; + goto out; + } + if (xdata) + local->xdata_req = dict_ref(xdata); + op_errno = -afr_fop_handle_lock(frame, frame->this); + if (op_errno) + goto out; + + return 0; out: - afr_fop_lock_unwind (frame, fop, -1, op_errno, NULL); - return 0; + afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL); + return 0; } int -afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume, - loc_t *loc, const char *basename, entrylk_cmd cmd, - entrylk_type type, dict_t *xdata) +afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - afr_handle_entrylk (frame, GF_FOP_ENTRYLK, volume, loc, NULL, basename, - cmd, type, xdata); - return 0; + afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename, + cmd, type, xdata); + return 0; } int -afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, - const char *basename, entrylk_cmd cmd, entrylk_type type, - dict_t *xdata) +afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) { - afr_handle_entrylk (frame, GF_FOP_FENTRYLK, volume, NULL, fd, basename, - cmd, type, xdata); - return 0; + afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename, + cmd, type, xdata); + return 0; } - int -afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, - int op_errno, struct statvfs *statvfs, dict_t *xdata) +afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, struct statvfs *statvfs, dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = 0; - struct statvfs *buf = NULL; + afr_local_t *local = NULL; + int call_count = 0; + struct statvfs *buf = NULL; - LOCK (&frame->lock); - { - local = frame->local; - - if (op_ret != 0) { - local->op_errno = op_errno; - goto unlock; - } - - local->op_ret = op_ret; - - buf = &local->cont.statfs.buf; - if (local->cont.statfs.buf_set) { - if (statvfs->f_bavail < buf->f_bavail) { - *buf = *statvfs; - if (xdata) { - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); - local->xdata_rsp = dict_ref (xdata); - } - } - } else { - *buf = *statvfs; - local->cont.statfs.buf_set = 1; - if (xdata) - local->xdata_rsp = dict_ref (xdata); - } + local = frame->local; + + LOCK(&frame->lock); + { + if (op_ret != 0) { + local->op_errno = op_errno; + goto unlock; } -unlock: - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); + local->op_ret = op_ret; - if (call_count == 0) - AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno, - &local->cont.statfs.buf, local->xdata_rsp); + buf = &local->cont.statfs.buf; + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < buf->f_bavail) { + *buf = *statvfs; + if (xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(xdata); + } + } + } else { + *buf = *statvfs; + local->cont.statfs.buf_set = 1; + if (xdata) + local->xdata_rsp = dict_ref(xdata); + } + } +unlock: + call_count = --local->call_count; + UNLOCK(&frame->lock); - return 0; -} + if (call_count == 0) + AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno, + &local->cont.statfs.buf, local->xdata_rsp); + return 0; +} int -afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +afr_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_STATFS; + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX]) + local->call_count--; + call_count = local->call_count; + if (!call_count) { + op_errno = ENOTCONN; + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + if (AFR_IS_ARBITER_BRICK(priv, i)) + continue; + STACK_WIND(frame, afr_statfs_cbk, priv->children[i], + priv->children[i]->fops->statfs, loc, xdata); + if (!--call_count) + break; + } + } + + return 0; +out: + AFR_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) { - afr_local_t * local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int call_count = 0; - int32_t op_errno = ENOMEM; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + int call_count = -1; + int child_index = (long)cookie; - priv = this->private; + local = frame->local; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } - local->op = GF_FOP_STATFS; - if (!afr_is_consistent_io_possible (local, priv, &op_errno)) - goto out; + call_count = afr_frame_return(frame); + if (call_count == 0) { + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, + local->xdata_rsp); + } - if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX]) - local->call_count--; - call_count = local->call_count; - if (!call_count) { - op_errno = ENOTCONN; - goto out; - } + return 0; +} - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - if (AFR_IS_ARBITER_BRICK(priv, i)) - continue; - STACK_WIND (frame, afr_statfs_cbk, - priv->children[i], - priv->children[i]->fops->statfs, - loc, xdata); - if (!--call_count) - break; - } - } +int32_t +afr_lk_unlock(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; - return 0; -out: - AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL); + local = frame->local; + priv = this->private; + + call_count = afr_locked_nodes_count(local->cont.lk.locked_nodes, + priv->child_count); + if (call_count == 0) { + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL, + local->xdata_rsp); return 0; -} + } + local->call_count = call_count; + + local->cont.lk.user_flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.locked_nodes[i]) { + STACK_WIND_COOKIE(frame, afr_lk_unlock_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, + NULL); + + if (!--call_count) + break; + } + } + + return 0; +} int32_t -afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) +afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_flock *lock, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = this->private; - int call_count = -1; - int child_index = (long)cookie; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = -1; - local = frame->local; + local = frame->local; + priv = this->private; - if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_msg (this->name, GF_LOG_ERROR, op_errno, - AFR_MSG_UNLOCK_FAIL, - "gfid=%s: unlock failed on subvolume %s " - "with lock owner %s", - uuid_utoa (local->fd->inode->gfid), - priv->children[child_index]->name, - lkowner_utoa (&frame->root->lk_owner)); - } + child_index = (long)cookie; - call_count = afr_frame_return (frame); - if (call_count == 0) - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - NULL, local->xdata_rsp); + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret < 0 && op_errno == EAGAIN) { + local->op_ret = -1; + local->op_errno = EAGAIN; + afr_lk_unlock(frame, this); return 0; -} + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + + child_index++; + + if (child_index < priv->child_count) { + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + } else if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + afr_lk_unlock(frame, this); + } else { + if (local->op_ret < 0) + local->op_errno = afr_final_errno(local, priv); -int32_t -afr_lk_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t * local = NULL; - afr_private_t * priv = NULL; - int i = 0; - int call_count = 0; + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } - local = frame->local; - priv = this->private; + return 0; +} + +int +afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque) +{ + return 0; +} - call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, - priv->child_count); +int +afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = -1; + + local = frame->local; + child_index = (long)cookie; + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.locked_nodes[child_index] = 1; + local->cont.lk.ret_flock = *lock; + } + syncbarrier_wake(&local->barrier); + return 0; +} - if (call_count == 0) { - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - NULL, local->xdata_rsp); - return 0; +int +afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_flock *lock, + dict_t *xdata) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + int child_index = (long)cookie; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL, + "gfid=%s: unlock failed on subvolume %s " + "with lock owner %s", + uuid_utoa(local->fd->inode->gfid), + priv->children[child_index]->name, + lkowner_utoa(&frame->root->lk_owner)); + } + return 0; +} +int +afr_lk_transaction(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *wind_on = NULL; + int op_errno = 0; + int i = 0; + int ret = 0; + + frame = (call_frame_t *)opaque; + local = frame->local; + this = frame->this; + priv = this->private; + wind_on = alloca0(priv->child_count); + + if (priv->arbiter_count || priv->child_count != 3) { + op_errno = ENOTSUP; + gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM, + "%s: Lock healing supported only for replica 3 volumes.", + uuid_utoa(local->fd->inode->gfid)); + goto err; + } + + op_errno = -afr_dom_lock_acquire(frame); // Released during + // AFR_STACK_UNWIND + if (op_errno != 0) { + goto err; + } + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) { + op_errno = afr_final_errno(local, priv); + goto err; + } + + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i]) + wind_on[i] = 1; + } + AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd, + local->cont.lk.cmd, &local->cont.lk.user_flock, + local->xdata_req); + + if (priv->quorum_count && + !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + goto unlock; + } else { + if (local->cont.lk.user_flock.l_type == F_UNLCK) + ret = afr_remove_lock_from_saved_locks(local, this); + else + ret = afr_add_lock_to_saved_locks(frame, this); + if (ret) { + local->op_ret = -1; + local->op_errno = -ret; + goto unlock; } + AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, + &local->cont.lk.ret_flock, local->xdata_rsp); + } - local->call_count = call_count; + return 0; - local->cont.lk.user_flock.l_type = F_UNLCK; +unlock: + local->cont.lk.user_flock.l_type = F_UNLCK; + AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk, + local->fd, F_SETLK, &local->cont.lk.user_flock, NULL); +err: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + return -1; +} - for (i = 0; i < priv->child_count; i++) { - if (local->cont.lk.locked_nodes[i]) { - STACK_WIND_COOKIE (frame, afr_lk_unlock_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->lk, - local->fd, F_SETLK, - &local->cont.lk.user_flock, NULL); - - if (!--call_count) - break; - } +int +afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd, + struct gf_flock *flock, dict_t *xdata) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = 0; + int i = 0; + int32_t op_errno = ENOMEM; + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_LK; + if (!afr_lk_is_unlock(cmd, flock)) { + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + } + + local->cont.lk.locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lk.locked_nodes), + gf_afr_mt_char); + + if (!local->cont.lk.locked_nodes) { + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref(fd); + local->cont.lk.cmd = cmd; + local->cont.lk.user_flock = *flock; + local->cont.lk.ret_flock = *flock; + if (xdata) + local->xdata_req = dict_ref(xdata); + + if (afr_is_lock_mode_mandatory(xdata)) { + ret = synctask_new(this->ctx->env, afr_lk_transaction, + afr_lk_transaction_cbk, frame, frame); + if (ret) { + op_errno = ENOMEM; + goto out; } - return 0; + } + + STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i], + priv->children[i]->fops->lk, fd, cmd, flock, + local->xdata_req); + + return 0; +out: + AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL); + + return 0; } +int32_t +afr_lease_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct gf_lease *lease, + dict_t *xdata) +{ + afr_local_t *local = NULL; + int call_count = -1; + + local = frame->local; + call_count = afr_frame_return(frame); + + if (call_count == 0) + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, lease, + xdata); + + return 0; +} int32_t -afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct gf_flock *lock, - dict_t *xdata) +afr_lease_unlock(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int child_index = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + local = frame->local; + priv = this->private; - local = frame->local; - priv = this->private; + call_count = afr_locked_nodes_count(local->cont.lease.locked_nodes, + priv->child_count); - child_index = (long) cookie; + if (call_count == 0) { + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, + &local->cont.lease.ret_lease, NULL); + return 0; + } - afr_common_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); - if (op_ret < 0 && op_errno == EAGAIN) { - local->op_ret = -1; - local->op_errno = EAGAIN; + local->call_count = call_count; - afr_lk_unlock (frame, this); - return 0; - } + local->cont.lease.user_lease.cmd = GF_UNLK_LEASE; - if (op_ret == 0) { - local->op_ret = 0; - local->op_errno = 0; - local->cont.lk.locked_nodes[child_index] = 1; - local->cont.lk.ret_flock = *lock; - } - - child_index++; - - if (child_index < priv->child_count) { - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->lk, - local->fd, local->cont.lk.cmd, - &local->cont.lk.user_flock, - local->xdata_req); - } else if (priv->quorum_count && - !afr_has_quorum (local->cont.lk.locked_nodes, this)) { - local->op_ret = -1; - local->op_errno = afr_final_errno (local, priv); - - afr_lk_unlock (frame, this); - } else { - if (local->op_ret < 0) - local->op_errno = afr_final_errno (local, priv); + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lease.locked_nodes[i]) { + STACK_WIND(frame, afr_lease_unlock_cbk, priv->children[i], + priv->children[i]->fops->lease, &local->loc, + &local->cont.lease.user_lease, NULL); - AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno, - &local->cont.lk.ret_flock, local->xdata_rsp); + if (!--call_count) + break; } + } + + return 0; +} +int32_t +afr_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct gf_lease *lease, dict_t *xdata) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long)cookie; + + afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata); + if (op_ret < 0 && op_errno == EAGAIN) { + local->op_ret = -1; + local->op_errno = EAGAIN; + + afr_lease_unlock(frame, this); return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lease.locked_nodes[child_index] = 1; + local->cont.lease.ret_lease = *lease; + } + + child_index++; + if (child_index < priv->child_count) { + STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)child_index, + priv->children[child_index], + priv->children[child_index]->fops->lease, &local->loc, + &local->cont.lease.user_lease, xdata); + } else if (priv->quorum_count && + !afr_has_quorum(local->cont.lease.locked_nodes, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + + afr_lease_unlock(frame, this); + } else { + if (local->op_ret < 0) + local->op_errno = afr_final_errno(local, priv); + AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, + &local->cont.lease.ret_lease, NULL); + } + + return 0; } int -afr_lk (call_frame_t *frame, xlator_t *this, - fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata) +afr_lease(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct gf_lease *lease, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int32_t op_errno = ENOMEM; - - priv = this->private; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int32_t op_errno = ENOMEM; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + priv = this->private; - local->op = GF_FOP_LK; - if (!afr_lk_is_unlock (cmd, flock) && - !afr_is_consistent_io_possible (local, priv, &op_errno)) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count, - sizeof (*local->cont.lk.locked_nodes), - gf_afr_mt_char); + local->op = GF_FOP_LEASE; + local->cont.lease.locked_nodes = GF_CALLOC( + priv->child_count, sizeof(*local->cont.lease.locked_nodes), + gf_afr_mt_char); - if (!local->cont.lk.locked_nodes) { - op_errno = ENOMEM; - goto out; - } + if (!local->cont.lease.locked_nodes) { + op_errno = ENOMEM; + goto out; + } - local->fd = fd_ref (fd); - local->cont.lk.cmd = cmd; - local->cont.lk.user_flock = *flock; - local->cont.lk.ret_flock = *flock; - if (xdata) - local->xdata_req = dict_ref (xdata); + loc_copy(&local->loc, loc); + local->cont.lease.user_lease = *lease; + local->cont.lease.ret_lease = *lease; - STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, - priv->children[i], - priv->children[i]->fops->lk, - fd, cmd, flock, local->xdata_req); + STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)0, priv->children[0], + priv->children[0]->fops->lease, loc, lease, xdata); - return 0; + return 0; out: - AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } int -afr_ipc_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - int child_index = (long)cookie; - int call_count = 0; - gf_boolean_t failed = _gf_false; - gf_boolean_t succeded = _gf_false; - int i = 0; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - if (xdata) - local->replies[child_index].xdata = dict_ref (xdata); - - call_count = afr_frame_return (frame); - if (call_count) - goto out; - /* If any of the subvolumes failed with other than ENOTCONN - * return error else return success unless all the subvolumes - * failed. - * TODO: In case of failure, we need to unregister the xattrs - * from the other subvolumes where it succeded (once upcall - * fixes the Bz-1371622)*/ - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret < 0 && - local->replies[i].op_errno != ENOTCONN) { - local->op_ret = local->replies[i].op_ret; - local->op_errno = local->replies[i].op_errno; - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); - local->xdata_rsp = NULL; - if (local->replies[i].xdata) { - local->xdata_rsp = - dict_ref (local->replies[i].xdata); - } - failed = _gf_true; - break; - } - if (local->replies[i].op_ret == 0) { - succeded = _gf_true; - local->op_ret = 0; - local->op_errno = 0; - if (!local->xdata_rsp && local->replies[i].xdata) { - local->xdata_rsp = - dict_ref (local->replies[i].xdata); - } - } +afr_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = 0; + gf_boolean_t failed = _gf_false; + gf_boolean_t succeeded = _gf_false; + int i = 0; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + + call_count = afr_frame_return(frame); + if (call_count) + goto out; + /* If any of the subvolumes failed with other than ENOTCONN + * return error else return success unless all the subvolumes + * failed. + * TODO: In case of failure, we need to unregister the xattrs + * from the other subvolumes where it succeeded (once upcall + * fixes the Bz-1371622)*/ + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno != ENOTCONN) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + if (local->replies[i].xdata) { + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + failed = _gf_true; + break; + } + if (local->replies[i].op_ret == 0) { + succeeded = _gf_true; + local->op_ret = 0; + local->op_errno = 0; + if (!local->xdata_rsp && local->replies[i].xdata) { + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } } + } - if (!succeded && !failed) { - local->op_ret = -1; - local->op_errno = ENOTCONN; - } + if (!succeeded && !failed) { + local->op_ret = -1; + local->op_errno = ENOTCONN; + } - AFR_STACK_UNWIND (ipc, frame, local->op_ret, local->op_errno, - local->xdata_rsp); + AFR_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno, + local->xdata_rsp); out: - return 0; + return 0; } int -afr_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) +afr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata) { - afr_local_t *local = NULL; - int32_t op_errno = -1; - afr_private_t *priv = NULL; - int i = 0; - int call_cnt = -1; + afr_local_t *local = NULL; + int32_t op_errno = -1; + afr_private_t *priv = NULL; + int i = 0; + int call_cnt = -1; - VALIDATE_OR_GOTO (frame, err); - VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO(frame, err); + VALIDATE_OR_GOTO(this, err); - if (op != GF_IPC_TARGET_UPCALL) - goto wind_default; + if (op != GF_IPC_TARGET_UPCALL) + goto wind_default; - VALIDATE_OR_GOTO (this->private, err); - priv = this->private; + VALIDATE_OR_GOTO(this->private, err); + priv = this->private; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto err; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto err; - call_cnt = local->call_count; + call_cnt = local->call_count; - if (xdata) { - for (i = 0; i < priv->child_count; i++) { - if (dict_set_int8 (xdata, priv->pending_key[i], 0) < 0) - goto err; - } + if (xdata) { + for (i = 0; i < priv->child_count; i++) { + if (dict_set_int8(xdata, priv->pending_key[i], 0) < 0) + goto err; } + } - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; - STACK_WIND_COOKIE (frame, afr_ipc_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->ipc, - op, xdata); - if (!--call_cnt) - break; - } - return 0; + STACK_WIND_COOKIE(frame, afr_ipc_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->ipc, op, + xdata); + if (!--call_cnt) + break; + } + return 0; err: - if (op_errno == -1) - op_errno = errno; - AFR_STACK_UNWIND (ipc, frame, -1, op_errno, NULL); + if (op_errno == -1) + op_errno = errno; + AFR_STACK_UNWIND(ipc, frame, -1, op_errno, NULL); - return 0; + return 0; wind_default: - STACK_WIND (frame, default_ipc_cbk, FIRST_CHILD (this), - FIRST_CHILD (this)->fops->ipc, op, xdata); - return 0; + STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ipc, op, xdata); + return 0; } int -afr_forget (xlator_t *this, inode_t *inode) +afr_forget(xlator_t *this, inode_t *inode) { - uint64_t ctx_int = 0; - afr_inode_ctx_t *ctx = NULL; + uint64_t ctx_int = 0; + afr_inode_ctx_t *ctx = NULL; - afr_spb_choice_timeout_cancel (this, inode); - inode_ctx_del (inode, this, &ctx_int); - if (!ctx_int) - return 0; - - ctx = (afr_inode_ctx_t *)ctx_int; - GF_FREE (ctx); + afr_spb_choice_timeout_cancel(this, inode); + inode_ctx_del(inode, this, &ctx_int); + if (!ctx_int) return 0; + + ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int; + afr_inode_ctx_destroy(ctx); + return 0; } int -afr_priv_dump (xlator_t *this) -{ - afr_private_t *priv = NULL; - char key_prefix[GF_DUMP_MAX_BUF_LEN]; - char key[GF_DUMP_MAX_BUF_LEN]; - int i = 0; - - - GF_ASSERT (this); - priv = this->private; - - GF_ASSERT (priv); - snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); - gf_proc_dump_add_section(key_prefix); - gf_proc_dump_write("child_count", "%u", priv->child_count); - for (i = 0; i < priv->child_count; i++) { - sprintf (key, "child_up[%d]", i); - gf_proc_dump_write(key, "%d", priv->child_up[i]); - sprintf (key, "pending_key[%d]", i); - gf_proc_dump_write(key, "%s", priv->pending_key[i]); - } - gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); - gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); - gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); - gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); - gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); - gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); - gf_proc_dump_write("read_child", "%d", priv->read_child); - gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); - gf_proc_dump_write("wait_count", "%u", priv->wait_count); - gf_proc_dump_write("quorum-reads", "%d", priv->quorum_reads); - gf_proc_dump_write("heal-wait-queue-length", "%d", - priv->heal_wait_qlen); - gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters); - gf_proc_dump_write("background-self-heal-count", "%d", - priv->background_self_heal_count); - gf_proc_dump_write("healers", "%d", priv->healers); - if (priv->quorum_count == AFR_QUORUM_AUTO) { - gf_proc_dump_write ("quorum-type", "auto"); - } else if (priv->quorum_count == 0) { - gf_proc_dump_write ("quorum-type", "none"); - } else { - gf_proc_dump_write("quorum-type", "fixed"); - gf_proc_dump_write("quorum-count", "%d", priv->quorum_count); - } - - return 0; +afr_priv_dump(xlator_t *this) +{ + afr_private_t *priv = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN]; + char key[GF_DUMP_MAX_BUF_LEN]; + int i = 0; + + GF_ASSERT(this); + priv = this->private; + + GF_ASSERT(priv); + snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name); + gf_proc_dump_add_section("%s", key_prefix); + gf_proc_dump_write("child_count", "%u", priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sprintf(key, "child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->child_up[i]); + sprintf(key, "pending_key[%d]", i); + gf_proc_dump_write(key, "%s", priv->pending_key[i]); + sprintf(key, "pending_reads[%d]", i); + gf_proc_dump_write(key, "%" PRId64, + GF_ATOMIC_GET(priv->pending_reads[i])); + sprintf(key, "child_latency[%d]", i); + gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]); + sprintf(key, "halo_child_up[%d]", i); + gf_proc_dump_write(key, "%d", priv->halo_child_up[i]); + } + gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); + gf_proc_dump_write("heal-wait-queue-length", "%d", priv->heal_wait_qlen); + gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters); + gf_proc_dump_write("background-self-heal-count", "%d", + priv->background_self_heal_count); + gf_proc_dump_write("healers", "%d", priv->healers); + gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode); + gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode); + if (priv->quorum_count == AFR_QUORUM_AUTO) { + gf_proc_dump_write("quorum-type", "auto"); + } else if (priv->quorum_count == 0) { + gf_proc_dump_write("quorum-type", "none"); + } else { + gf_proc_dump_write("quorum-type", "fixed"); + gf_proc_dump_write("quorum-count", "%d", priv->quorum_count); + } + gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this, NULL)); + if (priv->thin_arbiter_count) { + gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up); + gf_proc_dump_write("ta_bad_child_index", "%d", + priv->ta_bad_child_index); + gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64, + priv->ta_notify_dom_lock_offset); + } + + return 0; } - /** * find_child_index - find the child's index in the array of subvolumes * @this: AFR @@ -4697,1791 +5723,2156 @@ afr_priv_dump (xlator_t *this) */ static int -find_child_index (xlator_t *this, xlator_t *child) +afr_find_child_index(xlator_t *this, xlator_t *child) { - afr_private_t *priv = NULL; - int i = -1; + afr_private_t *priv = NULL; + int child_count = -1; + int i = -1; - priv = this->private; + priv = this->private; + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } - for (i = 0; i < priv->child_count; i++) { - if ((xlator_t *) child == priv->children[i]) - break; - } + for (i = 0; i < child_count; i++) { + if ((xlator_t *)child == priv->children[i]) + break; + } - return i; + return i; } -static int -__afr_get_up_children_count (afr_private_t *priv) +int +__afr_get_up_children_count(afr_private_t *priv) { - int up_children = 0; - int i = 0; + int up_children = 0; + int i = 0; - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 1) - up_children++; + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 1) + up_children++; - return up_children; + return up_children; } -glusterfs_event_t -__afr_transform_event_from_state (afr_private_t *priv) +static int +__get_heard_from_all_status(xlator_t *this) { - int i = 0; - int up_children = 0; - - if (AFR_COUNT (priv->last_event, priv->child_count) == - priv->child_count) - /* have_heard_from_all. Let afr_notify() do the propagation. */ - return GF_EVENT_MAXVAL; + afr_private_t *priv = this->private; + int i; - up_children = __afr_get_up_children_count (priv); - /* Treat the children with pending notification, as having sent a - * GF_EVENT_CHILD_DOWN. i.e. set the event as GF_EVENT_SOME_DESCENDENT_DOWN, - * as done in afr_notify() */ - for (i = 0; i < priv->child_count; i++) { - if (priv->last_event[i]) - continue; - priv->last_event[i] = GF_EVENT_SOME_DESCENDENT_DOWN; - priv->child_up[i] = 0; + for (i = 0; i < priv->child_count; i++) { + if (!priv->last_event[i]) { + return 0; } + } + if (priv->thin_arbiter_count && !priv->ta_child_up) { + return 0; + } + return 1; +} - if (up_children) - /* We received at least one child up */ - return GF_EVENT_CHILD_UP; - else - return GF_EVENT_CHILD_DOWN; +glusterfs_event_t +__afr_transform_event_from_state(xlator_t *this) +{ + int i = 0; + int up_children = 0; + afr_private_t *priv = this->private; + if (__get_heard_from_all_status(this)) + /* have_heard_from_all. Let afr_notify() do the propagation. */ return GF_EVENT_MAXVAL; -} -static void -afr_notify_cbk (void *data) -{ - xlator_t *this = data; - afr_private_t *priv = this->private; - glusterfs_event_t event = GF_EVENT_MAXVAL; - gf_boolean_t propagate = _gf_false; + up_children = __afr_get_up_children_count(priv); + /* Treat the children with pending notification, as having sent a + * GF_EVENT_CHILD_DOWN. i.e. set the event as GF_EVENT_SOME_DESCENDENT_DOWN, + * as done in afr_notify() */ + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i]) + continue; + priv->last_event[i] = GF_EVENT_SOME_DESCENDENT_DOWN; + priv->child_up[i] = 0; + } - LOCK (&priv->lock); - { - if (!priv->timer) { - /* - * Either child_up/child_down is already sent to parent. - * This is a spurious wake up. - */ - goto unlock; - } - priv->timer = NULL; - event = __afr_transform_event_from_state (priv); - if (event != GF_EVENT_MAXVAL) - propagate = _gf_true; - } -unlock: - UNLOCK (&priv->lock); - if (propagate) - default_notify (this, event, NULL); + if (up_children) + /* We received at least one child up */ + return GF_EVENT_CHILD_UP; + else + return GF_EVENT_CHILD_DOWN; + + return GF_EVENT_MAXVAL; } static void -__afr_launch_notify_timer (xlator_t *this, afr_private_t *priv) -{ - - struct timespec delay = {0, }; - - gf_msg_debug (this->name, 0, "Initiating child-down timer"); - delay.tv_sec = 10; - delay.tv_nsec = 0; - priv->timer = gf_timer_call_after (this->ctx, delay, - afr_notify_cbk, this); - if (priv->timer == NULL) { - gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL, - "Cannot create timer for delayed initialization"); - } +afr_notify_cbk(void *data) +{ + xlator_t *this = data; + afr_private_t *priv = this->private; + glusterfs_event_t event = GF_EVENT_MAXVAL; + gf_boolean_t propagate = _gf_false; + + LOCK(&priv->lock); + { + if (!priv->timer) { + /* + * Either child_up/child_down is already sent to parent. + * This is a spurious wake up. + */ + goto unlock; + } + priv->timer = NULL; + event = __afr_transform_event_from_state(this); + if (event != GF_EVENT_MAXVAL) + propagate = _gf_true; + } +unlock: + UNLOCK(&priv->lock); + if (propagate) + default_notify(this, event, NULL); } -int -__get_heard_from_all_status (xlator_t *this) +static void +__afr_launch_notify_timer(xlator_t *this, afr_private_t *priv) { - afr_private_t *priv = this->private; - int heard_from_all = 1; - int i = 0; + struct timespec delay = { + 0, + }; - for (i = 0; i < priv->child_count; i++) { - if (!priv->last_event[i]) { - heard_from_all = 0; - break; - } - } - return heard_from_all; + gf_msg_debug(this->name, 0, "Initiating child-down timer"); + delay.tv_sec = 10; + delay.tv_nsec = 0; + priv->timer = gf_timer_call_after(this->ctx, delay, afr_notify_cbk, this); + if (priv->timer == NULL) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL, + "Cannot create timer for delayed initialization"); + } } static int -find_best_down_child (xlator_t *this) +find_best_down_child(xlator_t *this) { - afr_private_t *priv = NULL; - int i = -1; - int32_t best_child = -1; - int64_t best_latency = INT64_MAX; + afr_private_t *priv = NULL; + int i = -1; + int32_t best_child = -1; + int64_t best_latency = INT64_MAX; - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i] && - priv->child_latency[i] >= 0 && - priv->child_latency[i] < best_latency) { - best_child = i; - best_latency = priv->child_latency[i]; - } + for (i = 0; i < priv->child_count; i++) { + if (!priv->child_up[i] && priv->child_latency[i] >= 0 && + priv->child_latency[i] < best_latency) { + best_child = i; + best_latency = priv->child_latency[i]; } - if (best_child >= 0) { - gf_msg_debug (this->name, 0, "Found best down child (%d) " - "@ %ld ms latency", best_child, best_latency); - } - return best_child; + } + if (best_child >= 0) { + gf_msg_debug(this->name, 0, + "Found best down child (%d) @ %" PRId64 " ms latency", + best_child, best_latency); + } + return best_child; } int -find_worst_up_child (xlator_t *this) +find_worst_up_child(xlator_t *this) { - afr_private_t *priv = NULL; - int i = -1; - int32_t worst_child = -1; - int64_t worst_latency = INT64_MIN; + afr_private_t *priv = NULL; + int i = -1; + int32_t worst_child = -1; + int64_t worst_latency = INT64_MIN; - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (priv->child_up[i] && - priv->child_latency[i] >= 0 && - priv->child_latency[i] > worst_latency) { - worst_child = i; - worst_latency = priv->child_latency[i]; - } + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i] && priv->child_latency[i] >= 0 && + priv->child_latency[i] > worst_latency) { + worst_child = i; + worst_latency = priv->child_latency[i]; } - if (worst_child >= 0) { - gf_msg_debug (this->name, 0, "Found worst up child (%d)" - " @ %ld ms latency", worst_child, worst_latency); - } - return worst_child; + } + if (worst_child >= 0) { + gf_msg_debug(this->name, 0, + "Found worst up child (%d) @ %" PRId64 " ms latency", + worst_child, worst_latency); + } + return worst_child; } void -__afr_handle_ping_event (xlator_t *this, xlator_t *child_xlator, - const int idx, int64_t halo_max_latency_msec, int32_t *event, - int64_t child_latency_msec) +__afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx, + int64_t halo_max_latency_msec, int32_t *event, + int64_t child_latency_msec) { - afr_private_t *priv = NULL; - int up_children = 0; + afr_private_t *priv = NULL; + int up_children = 0; - priv = this->private; + priv = this->private; - priv->child_latency[idx] = child_latency_msec; - gf_msg_debug (child_xlator->name, 0, "Client ping @ %ld ms", - child_latency_msec); - - up_children = __afr_get_up_children_count (priv); - - if (child_latency_msec > halo_max_latency_msec && - priv->child_up[idx] == 1 && - up_children > priv->halo_min_replicas) { - if ((up_children - 1) < - priv->halo_min_replicas) { - gf_log (child_xlator->name, GF_LOG_INFO, - "Overriding halo threshold, " - "min replicas: %d", - priv->halo_min_replicas); - } else { - gf_log (child_xlator->name, GF_LOG_INFO, - "Child latency (%ld ms) " - "exceeds halo threshold (%ld), " - "marking child down.", - child_latency_msec, - halo_max_latency_msec); - *event = GF_EVENT_CHILD_DOWN; - } - } else if (child_latency_msec < halo_max_latency_msec && - priv->child_up[idx] == 0) { - if (up_children < priv->halo_max_replicas) { - gf_log (child_xlator->name, GF_LOG_INFO, - "Child latency (%ld ms) " - "below halo threshold (%ld), " - "marking child up.", - child_latency_msec, - halo_max_latency_msec); - *event = GF_EVENT_CHILD_UP; - } else { - gf_log (child_xlator->name, GF_LOG_INFO, - "Not marking child %d up, " - "max replicas (%d) reached.", idx, - priv->halo_max_replicas); - } + priv->child_latency[idx] = child_latency_msec; + gf_msg_debug(child_xlator->name, 0, "Client ping @ %" PRId64 " ms", + child_latency_msec); + if (priv->shd.iamshd) + return; + + up_children = __afr_get_up_children_count(priv); + + if (child_latency_msec > halo_max_latency_msec && + priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) { + if ((up_children - 1) < priv->halo_min_replicas) { + gf_log(child_xlator->name, GF_LOG_INFO, + "Overriding halo threshold, " + "min replicas: %d", + priv->halo_min_replicas); + } else { + gf_log(child_xlator->name, GF_LOG_INFO, + "Child latency (%" PRId64 + " ms) " + "exceeds halo threshold (%" PRId64 + "), " + "marking child down.", + child_latency_msec, halo_max_latency_msec); + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_DOWN; + } + } + } else if (child_latency_msec < halo_max_latency_msec && + priv->child_up[idx] == 0) { + if (up_children < priv->halo_max_replicas) { + gf_log(child_xlator->name, GF_LOG_INFO, + "Child latency (%" PRId64 + " ms) " + "below halo threshold (%" PRId64 + "), " + "marking child up.", + child_latency_msec, halo_max_latency_msec); + if (priv->halo_child_up[idx]) { + *event = GF_EVENT_CHILD_UP; + } + } else { + gf_log(child_xlator->name, GF_LOG_INFO, + "Not marking child %d up, " + "max replicas (%d) reached.", + idx, priv->halo_max_replicas); } + } } -void -__afr_handle_child_up_event (xlator_t *this, xlator_t *child_xlator, - const int idx, int64_t halo_max_latency_msec, - int32_t *event, int32_t *call_psh, int32_t *up_child) +static int64_t +afr_get_halo_latency(xlator_t *this) { - afr_private_t *priv = NULL; - int up_children = 0; - int worst_up_child = -1; - - priv = this->private; + afr_private_t *priv = NULL; + int64_t halo_max_latency_msec = 0; - /* - * This only really counts if the child was never up - * (value = -1) or had been down (value = 0). See - * comment at GF_EVENT_CHILD_DOWN for a more detailed - * explanation. - */ - if (priv->child_up[idx] != 1) { - priv->event_generation++; - } - priv->child_up[idx] = 1; + priv = this->private; - *call_psh = 1; - *up_child = idx; - up_children = __afr_get_up_children_count (priv); + if (priv->shd.iamshd) { + halo_max_latency_msec = priv->shd.halo_max_latency_msec; + } else if (priv->nfsd.iamnfsd) { + halo_max_latency_msec = priv->nfsd.halo_max_latency_msec; + } else { + halo_max_latency_msec = priv->halo_max_latency_msec; + } + gf_msg_debug(this->name, 0, "Using halo latency %" PRId64, + halo_max_latency_msec); + return halo_max_latency_msec; +} - /* - * Handle the edge case where we exceed - * halo_min_replicas and we've got a child which is - * marked up as it was helping to satisfy the - * halo_min_replicas even though it's latency exceeds - * halo_max_latency_msec. - */ - if (up_children > priv->halo_min_replicas) { - worst_up_child = find_worst_up_child (this); - if (worst_up_child >= 0 && - priv->child_latency[worst_up_child] > - halo_max_latency_msec) { - gf_msg_debug (this->name, 0, "Marking child %d down, " - "doesn't meet halo threshold (%ld), and > " - "halo_min_replicas (%d)", - worst_up_child, halo_max_latency_msec, - priv->halo_min_replicas); - priv->child_up[worst_up_child] = 0; - up_children--; - } - } - if (up_children > priv->halo_max_replicas && - !priv->shd.iamshd) { - worst_up_child = find_worst_up_child (this); - if (worst_up_child < 0) { - worst_up_child = idx; - } - priv->child_up[worst_up_child] = 0; - up_children--; - gf_msg_debug (this->name, 0, "Marking child %d down, " - "up_children (%d) > halo_max_replicas (%d)", - worst_up_child, up_children, priv->halo_max_replicas); - } - - if (up_children == 1) { - gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, - "Subvolume '%s' came back up; " - "going online.", - child_xlator->name); - gf_event (EVENT_AFR_SUBVOL_UP, "subvol=%s", this->name); - } else { - *event = GF_EVENT_SOME_DESCENDENT_UP; - } +void +__afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator, + const int idx, int64_t child_latency_msec, + int32_t *event, int32_t *call_psh, + int32_t *up_child) +{ + afr_private_t *priv = NULL; + int up_children = 0; + int worst_up_child = -1; + int64_t halo_max_latency_msec = afr_get_halo_latency(this); + + priv = this->private; + + /* + * This only really counts if the child was never up + * (value = -1) or had been down (value = 0). See + * comment at GF_EVENT_CHILD_DOWN for a more detailed + * explanation. + */ + if (priv->child_up[idx] != 1) { + priv->event_generation++; + } + priv->child_up[idx] = 1; + + *call_psh = 1; + *up_child = idx; + up_children = __afr_get_up_children_count(priv); + /* + * If this is an _actual_ CHILD_UP event, we + * want to set the child_latency to MAX to indicate + * the child needs ping data to be available before doing child-up + */ + if (!priv->halo_enabled) + goto out; + + if (child_latency_msec < 0) { + /*set to INT64_MAX-1 so that it is found for best_down_child*/ + priv->halo_child_up[idx] = 1; + if (priv->child_latency[idx] < 0) { + priv->child_latency[idx] = AFR_HALO_MAX_LATENCY; + } + } + + /* + * Handle the edge case where we exceed + * halo_min_replicas and we've got a child which is + * marked up as it was helping to satisfy the + * halo_min_replicas even though it's latency exceeds + * halo_max_latency_msec. + */ + if (up_children > priv->halo_min_replicas) { + worst_up_child = find_worst_up_child(this); + if (worst_up_child >= 0 && + priv->child_latency[worst_up_child] > halo_max_latency_msec) { + gf_msg_debug(this->name, 0, + "Marking child %d down, " + "doesn't meet halo threshold (%" PRId64 + "), and > " + "halo_min_replicas (%d)", + worst_up_child, halo_max_latency_msec, + priv->halo_min_replicas); + priv->child_up[worst_up_child] = 0; + up_children--; + } + } + + if (up_children > priv->halo_max_replicas && !priv->shd.iamshd) { + worst_up_child = find_worst_up_child(this); + if (worst_up_child < 0) { + worst_up_child = idx; + } + priv->child_up[worst_up_child] = 0; + up_children--; + gf_msg_debug(this->name, 0, + "Marking child %d down, " + "up_children (%d) > halo_max_replicas (%d)", + worst_up_child, up_children, priv->halo_max_replicas); + } +out: + if (up_children == 1) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP, + "Subvolume '%s' came back up; " + "going online.", + child_xlator->name); + gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } else { + *event = GF_EVENT_SOME_DESCENDENT_UP; + } - priv->last_event[idx] = *event; + priv->last_event[idx] = *event; } void -__afr_handle_child_down_event (xlator_t *this, xlator_t *child_xlator, - int idx, int64_t child_latency_msec, int32_t *event, - int32_t *call_psh, int32_t *up_child) -{ - afr_private_t *priv = NULL; - int i = 0; - int up_children = 0; - int down_children = 0; - int best_down_child = -1; - - priv = this->private; - - /* - * If a brick is down when we start, we'll get a - * CHILD_DOWN to indicate its initial state. There - * was never a CHILD_UP in this case, so if we - * increment "down_count" the difference between than - * and "up_count" will no longer be the number of - * children that are currently up. This has serious - * implications e.g. for quorum enforcement, so we - * don't increment these values unless the event - * represents an actual state transition between "up" - * (value = 1) and anything else. - */ - if (priv->child_up[idx] == 1) { - priv->event_generation++; - } +__afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx, + int64_t child_latency_msec, int32_t *event, + int32_t *call_psh, int32_t *up_child) +{ + afr_private_t *priv = NULL; + int i = 0; + int up_children = 0; + int down_children = 0; + int best_down_child = -1; + + priv = this->private; + + /* + * If a brick is down when we start, we'll get a + * CHILD_DOWN to indicate its initial state. There + * was never a CHILD_UP in this case, so if we + * increment "down_count" the difference between than + * and "up_count" will no longer be the number of + * children that are currently up. This has serious + * implications e.g. for quorum enforcement, so we + * don't increment these values unless the event + * represents an actual state transition between "up" + * (value = 1) and anything else. + */ + if (priv->child_up[idx] == 1) { + priv->event_generation++; + } + + /* + * If this is an _actual_ CHILD_DOWN event, we + * want to set the child_latency to < 0 to indicate + * the child is really disconnected. + */ + if (child_latency_msec < 0) { + priv->child_latency[idx] = child_latency_msec; + priv->halo_child_up[idx] = 0; + } + priv->child_up[idx] = 0; + + up_children = __afr_get_up_children_count(priv); + /* + * Handle the edge case where we need to find the + * next best child (to mark up) as marking this child + * down would cause us to fall below halo_min_replicas. + * We will also force the SHD to heal this child _now_ + * as we want it to be up to date if we are going to + * begin using it synchronously. + */ + if (priv->halo_enabled && up_children < priv->halo_min_replicas) { + best_down_child = find_best_down_child(this); + if (best_down_child >= 0) { + gf_msg_debug(this->name, 0, + "Swapping out child %d for " + "child %d to satisfy halo_min_replicas (%d).", + idx, best_down_child, priv->halo_min_replicas); + priv->child_up[best_down_child] = 1; + *call_psh = 1; + *up_child = best_down_child; + } + } + for (i = 0; i < priv->child_count; i++) + if (priv->child_up[i] == 0) + down_children++; + if (down_children == priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN, + "All subvolumes are down. Going " + "offline until at least one of them " + "comes back up."); + gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } else { + *event = GF_EVENT_SOME_DESCENDENT_DOWN; + } + priv->last_event[idx] = *event; +} - /* - * If this is an _actual_ CHILD_DOWN event, we - * want to set the child_latency to < 0 to indicate - * the child is really disconnected. - */ - if (child_latency_msec < 0) { - priv->child_latency[idx] = child_latency_msec; - } - priv->child_up[idx] = 0; +void +afr_ta_lock_release_synctask(xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + int ret = 0; - up_children = __afr_get_up_children_count (priv); - /* - * Handle the edge case where we need to find the - * next best child (to mark up) as marking this child - * down would cause us to fall below halo_min_replicas. - * We will also force the SHD to heal this child _now_ - * as we want it to be up to date if we are going to - * begin using it synchronously. - */ - if (up_children < priv->halo_min_replicas) { - best_down_child = find_best_down_child (this); - if (best_down_child >= 0) { - gf_msg_debug (this->name, 0, - "Swapping out child %d for " - "child %d to satisfy halo_min_replicas (%d).", - idx, best_down_child, priv->halo_min_replicas); - priv->child_up[best_down_child] = 1; - *call_psh = 1; - *up_child = best_down_child; - } - } + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + return; + } - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i] == 0) - down_children++; - if (down_children == priv->child_count) { - gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN, - "All subvolumes are down. Going " - "offline until atleast one of them " - "comes back up."); - gf_event (EVENT_AFR_SUBVOLS_DOWN, "subvol=%s", this->name); - } else { - *event = GF_EVENT_SOME_DESCENDENT_DOWN; - } - priv->last_event[idx] = *event; + ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta, + afr_ta_lock_release_done, ta_frame, this); + if (ret) { + STACK_DESTROY(ta_frame->root); + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to release " + "AFR_TA_DOM_NOTIFY lock."); + } } -static int64_t -afr_get_halo_latency (xlator_t *this) +static void +afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall) { - afr_private_t *priv = NULL; - int64_t halo_max_latency_msec = 0; + struct gf_upcall_inodelk_contention *lc = NULL; + unsigned int inmem_count = 0; + unsigned int onwire_count = 0; + afr_private_t *priv = this->private; - priv = this->private; + lc = upcall->data; - if (priv->shd.iamshd) { - halo_max_latency_msec = priv->shd.halo_max_latency_msec; - } else if (priv->nfsd.iamnfsd) { - halo_max_latency_msec = - priv->nfsd.halo_max_latency_msec; - } else { - halo_max_latency_msec = priv->halo_max_latency_msec; - } - gf_msg_debug (this->name, 0, "Using halo latency %ld", - halo_max_latency_msec); - return halo_max_latency_msec; -} + if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0) + return; + if (priv->shd.iamshd) { + /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */ + return; + } + LOCK(&priv->lock); + { + if (priv->release_ta_notify_dom_lock == _gf_true) { + /* Ignore multiple release requests from shds.*/ + UNLOCK(&priv->lock); + return; + } + priv->release_ta_notify_dom_lock = _gf_true; + inmem_count = priv->ta_in_mem_txn_count; + onwire_count = priv->ta_on_wire_txn_count; + } + UNLOCK(&priv->lock); + if (inmem_count || onwire_count) + /* lock release will happen in txn code path after + * in-memory or on-wire txns are over.*/ + return; -int32_t -afr_notify (xlator_t *this, int32_t event, - void *data, void *data2) -{ - afr_private_t *priv = NULL; - xlator_t *child_xlator = NULL; - int i = -1; - int propagate = 0; - int had_heard_from_all = 0; - int have_heard_from_all = 0; - int idx = -1; - int ret = -1; - int call_psh = 0; - int up_child = -1; - dict_t *input = NULL; - dict_t *output = NULL; - gf_boolean_t had_quorum = _gf_false; - gf_boolean_t has_quorum = _gf_false; - struct gf_upcall *up_data = NULL; - struct gf_upcall_cache_invalidation *up_ci = NULL; - inode_table_t *itable = NULL; - inode_t *inode = NULL; - int64_t halo_max_latency_msec = 0; - int64_t child_latency_msec = -1; - - child_xlator = (xlator_t *)data; - - priv = this->private; - - if (!priv) - return 0; - - /* - * We need to reset this in case children come up in "staggered" - * fashion, so that we discover a late-arriving local subvolume. Note - * that we could end up issuing N lookups to the first subvolume, and - * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. - */ - priv->did_discovery = _gf_false; + afr_ta_lock_release_synctask(this); +} +static void +afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall) +{ + struct gf_upcall_cache_invalidation *up_ci = NULL; + afr_private_t *priv = this->private; + inode_t *inode = NULL; + inode_table_t *itable = NULL; + int i = 0; + + switch (upcall->event_type) { + case GF_UPCALL_INODELK_CONTENTION: + afr_handle_inodelk_contention(this, upcall); + break; + case GF_UPCALL_CACHE_INVALIDATION: + up_ci = (struct gf_upcall_cache_invalidation *)upcall->data; + + /* Since md-cache will be aggressively filtering + * lookups, the stale read issue will be more + * pronounced. Hence when a pending xattr is set notify + * all the md-cache clients to invalidate the existing + * stat cache and send the lookup next time */ + if (!up_ci->dict) + break; + for (i = 0; i < priv->child_count; i++) { + if (!dict_get(up_ci->dict, priv->pending_key[i])) + continue; + up_ci->flags |= UP_INVAL_ATTR; + itable = ((xlator_t *)this->graph->top)->itable; + /*Internal processes may not have itable for + *top xlator*/ + if (itable) + inode = inode_find(itable, upcall->gfid); + if (inode) + afr_inode_need_refresh_set(inode, this); + break; + } + break; + default: + break; + } +} - /* parent xlators dont need to know about every child_up, child_down - * because of afr ha. If all subvolumes go down, child_down has - * to be triggered. In that state when 1 subvolume comes up child_up - * needs to be triggered. dht optimizes revalidate lookup by sending - * it only to one of its subvolumes. When child up/down happens - * for afr's subvolumes dht should be notified by child_modified. The - * subsequent revalidate lookup happens on all the dht's subvolumes - * which triggers afr self-heals if any. - */ - idx = find_child_index (this, child_xlator); - if (idx < 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, - "Received child_up from invalid subvolume"); - goto out; - } +int32_t +afr_notify(xlator_t *this, int32_t event, void *data, void *data2) +{ + afr_private_t *priv = NULL; + xlator_t *child_xlator = NULL; + int i = -1; + int propagate = 0; + int had_heard_from_all = 0; + int have_heard_from_all = 0; + int idx = -1; + int ret = -1; + int call_psh = 0; + int up_child = -1; + dict_t *input = NULL; + dict_t *output = NULL; + gf_boolean_t had_quorum = _gf_false; + gf_boolean_t has_quorum = _gf_false; + int64_t halo_max_latency_msec = 0; + int64_t child_latency_msec = -1; + + child_xlator = (xlator_t *)data; + + priv = this->private; + + if (!priv) + return 0; - had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up, - this); + /* + * We need to reset this in case children come up in "staggered" + * fashion, so that we discover a late-arriving local subvolume. Note + * that we could end up issuing N lookups to the first subvolume, and + * O(N^2) overall, but N is small for AFR so it shouldn't be an issue. + */ + priv->did_discovery = _gf_false; + + /* parent xlators don't need to know about every child_up, child_down + * because of afr ha. If all subvolumes go down, child_down has + * to be triggered. In that state when 1 subvolume comes up child_up + * needs to be triggered. dht optimizes revalidate lookup by sending + * it only to one of its subvolumes. When child up/down happens + * for afr's subvolumes dht should be notified by child_modified. The + * subsequent revalidate lookup happens on all the dht's subvolumes + * which triggers afr self-heals if any. + */ + idx = afr_find_child_index(this, child_xlator); + if (idx < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP, + "Received child_up from invalid subvolume"); + goto out; + } + + had_quorum = priv->quorum_count && + afr_has_quorum(priv->child_up, this, NULL); + if (event == GF_EVENT_CHILD_PING) { + child_latency_msec = (int64_t)(uintptr_t)data2; if (priv->halo_enabled) { - halo_max_latency_msec = afr_get_halo_latency (this); - - if (event == GF_EVENT_CHILD_PING) { - /* Calculates the child latency and sets event - */ - child_latency_msec = (int64_t)(uintptr_t)data2; - LOCK (&priv->lock); - { - __afr_handle_ping_event (this, child_xlator, - idx, halo_max_latency_msec, &event, + halo_max_latency_msec = afr_get_halo_latency(this); + + /* Calculates the child latency and sets event + */ + LOCK(&priv->lock); + { + __afr_handle_ping_event(this, child_xlator, idx, + halo_max_latency_msec, &event, child_latency_msec); - } - UNLOCK (&priv->lock); - } - } - - if (event == GF_EVENT_CHILD_PING) { - /* This is the only xlator that handles PING, no reason to - * propagate. - */ - goto out; } - - if (event == GF_EVENT_TRANSLATOR_OP) { - LOCK (&priv->lock); - { - had_heard_from_all = __get_heard_from_all_status (this); - } - UNLOCK (&priv->lock); - - if (!had_heard_from_all) { - ret = -1; - } else { - input = data; - output = data2; - ret = afr_xl_op (this, input, output); - } - goto out; + UNLOCK(&priv->lock); + } else { + LOCK(&priv->lock); + { + priv->child_latency[idx] = child_latency_msec; + } + UNLOCK(&priv->lock); } + } + + if (event == GF_EVENT_CHILD_PING) { + /* This is the only xlator that handles PING, no reason to + * propagate. + */ + goto out; + } - LOCK (&priv->lock); + if (event == GF_EVENT_TRANSLATOR_OP) { + LOCK(&priv->lock); { - had_heard_from_all = __get_heard_from_all_status (this); - switch (event) { - case GF_EVENT_PARENT_UP: - __afr_launch_notify_timer (this, priv); - propagate = 1; - break; - case GF_EVENT_CHILD_UP: - __afr_handle_child_up_event (this, child_xlator, - idx, halo_max_latency_msec, &event, &call_psh, - &up_child); - break; - - case GF_EVENT_CHILD_DOWN: - __afr_handle_child_down_event (this, child_xlator, idx, - child_latency_msec, &event, &call_psh, - &up_child); - break; - - case GF_EVENT_CHILD_CONNECTING: - priv->last_event[idx] = event; - - break; - - case GF_EVENT_SOME_DESCENDENT_DOWN: - priv->last_event[idx] = event; - break; - case GF_EVENT_UPCALL: - up_data = (struct gf_upcall *)data; - if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION) - break; - up_ci = (struct gf_upcall_cache_invalidation *)up_data->data; - - /* Since md-cache will be aggressively filtering - * lookups, the stale read issue will be more - * pronounced. Hence when a pending xattr is set notify - * all the md-cache clients to invalidate the existing - * stat cache and send the lookup next time */ - if (!up_ci->dict) - break; - for (i = 0; i < priv->child_count; i++) { - if (dict_get (up_ci->dict, priv->pending_key[i])) { - up_ci->flags |= UP_INVAL_ATTR; - itable = ((xlator_t *)this->graph->top)->itable; - /*Internal processes may not have itable for top xlator*/ - if (itable) - inode = inode_find (itable, up_data->gfid); - if (inode) - afr_inode_need_refresh_set (inode, this); - - break; - } - } - break; - default: - propagate = 1; - break; - } - have_heard_from_all = __get_heard_from_all_status (this); - if (!had_heard_from_all && have_heard_from_all) { - if (priv->timer) { - gf_timer_call_cancel (this->ctx, priv->timer); - priv->timer = NULL; - } - /* This is the first event which completes aggregation - of events from all subvolumes. If at least one subvol - had come up, propagate CHILD_UP, but only this time - */ - event = GF_EVENT_CHILD_DOWN; - for (i = 0; i < priv->child_count; i++) { - if (priv->last_event[i] == GF_EVENT_CHILD_UP) { - event = GF_EVENT_CHILD_UP; - break; - } - - if (priv->last_event[i] == - GF_EVENT_CHILD_CONNECTING) { - event = GF_EVENT_CHILD_CONNECTING; - /* continue to check other events for CHILD_UP */ - } - } - } + had_heard_from_all = __get_heard_from_all_status(this); } - UNLOCK (&priv->lock); + UNLOCK(&priv->lock); - if (priv->quorum_count) { - has_quorum = afr_has_quorum (priv->child_up, this); - if (!had_quorum && has_quorum) { - gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET, - "Client-quorum is met"); - gf_event (EVENT_AFR_QUORUM_MET, - "subvol=%s", this->name); + if (!had_heard_from_all) { + ret = -1; + } else { + input = data; + output = data2; + ret = afr_xl_op(this, input, output); + } + goto out; + } + + if (event == GF_EVENT_UPCALL) { + afr_handle_upcall_event(this, data); + } + + LOCK(&priv->lock); + { + had_heard_from_all = __get_heard_from_all_status(this); + switch (event) { + case GF_EVENT_PARENT_UP: + __afr_launch_notify_timer(this, priv); + propagate = 1; + break; + case GF_EVENT_CHILD_UP: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 1; + priv->ta_event_gen++; + break; } - if (had_quorum && !has_quorum) { - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_QUORUM_FAIL, - "Client-quorum is not met"); - gf_event (EVENT_AFR_QUORUM_FAIL, "subvol=%s", - this->name); + __afr_handle_child_up_event(this, child_xlator, idx, + child_latency_msec, &event, + &call_psh, &up_child); + __afr_lock_heal_synctask(this, priv, idx); + break; + + case GF_EVENT_CHILD_DOWN: + if (priv->thin_arbiter_count && + (idx == AFR_CHILD_THIN_ARBITER)) { + priv->ta_child_up = 0; + priv->ta_event_gen++; + afr_ta_locked_priv_invalidate(priv); + break; } - } + __afr_handle_child_down_event(this, child_xlator, idx, + child_latency_msec, &event, + &call_psh, &up_child); + __afr_mark_pending_lk_heal(this, priv, idx); + break; - /* if all subvols have reported status, no need to hide anything - or wait for anything else. Just propagate blindly */ - if (have_heard_from_all) + case GF_EVENT_CHILD_CONNECTING: + priv->last_event[idx] = event; + + break; + + case GF_EVENT_SOME_DESCENDENT_DOWN: + priv->last_event[idx] = event; + break; + default: propagate = 1; + break; + } + have_heard_from_all = __get_heard_from_all_status(this); + if (!had_heard_from_all && have_heard_from_all) { + if (priv->timer) { + gf_timer_call_cancel(this->ctx, priv->timer); + priv->timer = NULL; + } + /* This is the first event which completes aggregation + of events from all subvolumes. If at least one subvol + had come up, propagate CHILD_UP, but only this time + */ + event = GF_EVENT_CHILD_DOWN; + for (i = 0; i < priv->child_count; i++) { + if (priv->last_event[i] == GF_EVENT_CHILD_UP) { + event = GF_EVENT_CHILD_UP; + break; + } - ret = 0; - if (propagate) - ret = default_notify (this, event, data); - - if ((!had_heard_from_all) || call_psh) { - /* Launch self-heal on all local subvolumes if: - * a) We have_heard_from_all for the first time - * b) Already heard from everyone, but we now got a child-up - * event. - */ - if (have_heard_from_all && priv->shd.iamshd) { - for (i = 0; i < priv->child_count; i++) - if (priv->child_up[i]) - afr_selfheal_childup (this, i); + if (priv->last_event[i] == GF_EVENT_CHILD_CONNECTING) { + event = GF_EVENT_CHILD_CONNECTING; + /* continue to check other events for CHILD_UP */ } + } } + } + UNLOCK(&priv->lock); + + if (priv->quorum_count) { + has_quorum = afr_has_quorum(priv->child_up, this, NULL); + if (!had_quorum && has_quorum) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET, + "Client-quorum is met"); + gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } + if (had_quorum && !has_quorum) { + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL, + "Client-quorum is not met"); + gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s", + this->ctx->cmd_args.client_pid, this->name); + } + } + + /* if all subvols have reported status, no need to hide anything + or wait for anything else. Just propagate blindly */ + if (have_heard_from_all) + propagate = 1; + + ret = 0; + if (propagate) + ret = default_notify(this, event, data); + + if ((!had_heard_from_all) || call_psh) { + /* Launch self-heal on all local subvolumes if: + * a) We have_heard_from_all for the first time + * b) Already heard from everyone, but we now got a child-up + * event. + */ + if (have_heard_from_all) { + afr_selfheal_childup(this, priv); + } + } out: - return ret; + return ret; } int -afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) +afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno) { - local->op_ret = -1; - local->op_errno = EUCLEAN; + int __ret = -1; + local->op_ret = -1; + local->op_errno = EUCLEAN; - syncbarrier_init (&local->barrier); - - local->child_up = GF_CALLOC (priv->child_count, - sizeof (*local->child_up), - gf_afr_mt_char); - if (!local->child_up) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } - - memcpy (local->child_up, priv->child_up, - sizeof (*local->child_up) * priv->child_count); - local->call_count = AFR_COUNT (local->child_up, priv->child_count); - if (local->call_count == 0) { - gf_msg (THIS->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOLS_DOWN, "no subvolumes up"); - if (op_errno) - *op_errno = ENOTCONN; - goto out; - } - - local->event_generation = priv->event_generation; + __ret = syncbarrier_init(&local->barrier); + if (__ret) { + if (op_errno) + *op_errno = __ret; + goto out; + } - local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char), - gf_afr_mt_char); - if (!local->read_attempted) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up), + gf_afr_mt_char); + if (!local->child_up) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + memcpy(local->child_up, priv->child_up, + sizeof(*local->child_up) * priv->child_count); + local->call_count = AFR_COUNT(local->child_up, priv->child_count); + if (local->call_count == 0) { + gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN, + "no subvolumes up"); + if (op_errno) + *op_errno = ENOTCONN; + goto out; + } - local->readable = GF_CALLOC (priv->child_count, sizeof (char), - gf_afr_mt_char); - if (!local->readable) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->event_generation = priv->event_generation; - local->readable2 = GF_CALLOC (priv->child_count, sizeof (char), + local->read_attempted = GF_CALLOC(priv->child_count, sizeof(char), gf_afr_mt_char); - if (!local->readable2) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + if (!local->read_attempted) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->readable = GF_CALLOC(priv->child_count, sizeof(char), + gf_afr_mt_char); + if (!local->readable) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), - gf_afr_mt_reply_t); - if (!local->replies) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } + local->readable2 = GF_CALLOC(priv->child_count, sizeof(char), + gf_afr_mt_char); + if (!local->readable2) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } - local->need_full_crawl = _gf_false; + local->read_subvol = -1; - local->compound = _gf_false; - INIT_LIST_HEAD (&local->healer); - return 0; + local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies), + gf_afr_mt_reply_t); + if (!local->replies) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + local->need_full_crawl = _gf_false; + if (priv->thin_arbiter_count) { + local->ta_child_up = priv->ta_child_up; + local->ta_failed_subvol = AFR_CHILD_UNKNOWN; + local->read_txn_query_child = AFR_CHILD_UNKNOWN; + local->ta_event_gen = priv->ta_event_gen; + local->fop_state = TA_SUCCESS; + } + local->is_new_entry = _gf_false; + + INIT_LIST_HEAD(&local->healer); + return 0; out: - return -1; + return -1; } int -afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, - transaction_lk_type_t lk_type) +afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count) { - int ret = -ENOMEM; + int ret = -ENOMEM; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->locked_nodes) - goto out; - - lk->lower_locked_nodes = GF_CALLOC (sizeof (*lk->lower_locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->lower_locked_nodes) - goto out; + lk->lower_locked_nodes = GF_CALLOC(sizeof(*lk->lower_locked_nodes), + child_count, gf_afr_mt_char); + if (NULL == lk->lower_locked_nodes) + goto out; - lk->lock_op_ret = -1; - lk->lock_op_errno = EUCLEAN; - lk->transaction_lk_type = lk_type; + lk->lock_op_ret = -1; + lk->lock_op_errno = EUCLEAN; - ret = 0; + ret = 0; out: - return ret; + return ret; } void -afr_matrix_cleanup (int32_t **matrix, unsigned int m) +afr_matrix_cleanup(int32_t **matrix, unsigned int m) { - int i = 0; + int i = 0; - if (!matrix) - goto out; - for (i = 0; i < m; i++) { - GF_FREE (matrix[i]); - } + if (!matrix) + goto out; + for (i = 0; i < m; i++) { + GF_FREE(matrix[i]); + } - GF_FREE (matrix); + GF_FREE(matrix); out: - return; + return; } -int32_t** -afr_matrix_create (unsigned int m, unsigned int n) +int32_t ** +afr_matrix_create(unsigned int m, unsigned int n) { - int32_t **matrix = NULL; - int i = 0; + int32_t **matrix = NULL; + int i = 0; - matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t); - if (!matrix) - goto out; + matrix = GF_CALLOC(sizeof(*matrix), m, gf_afr_mt_int32_t); + if (!matrix) + goto out; - for (i = 0; i < m; i++) { - matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n, - gf_afr_mt_int32_t); - if (!matrix[i]) - goto out; - } - return matrix; + for (i = 0; i < m; i++) { + matrix[i] = GF_CALLOC(sizeof(*matrix[i]), n, gf_afr_mt_int32_t); + if (!matrix[i]) + goto out; + } + return matrix; out: - afr_matrix_cleanup (matrix, m); - return NULL; + afr_matrix_cleanup(matrix, m); + return NULL; } int -afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) -{ - int ret = -ENOMEM; - - lk->domain = dom; - lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), - child_count, gf_afr_mt_char); - if (NULL == lk->locked_nodes) - goto out; - ret = 0; +afr_transaction_local_init(afr_local_t *local, xlator_t *this) +{ + int ret = -ENOMEM; + afr_private_t *priv = NULL; + + priv = this->private; + INIT_LIST_HEAD(&local->transaction.wait_list); + INIT_LIST_HEAD(&local->transaction.owner_list); + INIT_LIST_HEAD(&local->ta_waitq); + INIT_LIST_HEAD(&local->ta_onwireq); + ret = afr_internal_lock_init(&local->internal_lock, priv->child_count); + if (ret < 0) + goto out; + + ret = -ENOMEM; + local->pre_op_compat = priv->pre_op_compat; + + local->transaction.pre_op = GF_CALLOC(sizeof(*local->transaction.pre_op), + priv->child_count, gf_afr_mt_char); + if (!local->transaction.pre_op) + goto out; + + local->transaction.changelog_xdata = GF_CALLOC( + sizeof(*local->transaction.changelog_xdata), priv->child_count, + gf_afr_mt_dict_t); + if (!local->transaction.changelog_xdata) + goto out; + + if (priv->arbiter_count == 1) { + local->transaction.pre_op_sources = GF_CALLOC( + sizeof(*local->transaction.pre_op_sources), priv->child_count, + gf_afr_mt_char); + if (!local->transaction.pre_op_sources) + goto out; + } + + local->transaction.failed_subvols = GF_CALLOC( + sizeof(*local->transaction.failed_subvols), priv->child_count, + gf_afr_mt_char); + if (!local->transaction.failed_subvols) + goto out; + + local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + ret = 0; out: - return ret; + return ret; } -int -afr_transaction_local_init (afr_local_t *local, xlator_t *this) +void +afr_set_low_priority(call_frame_t *frame) { - int ret = -ENOMEM; - afr_private_t *priv = NULL; - - priv = this->private; - ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, - AFR_TRANSACTION_LK); - if (ret < 0) - goto out; - - if ((local->transaction.type == AFR_DATA_TRANSACTION) || - (local->transaction.type == AFR_METADATA_TRANSACTION)) { - ret = afr_inodelk_init (&local->internal_lock.inodelk[0], - this->name, priv->child_count); - if (ret < 0) - goto out; - } - - ret = -ENOMEM; - local->pre_op_compat = priv->pre_op_compat; - - local->transaction.eager_lock = - GF_CALLOC (sizeof (*local->transaction.eager_lock), - priv->child_count, - gf_afr_mt_int32_t); - - if (!local->transaction.eager_lock) - goto out; - - local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), - priv->child_count, - gf_afr_mt_char); - if (!local->transaction.pre_op) - goto out; - - if (priv->arbiter_count == 1) { - local->transaction.pre_op_xdata = - GF_CALLOC (sizeof (*local->transaction.pre_op_xdata), - priv->child_count, gf_afr_mt_dict_t); - if (!local->transaction.pre_op_xdata) - goto out; - - local->transaction.pre_op_sources = - GF_CALLOC (sizeof (*local->transaction.pre_op_sources), - priv->child_count, gf_afr_mt_char); - if (!local->transaction.pre_op_sources) - goto out; - } - - local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols), - priv->child_count, - gf_afr_mt_char); - if (!local->transaction.failed_subvols) - goto out; - - local->pending = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!local->pending) - goto out; - - local->compound = _gf_false; - INIT_LIST_HEAD (&local->transaction.eager_locked); + frame->root->pid = LOW_PRIO_PROC_PID; +} - ret = 0; +void +afr_priv_destroy(afr_private_t *priv) +{ + int i = 0; + int child_count = -1; + + if (!priv) + goto out; + + GF_FREE(priv->sh_domain); + GF_FREE(priv->last_event); + + child_count = priv->child_count; + if (priv->thin_arbiter_count) { + child_count++; + } + if (priv->pending_key) { + for (i = 0; i < child_count; i++) + GF_FREE(priv->pending_key[i]); + } + + GF_FREE(priv->pending_reads); + GF_FREE(priv->local); + GF_FREE(priv->pending_key); + GF_FREE(priv->children); + GF_FREE(priv->anon_inode); + GF_FREE(priv->child_up); + GF_FREE(priv->halo_child_up); + GF_FREE(priv->child_latency); + LOCK_DESTROY(&priv->lock); + + GF_FREE(priv); out: - return ret; + return; } - -void -afr_set_low_priority (call_frame_t *frame) +int ** +afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending, + dict_t *xattr, ia_type_t iat) { - frame->root->pid = LOW_PRIO_PROC_PID; -} + int i = 0; + int **changelog = NULL; + int idx = -1; + int m_idx = 0; + int d_idx = 0; + int ret = 0; + m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); -void -afr_priv_destroy (afr_private_t *priv) -{ - int i = 0; + idx = afr_index_from_ia_type(iat); - if (!priv) - goto out; - GF_FREE (priv->last_event); - if (priv->pending_key) { - for (i = 0; i < priv->child_count; i++) - GF_FREE (priv->pending_key[i]); - } - GF_FREE (priv->pending_key); - GF_FREE (priv->children); - GF_FREE (priv->child_up); - LOCK_DESTROY (&priv->lock); + changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!changelog) + goto out; - GF_FREE (priv); + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + continue; + + changelog[i][m_idx] = hton32(1); + if (idx != -1) + changelog[i][idx] = hton32(1); + /* If the newentry marking is on a newly created directory, + * then mark it with the full-heal indicator. + */ + if ((IA_ISDIR(iat)) && (priv->esh_granular)) + changelog[i][d_idx] = hton32(1); + } + ret = afr_set_pending_dict(priv, xattr, changelog); + if (ret < 0) { + afr_matrix_cleanup(changelog, priv->child_count); + return NULL; + } out: - return; + return changelog; } -void -afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +static dict_t * +afr_set_heal_info(char *status) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - - if (!local->fd) - return; + dict_t *dict = NULL; + int ret = -1; - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) - return; + dict = dict_new(); + if (!dict) { + ret = -ENOMEM; + goto out; + } + + ret = dict_set_dynstr_sizen(dict, "heal-info", status); + if (ret) + gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Failed to set heal-info key to " + "%s", + status); +out: + /* Any error other than EINVAL, dict_set_dynstr frees status */ + if (ret == -ENOMEM || ret == -EINVAL) { + GF_FREE(status); + } - fd_ctx->open_fd_count = local->open_fd_count; + if (ret && dict) { + dict_unref(dict); + dict = NULL; + } + return dict; } -int** -afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending, - dict_t *xattr, ia_type_t iat) +static gf_boolean_t +afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type) { - int i = 0; - int **changelog = NULL; - int idx = -1; - int m_idx = 0; - int d_idx = 0; - int ret = 0; + afr_private_t *priv = this->private; + int *dirty = alloca0(priv->child_count * sizeof(int)); + int i = 0; - m_idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION); - d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); + afr_selfheal_extract_xattr(this, replies, type, dirty, NULL); + for (i = 0; i < priv->child_count; i++) { + if (dirty[i] > 1) + return _gf_true; + } - idx = afr_index_from_ia_type (iat); + return _gf_false; +} - changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS); - if (!changelog) - goto out; +static gf_boolean_t +afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies, + ia_type_t ia_type) +{ + gf_boolean_t data_chk = _gf_false; + gf_boolean_t mdata_chk = _gf_false; + gf_boolean_t entry_chk = _gf_false; + + switch (ia_type) { + case IA_IFDIR: + mdata_chk = _gf_true; + entry_chk = _gf_true; + break; + case IA_IFREG: + mdata_chk = _gf_true; + data_chk = _gf_true; + break; + default: + /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/ + mdata_chk = _gf_true; + break; + } - for (i = 0; i < priv->child_count; i++) { - if (!pending[i]) - continue; + if (data_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_DATA_TRANSACTION)) { + return _gf_true; + } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_METADATA_TRANSACTION)) { + return _gf_true; + } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn( + this, replies, AFR_ENTRY_TRANSACTION)) { + return _gf_true; + } - changelog[i][m_idx] = hton32(1); - if (idx != -1) - changelog[i][idx] = hton32(1); - /* If the newentry marking is on a newly created directory, - * then mark it with the full-heal indicator. - */ - if ((IA_ISDIR (iat)) && (priv->esh_granular)) - changelog[i][d_idx] = hton32(1); - } - ret = afr_set_pending_dict (priv, xattr, changelog); - if (ret < 0) { - afr_matrix_cleanup (changelog, priv->child_count); - return NULL; - } -out: - return changelog; + return _gf_false; } -gf_boolean_t -afr_decide_heal_info (afr_private_t *priv, unsigned char *sources, int source) -{ - int sources_count = 0; - - if (source < 0) - goto out; - - sources_count = AFR_COUNT (sources, priv->child_count); - if (sources_count == priv->child_count) - return _gf_false; -out: - return _gf_true; +static int +afr_update_heal_status(xlator_t *this, struct afr_reply *replies, + ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh, + gf_boolean_t *msh, unsigned char pending) +{ + int ret = -1; + GF_UNUSED int ret1 = 0; + int i = 0; + int io_domain_lk_count = 0; + int shd_domain_lk_count = 0; + afr_private_t *priv = NULL; + char *key1 = NULL; + char *key2 = NULL; + + priv = this->private; + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); + + for (i = 0; i < priv->child_count; i++) { + if ((replies[i].valid != 1) || (replies[i].op_ret != 0)) + continue; + if (!io_domain_lk_count) { + ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count); + } + if (!shd_domain_lk_count) { + ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count); + } + } + + if (!pending) { + if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) || + (!io_domain_lk_count)) { + /* Needs heal. */ + ret = 0; + } else { + /* No heal needed. */ + *dsh = *esh = *msh = 0; + } + } else { + if (shd_domain_lk_count) { + ret = -EAGAIN; /*For 'possibly-healing'. */ + } else { + ret = 0; /*needs heal. Just set a non -ve value so that it is + assumed as the source index.*/ + } + } + return ret; } +/*return EIO, EAGAIN or pending*/ int -afr_selfheal_locked_metadata_inspect (call_frame_t *frame, xlator_t *this, - inode_t *inode, gf_boolean_t *msh, - gf_boolean_t *pending) -{ - int ret = -1; - unsigned char *locked_on = NULL; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *healed_sinks = NULL; - unsigned char *undid_pending = NULL; - struct afr_reply *locked_replies = NULL; - - afr_private_t *priv = this->private; - - locked_on = alloca0 (priv->child_count); - sources = alloca0 (priv->child_count); - sinks = alloca0 (priv->child_count); - healed_sinks = alloca0 (priv->child_count); - undid_pending = alloca0 (priv->child_count); - - locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); - - ret = afr_selfheal_inodelk (frame, this, inode, this->name, - LLONG_MAX - 1, 0, locked_on); - { - if (ret == 0) { - /* Not a single lock */ - ret = -afr_final_errno (frame->local, priv); - if (ret == 0) - ret = -ENOTCONN;/* all invalid responses */ - goto out; - } - ret = __afr_selfheal_metadata_prepare (frame, this, inode, - locked_on, sources, - sinks, healed_sinks, - undid_pending, - locked_replies, - pending); - *msh = afr_decide_heal_info (priv, sources, ret); - } - afr_selfheal_uninodelk (frame, this, inode, this->name, - LLONG_MAX - 1, 0, locked_on); +afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **inode, gf_boolean_t *entry_selfheal, + gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, unsigned char *pending) +{ + int ret = -1; + int i = 0; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + gf_boolean_t dsh = _gf_false; + gf_boolean_t msh = _gf_false; + gf_boolean_t esh = _gf_false; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *valid_on = NULL; + uint64_t *witness = NULL; + + priv = this->private; + replies = alloca0(sizeof(*replies) * priv->child_count); + sources = alloca0(sizeof(*sources) * priv->child_count); + sinks = alloca0(sizeof(*sinks) * priv->child_count); + witness = alloca0(sizeof(*witness) * priv->child_count); + valid_on = alloca0(sizeof(*valid_on) * priv->child_count); + + ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh, + &esh, replies); + if (ret) + goto out; + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid && replies[i].op_ret == 0) { + valid_on[i] = 1; + } + } + if (msh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_METADATA_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } + if (dsh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_DATA_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } + if (esh) { + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_ENTRY_TRANSACTION, valid_on, + sources, sinks, witness, pending); + if (*pending & PFLAG_SBRAIN) + ret = -EIO; + if (ret) + goto out; + } + + ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh, + &msh, *pending); out: - if (locked_replies) - afr_replies_wipe (locked_replies, priv->child_count); - return ret; + *data_selfheal = dsh; + *entry_selfheal = esh; + *metadata_selfheal = msh; + if (replies) + afr_replies_wipe(replies, priv->child_count); + return ret; } int -afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, - inode_t *inode, gf_boolean_t *dsh, - gf_boolean_t *pflag) -{ - int ret = -1; - unsigned char *data_lock = NULL; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *healed_sinks = NULL; - unsigned char *undid_pending = NULL; - afr_private_t *priv = NULL; - fd_t *fd = NULL; - struct afr_reply *locked_replies = NULL; - - priv = this->private; - data_lock = alloca0 (priv->child_count); - sources = alloca0 (priv->child_count); - sinks = alloca0 (priv->child_count); - healed_sinks = alloca0 (priv->child_count); - undid_pending = alloca0 (priv->child_count); - - /* Heal-info does an open() on the file being examined so that the - * current eager-lock holding client, if present, at some point sees - * open-fd count being > 1 and releases the eager-lock so that heal-info - * doesn't remain blocked forever until IO completes. - */ - ret = afr_selfheal_data_open (this, inode, &fd); +afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + unsigned char pending = 0; + dict_t *dict = NULL; + int ret = -1; + int op_errno = ENOMEM; + inode_t *inode = NULL; + char *substr = NULL; + char *status = NULL; + call_frame_t *heal_frame = NULL; + afr_local_t *heal_local = NULL; + + /*Use frame with lk-owner set*/ + heal_frame = afr_frame_create(frame->this, &op_errno); + if (!heal_frame) { + ret = -1; + goto out; + } + heal_local = heal_frame->local; + heal_frame->local = frame->local; + + ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode, + &entry_selfheal, &data_selfheal, + &metadata_selfheal, &pending); + + if (ret == -ENOMEM) { + ret = -1; + goto out; + } + + if (pending & PFLAG_PENDING) { + gf_asprintf(&substr, "-pending"); + if (!substr) + goto out; + } + + if (ret == -EIO) { + ret = gf_asprintf(&status, "split-brain%s", substr ? substr : ""); if (ret < 0) { - gf_msg_debug (this->name, -ret, "%s: Failed to open", - uuid_utoa (inode->gfid)); + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } else if (ret == -EAGAIN) { + ret = gf_asprintf(&status, "possibly-healing%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } else if (ret >= 0) { + /* value of ret = source index + * so ret >= 0 and at least one of the 3 booleans set to + * true means a source is identified; heal is required. + */ + if (!data_selfheal && !entry_selfheal && !metadata_selfheal) { + status = gf_strdup("no-heal"); + if (!status) { + ret = -1; + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; goto out; + } + } else { + ret = gf_asprintf(&status, "heal%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } + } + } else if (ret < 0) { + /* Apart from above checked -ve ret values, there are + * other possible ret values like ENOTCONN + * (returned when number of valid replies received are + * less than 2) + * in which case heal is required when one of the + * selfheal booleans is set. + */ + if (data_selfheal || entry_selfheal || metadata_selfheal) { + ret = gf_asprintf(&status, "heal%s", substr ? substr : ""); + if (ret < 0) { + goto out; + } + dict = afr_set_heal_info(status); + if (!dict) { + ret = -1; + goto out; + } } + } - locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + ret = 0; + op_errno = 0; - ret = afr_selfheal_inodelk (frame, this, inode, this->name, - 0, 0, data_lock); - { - if (ret == 0) { - ret = -afr_final_errno (frame->local, priv); - if (ret == 0) - ret = -ENOTCONN; /* all invalid responses */ - goto out; - } - ret = __afr_selfheal_data_prepare (frame, this, inode, - data_lock, sources, sinks, - healed_sinks, undid_pending, - locked_replies, pflag); - *dsh = afr_decide_heal_info (priv, sources, ret); - } - afr_selfheal_uninodelk (frame, this, inode, this->name, 0, 0, - data_lock); out: - if (locked_replies) - afr_replies_wipe (locked_replies, priv->child_count); - if (fd) - fd_unref (fd); - return ret; + if (heal_frame) { + heal_frame->local = heal_local; + AFR_STACK_DESTROY(heal_frame); + } + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + if (dict) + dict_unref(dict); + if (inode) + inode_unref(inode); + GF_FREE(substr); + return ret; } int -afr_selfheal_locked_entry_inspect (call_frame_t *frame, xlator_t *this, - inode_t *inode, - gf_boolean_t *esh, gf_boolean_t *pflag) -{ - int ret = -1; - int source = -1; - afr_private_t *priv = NULL; - unsigned char *locked_on = NULL; - unsigned char *data_lock = NULL; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *healed_sinks = NULL; - struct afr_reply *locked_replies = NULL; - gf_boolean_t granular_locks = _gf_false; - - priv = this->private; - if (strcmp ("granular", priv->locking_scheme) == 0) - granular_locks = _gf_true; - locked_on = alloca0 (priv->child_count); - data_lock = alloca0 (priv->child_count); - sources = alloca0 (priv->child_count); - sinks = alloca0 (priv->child_count); - healed_sinks = alloca0 (priv->child_count); - - locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); - - if (!granular_locks) { - ret = afr_selfheal_tryentrylk (frame, this, inode, - priv->sh_domain, NULL, locked_on); - } - { - if (!granular_locks && ret == 0) { - ret = -afr_final_errno (frame->local, priv); - if (ret == 0) - ret = -ENOTCONN;/* all invalid responses */ - goto out; - } - - ret = afr_selfheal_entrylk (frame, this, inode, this->name, - NULL, data_lock); - { - if (ret == 0) { - ret = -afr_final_errno (frame->local, priv); - if (ret == 0) - ret = -ENOTCONN; - /* all invalid responses */ - goto unlock; - } - ret = __afr_selfheal_entry_prepare (frame, this, inode, - data_lock, sources, - sinks, healed_sinks, - locked_replies, - &source, pflag); - if ((ret == 0) && source < 0) - ret = -EIO; - *esh = afr_decide_heal_info (priv, sources, ret); - } - afr_selfheal_unentrylk (frame, this, inode, this->name, NULL, - data_lock, NULL); - } -unlock: - if (!granular_locks) - afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, - NULL, locked_on, NULL); -out: - if (locked_replies) - afr_replies_wipe (locked_replies, priv->child_count); +_afr_is_split_brain(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, afr_transaction_type type, + gf_boolean_t *spb) +{ + afr_private_t *priv = NULL; + uint64_t *witness = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + int sources_count = 0; + int ret = 0; + + priv = this->private; + + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + witness = alloca0(priv->child_count * sizeof(*witness)); + + ret = afr_selfheal_find_direction(frame, this, replies, type, + priv->child_up, sources, sinks, witness, + NULL); + if (ret) return ret; + + sources_count = AFR_COUNT(sources, priv->child_count); + if (!sources_count) + *spb = _gf_true; + + return ret; } int -afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, - inode_t **inode, - gf_boolean_t *entry_selfheal, - gf_boolean_t *data_selfheal, - gf_boolean_t *metadata_selfheal, - gf_boolean_t *pending) - +afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode, + uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb) { - int ret = -1; - gf_boolean_t dsh = _gf_false; - gf_boolean_t msh = _gf_false; - gf_boolean_t esh = _gf_false; + int ret = -1; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; - ret = afr_selfheal_unlocked_inspect (frame, this, gfid, inode, - &dsh, &msh, &esh); - if (ret) - goto out; + priv = this->private; - /* For every heal type hold locks and check if it indeed needs heal */ + replies = alloca0(sizeof(*replies) * priv->child_count); - if (msh) { - ret = afr_selfheal_locked_metadata_inspect (frame, this, - *inode, &msh, - pending); - if (ret == -EIO) - goto out; - } + ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies); + if (ret) + goto out; - if (dsh) { - ret = afr_selfheal_locked_data_inspect (frame, this, *inode, - &dsh, pending); - if (ret == -EIO || (ret == -EAGAIN)) - goto out; - } + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + ret = -EAGAIN; + goto out; + } - if (esh) { - ret = afr_selfheal_locked_entry_inspect (frame, this, *inode, - &esh, pending); - } + ret = _afr_is_split_brain(frame, this, replies, AFR_DATA_TRANSACTION, + d_spb); + if (ret) + goto out; + ret = _afr_is_split_brain(frame, this, replies, AFR_METADATA_TRANSACTION, + m_spb); out: - *data_selfheal = dsh; - *entry_selfheal = esh; - *metadata_selfheal = msh; - return ret; + if (replies) { + afr_replies_wipe(replies, priv->child_count); + replies = NULL; + } + return ret; } -dict_t* -afr_set_heal_info (char *status) +int +afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque) { - dict_t *dict = NULL; - int ret = -1; - - dict = dict_new (); - if (!dict) { - ret = -ENOMEM; - goto out; - } - - ret = dict_set_str (dict, "heal-info", status); - if (ret) - gf_msg ("", GF_LOG_WARNING, -ret, - AFR_MSG_DICT_SET_FAILED, - "Failed to set heal-info key to " - "%s", status); -out: - return dict; + GF_FREE(opaque); + return 0; } int -afr_get_heal_info (call_frame_t *frame, xlator_t *this, loc_t *loc) -{ - gf_boolean_t data_selfheal = _gf_false; - gf_boolean_t metadata_selfheal = _gf_false; - gf_boolean_t entry_selfheal = _gf_false; - gf_boolean_t pending = _gf_false; - dict_t *dict = NULL; - int ret = -1; - int op_errno = 0; - int size = 0; - inode_t *inode = NULL; - char *substr = NULL; - char *status = NULL; - - ret = afr_selfheal_locked_inspect (frame, this, loc->gfid, &inode, - &entry_selfheal, - &data_selfheal, &metadata_selfheal, - &pending); - - if (ret == -ENOMEM) { - op_errno = -ret; - ret = -1; - goto out; +afr_get_split_brain_status(void *opaque) +{ + gf_boolean_t d_spb = _gf_false; + gf_boolean_t m_spb = _gf_false; + int ret = -1; + int op_errno = 0; + int i = 0; + char *choices = NULL; + char *status = NULL; + dict_t *dict = NULL; + inode_t *inode = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + afr_spb_status_t *data = NULL; + + data = opaque; + frame = data->frame; + this = frame->this; + loc = data->loc; + priv = this->private; + children = priv->children; + + inode = afr_inode_find(this, loc->gfid); + if (!inode) + goto out; + + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + + /* Calculation for string length : + * (child_count X length of child-name) + SLEN(" Choices :") + * child-name consists of : + * a) 251 = max characters for volname according to GD_VOLUME_NAME_MAX + * b) strlen("-client-00,") assuming 16 replicas + */ + choices = alloca0(priv->child_count * (256 + SLEN("-client-00,")) + + SLEN(" Choices:")); + + ret = afr_is_split_brain(frame, this, inode, loc->gfid, &d_spb, &m_spb); + if (ret) { + op_errno = -ret; + if (ret == -EAGAIN) { + ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS, + SBRAIN_HEAL_NO_GO_MSG); + if (ret) { + gf_msg(this->name, GF_LOG_WARNING, -ret, + AFR_MSG_DICT_SET_FAILED, + "Failed to set GF_AFR_SBRAIN_STATUS in dict"); + } } + ret = -1; + goto out; + } - if (pending) { - size = strlen ("-pending") + 1; - gf_asprintf (&substr, "-pending"); - if (!substr) - goto out; + if (d_spb || m_spb) { + sprintf(choices, " Choices:"); + for (i = 0; i < priv->child_count; i++) { + strcat(choices, children[i]->name); + strcat(choices, ","); } + choices[strlen(choices) - 1] = '\0'; - if (ret == -EIO) { - size += strlen ("split-brain") + 1; - ret = gf_asprintf (&status, "split-brain%s", - substr? substr : ""); - if (ret < 0) - goto out; - dict = afr_set_heal_info (status); - } else if (ret == -EAGAIN) { - size += strlen ("possibly-healing") + 1; - ret = gf_asprintf (&status, "possibly-healing%s", - substr? substr : ""); - if (ret < 0) - goto out; - dict = afr_set_heal_info (status); - } else if (ret >= 0) { - /* value of ret = source index - * so ret >= 0 and at least one of the 3 booleans set to - * true means a source is identified; heal is required. - */ - if (!data_selfheal && !entry_selfheal && - !metadata_selfheal) { - dict = afr_set_heal_info ("no-heal"); - } else { - size += strlen ("heal") + 1; - ret = gf_asprintf (&status, "heal%s", - substr? substr : ""); - if (ret < 0) - goto out; - dict = afr_set_heal_info (status); - } - } else if (ret < 0) { - /* Apart from above checked -ve ret values, there are - * other possible ret values like ENOTCONN - * (returned when number of valid replies received are - * less than 2) - * in which case heal is required when one of the - * selfheal booleans is set. - */ - if (data_selfheal || entry_selfheal || - metadata_selfheal) { - size += strlen ("heal") + 1; - ret = gf_asprintf (&status, "heal%s", - substr? substr : ""); - if (ret < 0) - goto out; - dict = afr_set_heal_info (status); - } + ret = gf_asprintf(&status, + "data-split-brain:%s " + "metadata-split-brain:%s%s", + (d_spb) ? "yes" : "no", (m_spb) ? "yes" : "no", + choices); + + if (-1 == ret) { + op_errno = ENOMEM; + goto out; } - ret = 0; + ret = dict_set_dynstr_sizen(dict, GF_AFR_SBRAIN_STATUS, status); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + } else { + ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS, + SFILE_NOT_UNDER_DATA); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + } + ret = 0; out: - AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); - if (dict) - dict_unref (dict); - if (inode) - inode_unref (inode); - GF_FREE (substr); - return ret; + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + if (dict) + dict_unref(dict); + if (inode) + inode_unref(inode); + return ret; } -int -_afr_is_split_brain (call_frame_t *frame, xlator_t *this, - struct afr_reply *replies, - afr_transaction_type type, - gf_boolean_t *spb) -{ - afr_private_t *priv = NULL; - uint64_t *witness = NULL; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - int sources_count = 0; - int ret = 0; - - priv = this->private; - - sources = alloca0 (priv->child_count); - sinks = alloca0 (priv->child_count); - witness = alloca0(priv->child_count * sizeof (*witness)); - - ret = afr_selfheal_find_direction (frame, this, replies, - type, priv->child_up, sources, - sinks, witness, NULL); +int32_t +afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + int ret = 0; + int op_errno = 0; + dict_t *dict = NULL; + afr_local_t *local = NULL; + afr_local_t *heal_local = NULL; + call_frame_t *heal_frame = NULL; + + local = frame->local; + dict = dict_new(); + if (!dict) { + op_errno = ENOMEM; + ret = -1; + goto out; + } + + heal_frame = afr_frame_create(this, &op_errno); + if (!heal_frame) { + ret = -1; + goto out; + } + heal_local = heal_frame->local; + heal_frame->local = frame->local; + /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk + * work correctly*/ + ret = afr_selfheal_do(heal_frame, this, loc->gfid); + + if (ret == 1 || ret == 2) { + ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg", + SFILE_NOT_IN_SPLIT_BRAIN); if (ret) - return ret; - - sources_count = AFR_COUNT (sources, priv->child_count); - if (!sources_count) - *spb = _gf_true; + gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED, + "Failed to set sh-fail-msg in dict"); + ret = 0; + goto out; + } else { + if (local->xdata_rsp) { + /* 'sh-fail-msg' has been set in the dict during self-heal.*/ + dict_copy(local->xdata_rsp, dict); + ret = 0; + } else if (ret < 0) { + op_errno = -ret; + ret = -1; + } + } - return ret; +out: + if (heal_frame) { + heal_frame->local = heal_local; + AFR_STACK_DESTROY(heal_frame); + } + if (local->op == GF_FOP_GETXATTR) + AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL); + else if (local->op == GF_FOP_SETXATTR) + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + if (dict) + dict_unref(dict); + return ret; } int -afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode, - uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb) +afr_get_child_index_from_name(xlator_t *this, char *name) { - int ret = -1; - afr_private_t *priv = NULL; - struct afr_reply *replies = NULL; + afr_private_t *priv = this->private; + int index = -1; - priv = this->private; + for (index = 0; index < priv->child_count; index++) { + if (!strcmp(priv->children[index]->name, name)) + goto out; + } + index = -1; +out: + return index; +} - replies = alloca0 (sizeof (*replies) * priv->child_count); +void +afr_priv_need_heal_set(afr_private_t *priv, gf_boolean_t need_heal) +{ + LOCK(&priv->lock); + { + priv->need_heal = need_heal; + } + UNLOCK(&priv->lock); +} - ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); - if (ret) - goto out; +void +afr_set_need_heal(xlator_t *this, afr_local_t *local) +{ + int i = 0; + afr_private_t *priv = this->private; + gf_boolean_t need_heal = _gf_false; - if (!afr_can_decide_split_brain_source_sinks (replies, - priv->child_count)) { - ret = -EAGAIN; - goto out; + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].need_heal) { + need_heal = _gf_true; + break; } + } + afr_priv_need_heal_set(priv, need_heal); + return; +} - ret = _afr_is_split_brain (frame, this, replies, - AFR_DATA_TRANSACTION, d_spb); - if (ret) - goto out; +gf_boolean_t +afr_get_need_heal(xlator_t *this) +{ + afr_private_t *priv = this->private; + gf_boolean_t need_heal = _gf_true; - ret = _afr_is_split_brain (frame, this, replies, - AFR_METADATA_TRANSACTION, m_spb); -out: - if (replies) { - afr_replies_wipe (replies, priv->child_count); - replies = NULL; - } - return ret; + LOCK(&priv->lock); + { + need_heal = priv->need_heal; + } + UNLOCK(&priv->lock); + return need_heal; } int -afr_get_split_brain_status_cbk (int ret, call_frame_t *frame, void *opaque) +afr_get_msg_id(char *op_type) { - GF_FREE (opaque); - return 0; + if (!strcmp(op_type, GF_AFR_REPLACE_BRICK)) + return AFR_MSG_REPLACE_BRICK_STATUS; + else if (!strcmp(op_type, GF_AFR_ADD_BRICK)) + return AFR_MSG_ADD_BRICK_STATUS; + return -1; } int -afr_get_split_brain_status (void *opaque) -{ - gf_boolean_t d_spb = _gf_false; - gf_boolean_t m_spb = _gf_false; - int ret = -1; - int op_errno = 0; - int i = 0; - char *choices = NULL; - char *status = NULL; - dict_t *dict = NULL; - inode_t *inode = NULL; - afr_private_t *priv = NULL; - xlator_t **children = NULL; - call_frame_t *frame = NULL; - xlator_t *this = NULL; - loc_t *loc = NULL; - afr_spb_status_t *data = NULL; - - data = opaque; - frame = data->frame; - this = frame->this; - loc = data->loc; - priv = this->private; - children = priv->children; - - inode = afr_inode_find (this, loc->gfid); - if (!inode) - goto out; +afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *heal_frame, + void *opaque) +{ + call_frame_t *txn_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *heal_local = NULL; + xlator_t *this = NULL; - dict = dict_new (); - if (!dict) { - op_errno = ENOMEM; - ret = -1; - goto out; - } + heal_local = heal_frame->local; + txn_frame = heal_local->heal_frame; + local = txn_frame->local; + this = txn_frame->this; - /* Calculation for string length : - * (child_count X length of child-name) + strlen (" Choices :") - * child-name consists of : - * a) 256 = max characters for volname according to GD_VOLUME_NAME_MAX - * b) strlen ("-client-00,") assuming 16 replicas - */ - choices = alloca0 (priv->child_count * (256 + strlen ("-client-00,")) + - strlen (" Choices:")); + /* Refresh the inode agan and proceed with the transaction.*/ + afr_inode_refresh(txn_frame, this, local->inode, NULL, local->refreshfn); - ret = afr_is_split_brain (frame, this, inode, loc->gfid, &d_spb, - &m_spb); - if (ret) { - op_errno = -ret; - if (ret == -EAGAIN) { - ret = dict_set_str (dict, GF_AFR_SBRAIN_STATUS, - SBRAIN_HEAL_NO_GO_MSG); - if (ret) { - gf_msg (this->name, GF_LOG_WARNING, - -ret, AFR_MSG_DICT_SET_FAILED, - "Failed to set GF_AFR_SBRAIN_STATUS in dict"); - } - } - ret = -1; - goto out; - } + AFR_STACK_DESTROY(heal_frame); - if (d_spb || m_spb) { - sprintf (choices, " Choices:"); - for (i = 0; i < priv->child_count; i++) { - strcat (choices, children[i]->name); - strcat (choices, ","); - } - choices[strlen (choices) - 1] = '\0'; + return 0; +} + +int +afr_fav_child_reset_sink_xattrs(void *opaque) +{ + call_frame_t *heal_frame = NULL; + call_frame_t *txn_frame = NULL; + xlator_t *this = NULL; + gf_boolean_t d_spb = _gf_false; + gf_boolean_t m_spb = _gf_false; + afr_local_t *heal_local = NULL; + afr_local_t *txn_local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; + unsigned char *locked_on = NULL; + unsigned char *sources = NULL; + unsigned char *sinks = NULL; + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + struct afr_reply *locked_replies = NULL; + int ret = 0; + + heal_frame = (call_frame_t *)opaque; + heal_local = heal_frame->local; + txn_frame = heal_local->heal_frame; + txn_local = txn_frame->local; + this = txn_frame->this; + inode = txn_local->inode; + priv = this->private; + locked_on = alloca0(priv->child_count); + sources = alloca0(priv->child_count); + sinks = alloca0(priv->child_count); + healed_sinks = alloca0(priv->child_count); + undid_pending = alloca0(priv->child_count); + locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count); + + ret = _afr_is_split_brain(txn_frame, this, txn_local->replies, + AFR_DATA_TRANSACTION, &d_spb); + + ret = _afr_is_split_brain(txn_frame, this, txn_local->replies, + AFR_METADATA_TRANSACTION, &m_spb); + + /* Take appropriate locks and reset sink xattrs. */ + if (d_spb) { + ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0, + locked_on); + { + if (ret < priv->child_count) + goto data_unlock; + ret = __afr_selfheal_data_prepare( + heal_frame, this, inode, locked_on, sources, sinks, + healed_sinks, undid_pending, locked_replies, NULL); + } + data_unlock: + afr_selfheal_uninodelk(heal_frame, this, inode, this->name, 0, 0, + locked_on); + } + + if (m_spb) { + memset(locked_on, 0, sizeof(*locked_on) * priv->child_count); + memset(undid_pending, 0, sizeof(*undid_pending) * priv->child_count); + ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, + LLONG_MAX - 1, 0, locked_on); + { + if (ret < priv->child_count) + goto mdata_unlock; + ret = __afr_selfheal_metadata_prepare( + heal_frame, this, inode, locked_on, sources, sinks, + healed_sinks, undid_pending, locked_replies, NULL); + } + mdata_unlock: + afr_selfheal_uninodelk(heal_frame, this, inode, this->name, + LLONG_MAX - 1, 0, locked_on); + } - ret = gf_asprintf (&status, "data-split-brain:%s " - "metadata-split-brain:%s%s", - (d_spb) ? "yes" : "no", - (m_spb) ? "yes" : "no", choices); + return ret; +} - if (-1 == ret) { - op_errno = ENOMEM; - goto out; - } - ret = dict_set_dynstr (dict, GF_AFR_SBRAIN_STATUS, status); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } +/* + * Concatenates the xattrs in local->replies separated by a delimiter. + */ +int +afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this, + char *buf, const char *default_str, + int32_t *serz_len, char delimiter) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + char *xattr = NULL; + int i = 0; + int len = 0; + int keylen = 0; + size_t str_len = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + + keylen = strlen(local->cont.getxattr.name); + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid || local->replies[i].op_ret) { + str_len = strlen(default_str); + buf = strncat(buf, default_str, str_len); + len += str_len; + buf[len++] = delimiter; + buf[len] = '\0'; } else { - ret = dict_set_str (dict, GF_AFR_SBRAIN_STATUS, - "The file is not under data or" - " metadata split-brain"); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - } + ret = dict_get_strn(local->replies[i].xattr, + local->cont.getxattr.name, keylen, &xattr); + if (ret) { + gf_msg("TEST", GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED, + "Failed to get the node_uuid of brick " + "%d", + i); + goto out; + } + str_len = strlen(xattr); + buf = strncat(buf, xattr, str_len); + len += str_len; + buf[len++] = delimiter; + buf[len] = '\0'; + } + } + buf[--len] = '\0'; /*remove the last delimiter*/ + if (serz_len) + *serz_len = ++len; + ret = 0; - ret = 0; out: - AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); - if (dict) - dict_unref (dict); - if (inode) - inode_unref (inode); - return ret; + return ret; } -int32_t -afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc) +uint64_t +afr_write_subvol_get(call_frame_t *frame, xlator_t *this) { - int ret = 0; - int op_errno = 0; - dict_t *dict = NULL; - afr_local_t *local = NULL; + afr_local_t *local = NULL; + uint64_t write_subvol = 0; - local = frame->local; - dict = dict_new (); - if (!dict) { - op_errno = ENOMEM; - ret = -1; - goto out; - } - - ret = afr_selfheal_do (frame, this, loc->gfid); + local = frame->local; + LOCK(&local->inode->lock); + write_subvol = local->inode_ctx->write_subvol; + UNLOCK(&local->inode->lock); - if (ret == 1 || ret == 2) { - ret = dict_set_str (dict, "sh-fail-msg", - "File not in split-brain"); - if (ret) - gf_msg (this->name, GF_LOG_WARNING, - -ret, AFR_MSG_DICT_SET_FAILED, - "Failed to set sh-fail-msg in dict"); - ret = 0; - goto out; - } else { - if (local->xdata_rsp) { - /* 'sh-fail-msg' has been set in the dict during self-heal.*/ - dict_copy (local->xdata_rsp, dict); - ret = 0; - } else if (ret < 0) { - op_errno = -ret; - ret = -1; - } - } + return write_subvol; +} -out: - if (local->op == GF_FOP_GETXATTR) - AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL); - else if (local->op == GF_FOP_SETXATTR) - AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); - if (dict) - dict_unref(dict); - return ret; +int +afr_write_subvol_set(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *data_accused = NULL; + unsigned char *metadata_accused = NULL; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + uint16_t datamap = 0; + uint16_t metadatamap = 0; + uint64_t val = 0; + int event = 0; + int i = 0; + + local = frame->local; + priv = this->private; + data_accused = alloca0(priv->child_count); + metadata_accused = alloca0(priv->child_count); + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + event = local->event_generation; + + afr_readables_fill(frame, this, local->inode, data_accused, + metadata_accused, data_readable, metadata_readable, + NULL); + + for (i = 0; i < priv->child_count; i++) { + if (data_readable[i]) + datamap |= (1 << i); + if (metadata_readable[i]) + metadatamap |= (1 << i); + } + + val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) | + (((uint64_t)event) << 32); + + LOCK(&local->inode->lock); + { + if (local->inode_ctx->write_subvol == 0 && + local->transaction.type == AFR_DATA_TRANSACTION) { + local->inode_ctx->write_subvol = val; + } + } + UNLOCK(&local->inode->lock); + + return 0; } int -afr_get_child_index_from_name (xlator_t *this, char *name) +afr_write_subvol_reset(call_frame_t *frame, xlator_t *this) { - afr_private_t *priv = this->private; - int index = -1; + afr_local_t *local = NULL; - for (index = 0; index < priv->child_count; index++) { - if (!strcmp (priv->children[index]->name, name)) - goto out; - } - index = -1; -out: - return index; -} + local = frame->local; + LOCK(&local->inode->lock); + { + GF_ASSERT(local->inode_ctx->lock_count > 0); + local->inode_ctx->lock_count--; -void -afr_priv_need_heal_set (afr_private_t *priv, gf_boolean_t need_heal) -{ - LOCK (&priv->lock); - { - priv->need_heal = need_heal; - } - UNLOCK (&priv->lock); + if (!local->inode_ctx->lock_count) + local->inode_ctx->write_subvol = 0; + } + UNLOCK(&local->inode->lock); + + return 0; } -void -afr_set_need_heal (xlator_t *this, afr_local_t *local) +int +afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode) { - int i = 0; - afr_private_t *priv = this->private; - gf_boolean_t need_heal = _gf_false; + int ret = 0; - for (i = 0; i < priv->child_count; i++) { - if (local->replies[i].valid && local->replies[i].need_heal) { - need_heal = _gf_true; - break; - } - } - afr_priv_need_heal_set (priv, need_heal); - return; + local->inode = inode_ref(inode); + LOCK(&local->inode->lock); + { + ret = __afr_inode_ctx_get(this, local->inode, &local->inode_ctx); + } + UNLOCK(&local->inode->lock); + if (ret < 0) { + gf_msg_callingfn( + this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_INODE_CTX_GET_FAILED, + "Error getting inode ctx %s", uuid_utoa(local->inode->gfid)); + } + return ret; } gf_boolean_t -afr_get_need_heal (xlator_t *this) +afr_ta_is_fop_called_from_synctask(xlator_t *this) { - afr_private_t *priv = this->private; - gf_boolean_t need_heal = _gf_true; + struct synctask *task = NULL; + gf_lkowner_t tmp_owner = { + 0, + }; - LOCK (&priv->lock); - { - need_heal = priv->need_heal; - } - UNLOCK (&priv->lock); - return need_heal; + task = synctask_get(); + if (!task) + return _gf_false; + + set_lk_owner_from_ptr(&tmp_owner, (void *)this); + + if (!is_same_lkowner(&tmp_owner, &task->frame->root->lk_owner)) + return _gf_false; + + return _gf_true; } int -afr_get_msg_id (char *op_type) -{ +afr_ta_post_op_lock(xlator_t *this, loc_t *loc) +{ + int ret = 0; + uuid_t gfid = { + 0, + }; + afr_private_t *priv = this->private; + gf_boolean_t locked = _gf_false; + struct gf_flock flock1 = { + 0, + }; + struct gf_flock flock2 = { + 0, + }; + int32_t cmd = 0; + + /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock + * has been released in afr_notify due to upcall notification from shd. + */ + GF_ASSERT(priv->ta_notify_dom_lock_offset == 0); + + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + flock1.l_type = F_WRLCK; + + while (!locked) { + if (priv->shd.iamshd) { + cmd = F_SETLKW; + flock1.l_start = 0; + flock1.l_len = 0; + } else { + cmd = F_SETLK; + gf_uuid_generate(gfid); + flock1.l_start = gfid_to_ino(gfid); + if (flock1.l_start < 0) + flock1.l_start = -flock1.l_start; + flock1.l_len = 1; + } + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, cmd, &flock1, NULL, NULL); + if (!ret) { + locked = _gf_true; + priv->ta_notify_dom_lock_offset = flock1.l_start; + } else if (ret == -EAGAIN) { + continue; + } else { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to get " + "AFR_TA_DOM_NOTIFY lock on %s.", + loc->name); + goto out; + } + } + + flock2.l_type = F_WRLCK; + flock2.l_start = 0; + flock2.l_len = 0; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name); + flock1.l_type = F_UNLCK; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL, + NULL); + } +out: + return ret; +} - if (!strcmp (op_type, GF_AFR_REPLACE_BRICK)) - return AFR_MSG_REPLACE_BRICK_STATUS; - else if (!strcmp (op_type, GF_AFR_ADD_BRICK)) - return AFR_MSG_ADD_BRICK_STATUS; - return -1; +int +afr_ta_post_op_unlock(xlator_t *this, loc_t *loc) +{ + afr_private_t *priv = this->private; + struct gf_flock flock = { + 0, + }; + int ret = 0; + + if (!priv->shd.iamshd) + GF_ASSERT(afr_ta_is_fop_called_from_synctask(this)); + flock.l_type = F_UNLCK; + flock.l_start = 0; + flock.l_len = 0; + + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_MODIFY lock."); + goto out; + } + + if (!priv->shd.iamshd) + /* Mounts (clients) will not release the AFR_TA_DOM_NOTIFY lock + * in post-op as they use it as a notification mechanism. When + * shd sends a lock request on TA during heal, the clients will + * receive a lock-contention upcall notification upon which they + * will release the AFR_TA_DOM_NOTIFY lock after completing the + * in flight I/O.*/ + goto out; + + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to unlock AFR_TA_DOM_NOTIFY lock."); + } +out: + return ret; } -gf_boolean_t -afr_can_compound_pre_op_and_op (afr_private_t *priv, glusterfs_fop_t fop) +call_frame_t * +afr_ta_frame_create(xlator_t *this) { - if (priv->arbiter_count != 0) - return _gf_false; - - if (!priv->use_compound_fops) - return _gf_false; + call_frame_t *frame = NULL; + void *lk_owner = NULL; - switch (fop) { - case GF_FOP_WRITE: - return _gf_true; - default: - return _gf_false; - } + frame = create_frame(this, this->ctx->pool); + if (!frame) + return NULL; + lk_owner = (void *)this; + afr_set_lk_owner(frame, this, lk_owner); + return frame; } -afr_compound_cbk_t -afr_pack_fop_args (call_frame_t *frame, compound_args_t *args, - glusterfs_fop_t fop, int index) +gf_boolean_t +afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local) { - afr_local_t *local = frame->local; + int data_count = 0; - switch (fop) { - case GF_FOP_WRITE: - COMPOUND_PACK_ARGS (writev, GF_FOP_WRITE, - args, index, - local->fd, local->cont.writev.vector, - local->cont.writev.count, - local->cont.writev.offset, - local->cont.writev.flags, - local->cont.writev.iobref, - local->xdata_req); - return afr_pre_op_writev_cbk; - default: - break; - } - return NULL; + data_count = AFR_COUNT(local->child_up, priv->child_count); + if (data_count == 2) { + return _gf_true; + } else if (data_count == 1 && local->ta_child_up) { + return _gf_true; + } + + return _gf_false; } -int -afr_fav_child_reset_sink_xattrs_cbk (int ret, call_frame_t *heal_frame, - void *opaque) +static gf_boolean_t +afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame) { + afr_local_t *local = NULL; - call_frame_t *txn_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *heal_local = NULL; - xlator_t *this = NULL; + if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) + return _gf_false; - heal_local = heal_frame->local; - txn_frame = heal_local->heal_frame; - local = txn_frame->local; - this = txn_frame->this; + local = frame->local; - /* Refresh the inode agan and proceed with the transaction.*/ - afr_inode_refresh (txn_frame, this, local->inode, NULL, - local->refreshfn); + if (local->op != GF_FOP_LOOKUP) + /* TODO:If the replica count is being increased on a plain distribute + * volume that was never mounted, we need to allow setxattr on '/' with + * GF_CLIENT_PID_NO_ROOT_SQUASH to accomodate for DHT layout setting */ + return _gf_false; - if (heal_frame) - AFR_STACK_DESTROY (heal_frame); + if (local->inode == NULL) + return _gf_false; - return 0; + if (!__is_root_gfid(local->inode->gfid)) + return _gf_false; + + return _gf_true; } -int -afr_fav_child_reset_sink_xattrs (void *opaque) -{ - call_frame_t *heal_frame = NULL; - call_frame_t *txn_frame = NULL; - xlator_t *this = NULL; - gf_boolean_t d_spb = _gf_false; - gf_boolean_t m_spb = _gf_false; - afr_local_t *heal_local = NULL; - afr_local_t *txn_local = NULL; - afr_private_t *priv = NULL; - inode_t *inode = NULL; - unsigned char *locked_on = NULL; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *healed_sinks = NULL; - unsigned char *undid_pending = NULL; - struct afr_reply *locked_replies = NULL; - int ret = 0; - - heal_frame = (call_frame_t *) opaque; - heal_local = heal_frame->local; - txn_frame = heal_local->heal_frame; - txn_local = txn_frame->local; - this = txn_frame->this; - inode = txn_local->inode; - priv = this->private; - locked_on = alloca0 (priv->child_count); - sources = alloca0 (priv->child_count); - sinks = alloca0 (priv->child_count); - healed_sinks = alloca0 (priv->child_count); - undid_pending = alloca0 (priv->child_count); - locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); - - ret = _afr_is_split_brain (txn_frame, this, txn_local->replies, - AFR_DATA_TRANSACTION, &d_spb); - - ret = _afr_is_split_brain (txn_frame, this, txn_local->replies, - AFR_METADATA_TRANSACTION, &m_spb); - - /* Take appropriate locks and reset sink xattrs. */ - if (d_spb) { - ret = afr_selfheal_inodelk (heal_frame, this, inode, this->name, - 0, 0, locked_on); - { - if (ret < AFR_SH_MIN_PARTICIPANTS) - goto data_unlock; - ret = __afr_selfheal_data_prepare (heal_frame, this, - inode, locked_on, - sources, sinks, - healed_sinks, - undid_pending, - locked_replies, - NULL); - } -data_unlock: - afr_selfheal_uninodelk (heal_frame, this, inode, this->name, - 0, 0, locked_on); - } - - if (m_spb) { - memset (locked_on, 0, sizeof (*locked_on) * priv->child_count); - memset (undid_pending, 0, - sizeof (*undid_pending) * priv->child_count); - ret = afr_selfheal_inodelk (heal_frame, this, inode, this->name, - LLONG_MAX-1, 0, locked_on); - { - if (ret < AFR_SH_MIN_PARTICIPANTS) - goto mdata_unlock; - ret = __afr_selfheal_metadata_prepare (heal_frame, this, - inode, locked_on, - sources, sinks, - healed_sinks, - undid_pending, - locked_replies, - NULL); +gf_boolean_t +afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count) +{ + if (frame && (up_children_count > 0) && + afr_is_add_replica_mount_lookup_on_root(frame)) + return _gf_true; - } -mdata_unlock: - afr_selfheal_uninodelk (heal_frame, this, inode, this->name, - LLONG_MAX-1, 0, locked_on); - } + return _gf_false; +} - return ret; +void +afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + unsigned char *success_replies = NULL; -} + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); -/* - * Concatenates the xattrs in local->replies separated by a delimiter. - */ -int -afr_serialize_xattrs_with_delimiter (call_frame_t *frame, xlator_t *this, - char *buf, const char *default_str, - int32_t *serz_len, char delimiter) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - char *xattr = NULL; - int i = 0; - int len = 0; - int ret = -1; - - priv = this->private; - local = frame->local; + if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) { + local->op_errno = afr_final_errno(local, priv); + if (!local->op_errno) + local->op_errno = afr_quorum_errno(priv); + local->op_ret = -1; + } +} - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid || local->replies[i].op_ret) { - buf = strncat (buf, default_str, strlen (default_str)); - len += strlen (default_str); - buf[len++] = delimiter; - buf[len] = '\0'; - } else { - ret = dict_get_str (local->replies[i].xattr, - local->cont.getxattr.name, &xattr); - if (ret) { - gf_msg ("TEST", GF_LOG_ERROR, -ret, - AFR_MSG_DICT_GET_FAILED, - "Failed to get the node_uuid of brick " - "%d", i); - goto out; - } - buf = strncat (buf, xattr, strlen (xattr)); - len += strlen (xattr); - buf[len++] = delimiter; - buf[len] = '\0'; - } +gf_boolean_t +afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child) +{ + int *pending = NULL; + int ret = 0; + int i = 0; + + ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending); + if (ret == 0) { + for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { + /* Not doing a ntoh32(pending) as we just want to check + * if it is non-zero or not. */ + if (pending[i]) { + return _gf_true; + } } - buf[--len] = '\0'; /*remove the last delimiter*/ - if (serz_len) - *serz_len = ++len; - ret = 0; + } -out: - return ret; + return _gf_false; } diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c index 5218d386c79..f8bf8340dab 100644 --- a/xlators/cluster/afr/src/afr-dir-read.c +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -8,348 +8,339 @@ cases as published by the Free Software Foundation. */ - #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> #include <string.h> -#include "glusterfs.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "checksum.h" +#include <glusterfs/glusterfs.h> +#include <glusterfs/dict.h> +#include <glusterfs/list.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> #include "afr.h" #include "afr-transaction.h" - int32_t -afr_opendir_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd, dict_t *xdata) +afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = -1; - int32_t child_index = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - fd_ctx = local->fd_ctx; - child_index = (long) cookie; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; - } else { - local->op_ret = op_ret; - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - if (!local->xdata_rsp && xdata) - local->xdata_rsp = dict_ref (xdata); - } + afr_local_t *local = NULL; + int call_count = -1; + int32_t child_index = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + child_index = (long)cookie; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } - UNLOCK (&frame->lock); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); - call_count = afr_frame_return (frame); + if (call_count == 0) { + afr_handle_replies_quorum(frame, this); + AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno, + local->fd, NULL); + } - if (call_count == 0) - AFR_STACK_UNWIND (opendir, frame, local->op_ret, - local->op_errno, local->fd, NULL); - return 0; + return 0; } - int -afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata) { - afr_private_t * priv = NULL; - afr_local_t * local = NULL; - int i = 0; - int call_count = -1; - int32_t op_errno = ENOMEM; - afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = -1; + int32_t op_errno = ENOMEM; + afr_fd_ctx_t *fd_ctx = NULL; - priv = this->private; + priv = this->private; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->op = GF_FOP_OPENDIR; - if (!afr_is_consistent_io_possible (local, priv, &op_errno)) - goto out; + local->op = GF_FOP_OPENDIR; - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } - loc_copy (&local->loc, loc); + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; - local->fd = fd_ref (fd); - local->fd_ctx = fd_ctx; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + goto out; - call_count = local->call_count; + loc_copy(&local->loc, loc); - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_opendir_cbk, - (void*) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - loc, fd, NULL); + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; - if (!--call_count) - break; - } + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_opendir_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->opendir, loc, fd, NULL); + + if (!--call_count) + break; } + } - return 0; + return 0; out: - AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL); - return 0; + AFR_STACK_UNWIND(opendir, frame, -1, op_errno, fd, NULL); + return 0; } static int -afr_validate_read_subvol (inode_t *inode, xlator_t *this, int par_read_subvol) +afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol) { - int gen = 0; - int entry_read_subvol = 0; - unsigned char *data_readable = NULL; - unsigned char *metadata_readable = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - data_readable = alloca0 (priv->child_count); - metadata_readable = alloca0 (priv->child_count); - - afr_inode_read_subvol_get (inode, this, data_readable, - metadata_readable, &gen); - - if (gen != priv->event_generation || - !data_readable[par_read_subvol] || - !metadata_readable[par_read_subvol]) - return -1; - - /* Once the control reaches the following statement, it means that the - * parent's read subvol is perfectly readable. So calling - * either afr_data_subvol_get() or afr_metadata_subvol_get() would - * yield the same result. Hence, choosing afr_data_subvol_get() below. - */ - - if (!priv->consistent_metadata) - return 0; - - /* For an inode fetched through readdirp which is yet to be linked, - * inode ctx would not be initialised (yet). So this function returns - * -1 above due to gen being 0, which is why it is OK to pass NULL for - * read_subvol_args here. - */ - entry_read_subvol = afr_data_subvol_get (inode, this, NULL, NULL, - NULL, NULL); - if (entry_read_subvol != par_read_subvol) - return -1; - + int gen = 0; + int entry_read_subvol = 0; + unsigned char *data_readable = NULL; + unsigned char *metadata_readable = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + data_readable = alloca0(priv->child_count); + metadata_readable = alloca0(priv->child_count); + + afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable, + &gen); + + if (gen != priv->event_generation || !data_readable[par_read_subvol] || + !metadata_readable[par_read_subvol]) + return -1; + + /* Once the control reaches the following statement, it means that the + * parent's read subvol is perfectly readable. So calling + * either afr_data_subvol_get() or afr_metadata_subvol_get() would + * yield the same result. Hence, choosing afr_data_subvol_get() below. + */ + + if (!priv->consistent_metadata) return 0; + /* For an inode fetched through readdirp which is yet to be linked, + * inode ctx would not be initialised (yet). So this function returns + * -1 above due to gen being 0, which is why it is OK to pass NULL for + * read_subvol_args here. + */ + entry_read_subvol = afr_data_subvol_get(inode, this, NULL, NULL, NULL, + NULL); + if (entry_read_subvol != par_read_subvol) + return -1; + + return 0; } static void -afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol, - gf_dirent_t *entries, fd_t *fd) +afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries, + int subvol, gf_dirent_t *entries, fd_t *fd) { - int ret = -1; - gf_dirent_t *entry = NULL; - gf_dirent_t *tmp = NULL; - xlator_t *this = NULL; - afr_private_t *priv = NULL; - gf_boolean_t need_heal = _gf_false; - gf_boolean_t validate_subvol = _gf_false; - - this = THIS; - priv = this->private; - - need_heal = afr_get_need_heal (this); - validate_subvol = need_heal | priv->consistent_metadata; - - list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) { - if (__is_root_gfid (fd->inode->gfid) && - !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) { - continue; - } - - list_del_init (&entry->list); - list_add_tail (&entry->list, &entries->list); - - if (!validate_subvol) - continue; - - if (entry->inode) { - ret = afr_validate_read_subvol (entry->inode, this, - subvol); - if (ret == -1) { - inode_unref (entry->inode); - entry->inode = NULL; - continue; - } - } + int ret = -1; + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + xlator_t *this = NULL; + afr_private_t *priv = NULL; + gf_boolean_t need_heal = _gf_false; + gf_boolean_t validate_subvol = _gf_false; + + this = THIS; + priv = this->private; + + need_heal = afr_get_need_heal(this); + validate_subvol = need_heal | priv->consistent_metadata; + + list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list) + { + if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name, + frame->root->pid)) { + continue; } -} + list_del_init(&entry->list); + list_add_tail(&entry->list, &entries->list); + + if (!validate_subvol) + continue; + + if (entry->inode) { + ret = afr_validate_read_subvol(entry->inode, this, subvol); + if (ret == -1) { + inode_unref(entry->inode); + entry->inode = NULL; + continue; + } + } + } +} int32_t -afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, - dict_t *xdata) +afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries, + dict_t *xdata) { - afr_local_t *local = NULL; - gf_dirent_t entries; + afr_local_t *local = NULL; + gf_dirent_t entries; - INIT_LIST_HEAD (&entries.list); + INIT_LIST_HEAD(&entries.list); - local = frame->local; + local = frame->local; - if (op_ret < 0 && !local->cont.readdir.offset) { - /* failover only if this was first readdir, detected - by offset == 0 */ - local->op_ret = op_ret; - local->op_errno = op_errno; + if (op_ret < 0 && !local->cont.readdir.offset) { + /* failover only if this was first readdir, detected + by offset == 0 */ + local->op_ret = op_ret; + local->op_errno = op_errno; - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - if (op_ret >= 0) - afr_readdir_transform_entries (subvol_entries, (long) cookie, - &entries, local->fd); + if (op_ret >= 0) + afr_readdir_transform_entries(frame, subvol_entries, (long)cookie, + &entries, local->fd); - AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata); + AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata); - gf_dirent_free (&entries); + gf_dirent_free(&entries); - return 0; + return 0; } - int -afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_readdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - priv = this->private; - local = frame->local; - fd_ctx = afr_fd_ctx_get (local->fd, this); - - if (subvol == -1) { - AFR_STACK_UNWIND (readdir, frame, local->op_ret, - local->op_errno, 0, 0); - return 0; - } - - fd_ctx->readdir_subvol = subvol; - - if (local->op == GF_FOP_READDIR) - STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->readdir, - local->fd, local->cont.readdir.size, - local->cont.readdir.offset, - local->xdata_req); - else - STACK_WIND_COOKIE (frame, afr_readdir_cbk, - (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->readdirp, - local->fd, local->cont.readdir.size, - local->cont.readdir.offset, - local->xdata_req); - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + priv = this->private; + local = frame->local; + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + local->op_errno = EINVAL; + local->op_ret = -1; + } + + if (subvol == -1 || !fd_ctx) { + AFR_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, 0, 0); + return 0; + } + + fd_ctx->readdir_subvol = subvol; + + if (local->op == GF_FOP_READDIR) + STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdir, local->fd, + local->cont.readdir.size, local->cont.readdir.offset, + local->xdata_req); + else + STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readdirp, local->fd, + local->cont.readdir.size, local->cont.readdir.offset, + local->xdata_req); + return 0; } - int -afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, int whichop, dict_t *dict) +afr_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, int whichop, dict_t *dict) { - afr_local_t *local = NULL; - int32_t op_errno = 0; - int subvol = -1; - afr_fd_ctx_t *fd_ctx = NULL; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - op_errno = EINVAL; - goto out; - } - - local->op = whichop; - local->fd = fd_ref (fd); - local->cont.readdir.size = size; - local->cont.readdir.offset = offset; - local->xdata_req = (dict)? dict_ref (dict) : NULL; - - subvol = fd_ctx->readdir_subvol; - - if (offset == 0 || subvol == -1) { - /* First readdir has option of failing over and selecting - an appropriate read subvolume */ - afr_read_txn (frame, this, fd->inode, afr_readdir_wind, - AFR_DATA_TRANSACTION); - } else { - /* But continued readdirs MUST stick to the same subvolume - without an option to failover */ - afr_readdir_wind (frame, this, subvol); - } - - return 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; + int subvol = -1; + afr_fd_ctx_t *fd_ctx = NULL; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) { + op_errno = EINVAL; + goto out; + } + + local->op = whichop; + local->fd = fd_ref(fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + local->xdata_req = (dict) ? dict_ref(dict) : NULL; + + subvol = fd_ctx->readdir_subvol; + + if (offset == 0 || subvol == -1) { + /* First readdir has option of failing over and selecting + an appropriate read subvolume */ + afr_read_txn(frame, this, fd->inode, afr_readdir_wind, + AFR_DATA_TRANSACTION); + } else { + /* But continued readdirs MUST stick to the same subvolume + without an option to failover */ + afr_readdir_wind(frame, this, subvol); + } + + return 0; out: - AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL); - return 0; + AFR_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL); + return 0; } - int32_t -afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *xdata) +afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata); + afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIR, xdata); - return 0; + return 0; } - int32_t -afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, dict_t *dict) +afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict) { - afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict); + afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIRP, dict); - return 0; + return 0; } - int32_t -afr_releasedir (xlator_t *this, fd_t *fd) +afr_releasedir(xlator_t *this, fd_t *fd) { - afr_cleanup_fd_ctx (this, fd); + afr_cleanup_fd_ctx(this, fd); - return 0; + return 0; } diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h index 09456d15949..773e925ec6c 100644 --- a/xlators/cluster/afr/src/afr-dir-read.h +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -11,26 +11,23 @@ #ifndef __DIR_READ_H__ #define __DIR_READ_H__ - int32_t -afr_opendir (call_frame_t *frame, xlator_t *this, - loc_t *loc, fd_t *fd, dict_t *xdata); +afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + dict_t *xdata); int32_t -afr_releasedir (xlator_t *this, fd_t *fd); +afr_releasedir(xlator_t *this, fd_t *fd); int32_t -afr_readdir (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, dict_t *xdata); - +afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *xdata); int32_t -afr_readdirp (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, dict_t *dict); +afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, dict_t *dict); int32_t -afr_checksum (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, dict_t *xdata); - +afr_checksum(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + dict_t *xdata); #endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c index 1afd5d39670..b7cceb79158 100644 --- a/xlators/cluster/afr/src/afr-dir-write.c +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -8,529 +8,493 @@ cases as published by the Free Software Foundation. */ - #include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/list.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> #include "afr.h" #include "afr-transaction.h" void -afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this); +afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this); int -afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno) +afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno) { - int ret = -1; - char *child_path = NULL; - - if (!child->parent) { - if (op_errno) - *op_errno = EINVAL; - goto out; - } - - child_path = gf_strdup (child->path); - if (!child_path) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } - - parent->path = gf_strdup (dirname (child_path)); - if (!parent->path) { - if (op_errno) - *op_errno = ENOMEM; - goto out; - } - - parent->inode = inode_ref (child->parent); - gf_uuid_copy (parent->gfid, child->pargfid); - - ret = 0; + int ret = -1; + char *child_path = NULL; + + if (!child->parent) { + if (op_errno) + *op_errno = EINVAL; + goto out; + } + + child_path = gf_strdup(child->path); + if (!child_path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + parent->path = gf_strdup(dirname(child_path)); + if (!parent->path) { + if (op_errno) + *op_errno = ENOMEM; + goto out; + } + + parent->inode = inode_ref(child->parent); + gf_uuid_copy(parent->gfid, child->pargfid); + + ret = 0; out: - GF_FREE (child_path); + GF_FREE(child_path); - return ret; + return ret; } - static void -__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this) +__afr_dir_write_finalize(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int inode_read_subvol = -1; - int parent_read_subvol = -1; - int parent2_read_subvol = -1; - int i = 0; - afr_read_subvol_args_t args = {0,}; - - local = frame->local; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret == -1) - continue; - gf_uuid_copy (args.gfid, local->replies[i].poststat.ia_gfid); - args.ia_type = local->replies[i].poststat.ia_type; - break; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int inode_read_subvol = -1; + int parent_read_subvol = -1; + int parent2_read_subvol = -1; + int i = 0; + afr_read_subvol_args_t args = { + 0, + }; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == -1) + continue; + gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid); + args.ia_type = local->replies[i].poststat.ia_type; + break; + } + + if (local->inode) { + if (local->op != GF_FOP_RENAME && local->op != GF_FOP_LINK) + afr_replies_interpret(frame, this, local->inode, NULL); + + inode_read_subvol = afr_data_subvol_get(local->inode, this, NULL, NULL, + NULL, &args); + } + + if (local->parent) + parent_read_subvol = afr_data_subvol_get(local->parent, this, NULL, + local->readable, NULL, NULL); + + if (local->parent2) + parent2_read_subvol = afr_data_subvol_get(local->parent2, this, NULL, + local->readable2, NULL, NULL); + + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + afr_pick_error_xdata(local, priv, local->parent, local->readable, + local->parent2, local->readable2); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) { + if (local->inode) + afr_inode_need_refresh_set(local->inode, this); + if (local->parent) + afr_inode_need_refresh_set(local->parent, this); + if (local->parent2) + afr_inode_need_refresh_set(local->parent2, this); + continue; } - if (local->inode) { - afr_replies_interpret (frame, this, local->inode, NULL); - inode_read_subvol = afr_data_subvol_get (local->inode, this, - NULL, NULL, NULL, &args); - } - - if (local->parent) - parent_read_subvol = afr_data_subvol_get (local->parent, this, - NULL, local->readable, NULL, NULL); - - if (local->parent2) - parent2_read_subvol = afr_data_subvol_get (local->parent2, this, - NULL, local->readable2, NULL, NULL); - - local->op_ret = -1; - local->op_errno = afr_final_errno (local, priv); - afr_pick_error_xdata (local, priv, local->parent, local->readable, - local->parent2, local->readable2); - - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret < 0) { - if (local->inode) - afr_inode_event_gen_reset (local->inode, this); - if (local->parent) - afr_inode_event_gen_reset (local->parent, - this); - if (local->parent2) - afr_inode_event_gen_reset (local->parent2, - this); - continue; - } - - if (local->op_ret == -1) { - local->op_ret = local->replies[i].op_ret; - local->op_errno = local->replies[i].op_errno; - - local->cont.dir_fop.buf = - local->replies[i].poststat; - local->cont.dir_fop.preparent = - local->replies[i].preparent; - local->cont.dir_fop.postparent = - local->replies[i].postparent; - local->cont.dir_fop.prenewparent = - local->replies[i].preparent2; - local->cont.dir_fop.postnewparent = - local->replies[i].postparent2; - if (local->xdata_rsp) { - dict_unref (local->xdata_rsp); - local->xdata_rsp = NULL; - } - - if (local->replies[i].xdata) - local->xdata_rsp = - dict_ref (local->replies[i].xdata); - continue; - } - - if (i == inode_read_subvol) { - local->cont.dir_fop.buf = - local->replies[i].poststat; - if (local->replies[i].xdata) { - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); - local->xdata_rsp = - dict_ref (local->replies[i].xdata); - } - } - - if (i == parent_read_subvol) { - local->cont.dir_fop.preparent = - local->replies[i].preparent; - local->cont.dir_fop.postparent = - local->replies[i].postparent; - } - - if (i == parent2_read_subvol) { - local->cont.dir_fop.prenewparent = - local->replies[i].preparent2; - local->cont.dir_fop.postnewparent = - local->replies[i].postparent2; - } - } + if (local->op_ret == -1) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.dir_fop.buf = local->replies[i].poststat; + local->cont.dir_fop.preparent = local->replies[i].preparent; + local->cont.dir_fop.postparent = local->replies[i].postparent; + local->cont.dir_fop.prenewparent = local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = local->replies[i].postparent2; + if (local->xdata_rsp) { + dict_unref(local->xdata_rsp); + local->xdata_rsp = NULL; + } + + if (local->replies[i].xdata) + local->xdata_rsp = dict_ref(local->replies[i].xdata); + continue; + } -} + if (i == inode_read_subvol) { + local->cont.dir_fop.buf = local->replies[i].poststat; + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + } + if (i == parent_read_subvol) { + local->cont.dir_fop.preparent = local->replies[i].preparent; + local->cont.dir_fop.postparent = local->replies[i].postparent; + } + + if (i == parent2_read_subvol) { + local->cont.dir_fop.prenewparent = local->replies[i].preparent2; + local->cont.dir_fop.postnewparent = local->replies[i].postparent2; + } + } +} static void -__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index, - int op_ret, int op_errno, struct iatt *poststat, - struct iatt *preparent, struct iatt *postparent, - struct iatt *preparent2, struct iatt *postparent2, - dict_t *xdata) +__afr_dir_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *poststat, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - fd_ctx = local->fd_ctx; - - local->replies[child_index].valid = 1; - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - if (xdata) - local->replies[child_index].xdata = dict_ref (xdata); - - - if (op_ret >= 0) { - if (poststat) - local->replies[child_index].poststat = *poststat; - if (preparent) - local->replies[child_index].preparent = *preparent; - if (postparent) - local->replies[child_index].postparent = *postparent; - if (preparent2) - local->replies[child_index].preparent2 = *preparent2; - if (postparent2) - local->replies[child_index].postparent2 = *postparent2; - if (fd_ctx) - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - } else { - if (op_errno != ENOTEMPTY) - afr_transaction_fop_failed (frame, this, child_index); - if (fd_ctx) - fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; - } - - return; + afr_local_t *local = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); + + if (op_ret >= 0) { + if (poststat) + local->replies[child_index].poststat = *poststat; + if (preparent) + local->replies[child_index].preparent = *preparent; + if (postparent) + local->replies[child_index].postparent = *postparent; + if (preparent2) + local->replies[child_index].preparent2 = *preparent2; + if (postparent2) + local->replies[child_index].postparent2 = *postparent2; + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + } else { + if (op_errno != ENOTEMPTY) + afr_transaction_fop_failed(frame, this, child_index); + if (fd_ctx) + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } + + return; } - static int -__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent, - struct iatt *preparent2, struct iatt *postparent2, - dict_t *xdata) +__afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, + struct iatt *preparent2, struct iatt *postparent2, + dict_t *xdata) { - afr_local_t *local = NULL; - int child_index = (long) cookie; - int call_count = -1; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - __afr_dir_write_fill (frame, this, child_index, op_ret, - op_errno, buf, preparent, postparent, - preparent2, postparent2, xdata); - } - UNLOCK (&frame->lock); - call_count = afr_frame_return (frame); - - if (call_count == 0) { - __afr_dir_write_finalize (frame, this); - - if (afr_txn_nothing_failed (frame, this)) { - /*if it did pre-op, it will do post-op changing ctime*/ - if (priv->consistent_metadata && - afr_needs_changelog_update (local)) - afr_zero_fill_stat (local); - local->transaction.unwind (frame, this); - } - - afr_mark_entry_pending_changelog (frame, this); - - local->transaction.resume (frame, this); + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + LOCK(&frame->lock); + { + __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf, + preparent, postparent, preparent2, postparent2, + xdata); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + __afr_dir_write_finalize(frame, this); + + if (afr_txn_nothing_failed(frame, this)) { + /*if it did pre-op, it will do post-op changing ctime*/ + if (priv->consistent_metadata && afr_needs_changelog_update(local)) + afr_zero_fill_stat(local); + local->transaction.unwind(frame, this); } - return 0; -} + afr_mark_entry_pending_changelog(frame, this); + + afr_transaction_resume(frame, this); + } + return 0; +} int -afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - dict_t *xattr, dict_t *xdata) +afr_mark_new_entry_changelog_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { - int call_count = 0; + int call_count = 0; - call_count = afr_frame_return (frame); + call_count = afr_frame_return(frame); - if (call_count == 0) - AFR_STACK_DESTROY (frame); + if (call_count == 0) + AFR_STACK_DESTROY(frame); - return 0; + return 0; } - void -afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this) +afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this) { - call_frame_t *new_frame = NULL; - afr_local_t *local = NULL; - afr_local_t *new_local = NULL; - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - int32_t **changelog = NULL; - int i = 0; - int op_errno = ENOMEM; - unsigned char *pending = NULL; - int call_count = 0; - - local = frame->local; - priv = this->private; - - new_frame = copy_frame (frame); - if (!new_frame) - goto out; - - new_local = AFR_FRAME_INIT (new_frame, op_errno); - if (!new_local) - goto out; - - xattr = dict_new (); - if (!xattr) - goto out; - - pending = alloca0 (priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (local->transaction.pre_op[i] && - !local->transaction.failed_subvols[i]) { - call_count ++; - continue; - } - pending[i] = 1; - } - - changelog = afr_mark_pending_changelog (priv, pending, xattr, - local->cont.dir_fop.buf.ia_type); - if (!changelog) - goto out; - - new_local->pending = changelog; - gf_uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); - new_local->loc.inode = inode_ref (local->inode); - - new_local->call_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (pending[i]) - continue; - - STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->xattrop, - &new_local->loc, GF_XATTROP_ADD_ARRAY, - xattr, NULL); - if (!--call_count) - break; + call_frame_t *new_frame = NULL; + afr_local_t *local = NULL; + afr_local_t *new_local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int32_t **changelog = NULL; + int i = 0; + int op_errno = ENOMEM; + unsigned char *pending = NULL; + int call_count = 0; + + local = frame->local; + priv = this->private; + + new_frame = copy_frame(frame); + if (!new_frame) + goto out; + + new_local = AFR_FRAME_INIT(new_frame, op_errno); + if (!new_local) + goto out; + + xattr = dict_new(); + if (!xattr) + goto out; + + pending = alloca0(priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] && + !local->transaction.failed_subvols[i]) { + call_count++; + continue; } + pending[i] = 1; + } + + changelog = afr_mark_pending_changelog(priv, pending, xattr, + local->cont.dir_fop.buf.ia_type); + if (!changelog) + goto out; + + new_local->pending = changelog; + gf_uuid_copy(new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid); + new_local->loc.inode = inode_ref(local->inode); - new_frame = NULL; + new_local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (pending[i]) + continue; + + STACK_WIND_COOKIE(new_frame, afr_mark_new_entry_changelog_cbk, + (void *)(long)i, priv->children[i], + priv->children[i]->fops->xattrop, &new_local->loc, + GF_XATTROP_ADD_ARRAY, xattr, NULL); + if (!--call_count) + break; + } + + new_frame = NULL; out: - if (new_frame) - AFR_STACK_DESTROY (new_frame); - if (xattr) - dict_unref (xattr); - return; + if (new_frame) + AFR_STACK_DESTROY(new_frame); + if (xattr) + dict_unref(xattr); + return; } - void -afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this) +afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int pre_op_count = 0; - int failed_count = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_count = 0; + int failed_count = 0; + unsigned char *success_replies = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - if (local->op_ret < 0) - return; + if (local->op_ret < 0) + return; - if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD && - local->op != GF_FOP_MKDIR) - return; + if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD && + local->op != GF_FOP_MKDIR) + return; - pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count); - failed_count = AFR_COUNT (local->transaction.failed_subvols, - priv->child_count); + pre_op_count = AFR_COUNT(local->transaction.pre_op, priv->child_count); + failed_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); - if (pre_op_count == priv->child_count && !failed_count) - return; + /* FOP succeeded on all bricks. */ + if (pre_op_count == priv->child_count && !failed_count) + return; - afr_mark_new_entry_changelog (frame, this); + /* FOP did not suceed on quorum no. of bricks. */ + success_replies = alloca0(priv->child_count); + afr_fill_success_replies(local, priv, success_replies); + if (!afr_has_quorum(success_replies, this, NULL)) + return; + if (priv->thin_arbiter_count) { + /*Mark new entry using ta file*/ + local->is_new_entry = _gf_true; return; -} + } + afr_mark_new_entry_changelog(frame, this); + + return; +} /* {{{ create */ int -afr_create_unwind (call_frame_t *frame, xlator_t *this) +afr_create_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); + local = frame->local; - if (!main_frame) - return 0; + main_frame = afr_transaction_detach_fop_frame(frame); - AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno, - local->cont.create.fd, local->inode, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, local->xdata_rsp); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(create, main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, local->inode, + &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - fd_t *fd, inode_t *inode, struct iatt *buf, - struct iatt *preparent, struct iatt *postparent, - dict_t *xdata) +afr_create_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, - preparent, postparent, NULL, NULL, xdata); + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_create_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->create, - &local->loc, local->cont.create.flags, - local->cont.create.mode, local->umask, - local->cont.create.fd, local->xdata_req); - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_create_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->create, &local->loc, + local->cont.create.flags, local->cont.create.mode, + local->umask, local->cont.create.fd, local->xdata_req); + return 0; } - int -afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; - - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; - - loc_copy (&local->loc, loc); - - local->fd_ctx = afr_fd_ctx_get (fd, this); - if (!local->fd_ctx) - goto out; - - local->inode = inode_ref (loc->inode); - local->parent = inode_ref (loc->parent); - - local->op = GF_FOP_CREATE; - local->cont.create.flags = flags; - local->fd_ctx->flags = flags; - local->cont.create.mode = mode; - local->cont.create.fd = fd_ref (fd); - local->umask = umask; - - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); - - if (!local->xdata_req) - goto out; - - local->transaction.wind = afr_create_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_create_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - return 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + + local->fd_ctx = afr_fd_ctx_get(fd, this); + if (!local->fd_ctx) + goto out; + + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->op = GF_FOP_CREATE; + local->cont.create.flags = flags; + local->fd_ctx->flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref(fd); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_create_wind; + local->transaction.unwind = afr_create_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL, NULL); - return 0; + AFR_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ @@ -538,524 +502,436 @@ out: /* {{{ mknod */ int -afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +afr_mknod_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; - - AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno, - local->inode, &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(mknod, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +afr_mknod_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, - preparent, postparent, NULL, NULL, xdata); + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_mknod_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->mknod, - &local->loc, local->cont.mknod.mode, - local->cont.mknod.dev, local->umask, - local->xdata_req); - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_mknod_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->mknod, &local->loc, + local->cont.mknod.mode, local->cont.mknod.dev, + local->umask, local->xdata_req); + return 0; } int -afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - dev_t dev, mode_t umask, dict_t *xdata) +afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; - - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; - - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); - local->parent = inode_ref (loc->parent); - - local->op = GF_FOP_MKNOD; - local->cont.mknod.mode = mode; - local->cont.mknod.dev = dev; - local->umask = umask; - - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); - - if (!local->xdata_req) - goto out; - - local->transaction.wind = afr_mknod_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_mknod_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - return 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->op = GF_FOP_MKNOD; + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->transaction.wind = afr_mknod_wind; + local->transaction.unwind = afr_mknod_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL); - return 0; + AFR_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ mkdir */ - int -afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +afr_mkdir_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno, - local->inode, &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(mkdir, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +afr_mkdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, - preparent, postparent, NULL, NULL, xdata); + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_mkdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->mkdir, &local->loc, - local->cont.mkdir.mode, local->umask, - local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_mkdir_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->mkdir, &local->loc, + local->cont.mkdir.mode, local->umask, local->xdata_req); + return 0; } - int -afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, - mode_t umask, dict_t *xdata) +afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; - - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; - - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); - local->parent = inode_ref (loc->parent); - - local->cont.mkdir.mode = mode; - local->umask = umask; - - if (!xdata || !dict_get (xdata, "gfid-req")) { - op_errno = EPERM; - gf_msg_callingfn (this->name, GF_LOG_WARNING, op_errno, - AFR_MSG_GFID_NULL, "mkdir: %s is received " - "without gfid-req %p", loc->path, xdata); - goto out; - } - - local->xdata_req = dict_copy_with_ref (xdata, NULL); - if (!local->xdata_req) { - op_errno = ENOMEM; - goto out; - } - - local->op = GF_FOP_MKDIR; - local->transaction.wind = afr_mkdir_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_mkdir_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - return 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.mkdir.mode = mode; + local->umask = umask; + + if (!xdata || !dict_get_sizen(xdata, "gfid-req")) { + op_errno = EPERM; + gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_GFID_NULL, + "mkdir: %s is received " + "without gfid-req %p", + loc->path, xdata); + goto out; + } + + local->xdata_req = dict_copy_with_ref(xdata, NULL); + if (!local->xdata_req) { + op_errno = ENOMEM; + goto out; + } + + local->op = GF_FOP_MKDIR; + local->transaction.wind = afr_mkdir_wind; + local->transaction.unwind = afr_mkdir_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL); - return 0; + AFR_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ link */ - int -afr_link_unwind (call_frame_t *frame, xlator_t *this) +afr_link_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno, - local->inode, &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(link, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +afr_link_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, - preparent, postparent, NULL, NULL, xdata); + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_link_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->link, - &local->loc, &local->newloc, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_link_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->link, &local->loc, + &local->newloc, local->xdata_req); + return 0; } - int -afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) +afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; - - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; - - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); - - local->inode = inode_ref (oldloc->inode); - local->parent = inode_ref (newloc->parent); - - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); - - if (!local->xdata_req) - goto out; - - local->op = GF_FOP_LINK; - - local->transaction.wind = afr_link_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_link_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (newloc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, oldloc); + loc_copy(&local->newloc, newloc); - return 0; + local->inode = inode_ref(oldloc->inode); + local->parent = inode_ref(newloc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_LINK; + + local->transaction.wind = afr_link_wind; + local->transaction.unwind = afr_link_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, newloc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(newloc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL); - return 0; + AFR_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ symlink */ - int -afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +afr_symlink_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno, - local->inode, &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(symlink, main_frame, local->op_ret, local->op_errno, + local->inode, &local->cont.dir_fop.buf, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, inode_t *inode, - struct iatt *buf, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +afr_symlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, - preparent, postparent, NULL, NULL, xdata); + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preparent, postparent, NULL, NULL, xdata); } - int -afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_symlink_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->symlink, - local->cont.symlink.linkpath, &local->loc, - local->umask, local->xdata_req); - return 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_symlink_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->symlink, + local->cont.symlink.linkpath, &local->loc, local->umask, + local->xdata_req); + return 0; } - int -afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, - loc_t *loc, mode_t umask, dict_t *xdata) +afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *loc, mode_t umask, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; - - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; - - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); - local->parent = inode_ref (loc->parent); - - local->cont.symlink.linkpath = gf_strdup (linkpath); - local->umask = umask; - - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); - - if (!local->xdata_req) - goto out; - - local->op = GF_FOP_SYMLINK; - local->transaction.wind = afr_symlink_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_symlink_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - return 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.symlink.linkpath = gf_strdup(linkpath); + local->umask = umask; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_SYMLINK; + local->transaction.wind = afr_symlink_wind; + local->transaction.unwind = afr_symlink_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL, - NULL, NULL); - return 0; + AFR_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ @@ -1063,161 +939,118 @@ out: /* {{{ rename */ int -afr_rename_unwind (call_frame_t *frame, xlator_t *this) +afr_rename_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno, - &local->cont.dir_fop.buf, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, - &local->cont.dir_fop.prenewparent, - &local->cont.dir_fop.postnewparent, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(rename, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, + &local->cont.dir_fop.prenewparent, + &local->cont.dir_fop.postnewparent, local->xdata_rsp); + return 0; +} int -afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - struct iatt *preoldparent, struct iatt *postoldparent, - struct iatt *prenewparent, struct iatt *postnewparent, - dict_t *xdata) +afr_rename_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *buf, + struct iatt *preoldparent, struct iatt *postoldparent, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) { - return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf, - preoldparent, postoldparent, prenewparent, - postnewparent, xdata); + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf, + preoldparent, postoldparent, prenewparent, + postnewparent, xdata); } - int -afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_rename_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->rename, - &local->loc, &local->newloc, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_rename_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->rename, &local->loc, + &local->newloc, local->xdata_req); + return 0; } - int -afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, - dict_t *xdata) +afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; - int nlockee = 0; - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) { - op_errno = ENOMEM; - goto out; - } - - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; - - loc_copy (&local->loc, oldloc); - loc_copy (&local->newloc, newloc); - - local->inode = inode_ref (oldloc->inode); - local->parent = inode_ref (oldloc->parent); - local->parent2 = inode_ref (newloc->parent); - - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); - - if (!local->xdata_req) - goto out; - - local->op = GF_FOP_RENAME; - local->transaction.wind = afr_rename_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_rename_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, - &op_errno); - if (ret) - goto out; - ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (oldloc->path); - local->transaction.new_basename = AFR_BASENAME (newloc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = nlockee = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->transaction.new_parent_loc, - local->transaction.new_basename, - priv->child_count); - if (ret) - goto out; - - nlockee++; - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - nlockee++; - if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) { - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->newloc, - NULL, - priv->child_count); - if (ret) - goto out; - - nlockee++; - } - qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), - afr_entry_lockee_cmp); - int_lock->lockee_count = nlockee; - - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - return 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) { + op_errno = ENOMEM; + goto out; + } + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, oldloc); + loc_copy(&local->newloc, newloc); + + local->inode = inode_ref(oldloc->inode); + local->parent = inode_ref(oldloc->parent); + local->parent2 = inode_ref(newloc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_RENAME; + local->transaction.wind = afr_rename_wind; + local->transaction.unwind = afr_rename_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, oldloc, + &op_errno); + if (ret) + goto out; + ret = afr_build_parent_loc(&local->transaction.new_parent_loc, newloc, + &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(oldloc->path); + local->transaction.new_basename = AFR_BASENAME(newloc->path); + ret = afr_transaction(transaction_frame, this, + AFR_ENTRY_RENAME_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, - NULL, NULL); - return 0; + AFR_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, + NULL); + return 0; } /* }}} */ @@ -1225,263 +1058,205 @@ out: /* {{{ unlink */ int -afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +afr_unlink_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; - - local = frame->local; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(unlink, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +afr_unlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, - preparent, postparent, NULL, NULL, xdata); + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } - int -afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_unlink_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->unlink, - &local->loc, local->xflag, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_unlink_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->unlink, &local->loc, + local->xflag, local->xdata_req); + return 0; } - int -afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, - dict_t *xdata) +afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; - - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; - - loc_copy (&local->loc, loc); - local->xflag = xflag; - - local->inode = inode_ref (loc->inode); - local->parent = inode_ref (loc->parent); - - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); - - if (!local->xdata_req) - goto out; - - local->op = GF_FOP_UNLINK; - local->transaction.wind = afr_unlink_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_unlink_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[0], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - int_lock->lockee_count++; - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - return 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->xflag = xflag; + + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_UNLINK; + local->transaction.wind = afr_unlink_wind; + local->transaction.unwind = afr_unlink_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ /* {{{ rmdir */ - - int -afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +afr_rmdir_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *main_frame = NULL; - afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; - - AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno, - &local->cont.dir_fop.preparent, - &local->cont.dir_fop.postparent, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno, + &local->cont.dir_fop.preparent, + &local->cont.dir_fop.postparent, local->xdata_rsp); + return 0; +} int -afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *preparent, - struct iatt *postparent, dict_t *xdata) +afr_rmdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) { - return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL, - preparent, postparent, NULL, NULL, xdata); + return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + preparent, postparent, NULL, NULL, xdata); } - int -afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_rmdir_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->rmdir, - &local->loc, local->cont.rmdir.flags, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_rmdir_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->rmdir, &local->loc, + local->cont.rmdir.flags, local->xdata_req); + return 0; } - int -afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, - dict_t *xdata) +afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; - int nlockee = 0; - - priv = this->private; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; - - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; - - - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); - local->parent = inode_ref (loc->parent); - - local->cont.rmdir.flags = flags; - - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); - - if (!local->xdata_req) - goto out; - - local->op = GF_FOP_RMDIR; - local->transaction.wind = afr_rmdir_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_rmdir_unwind; - - ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, - &op_errno); - if (ret) - goto out; - - local->transaction.main_frame = frame; - local->transaction.basename = AFR_BASENAME (loc->path); - int_lock = &local->internal_lock; - - int_lock->lockee_count = nlockee = 0; - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->transaction.parent_loc, - local->transaction.basename, - priv->child_count); - if (ret) - goto out; - - nlockee++; - ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local, - &local->loc, - NULL, - priv->child_count); - if (ret) - goto out; - - nlockee++; - qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee), - afr_entry_lockee_cmp); - int_lock->lockee_count = nlockee; - - ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - return 0; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; + + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, loc); + local->inode = inode_ref(loc->inode); + local->parent = inode_ref(loc->parent); + + local->cont.rmdir.flags = flags; + + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); + + if (!local->xdata_req) + goto out; + + local->op = GF_FOP_RMDIR; + local->transaction.wind = afr_rmdir_wind; + local->transaction.unwind = afr_rmdir_unwind; + + ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno); + if (ret) + goto out; + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME(loc->path); + ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h index 02f0a3682d9..1d88c3b9b26 100644 --- a/xlators/cluster/afr/src/afr-dir-write.h +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -12,36 +12,35 @@ #define __DIR_WRITE_H__ int32_t -afr_create (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t flags, mode_t mode, - mode_t umask, fd_t *fd, dict_t *xdata); +afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata); int32_t -afr_mknod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata); +afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t dev, mode_t umask, dict_t *xdata); int32_t -afr_mkdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata); +afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + mode_t umask, dict_t *xdata); int32_t -afr_unlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, int xflag, dict_t *xdata); +afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata); int32_t -afr_rmdir (call_frame_t *frame, xlator_t *this, - loc_t *loc, int flags, dict_t *xdata); +afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + dict_t *xdata); int32_t -afr_link (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata); +afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); int32_t -afr_rename (call_frame_t *frame, xlator_t *this, - loc_t *oldloc, loc_t *newloc, dict_t *xdata); +afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata); int -afr_symlink (call_frame_t *frame, xlator_t *this, - const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params); +afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath, + loc_t *oldloc, mode_t umask, dict_t *params); #endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c index 75b2bf8e22c..c5521704de2 100644 --- a/xlators/cluster/afr/src/afr-inode-read.c +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -8,7 +8,6 @@ cases as published by the Free Software Foundation. */ - #include <libgen.h> #include <unistd.h> #include <fnmatch.h> @@ -16,20 +15,17 @@ #include <stdlib.h> #include <signal.h> -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "list.h" -#include "call-stub.h" -#include "byte-order.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "quota-common-utils.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/list.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/quota-common-utils.h> #include "afr-transaction.h" #include "afr-messages.h" @@ -44,146 +40,146 @@ * */ int -afr_handle_quota_size (call_frame_t *frame, xlator_t *this) +afr_handle_quota_size(call_frame_t *frame, xlator_t *this) { - unsigned char *readable = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - struct afr_reply *replies = NULL; - int i = 0; - int ret = 0; - quota_meta_t size = {0, }; - quota_meta_t max_size = {0, }; - int readable_cnt = 0; - int read_subvol = -1; - - local = frame->local; - priv = this->private; - replies = local->replies; - - readable = alloca0 (priv->child_count); - - afr_inode_read_subvol_get (local->inode, this, readable, 0, 0); - - readable_cnt = AFR_COUNT (readable, priv->child_count); - - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid || replies[i].op_ret == -1) - continue; - if (readable_cnt && !readable[i]) - continue; - if (!replies[i].xdata) - continue; - ret = quota_dict_get_meta (replies[i].xdata, QUOTA_SIZE_KEY, - &size); - if (ret == -1) - continue; - if (read_subvol == -1) - read_subvol = i; - if (size.size > max_size.size || - (size.file_count + size.dir_count) > - (max_size.file_count + max_size.dir_count)) - read_subvol = i; - - if (size.size > max_size.size) - max_size.size = size.size; - if (size.file_count > max_size.file_count) - max_size.file_count = size.file_count; - if (size.dir_count > max_size.dir_count) - max_size.dir_count = size.dir_count; - } - - if (max_size.size == 0 && max_size.file_count == 0 && - max_size.dir_count == 0) - return read_subvol; - - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid || replies[i].op_ret == -1) - continue; - if (readable_cnt && !readable[i]) - continue; - if (!replies[i].xdata) - continue; - quota_dict_set_meta (replies[i].xdata, QUOTA_SIZE_KEY, - &max_size, IA_IFDIR); - } - + unsigned char *readable = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct afr_reply *replies = NULL; + int i = 0; + int ret = 0; + quota_meta_t size = { + 0, + }; + quota_meta_t max_size = { + 0, + }; + int readable_cnt = 0; + int read_subvol = -1; + + local = frame->local; + priv = this->private; + replies = local->replies; + + readable = alloca0(priv->child_count); + + afr_inode_read_subvol_get(local->inode, this, readable, 0, 0); + + readable_cnt = AFR_COUNT(readable, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY, + SLEN(QUOTA_SIZE_KEY), &size); + if (ret == -1) + continue; + if (read_subvol == -1) + read_subvol = i; + if (size.size > max_size.size || + (size.file_count + size.dir_count) > + (max_size.file_count + max_size.dir_count)) + read_subvol = i; + + if (size.size > max_size.size) + max_size.size = size.size; + if (size.file_count > max_size.file_count) + max_size.file_count = size.file_count; + if (size.dir_count > max_size.dir_count) + max_size.dir_count = size.dir_count; + } + + if (max_size.size == 0 && max_size.file_count == 0 && + max_size.dir_count == 0) return read_subvol; -} + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (readable_cnt && !readable[i]) + continue; + if (!replies[i].xdata) + continue; + quota_dict_set_meta(replies[i].xdata, QUOTA_SIZE_KEY, &max_size, + IA_IFDIR); + } + + return read_subvol; +} /* {{{ access */ int -afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) +afr_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret < 0) { - local->op_ret = op_ret; - local->op_errno = op_errno; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata); + AFR_STACK_UNWIND(access, frame, op_ret, op_errno, xdata); - return 0; + return 0; } - int -afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_access_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (subvol == -1) { - AFR_STACK_UNWIND (access, frame, local->op_ret, - local->op_errno, 0); - return 0; - } - - STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->access, - &local->loc, local->cont.access.mask, - local->xdata_req); - return 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND(access, frame, local->op_ret, local->op_errno, 0); + return 0; + } + + STACK_WIND_COOKIE(frame, afr_access_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->access, &local->loc, + local->cont.access.mask, local->xdata_req); + return 0; } int -afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, - int mask, dict_t *xdata) +afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask, + dict_t *xdata) { - afr_local_t *local = NULL; - int op_errno = 0; + afr_local_t *local = NULL; + int op_errno = 0; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->op = GF_FOP_ACCESS; - loc_copy (&local->loc, loc); - local->cont.access.mask = mask; - if (xdata) - local->xdata_req = dict_ref (xdata); + local->op = GF_FOP_ACCESS; + loc_copy(&local->loc, loc); + local->cont.access.mask = mask; + if (xdata) + local->xdata_req = dict_ref(xdata); - afr_read_txn (frame, this, loc->inode, afr_access_wind, - AFR_METADATA_TRANSACTION); + afr_read_txn(frame, this, loc->inode, afr_access_wind, + AFR_METADATA_TRANSACTION); - return 0; + return 0; out: - AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND(access, frame, -1, op_errno, NULL); - return 0; + return 0; } /* }}} */ @@ -191,152 +187,140 @@ out: /* {{{ stat */ int -afr_stat_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iatt *buf, dict_t *xdata) +afr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret < 0) { - local->op_ret = op_ret; - local->op_errno = op_errno; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata); + AFR_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata); - return 0; + return 0; } - int -afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_stat_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (subvol == -1) { - AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno, - 0, 0); - return 0; - } - - STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->stat, - &local->loc, local->xdata_req); - return 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + + if (subvol == -1) { + AFR_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, 0, 0); + return 0; + } + + STACK_WIND_COOKIE( + frame, afr_stat_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->stat, &local->loc, local->xdata_req); + return 0; } int -afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) +afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { - afr_local_t *local = NULL; - int op_errno = 0; + afr_local_t *local = NULL; + int op_errno = 0; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->op = GF_FOP_STAT; - loc_copy (&local->loc, loc); - if (xdata) - local->xdata_req = dict_ref (xdata); + local->op = GF_FOP_STAT; + loc_copy(&local->loc, loc); + if (xdata) + local->xdata_req = dict_ref(xdata); - afr_read_txn (frame, this, loc->inode, afr_stat_wind, - AFR_DATA_TRANSACTION); + afr_read_txn(frame, this, loc->inode, afr_stat_wind, AFR_DATA_TRANSACTION); - return 0; + return 0; out: - AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } - /* }}} */ /* {{{ fstat */ int -afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *buf, - dict_t *xdata) +afr_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *buf, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret < 0) { - local->op_ret = op_ret; - local->op_errno = op_errno; + if (op_ret < 0) { + local->op_ret = op_ret; + local->op_errno = op_errno; - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata); + AFR_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata); - return 0; + return 0; } - int -afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_fstat_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - - priv = this->private; - local = frame->local; - - if (subvol == -1) { - AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno, - 0, 0); - return 0; - } - - STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->fstat, - local->fd, local->xdata_req); - return 0; -} + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + priv = this->private; + local = frame->local; + if (subvol == -1) { + AFR_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, 0, 0); + return 0; + } + + STACK_WIND_COOKIE( + frame, afr_fstat_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->fstat, local->fd, local->xdata_req); + return 0; +} int32_t -afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xdata) +afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - int op_errno = 0; + afr_local_t *local = NULL; + int op_errno = 0; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->op = GF_FOP_FSTAT; - local->fd = fd_ref (fd); - if (xdata) - local->xdata_req = dict_ref (xdata); + local->op = GF_FOP_FSTAT; + local->fd = fd_ref(fd); + if (xdata) + local->xdata_req = dict_ref(xdata); - afr_fix_open (fd, this); + afr_fix_open(fd, this); - afr_read_txn (frame, this, fd->inode, afr_fstat_wind, - AFR_DATA_TRANSACTION); + afr_read_txn(frame, this, fd->inode, afr_fstat_wind, AFR_DATA_TRANSACTION); - return 0; + return 0; out: - AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } /* }}} */ @@ -344,1555 +328,1493 @@ out: /* {{{ readlink */ int -afr_readlink_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - const char *buf, struct iatt *sbuf, dict_t *xdata) +afr_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, const char *buf, + struct iatt *sbuf, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = op_errno; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, - buf, sbuf, xdata); - return 0; + AFR_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, sbuf, xdata); + return 0; } int -afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_readlink_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (subvol == -1) { - AFR_STACK_UNWIND (readlink, frame, local->op_ret, - local->op_errno, 0, 0, 0); - return 0; - } - - STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->readlink, - &local->loc, local->cont.readlink.size, - local->xdata_req); - return 0; -} + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND(readlink, frame, local->op_ret, local->op_errno, 0, 0, + 0); + return 0; + } + + STACK_WIND_COOKIE(frame, afr_readlink_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->readlink, &local->loc, + local->cont.readlink.size, local->xdata_req); + return 0; +} int -afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size, dict_t *xdata) +afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata) { - afr_local_t * local = NULL; - int32_t op_errno = 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->op = GF_FOP_READLINK; - loc_copy (&local->loc, loc); - local->cont.readlink.size = size; - if (xdata) - local->xdata_req = dict_ref (xdata); + local->op = GF_FOP_READLINK; + loc_copy(&local->loc, loc); + local->cont.readlink.size = size; + if (xdata) + local->xdata_req = dict_ref(xdata); - afr_read_txn (frame, this, loc->inode, afr_readlink_wind, - AFR_DATA_TRANSACTION); + afr_read_txn(frame, this, loc->inode, afr_readlink_wind, + AFR_DATA_TRANSACTION); - return 0; + return 0; out: - AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); + AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0); - return 0; + return 0; } - /* }}} */ /* {{{ getxattr */ struct _xattr_key { - char *key; - struct list_head list; + char *key; + struct list_head list; }; - int -__gather_xattr_keys (dict_t *dict, char *key, data_t *value, - void *data) +__gather_xattr_keys(dict_t *dict, char *key, data_t *value, void *data) { - struct list_head * list = data; - struct _xattr_key * xkey = NULL; - - if (!strncmp (key, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { + struct list_head *list = data; + struct _xattr_key *xkey = NULL; - xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key); - if (!xkey) - return -1; + if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) { + xkey = GF_MALLOC(sizeof(*xkey), gf_afr_mt_xattr_key); + if (!xkey) + return -1; - xkey->key = key; - INIT_LIST_HEAD (&xkey->list); + xkey->key = key; + INIT_LIST_HEAD(&xkey->list); - list_add_tail (&xkey->list, list); - } - return 0; + list_add_tail(&xkey->list, list); + } + return 0; } - void -afr_filter_xattrs (dict_t *dict) +afr_filter_xattrs(dict_t *dict) { - struct list_head keys = {0,}; - struct _xattr_key *key = NULL; - struct _xattr_key *tmp = NULL; + struct list_head keys = { + 0, + }; + struct _xattr_key *key = NULL; + struct _xattr_key *tmp = NULL; - INIT_LIST_HEAD (&keys); + INIT_LIST_HEAD(&keys); - dict_foreach (dict, __gather_xattr_keys, - (void *) &keys); + dict_foreach(dict, __gather_xattr_keys, (void *)&keys); - list_for_each_entry_safe (key, tmp, &keys, list) { - dict_del (dict, key->key); + list_for_each_entry_safe(key, tmp, &keys, list) + { + dict_del(dict, key->key); - list_del_init (&key->list); + list_del_init(&key->list); - GF_FREE (key); - } + GF_FREE(key); + } } -static -gf_boolean_t -afr_getxattr_ignorable_errnos (int32_t op_errno) +static gf_boolean_t +afr_getxattr_ignorable_errnos(int32_t op_errno) { - if (op_errno == ENODATA || op_errno == ENOTSUP || op_errno == ERANGE || - op_errno == ENAMETOOLONG) - return _gf_true; + if (op_errno == ENODATA || op_errno == ENOTSUP || op_errno == ERANGE || + op_errno == ENAMETOOLONG) + return _gf_true; - return _gf_false; + return _gf_false; } int -afr_getxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret < 0 && !afr_getxattr_ignorable_errnos(op_errno)) { - local->op_ret = op_ret; - local->op_errno = op_errno; + if (op_ret < 0 && !afr_getxattr_ignorable_errnos(op_errno)) { + local->op_ret = op_ret; + local->op_errno = op_errno; - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - if (dict) - afr_filter_xattrs (dict); + if (dict) + afr_filter_xattrs(dict); - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + return 0; } - int -afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_getxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (subvol == -1) { - AFR_STACK_UNWIND (getxattr, frame, local->op_ret, - local->op_errno, NULL, NULL); - return 0; - } - - STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->getxattr, - &local->loc, local->cont.getxattr.name, - local->xdata_req); - return 0; -} + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, NULL, + NULL); + return 0; + } + + STACK_WIND_COOKIE(frame, afr_getxattr_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->getxattr, &local->loc, + local->cont.getxattr.name, local->xdata_req); + return 0; +} int32_t -afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict, + dict_t *xdata) { - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + return 0; } int32_t -afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - xlator_t **children = NULL; - dict_t *xattr = NULL; - char *tmp_report = NULL; - char lk_summary[1024] = {0,}; - int serz_len = 0; - int32_t callcnt = 0; - long int cky = 0; - int ret = 0; - - priv = this->private; - children = priv->children; - - local = frame->local; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) - local->replies[cky].op_errno = op_errno; - - if (!local->dict) - local->dict = dict_new (); - if (local->dict) { - ret = dict_get_str (dict, local->cont.getxattr.name, - &tmp_report); - if (ret) - goto unlock; - ret = dict_set_dynstr (local->dict, - children[cky]->name, - gf_strdup (tmp_report)); - if (ret) - goto unlock; - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = { + 0, + }; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + int keylen = 0; + int children_keylen = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + children_keylen = strlen(children[cky]->name); + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new(); + if (local->dict) { + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstrn(local->dict, children[cky]->name, + children_keylen, gf_strdup(tmp_report)); + if (ret) + goto unlock; } + } unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - xattr = dict_new (); - if (!xattr) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - ret = dict_serialize_value_with_delim (local->dict, - lk_summary, - &serz_len, '\n'); - if (ret) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - if (serz_len == -1) - snprintf (lk_summary, sizeof (lk_summary), - "No locks cleared."); - ret = dict_set_dynstr (xattr, local->cont.getxattr.name, - gf_strdup (lk_summary)); - if (ret) { - op_ret = -1; - op_errno = ENOMEM; - gf_msg (this->name, GF_LOG_ERROR, - ENOMEM, AFR_MSG_DICT_SET_FAILED, - "Error setting dictionary"); - goto unwind; - } + UNLOCK(&frame->lock); + + if (!callcnt) { + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim(local->dict, lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + if (serz_len == -1) + snprintf(lk_summary, sizeof(lk_summary), "No locks cleared."); + ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen, + gf_strdup(lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED, + "Error setting dictionary"); + goto unwind; + } - op_errno = afr_final_errno (local, priv); + op_errno = afr_final_errno(local, priv); -unwind: - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr, - xdata); - if (xattr) - dict_unref (xattr); - } + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata); + if (xattr) + dict_unref(xattr); + } - return ret; + return ret; } int32_t -afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - xlator_t **children = NULL; - dict_t *xattr = NULL; - char *tmp_report = NULL; - char lk_summary[1024] = {0,}; - int serz_len = 0; - int32_t callcnt = 0; - long int cky = 0; - int ret = 0; - - priv = this->private; - children = priv->children; - - local = frame->local; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - if (op_ret == -1) - local->replies[cky].op_errno = op_errno; - - if (!local->dict) - local->dict = dict_new (); - if (local->dict) { - ret = dict_get_str (dict, local->cont.getxattr.name, - &tmp_report); - if (ret) - goto unlock; - ret = dict_set_dynstr (local->dict, - children[cky]->name, - gf_strdup (tmp_report)); - if (ret) - goto unlock; - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + xlator_t **children = NULL; + dict_t *xattr = NULL; + char *tmp_report = NULL; + char lk_summary[1024] = { + 0, + }; + int serz_len = 0; + int32_t callcnt = 0; + long int cky = 0; + int ret = 0; + int keylen = 0; + int children_keylen = 0; + + priv = this->private; + children = priv->children; + + local = frame->local; + cky = (long)cookie; + + keylen = strlen(local->cont.getxattr.name); + children_keylen = strlen(children[cky]->name); + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) + local->replies[cky].op_errno = op_errno; + + if (!local->dict) + local->dict = dict_new(); + if (local->dict) { + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, + &tmp_report); + if (ret) + goto unlock; + ret = dict_set_dynstrn(local->dict, children[cky]->name, + children_keylen, gf_strdup(tmp_report)); + if (ret) + goto unlock; } + } unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - xattr = dict_new (); - if (!xattr) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - ret = dict_serialize_value_with_delim (local->dict, - lk_summary, - &serz_len, '\n'); - if (ret) { - op_ret = -1; - op_errno = ENOMEM; - goto unwind; - } - if (serz_len == -1) - snprintf (lk_summary, sizeof (lk_summary), - "No locks cleared."); - ret = dict_set_dynstr (xattr, local->cont.getxattr.name, - gf_strdup (lk_summary)); - if (ret) { - op_ret = -1; - op_errno = ENOMEM; - gf_msg (this->name, GF_LOG_ERROR, - ENOMEM, AFR_MSG_DICT_SET_FAILED, - "Error setting dictionary"); - goto unwind; - } + UNLOCK(&frame->lock); + + if (!callcnt) { + xattr = dict_new(); + if (!xattr) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + ret = dict_serialize_value_with_delim(local->dict, lk_summary, + &serz_len, '\n'); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + if (serz_len == -1) + snprintf(lk_summary, sizeof(lk_summary), "No locks cleared."); + ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen, + gf_strdup(lk_summary)); + if (ret) { + op_ret = -1; + op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED, + "Error setting dictionary"); + goto unwind; + } - op_errno = afr_final_errno (local, priv); + op_errno = afr_final_errno(local, priv); -unwind: - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata); + unwind: + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata); - if (xattr) - dict_unref (xattr); - } + if (xattr) + dict_unref(xattr); + } - return ret; + return ret; } /** * node-uuid cbk uses next child querying mechanism */ int32_t -afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_node_uuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int unwind = 1; - int curr_call_child = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int unwind = 1; + int curr_call_child = 0; - priv = this->private; - children = priv->children; + priv = this->private; + children = priv->children; - local = frame->local; + local = frame->local; - if (op_ret == -1) { /** query the _next_ child */ - - /** - * _current_ becomes _next_ - * If done with all childs and yet no success; give up ! - */ - curr_call_child = (int) ((long)cookie); - if (++curr_call_child == priv->child_count) - goto unwind; - - gf_msg_debug (this->name, op_errno, - "op_ret (-1): Re-querying afr-child (%d/%d)", - curr_call_child, priv->child_count); - - unwind = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, - (void *) (long) curr_call_child, - children[curr_call_child], - children[curr_call_child]->fops->getxattr, - &local->loc, - local->cont.getxattr.name, - local->xdata_req); - } + if (op_ret == -1) { /** query the _next_ child */ - unwind: - if (unwind) - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, - xdata); + /** + * _current_ becomes _next_ + * If done with all children and yet no success; give up ! + */ + curr_call_child = (int)((long)cookie); + if (++curr_call_child == priv->child_count) + goto unwind; + + gf_msg_debug(this->name, op_errno, + "op_ret (-1): Re-querying afr-child (%d/%d)", + curr_call_child, priv->child_count); + + unwind = 0; + STACK_WIND_COOKIE( + frame, afr_getxattr_node_uuid_cbk, (void *)(long)curr_call_child, + children[curr_call_child], + children[curr_call_child]->fops->getxattr, &local->loc, + local->cont.getxattr.name, local->xdata_req); + } - return 0; +unwind: + if (unwind) + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + + return 0; } /** * list-node-uuids cbk returns the list of node_uuids for the subvolume. */ int32_t -afr_getxattr_list_node_uuids_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, - int32_t op_errno, dict_t *dict, dict_t *xdata) +afr_getxattr_list_node_uuids_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t callcnt = 0; - int ret = 0; - char *xattr_serz = NULL; - long cky = 0; - int32_t tlen = 0; - - local = frame->local; - priv = this->private; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - local->replies[cky].valid = 1; - local->replies[cky].op_ret = op_ret; - local->replies[cky].op_errno = op_errno; - - if (op_ret < 0) - goto unlock; - - local->op_ret = 0; - - if (!local->xdata_rsp && xdata) - local->xdata_rsp = dict_ref (xdata); - local->replies[cky].xattr = dict_ref (dict); - } + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr_serz = NULL; + long cky = 0; + int32_t tlen = 0; + + local = frame->local; + priv = this->private; + cky = (long)cookie; + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + local->replies[cky].valid = 1; + local->replies[cky].op_ret = op_ret; + local->replies[cky].op_errno = op_errno; + + if (op_ret < 0) + goto unlock; + + local->op_ret = 0; + + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + local->replies[cky].xattr = dict_ref(dict); + } unlock: - UNLOCK (&frame->lock); + UNLOCK(&frame->lock); - if (!callcnt) { - - if (local->op_ret != 0) { - /* All bricks gave an error. */ - local->op_errno = afr_final_errno (local, priv); - goto unwind; - } - - /*Since we store the UUID0_STR as node uuid for down bricks and - *for non zero op_ret, assigning length to priv->child_count - *number of uuids*/ - local->cont.getxattr.xattr_len = (strlen (UUID0_STR) + 2) * - priv->child_count; - - if (!local->dict) - local->dict = dict_new (); - if (!local->dict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + if (!callcnt) { + if (local->op_ret != 0) { + /* All bricks gave an error. */ + local->op_errno = afr_final_errno(local, priv); + goto unwind; + } - xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, - sizeof (char), gf_common_mt_char); + /*Since we store the UUID0_STR as node uuid for down bricks and + *for non zero op_ret, assigning length to priv->child_count + *number of uuids*/ + local->cont.getxattr.xattr_len = (SLEN(UUID0_STR) + 2) * + priv->child_count; + + if (!local->dict) + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - if (!xattr_serz) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + xattr_serz = GF_CALLOC(local->cont.getxattr.xattr_len, sizeof(char), + gf_common_mt_char); - ret = afr_serialize_xattrs_with_delimiter (frame, this, - xattr_serz, - UUID0_STR, &tlen, - ' '); - if (ret) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - ret = dict_set_dynstr (local->dict, - GF_XATTR_LIST_NODE_UUIDS_KEY, - xattr_serz); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, - -ret, AFR_MSG_DICT_SET_FAILED, - "Cannot set node_uuid key in dict"); - local->op_ret = -1; - local->op_errno = ENOMEM; - } else { - local->op_ret = local->cont.getxattr.xattr_len - 1; - local->op_errno = 0; - } + if (!xattr_serz) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } -unwind: - AFR_STACK_UNWIND (getxattr, frame, local->op_ret, - local->op_errno, local->dict, - local->xdata_rsp); + ret = afr_serialize_xattrs_with_delimiter(frame, this, xattr_serz, + UUID0_STR, &tlen, ' '); + if (ret) { + local->op_ret = -1; + local->op_errno = ENOMEM; + GF_FREE(xattr_serz); + goto unwind; + } + ret = dict_set_dynstr_sizen(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set node_uuid key in dict"); + local->op_ret = -1; + local->op_errno = ENOMEM; + if (ret == -EINVAL) + GF_FREE(xattr_serz); + } else { + local->op_ret = local->cont.getxattr.xattr_len - 1; + local->op_errno = 0; } - return ret; -} + unwind: + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + local->dict, local->xdata_rsp); + } + return ret; +} int32_t -afr_getxattr_quota_size_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_quota_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - int idx = (long) cookie; - int call_count = 0; - afr_local_t *local = frame->local; - int read_subvol = -1; - - local->replies[idx].valid = 1; - local->replies[idx].op_ret = op_ret; - local->replies[idx].op_errno = op_errno; - if (dict) - local->replies[idx].xdata = dict_ref (dict); - call_count = afr_frame_return (frame); - if (call_count == 0) { - local->inode = inode_ref (local->loc.inode); - read_subvol = afr_handle_quota_size (frame, this); - if (read_subvol != -1) { - op_ret = local->replies[read_subvol].op_ret; - op_errno = local->replies[read_subvol].op_errno; - dict = local->replies[read_subvol].xdata; - } - AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, - xdata); + int idx = (long)cookie; + int call_count = 0; + afr_local_t *local = frame->local; + int read_subvol = -1; + + local->replies[idx].valid = 1; + local->replies[idx].op_ret = op_ret; + local->replies[idx].op_errno = op_errno; + if (dict) + local->replies[idx].xdata = dict_ref(dict); + call_count = afr_frame_return(frame); + if (call_count == 0) { + local->inode = inode_ref(local->loc.inode); + read_subvol = afr_handle_quota_size(frame, this); + if (read_subvol != -1) { + op_ret = local->replies[read_subvol].op_ret; + op_errno = local->replies[read_subvol].op_errno; + dict = local->replies[read_subvol].xdata; } + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata); + } - return 0; + return 0; } int32_t -afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - int call_cnt = 0, len = 0; - char *lockinfo_buf = NULL; - dict_t *lockinfo = NULL, *newdict = NULL; - afr_local_t *local = NULL; - - LOCK (&frame->lock); - { - local = frame->local; + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; - call_cnt = --local->call_count; + LOCK(&frame->lock); + { + local = frame->local; - if ((op_ret < 0) || (!dict && !xdata)) { - goto unlock; - } + call_cnt = --local->call_count; - if (xdata) { - if (!local->xdata_rsp) { - local->xdata_rsp = dict_new (); - if (!local->xdata_rsp) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - } - } + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } - if (!dict) { - goto unlock; + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; } + } + } - op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, - (void **)&lockinfo_buf, &len); + if (!dict) { + goto unlock; + } - if (!lockinfo_buf) { - goto unlock; - } + op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); - if (!local->dict) { - local->dict = dict_new (); - if (!local->dict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - } - } -unlock: - UNLOCK (&frame->lock); - - if (lockinfo_buf != NULL) { - lockinfo = dict_new (); - if (lockinfo == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - } else { - op_ret = dict_unserialize (lockinfo_buf, len, - &lockinfo); - - if (lockinfo && local->dict) { - dict_copy (lockinfo, local->dict); - } - } + if (!lockinfo_buf) { + goto unlock; } - if (xdata && local->xdata_rsp) { - dict_copy (xdata, local->xdata_rsp); + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } } + } +unlock: + UNLOCK(&frame->lock); - if (!call_cnt) { - newdict = dict_new (); - if (!newdict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - - len = dict_serialized_length (local->dict); - if (len <= 0) { - goto unwind; - } - - lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); - if (!lockinfo_buf) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + if (lockinfo_buf != NULL) { + lockinfo = dict_new(); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); - op_ret = dict_serialize (local->dict, lockinfo_buf); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - } + if (lockinfo && local->dict) { + dict_copy(lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy(xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new(); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, - (void *)lockinfo_buf, len); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - goto unwind; - } + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { + local->op_ret = -1; + goto unwind; + } - unwind: - AFR_STACK_UNWIND (getxattr, frame, op_ret, - op_errno, newdict, - local->xdata_rsp); + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; } - dict_unref (lockinfo); + unwind: + AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict, + local->xdata_rsp); + } - return 0; + dict_unref(lockinfo); + + return 0; } int32_t -afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - int call_cnt = 0, len = 0; - char *lockinfo_buf = NULL; - dict_t *lockinfo = NULL, *newdict = NULL; - afr_local_t *local = NULL; + int call_cnt = 0, len = 0; + char *lockinfo_buf = NULL; + dict_t *lockinfo = NULL, *newdict = NULL; + afr_local_t *local = NULL; - LOCK (&frame->lock); - { - local = frame->local; + LOCK(&frame->lock); + { + local = frame->local; - call_cnt = --local->call_count; + call_cnt = --local->call_count; - if ((op_ret < 0) || (!dict && !xdata)) { - goto unlock; - } - - if (xdata) { - if (!local->xdata_rsp) { - local->xdata_rsp = dict_new (); - if (!local->xdata_rsp) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - } - } + if ((op_ret < 0) || (!dict && !xdata)) { + goto unlock; + } - if (!dict) { - goto unlock; + if (xdata) { + if (!local->xdata_rsp) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; } + } + } - op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY, - (void **)&lockinfo_buf, &len); + if (!dict) { + goto unlock; + } - if (!lockinfo_buf) { - goto unlock; - } + op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY, + (void **)&lockinfo_buf, &len); - if (!local->dict) { - local->dict = dict_new (); - if (!local->dict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unlock; - } - } - } -unlock: - UNLOCK (&frame->lock); - - if (lockinfo_buf != NULL) { - lockinfo = dict_new (); - if (lockinfo == NULL) { - local->op_ret = -1; - local->op_errno = ENOMEM; - } else { - op_ret = dict_unserialize (lockinfo_buf, len, - &lockinfo); - - if (lockinfo && local->dict) { - dict_copy (lockinfo, local->dict); - } - } + if (!lockinfo_buf) { + goto unlock; } - if (xdata && local->xdata_rsp) { - dict_copy (xdata, local->xdata_rsp); + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unlock; + } } + } +unlock: + UNLOCK(&frame->lock); - if (!call_cnt) { - newdict = dict_new (); - if (!newdict) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } - - len = dict_serialized_length (local->dict); - if (len <= 0) { - goto unwind; - } - - lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char); - if (!lockinfo_buf) { - local->op_ret = -1; - local->op_errno = ENOMEM; - goto unwind; - } + if (lockinfo_buf != NULL) { + lockinfo = dict_new(); + if (lockinfo == NULL) { + local->op_ret = -1; + local->op_errno = ENOMEM; + } else { + op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo); - op_ret = dict_serialize (local->dict, lockinfo_buf); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - } + if (lockinfo && local->dict) { + dict_copy(lockinfo, local->dict); + } + } + } + + if (xdata && local->xdata_rsp) { + dict_copy(xdata, local->xdata_rsp); + } + + if (!call_cnt) { + newdict = dict_new(); + if (!newdict) { + local->op_ret = -1; + local->op_errno = ENOMEM; + goto unwind; + } - op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY, - (void *)lockinfo_buf, len); - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = -op_ret; - goto unwind; - } + op_ret = dict_allocate_and_serialize( + local->dict, (char **)&lockinfo_buf, (unsigned int *)&len); + if (op_ret != 0) { + local->op_ret = -1; + goto unwind; + } - unwind: - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, - op_errno, newdict, - local->xdata_rsp); + op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY, + (void *)lockinfo_buf, len); + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = -op_ret; + goto unwind; } - dict_unref (lockinfo); + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict, + local->xdata_rsp); + } - return 0; + dict_unref(lockinfo); + + return 0; } int32_t -afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; - int ret = 0; - char *xattr = NULL; - char *xattr_serz = NULL; - char xattr_cky[1024] = {0,}; - dict_t *nxattr = NULL; - long cky = 0; - int32_t padding = 0; - int32_t tlen = 0; - - if (!frame || !frame->local || !this) { - gf_msg ("", GF_LOG_ERROR, 0, - AFR_MSG_INVALID_ARG, "possible NULL deref"); - goto out; + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + int keylen = 0; + char xattr_cky[1024] = { + 0, + }; + int xattr_cky_len = 0; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld", + local->cont.getxattr.name, cky); + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } - local = frame->local; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret < 0) { - local->op_errno = op_errno; - } else { - local->op_ret = op_ret; - if (!local->xdata_rsp && xdata) - local->xdata_rsp = dict_ref (xdata); - } + if (!dict || (op_ret < 0)) + goto unlock; - if (!dict || (op_ret < 0)) - goto unlock; - - if (!local->dict) - local->dict = dict_new (); - - if (local->dict) { - ret = dict_get_str (dict, - local->cont.getxattr.name, - &xattr); - if (ret) - goto unlock; - - xattr = gf_strdup (xattr); - - (void)snprintf (xattr_cky, 1024, "%s-%ld", - local->cont.getxattr.name, cky); - ret = dict_set_dynstr (local->dict, - xattr_cky, xattr); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, - -ret, AFR_MSG_DICT_SET_FAILED, - "Cannot set xattr cookie key"); - goto unlock; - } - - local->cont.getxattr.xattr_len - += strlen (xattr) + 1; - } + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) + goto unlock; + } + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr); + if (ret) + goto unlock; + + xattr = gf_strdup(xattr); + + ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr); + if (ret) { + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set xattr cookie key"); + goto post_unlock; } -unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - if (!local->cont.getxattr.xattr_len) - goto unwind; - - nxattr = dict_new (); - if (!nxattr) - goto unwind; - - /* extra bytes for decorations (brackets and <>'s) */ - padding += strlen (this->name) - + strlen (AFR_PATHINFO_HEADER) + 4; - local->cont.getxattr.xattr_len += (padding + 2); - - xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, - sizeof (char), gf_common_mt_char); - - if (!xattr_serz) - goto unwind; - - /* the xlator info */ - (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", - this->name); - - /* actual series of pathinfo */ - ret = dict_serialize_value_with_delim (local->dict, - xattr_serz - + strlen (xattr_serz), - &tlen, ' '); - if (ret) { - goto unwind; - } - /* closing part */ - *(xattr_serz + padding + tlen) = ')'; - *(xattr_serz + padding + tlen + 1) = '\0'; + local->cont.getxattr.xattr_len += strlen(xattr) + 1; + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new(); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len, + gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + int xattr_serz_len = sprintf( + xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim( + local->dict, xattr_serz + xattr_serz_len, &tlen, ' '); + if (ret) { + GF_FREE(xattr_serz); + goto unwind; + } - ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, - xattr_serz); - if (ret) - gf_msg (this->name, GF_LOG_ERROR, - -ret, AFR_MSG_DICT_SET_FAILED, - "Cannot set pathinfo key in dict"); + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set pathinfo key in dict"); + if (ret == -EINVAL) + GF_FREE(xattr_serz); + } - unwind: - AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, - local->op_errno, nxattr, local->xdata_rsp); + unwind: + AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, + nxattr, local->xdata_rsp); - if (nxattr) - dict_unref (nxattr); - } + if (nxattr) + dict_unref(nxattr); + } out: - return ret; + return ret; } int32_t -afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; - int ret = 0; - char *xattr = NULL; - char *xattr_serz = NULL; - char xattr_cky[1024] = {0,}; - dict_t *nxattr = NULL; - long cky = 0; - int32_t padding = 0; - int32_t tlen = 0; - - if (!frame || !frame->local || !this) { - gf_msg ("", GF_LOG_ERROR, 0, - AFR_MSG_INVALID_ARG, "possible NULL deref"); - goto out; - } - - local = frame->local; - cky = (long) cookie; - - LOCK (&frame->lock); - { - callcnt = --local->call_count; - - if (op_ret < 0) { - local->op_errno = op_errno; - } else { - local->op_ret = op_ret; - if (!local->xdata_rsp && xdata) - local->xdata_rsp = dict_ref (xdata); - } - - if (!dict || (op_ret < 0)) - goto unlock; - - if (!local->dict) - local->dict = dict_new (); - - if (local->dict) { - ret = dict_get_str (dict, - local->cont.getxattr.name, - &xattr); - if (ret) - goto unlock; - - xattr = gf_strdup (xattr); - - (void)snprintf (xattr_cky, 1024, "%s-%ld", - local->cont.getxattr.name, cky); - ret = dict_set_dynstr (local->dict, - xattr_cky, xattr); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, - -ret, - AFR_MSG_DICT_SET_FAILED, - "Cannot set xattr " - "cookie key"); - goto unlock; - } - - local->cont.getxattr.xattr_len += strlen (xattr) + 1; - } - } -unlock: - UNLOCK (&frame->lock); - - if (!callcnt) { - if (!local->cont.getxattr.xattr_len) - goto unwind; - - nxattr = dict_new (); - if (!nxattr) - goto unwind; - - /* extra bytes for decorations (brackets and <>'s) */ - padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4; - local->cont.getxattr.xattr_len += (padding + 2); + afr_local_t *local = NULL; + int32_t callcnt = 0; + int ret = 0; + char *xattr = NULL; + char *xattr_serz = NULL; + char xattr_cky[1024] = { + 0, + }; + int keylen = 0; + int xattr_cky_len = 0; + dict_t *nxattr = NULL; + long cky = 0; + int32_t padding = 0; + int32_t tlen = 0; + + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } + + local = frame->local; + cky = (long)cookie; + keylen = strlen(local->cont.getxattr.name); + xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld", + local->cont.getxattr.name, cky); + LOCK(&frame->lock); + { + callcnt = --local->call_count; - xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len, - sizeof (char), gf_common_mt_char); - - if (!xattr_serz) - goto unwind; + if (op_ret < 0) { + local->op_errno = op_errno; + } else { + local->op_ret = op_ret; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); + } - /* the xlator info */ - (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ", - this->name); + if (!dict || (op_ret < 0)) + goto unlock; - /* actual series of pathinfo */ - ret = dict_serialize_value_with_delim (local->dict, - xattr_serz + strlen (xattr_serz), - &tlen, ' '); - if (ret) { - goto unwind; - } + if (!local->dict) { + local->dict = dict_new(); + if (!local->dict) + goto unlock; + } + ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr); + if (ret) + goto unlock; + + xattr = gf_strdup(xattr); + + ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr); + if (ret) { + UNLOCK(&frame->lock); + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set xattr cookie key"); + goto post_unlock; + } - /* closing part */ - *(xattr_serz + padding + tlen) = ')'; - *(xattr_serz + padding + tlen + 1) = '\0'; + local->cont.getxattr.xattr_len += strlen(xattr) + 1; + } +unlock: + UNLOCK(&frame->lock); +post_unlock: + if (!callcnt) { + if (!local->cont.getxattr.xattr_len) + goto unwind; + + nxattr = dict_new(); + if (!nxattr) + goto unwind; + + /* extra bytes for decorations (brackets and <>'s) */ + padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4; + local->cont.getxattr.xattr_len += (padding + 2); + + xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len, + gf_common_mt_char); + + if (!xattr_serz) + goto unwind; + + /* the xlator info */ + int xattr_serz_len = sprintf( + xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name); + + /* actual series of pathinfo */ + ret = dict_serialize_value_with_delim( + local->dict, xattr_serz + xattr_serz_len, &tlen, ' '); + if (ret) { + GF_FREE(xattr_serz); + goto unwind; + } - ret = dict_set_dynstr (nxattr, local->cont.getxattr.name, - xattr_serz); - if (ret) - gf_msg (this->name, GF_LOG_ERROR, - -ret, AFR_MSG_DICT_SET_FAILED, - "Cannot set pathinfo key in dict"); + /* closing part */ + *(xattr_serz + padding + tlen) = ')'; + *(xattr_serz + padding + tlen + 1) = '\0'; + + ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen, + xattr_serz); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, + "Cannot set pathinfo key in dict"); + if (ret == -EINVAL) + GF_FREE(xattr_serz); + } - unwind: - AFR_STACK_UNWIND (getxattr, frame, local->op_ret, - local->op_errno, nxattr, local->xdata_rsp); + unwind: + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + nxattr, local->xdata_rsp); - if (nxattr) - dict_unref (nxattr); - } + if (nxattr) + dict_unref(nxattr); + } out: - return ret; + return ret; } static int -afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data) +afr_aggregate_stime_xattr(dict_t *this, char *key, data_t *value, void *data) { - int ret = 0; + int ret = 0; - if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) - ret = gf_get_max_stime (THIS, data, key, value); + if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) + ret = gf_get_max_stime(THIS, data, key, value); - return ret; + return ret; } int32_t -afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_common_getxattr_stime_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, + dict_t *xdata) { - afr_local_t *local = NULL; - int32_t callcnt = 0; + afr_local_t *local = NULL; + int32_t callcnt = 0; - if (!frame || !frame->local || !this) { - gf_msg ("", GF_LOG_ERROR, 0, - AFR_MSG_INVALID_ARG, "possible NULL deref"); - goto out; - } + if (!frame || !frame->local || !this) { + gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref"); + goto out; + } - local = frame->local; + local = frame->local; - LOCK (&frame->lock); - { - callcnt = --local->call_count; + LOCK(&frame->lock); + { + callcnt = --local->call_count; - if (!dict || (op_ret < 0)) { - local->op_errno = op_errno; - goto cleanup; - } - - if (!local->dict) - local->dict = dict_copy_with_ref (dict, NULL); - else - dict_foreach (dict, afr_aggregate_stime_xattr, - local->dict); - local->op_ret = 0; + if (!dict || (op_ret < 0)) { + local->op_errno = op_errno; + goto cleanup; } + if (!local->dict) + local->dict = dict_copy_with_ref(dict, NULL); + else + dict_foreach(dict, afr_aggregate_stime_xattr, local->dict); + local->op_ret = 0; + } + cleanup: - UNLOCK (&frame->lock); + UNLOCK(&frame->lock); - if (!callcnt) { - AFR_STACK_UNWIND (getxattr, frame, local->op_ret, - local->op_errno, local->dict, xdata); - } + if (!callcnt) { + AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, + local->dict, xdata); + } out: - return 0; + return 0; } - static gf_boolean_t -afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk, - gf_boolean_t is_fgetxattr) +afr_is_special_xattr(const char *name, fop_getxattr_cbk_t *cbk, + gf_boolean_t is_fgetxattr) { - gf_boolean_t is_spl = _gf_true; - - GF_ASSERT (cbk); - if (!cbk || !name) { - is_spl = _gf_false; - goto out; + gf_boolean_t is_spl = _gf_true; + + GF_ASSERT(cbk); + if (!cbk || !name) { + is_spl = _gf_false; + goto out; + } + + if (!strcmp(name, GF_XATTR_PATHINFO_KEY) || + !strcmp(name, GF_XATTR_USER_PATHINFO_KEY)) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_pathinfo_cbk; + } else { + *cbk = afr_getxattr_pathinfo_cbk; } - - if (!strcmp (name, GF_XATTR_PATHINFO_KEY) || - !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) { - if (is_fgetxattr) { - *cbk = afr_fgetxattr_pathinfo_cbk; - } else { - *cbk = afr_getxattr_pathinfo_cbk; - } - } else if (!strncmp (name, GF_XATTR_CLRLK_CMD, - strlen (GF_XATTR_CLRLK_CMD))) { - if (is_fgetxattr) { - *cbk = afr_fgetxattr_clrlk_cbk; - } else { - *cbk = afr_getxattr_clrlk_cbk; - } - } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY, - strlen (GF_XATTR_LOCKINFO_KEY))) { - if (is_fgetxattr) { - *cbk = afr_fgetxattr_lockinfo_cbk; - } else { - *cbk = afr_getxattr_lockinfo_cbk; - } - } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { - *cbk = afr_common_getxattr_stime_cbk; - } else if (strcmp (name, QUOTA_SIZE_KEY) == 0) { - *cbk = afr_getxattr_quota_size_cbk; - } else if (!strcmp (name, GF_XATTR_LIST_NODE_UUIDS_KEY)) { - *cbk = afr_getxattr_list_node_uuids_cbk; + } else if (!strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_clrlk_cbk; } else { - is_spl = _gf_false; + *cbk = afr_getxattr_clrlk_cbk; } + } else if (!strncmp(name, GF_XATTR_LOCKINFO_KEY, + SLEN(GF_XATTR_LOCKINFO_KEY))) { + if (is_fgetxattr) { + *cbk = afr_fgetxattr_lockinfo_cbk; + } else { + *cbk = afr_getxattr_lockinfo_cbk; + } + } else if (fnmatch(GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) { + *cbk = afr_common_getxattr_stime_cbk; + } else if (strcmp(name, QUOTA_SIZE_KEY) == 0) { + *cbk = afr_getxattr_quota_size_cbk; + } else if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) { + *cbk = afr_getxattr_list_node_uuids_cbk; + } else { + is_spl = _gf_false; + } out: - return is_spl; + return is_spl; } static void -afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame, - const char *name, loc_t *loc, - fop_getxattr_cbk_t cbk) +afr_getxattr_all_subvols(xlator_t *this, call_frame_t *frame, const char *name, + loc_t *loc, fop_getxattr_cbk_t cbk) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; - - priv = this->private; - - local = frame->local; - //local->call_count set in afr_local_init - call_count = local->call_count; - - if (!strcmp (name, GF_XATTR_LIST_NODE_UUIDS_KEY)) { - GF_FREE (local->cont.getxattr.name); - local->cont.getxattr.name = gf_strdup (GF_XATTR_NODE_UUID_KEY); + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + // local->call_count set in afr_local_init + call_count = local->call_count; + + if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) { + GF_FREE(local->cont.getxattr.name); + local->cont.getxattr.name = gf_strdup(GF_XATTR_NODE_UUID_KEY); + } + + // If up-children count is 0, afr_local_init would have failed already + // and the call would have unwound so not handling it here. + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->getxattr, loc, + local->cont.getxattr.name, NULL); + if (!--call_count) + break; } - - //If up-children count is 0, afr_local_init would have failed already - //and the call would have unwound so not handling it here. - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, cbk, - (void *) (long) i, priv->children[i], - priv->children[i]->fops->getxattr, - loc, local->cont.getxattr.name, - NULL); - if (!--call_count) - break; - } - } - return; + } + return; } int -afr_marker_populate_args (call_frame_t *frame, int type, int *gauge, - xlator_t **subvols) +afr_marker_populate_args(call_frame_t *frame, int type, int *gauge, + xlator_t **subvols) { - xlator_t *this = frame->this; - afr_private_t *priv = this->private; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; - memcpy (subvols, priv->children, sizeof (*subvols) * priv->child_count); + memcpy(subvols, priv->children, sizeof(*subvols) * priv->child_count); - if (type == MARKER_XTIME_TYPE) { - /*Don't error out on ENOENT/ENOTCONN */ - gauge[MCNT_NOTFOUND] = 0; - gauge[MCNT_ENOTCONN] = 0; - } - return priv->child_count; + if (type == MARKER_XTIME_TYPE) { + /*Don't error out on ENOENT/ENOTCONN */ + gauge[MCNT_NOTFOUND] = 0; + gauge[MCNT_ENOTCONN] = 0; + } + return priv->child_count; } static int -afr_handle_heal_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc, - const char *heal_op) +afr_handle_heal_xattrs(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *heal_op) { - int ret = -1; - afr_spb_status_t *data = NULL; + int ret = -1; + afr_spb_status_t *data = NULL; - if (!strcmp (heal_op, GF_HEAL_INFO)) { - afr_get_heal_info (frame, this, loc); - ret = 0; - goto out; - } + if (!strcmp(heal_op, GF_HEAL_INFO)) { + afr_get_heal_info(frame, this, loc); + ret = 0; + goto out; + } - if (!strcmp (heal_op, GF_AFR_HEAL_SBRAIN)) { - afr_heal_splitbrain_file (frame, this, loc); - ret = 0; - goto out; + if (!strcmp(heal_op, GF_AFR_HEAL_SBRAIN)) { + afr_heal_splitbrain_file(frame, this, loc); + ret = 0; + goto out; + } + + if (!strcmp(heal_op, GF_AFR_SBRAIN_STATUS)) { + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spb_status_t); + if (!data) { + ret = 1; + goto out; } - - if (!strcmp (heal_op, GF_AFR_SBRAIN_STATUS)) { - data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spb_status_t); - if (!data) { - ret = 1; - goto out; - } - data->frame = frame; - data->loc = loc; - ret = synctask_new (this->ctx->env, - afr_get_split_brain_status, - afr_get_split_brain_status_cbk, - NULL, data); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN_STATUS, - "Failed to create" - " synctask. Unable to fetch split-brain status" - " for %s.", loc->name); - ret = 1; - goto out; - } - goto out; + data->frame = frame; + data->loc = loc; + ret = synctask_new(this->ctx->env, afr_get_split_brain_status, + afr_get_split_brain_status_cbk, NULL, data); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + "Failed to create" + " synctask. Unable to fetch split-brain status" + " for %s.", + loc->name); + ret = 1; + goto out; } + goto out; + } out: - if (ret == 1) { - AFR_STACK_UNWIND (getxattr, frame, -1, ENOMEM, NULL, NULL); - if (data) - GF_FREE (data); - ret = 0; - } - return ret; + if (ret == 1) { + AFR_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL); + if (data) + GF_FREE(data); + ret = 0; + } + return ret; } int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - xlator_t **children = NULL; - int i = 0; - int32_t op_errno = 0; - int ret = -1; - fop_getxattr_cbk_t cbk = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + xlator_t **children = NULL; + int i = 0; + int32_t op_errno = 0; + int ret = -1; + fop_getxattr_cbk_t cbk = NULL; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + priv = this->private; - priv = this->private; + children = priv->children; - children = priv->children; + loc_copy(&local->loc, loc); - loc_copy (&local->loc, loc); + local->op = GF_FOP_GETXATTR; - local->op = GF_FOP_GETXATTR; + if (xdata) + local->xdata_req = dict_ref(xdata); - if (xdata) - local->xdata_req = dict_ref (xdata); + if (!name) + goto no_name; - if (!name) - goto no_name; + local->cont.getxattr.name = gf_strdup(name); - local->cont.getxattr.name = gf_strdup (name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; + } - if (!local->cont.getxattr.name) { - op_errno = ENOMEM; - goto out; - } + if (!strncmp(name, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) { + op_errno = ENODATA; + goto out; + } - if (!strncmp (name, AFR_XATTR_PREFIX, - strlen (AFR_XATTR_PREFIX))) { - op_errno = ENODATA; - goto out; - } - - if (cluster_handle_marker_getxattr (frame, loc, name, priv->vol_uuid, - afr_getxattr_unwind, - afr_marker_populate_args) == 0) - return 0; + if (cluster_handle_marker_getxattr(frame, loc, name, priv->vol_uuid, + afr_getxattr_unwind, + afr_marker_populate_args) == 0) + return 0; - ret = afr_handle_heal_xattrs (frame, this, &local->loc, name); - if (ret == 0) - return 0; + ret = afr_handle_heal_xattrs(frame, this, &local->loc, name); + if (ret == 0) + return 0; - /* - * Special xattrs which need responses from all subvols - */ - if (afr_is_special_xattr (name, &cbk, 0)) { - afr_getxattr_all_subvols (this, frame, name, loc, cbk); - return 0; - } + /* + * Heal daemons don't have IO threads ... and as a result they + * send this getxattr down and eventually crash :( + */ + op_errno = -1; + GF_CHECK_XATTR_KEY_AND_GOTO(name, IO_THREADS_QUEUE_SIZE_KEY, op_errno, out); + + /* + * Special xattrs which need responses from all subvols + */ + if (afr_is_special_xattr(name, &cbk, 0)) { + afr_getxattr_all_subvols(this, frame, name, loc, cbk); + return 0; + } - if (XATTR_IS_NODE_UUID (name)) { - i = 0; - STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk, - (void *) (long) i, - children[i], - children[i]->fops->getxattr, - loc, name, xdata); - return 0; - } + if (XATTR_IS_NODE_UUID(name)) { + i = 0; + STACK_WIND_COOKIE(frame, afr_getxattr_node_uuid_cbk, (void *)(long)i, + children[i], children[i]->fops->getxattr, loc, name, + xdata); + return 0; + } no_name: - afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind, - AFR_METADATA_TRANSACTION); + afr_read_txn(frame, this, local->loc.inode, afr_getxattr_wind, + AFR_METADATA_TRANSACTION); - ret = 0; + ret = 0; out: - if (ret < 0) - AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); - return 0; + if (ret < 0) + AFR_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL); + return 0; } /* {{{ fgetxattr */ - int32_t -afr_fgetxattr_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *dict, dict_t *xdata) +afr_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = op_errno; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - if (dict) - afr_filter_xattrs (dict); + if (dict) + afr_filter_xattrs(dict); - AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata); + AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata); - return 0; + return 0; } int -afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_fgetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (subvol == -1) { - AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret, - local->op_errno, NULL, NULL); - return 0; - } - - STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->fgetxattr, - local->fd, local->cont.getxattr.name, - local->xdata_req); - return 0; -} + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + local = frame->local; + priv = this->private; -static void -afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame, - fop_fgetxattr_cbk_t cbk) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int call_count = 0; + if (subvol == -1) { + AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, NULL, + NULL); + return 0; + } - priv = this->private; + STACK_WIND_COOKIE(frame, afr_fgetxattr_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fgetxattr, local->fd, + local->cont.getxattr.name, local->xdata_req); + return 0; +} - local = frame->local; - //local->call_count set in afr_local_init - call_count = local->call_count; - - //If up-children count is 0, afr_local_init would have failed already - //and the call would have unwound so not handling it here. - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->fgetxattr, - local->fd, local->cont.getxattr.name, - NULL); - if (!--call_count) - break; - } +static void +afr_fgetxattr_all_subvols(xlator_t *this, call_frame_t *frame, + fop_fgetxattr_cbk_t cbk) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + + priv = this->private; + + local = frame->local; + // local->call_count set in afr_local_init + call_count = local->call_count; + + // If up-children count is 0, afr_local_init would have failed already + // and the call would have unwound so not handling it here. + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i], + priv->children[i]->fops->fgetxattr, local->fd, + local->cont.getxattr.name, NULL); + if (!--call_count) + break; } + } - return; + return; } - int -afr_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata) +afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) { - afr_local_t *local = NULL; - int32_t op_errno = 0; - fop_fgetxattr_cbk_t cbk = NULL; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - local->op = GF_FOP_FGETXATTR; - local->fd = fd_ref (fd); - if (name) { - local->cont.getxattr.name = gf_strdup (name); - if (!local->cont.getxattr.name) { - op_errno = ENOMEM; - goto out; - } - } - if (xdata) - local->xdata_req = dict_ref (xdata); - - /* pathinfo gets handled only in getxattr(), but we need to handle - * lockinfo. - * If we are doing fgetxattr with lockinfo as the key then we - * collect information from all children. - */ - if (afr_is_special_xattr (name, &cbk, 1)) { - afr_fgetxattr_all_subvols (this, frame, cbk); - return 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; + fop_fgetxattr_cbk_t cbk = NULL; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_FGETXATTR; + local->fd = fd_ref(fd); + if (name) { + local->cont.getxattr.name = gf_strdup(name); + if (!local->cont.getxattr.name) { + op_errno = ENOMEM; + goto out; } + } + if (xdata) + local->xdata_req = dict_ref(xdata); + + /* pathinfo gets handled only in getxattr(), but we need to handle + * lockinfo. + * If we are doing fgetxattr with lockinfo as the key then we + * collect information from all children. + */ + if (afr_is_special_xattr(name, &cbk, 1)) { + afr_fgetxattr_all_subvols(this, frame, cbk); + return 0; + } - afr_fix_open (fd, this); + afr_fix_open(fd, this); - afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind, - AFR_METADATA_TRANSACTION); + afr_read_txn(frame, this, fd->inode, afr_fgetxattr_wind, + AFR_METADATA_TRANSACTION); - return 0; + return 0; out: - AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); + AFR_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL); - return 0; + return 0; } - /* }}} */ /* {{{ readv */ int -afr_readv_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - struct iovec *vector, int32_t count, struct iatt *buf, - struct iobref *iobref, dict_t *xdata) +afr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, int32_t count, + struct iatt *buf, struct iobref *iobref, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = op_errno; + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + afr_read_txn_continue(frame, this, (long)cookie); + return 0; + } - AFR_STACK_UNWIND (readv, frame, op_ret, op_errno, - vector, count, buf, iobref, xdata); - return 0; + AFR_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf, iobref, + xdata); + return 0; } - int -afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_readv_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; - - if (subvol == -1) { - AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno, - 0, 0, 0, 0, 0); - return 0; - } - - STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->readv, - local->fd, local->cont.readv.size, - local->cont.readv.offset, local->cont.readv.flags, - local->xdata_req); - return 0; -} + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + local = frame->local; + priv = this->private; + + if (subvol == -1) { + AFR_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, 0, 0, 0, + 0, 0); + return 0; + } + + STACK_WIND_COOKIE( + frame, afr_readv_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->readv, local->fd, local->cont.readv.size, + local->cont.readv.offset, local->cont.readv.flags, local->xdata_req); + return 0; +} int -afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, - off_t offset, uint32_t flags, dict_t *xdata) +afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) { - afr_local_t * local = NULL; - int32_t op_errno = 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->op = GF_FOP_READ; - local->fd = fd_ref (fd); - local->cont.readv.size = size; - local->cont.readv.offset = offset; - local->cont.readv.flags = flags; - if (xdata) - local->xdata_req = dict_ref (xdata); + local->op = GF_FOP_READ; + local->fd = fd_ref(fd); + local->cont.readv.size = size; + local->cont.readv.offset = offset; + local->cont.readv.flags = flags; + if (xdata) + local->xdata_req = dict_ref(xdata); - afr_fix_open (fd, this); + afr_fix_open(fd, this); - afr_read_txn (frame, this, fd->inode, afr_readv_wind, - AFR_DATA_TRANSACTION); + afr_read_txn(frame, this, fd->inode, afr_readv_wind, AFR_DATA_TRANSACTION); - return 0; + return 0; out: - AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); + AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0); - return 0; + return 0; } /* }}} */ @@ -1900,77 +1822,73 @@ out: /* {{{ seek */ int -afr_seek_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata) +afr_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, off_t offset, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret < 0) { - local->op_ret = -1; - local->op_errno = op_errno; - - afr_read_txn_continue (frame, this, (long) cookie); - return 0; - } + if (op_ret < 0) { + local->op_ret = -1; + local->op_errno = op_errno; - AFR_STACK_UNWIND (seek, frame, op_ret, op_errno, offset, xdata); + afr_read_txn_continue(frame, this, (long)cookie); return 0; -} + } + AFR_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata); + return 0; +} int -afr_seek_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_seek_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - local = frame->local; - priv = this->private; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - if (subvol == -1) { - AFR_STACK_UNWIND (seek, frame, local->op_ret, local->op_errno, - 0, NULL); - return 0; - } + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_seek_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->seek, - local->fd, local->cont.seek.offset, - local->cont.seek.what, local->xdata_req); + if (subvol == -1) { + AFR_STACK_UNWIND(seek, frame, local->op_ret, local->op_errno, 0, NULL); return 0; -} + } + STACK_WIND_COOKIE( + frame, afr_seek_cbk, (void *)(long)subvol, priv->children[subvol], + priv->children[subvol]->fops->seek, local->fd, local->cont.seek.offset, + local->cont.seek.what, local->xdata_req); + return 0; +} int -afr_seek (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - gf_seek_what_t what, dict_t *xdata) +afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) { - afr_local_t *local = NULL; - int32_t op_errno = 0; + afr_local_t *local = NULL; + int32_t op_errno = 0; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->op = GF_FOP_SEEK; - local->fd = fd_ref (fd); - local->cont.seek.offset = offset; - local->cont.seek.what = what; - if (xdata) - local->xdata_req = dict_ref (xdata); + local->op = GF_FOP_SEEK; + local->fd = fd_ref(fd); + local->cont.seek.offset = offset; + local->cont.seek.what = what; + if (xdata) + local->xdata_req = dict_ref(xdata); - afr_fix_open (fd, this); + afr_fix_open(fd, this); - afr_read_txn (frame, this, fd->inode, afr_seek_wind, - AFR_DATA_TRANSACTION); + afr_read_txn(frame, this, fd->inode, afr_seek_wind, AFR_DATA_TRANSACTION); - return 0; + return 0; out: - AFR_STACK_UNWIND (seek, frame, -1, op_errno, 0, NULL); + AFR_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL); - return 0; + return 0; } /* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h index d128134ef2a..8c982bc7e6f 100644 --- a/xlators/cluster/afr/src/afr-inode-read.h +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -12,34 +12,34 @@ #define __INODE_READ_H__ int32_t -afr_access (call_frame_t *frame, xlator_t *this, - loc_t *loc, int32_t mask, dict_t *xdata); +afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, + dict_t *xdata); int32_t -afr_stat (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *xdata); +afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); int32_t -afr_fstat (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *xdata); +afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata); int32_t -afr_readlink (call_frame_t *frame, xlator_t *this, - loc_t *loc, size_t size, dict_t *xdata); +afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, + dict_t *xdata); int32_t -afr_readv (call_frame_t *frame, xlator_t *this, - fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata); +afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata); int32_t -afr_getxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata); +afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name, + dict_t *xdata); int32_t -afr_fgetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata); - +afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata); int -afr_handle_quota_size (call_frame_t *frame, xlator_t *this); +afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata); +int +afr_handle_quota_size(call_frame_t *frame, xlator_t *this); #endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c index 14114699594..1d6e4f3570a 100644 --- a/xlators/cluster/afr/src/afr-inode-write.c +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -8,796 +8,779 @@ cases as published by the Free Software Foundation. */ - -#include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> #include "protocol-common.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> #include "afr-transaction.h" #include "afr-self-heal.h" #include "afr-messages.h" static void -__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this) -{ - int i = 0; - int ret = 0; - int read_subvol = 0; - struct iatt *stbuf = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_read_subvol_args_t args = {0,}; - - local = frame->local; - priv = this->private; - - /*This code needs to stay till DHT sends fops on linked - * inodes*/ - if (local->inode && !inode_is_linked (local->inode)) { - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret == -1) - continue; - if (!gf_uuid_is_null - (local->replies[i].poststat.ia_gfid)) { - gf_uuid_copy (args.gfid, - local->replies[i].poststat.ia_gfid); - args.ia_type = - local->replies[i].poststat.ia_type; - break; - } else { - ret = dict_get_bin (local->replies[i].xdata, - DHT_IATT_IN_XDATA_KEY, - (void **) &stbuf); - if (ret) - continue; - gf_uuid_copy (args.gfid, stbuf->ia_gfid); - args.ia_type = stbuf->ia_type; - break; - } - } +__afr_inode_write_finalize(call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int ret = 0; + int read_subvol = 0; + struct iatt *stbuf = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_read_subvol_args_t args = { + 0, + }; + + local = frame->local; + priv = this->private; + GF_VALIDATE_OR_GOTO(this->name, local->inode, out); + + /*This code needs to stay till DHT sends fops on linked + * inodes*/ + if (!inode_is_linked(local->inode)) { + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret == -1) + continue; + if (!gf_uuid_is_null(local->replies[i].poststat.ia_gfid)) { + gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid); + args.ia_type = local->replies[i].poststat.ia_type; + break; + } else { + ret = dict_get_bin(local->replies[i].xdata, + DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); + if (ret) + continue; + gf_uuid_copy(args.gfid, stbuf->ia_gfid); + args.ia_type = stbuf->ia_type; + break; + } + } + } + + if (local->transaction.type == AFR_METADATA_TRANSACTION) { + read_subvol = afr_metadata_subvol_get(local->inode, this, NULL, + local->readable, NULL, &args); + } else { + read_subvol = afr_data_subvol_get(local->inode, this, NULL, + local->readable, NULL, &args); + } + + local->op_ret = -1; + local->op_errno = afr_final_errno(local, priv); + afr_pick_error_xdata(local, priv, local->inode, local->readable, NULL, + NULL); + + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; + if (local->replies[i].op_ret < 0) + continue; + + /* Order of checks in the compound conditional + below is important. + + - Highest precedence: largest op_ret + - Next precedence: if all op_rets are equal, read subvol + - Least precedence: any succeeded subvol + */ + if ((local->op_ret < local->replies[i].op_ret) || + ((local->op_ret == local->replies[i].op_ret) && + (i == read_subvol))) { + local->op_ret = local->replies[i].op_ret; + local->op_errno = local->replies[i].op_errno; + + local->cont.inode_wfop.prebuf = local->replies[i].prestat; + local->cont.inode_wfop.postbuf = local->replies[i].poststat; + + if (local->replies[i].xdata) { + if (local->xdata_rsp) + dict_unref(local->xdata_rsp); + local->xdata_rsp = dict_ref(local->replies[i].xdata); + } + if (local->replies[i].xattr) { + if (local->xattr_rsp) + dict_unref(local->xattr_rsp); + local->xattr_rsp = dict_ref(local->replies[i].xattr); + } } + } - if (local->inode) { - if (local->transaction.type == AFR_METADATA_TRANSACTION) - read_subvol = afr_metadata_subvol_get (local->inode, - this, NULL, local->readable, NULL, &args); - else - read_subvol = afr_data_subvol_get (local->inode, this, - NULL, local->readable, NULL, &args); - } - - local->op_ret = -1; - local->op_errno = afr_final_errno (local, priv); - afr_pick_error_xdata (local, priv, local->inode, local->readable, NULL, - NULL); - - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - if (local->replies[i].op_ret < 0) - continue; - - /* Order of checks in the compound conditional - below is important. - - - Highest precedence: largest op_ret - - Next precendence: if all op_rets are equal, read subvol - - Least precedence: any succeeded subvol - */ - if ((local->op_ret < local->replies[i].op_ret) || - ((local->op_ret == local->replies[i].op_ret) && - (i == read_subvol))) { - - local->op_ret = local->replies[i].op_ret; - local->op_errno = local->replies[i].op_errno; - - local->cont.inode_wfop.prebuf = - local->replies[i].prestat; - local->cont.inode_wfop.postbuf = - local->replies[i].poststat; - - if (local->replies[i].xdata) { - if (local->xdata_rsp) - dict_unref (local->xdata_rsp); - local->xdata_rsp = - dict_ref (local->replies[i].xdata); - } - if (local->replies[i].xattr) { - if (local->xattr_rsp) - dict_unref (local->xattr_rsp); - local->xattr_rsp = - dict_ref (local->replies[i].xattr); - } - } - } - - afr_set_in_flight_sb_status (this, frame, local->inode); + afr_set_in_flight_sb_status(this, frame, local->inode); +out: + return; } - static void -__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, - int op_ret, int op_errno, - struct iatt *prebuf, struct iatt *postbuf, - dict_t *xattr, dict_t *xdata) +__afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, + int op_ret, int op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - local->replies[child_index].valid = 1; + local->replies[child_index].valid = 1; - if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1) - op_ret = iov_length (local->cont.writev.vector, - local->cont.writev.count); + if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1) + op_ret = iov_length(local->cont.writev.vector, + local->cont.writev.count); - local->replies[child_index].op_ret = op_ret; - local->replies[child_index].op_errno = op_errno; - if (xdata) - local->replies[child_index].xdata = dict_ref (xdata); + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + if (xdata) + local->replies[child_index].xdata = dict_ref(xdata); - if (op_ret >= 0) { - if (prebuf) - local->replies[child_index].prestat = *prebuf; - if (postbuf) - local->replies[child_index].poststat = *postbuf; - if (xattr) - local->replies[child_index].xattr = dict_ref (xattr); - } else { - afr_transaction_fop_failed (frame, this, child_index); - } + if (op_ret >= 0) { + if (prebuf) + local->replies[child_index].prestat = *prebuf; + if (postbuf) + local->replies[child_index].poststat = *postbuf; + if (xattr) + local->replies[child_index].xattr = dict_ref(xattr); + } else { + afr_transaction_fop_failed(frame, this, child_index); + } - return; + return; } - static int -__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xattr, dict_t *xdata) -{ - afr_local_t *local = NULL; - int child_index = (long) cookie; - int call_count = -1; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - - LOCK (&frame->lock); - { - __afr_inode_write_fill (frame, this, child_index, op_ret, - op_errno, prebuf, postbuf, xattr, - xdata); +__afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xattr, dict_t *xdata) +{ + afr_local_t *local = NULL; + int child_index = (long)cookie; + int call_count = -1; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + LOCK(&frame->lock); + { + __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, + prebuf, postbuf, xattr, xdata); + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + __afr_inode_write_finalize(frame, this); + + if (afr_txn_nothing_failed(frame, this)) { + /*if it did pre-op, it will do post-op changing ctime*/ + if (priv->consistent_metadata && afr_needs_changelog_update(local)) + afr_zero_fill_stat(local); + local->transaction.unwind(frame, this); } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - if (call_count == 0) { - __afr_inode_write_finalize (frame, this); - - if (afr_txn_nothing_failed (frame, this)) { - /*if it did pre-op, it will do post-op changing ctime*/ - if (priv->consistent_metadata && - afr_needs_changelog_update (local)) - afr_zero_fill_stat (local); - local->transaction.unwind (frame, this); - } - - local->transaction.resume (frame, this); - } + afr_transaction_resume(frame, this); + } - return 0; + return 0; } /* {{{ writev */ void -afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame) +afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame) { - afr_local_t *src_local = NULL; - afr_local_t *dst_local = NULL; + afr_local_t *src_local = NULL; + afr_local_t *dst_local = NULL; - src_local = src_frame->local; - dst_local = dst_frame->local; + src_local = src_frame->local; + dst_local = dst_frame->local; - dst_local->op_ret = src_local->op_ret; - dst_local->op_errno = src_local->op_errno; - dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; - dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; - if (src_local->xdata_rsp) - dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp); + dst_local->op_ret = src_local->op_ret; + dst_local->op_errno = src_local->op_errno; + dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf; + dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf; + if (src_local->xdata_rsp) + dst_local->xdata_rsp = dict_ref(src_local->xdata_rsp); } void -afr_writev_unwind (call_frame_t *frame, xlator_t *this) +afr_writev_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - afr_private_t *priv = this->private; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; - local = frame->local; + local = frame->local; - if (priv->consistent_metadata) - afr_zero_fill_stat (local); + if (priv->consistent_metadata) + afr_zero_fill_stat(local); - AFR_STACK_UNWIND (writev, frame, - local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - local->xdata_rsp); + AFR_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); } - int -afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this) +afr_transaction_writev_unwind(call_frame_t *frame, xlator_t *this) { - call_frame_t *fop_frame = NULL; + call_frame_t *fop_frame = NULL; - fop_frame = afr_transaction_detach_fop_frame (frame); + fop_frame = afr_transaction_detach_fop_frame(frame); - if (fop_frame) { - afr_writev_copy_outvars (frame, fop_frame); - afr_writev_unwind (fop_frame, this); - } - return 0; + if (fop_frame) { + afr_writev_copy_outvars(frame, fop_frame); + afr_writev_unwind(fop_frame, this); + } + return 0; } static void -afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - - local = frame->local; - priv = this->private; - /* - * We already have the best case result of the writev calls staged - * as the return value. Any writev that returns some value less - * than the best case is now out of sync, so mark the fop as - * failed. Note that fops that have returned with errors have - * already been marked as failed. - */ - for (i = 0; i < priv->child_count; i++) { - if ((!local->replies[i].valid) || - (local->replies[i].op_ret == -1)) - continue; - - if (local->replies[i].op_ret < local->op_ret) - afr_transaction_fop_failed (frame, this, i); - } +afr_writev_handle_short_writes(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + /* + * We already have the best case result of the writev calls staged + * as the return value. Any writev that returns some value less + * than the best case is now out of sync, so mark the fop as + * failed. Note that fops that have returned with errors have + * already been marked as failed. + */ + for (i = 0; i < priv->child_count; i++) { + if ((!local->replies[i].valid) || (local->replies[i].op_ret == -1)) + continue; + + if (local->replies[i].op_ret < local->op_ret) + afr_transaction_fop_failed(frame, this, i); + } } void -afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, +afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - int ret = 0; - afr_local_t *local = frame->local; - uint32_t open_fd_count = 0; - uint32_t write_is_append = 0; - - LOCK (&frame->lock); - { - __afr_inode_write_fill (frame, this, child_index, op_ret, - op_errno, prebuf, postbuf, NULL, xdata); - if (op_ret == -1 || !xdata) - goto unlock; - - write_is_append = 0; - ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND, - &write_is_append); - if (ret || !write_is_append) - local->append_write = _gf_false; - - ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, - &open_fd_count); - if (ret == -1) - goto unlock; - if (open_fd_count > local->open_fd_count) { - local->open_fd_count = open_fd_count; - local->update_open_fd_count = _gf_true; - } + int ret = 0; + afr_local_t *local = frame->local; + uint32_t open_fd_count = 0; + uint32_t write_is_append = 0; + int32_t num_inodelks = 0; + + LOCK(&frame->lock); + { + __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, + prebuf, postbuf, NULL, xdata); + if (op_ret == -1 || !xdata) + goto unlock; + + write_is_append = 0; + ret = dict_get_uint32(xdata, GLUSTERFS_WRITE_IS_APPEND, + &write_is_append); + if (ret || !write_is_append) + local->append_write = _gf_false; + + ret = dict_get_uint32(xdata, GLUSTERFS_ACTIVE_FD_COUNT, &open_fd_count); + if (ret < 0) + goto unlock; + if (open_fd_count > local->open_fd_count) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; } + + ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT, + &num_inodelks); + if (ret < 0) + goto unlock; + if (num_inodelks > local->num_inodelks) { + local->num_inodelks = num_inodelks; + local->update_num_inodelks = _gf_true; + } + } unlock: - UNLOCK (&frame->lock); + UNLOCK(&frame->lock); } void -afr_process_post_writev (call_frame_t *frame, xlator_t *this) +afr_process_post_writev(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - - local = frame->local; + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; - if (!local->stable_write && !local->append_write) - /* An appended write removes the necessity to - fsync() the file. This is because self-heal - has the logic to check for larger file when - the xattrs are not reliably pointing at - a stale file. - */ - afr_fd_report_unstable_write (this, local->fd); + local = frame->local; - __afr_inode_write_finalize (frame, this); + if (!local->stable_write && !local->append_write) + /* An appended write removes the necessity to + fsync() the file. This is because self-heal + has the logic to check for larger file when + the xattrs are not reliably pointing at + a stale file. + */ + afr_fd_report_unstable_write(this, local); - afr_writev_handle_short_writes (frame, this); + __afr_inode_write_finalize(frame, this); - if (local->update_open_fd_count) - afr_handle_open_fd_count (frame, this); + afr_writev_handle_short_writes(frame, this); + if (local->update_open_fd_count) + local->inode_ctx->open_fd_count = local->open_fd_count; + if (local->update_num_inodelks && + local->transaction.type == AFR_DATA_TRANSACTION) { + lock = &local->inode_ctx->lock[local->transaction.type]; + lock->num_inodelks = local->num_inodelks; + } } int -afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +afr_writev_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *fop_frame = NULL; - int child_index = (long) cookie; - int call_count = -1; - - local = frame->local; - - afr_inode_write_fill (frame, this, child_index, op_ret, op_errno, - prebuf, postbuf, xdata); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - afr_process_post_writev (frame, this); - - if (!afr_txn_nothing_failed (frame, this)) { - //Don't unwind until post-op is complete - local->transaction.resume (frame, this); - } else { - /* - * Generally inode-write fops do transaction.unwind then - * transaction.resume, but writev needs to make sure that - * delayed post-op frame is placed in fdctx before unwind - * happens. This prevents the race of flush doing the - * changelog wakeup first in fuse thread and then this - * writev placing its delayed post-op frame in fdctx. - * This helps flush make sure all the delayed post-ops are - * completed. - */ - - fop_frame = afr_transaction_detach_fop_frame (frame); - afr_writev_copy_outvars (frame, fop_frame); - local->transaction.resume (frame, this); - afr_writev_unwind (fop_frame, this); - } + call_frame_t *fop_frame = NULL; + int child_index = (long)cookie; + int call_count = -1; + + afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf, + postbuf, xdata); + + call_count = afr_frame_return(frame); + + if (call_count == 0) { + afr_process_post_writev(frame, this); + + if (!afr_txn_nothing_failed(frame, this)) { + // Don't unwind until post-op is complete + afr_transaction_resume(frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then + * transaction.resume, but writev needs to make sure that + * delayed post-op frame is placed in fdctx before unwind + * happens. This prevents the race of flush doing the + * changelog wakeup first in fuse thread and then this + * writev placing its delayed post-op frame in fdctx. + * This helps flush make sure all the delayed post-ops are + * completed. + */ + + fop_frame = afr_transaction_detach_fop_frame(frame); + afr_writev_copy_outvars(frame, fop_frame); + afr_transaction_resume(frame, this); + afr_writev_unwind(fop_frame, this); } - return 0; + } + return 0; } static int -afr_arbiter_writev_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_arbiter_writev_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = frame->local; - afr_private_t *priv = this->private; - static char byte = 0xFF; - static struct iovec vector = {&byte, 1}; - int32_t count = 1; + afr_local_t *local = frame->local; + afr_private_t *priv = this->private; + static char byte = 0xFF; + static struct iovec vector = {&byte, 1}; + int32_t count = 1; - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->writev, - local->fd, &vector, count, local->cont.writev.offset, - local->cont.writev.flags, local->cont.writev.iobref, - local->xdata_req); + STACK_WIND_COOKIE( + frame, afr_writev_wind_cbk, (void *)(long)subvol, + priv->children[subvol], priv->children[subvol]->fops->writev, local->fd, + &vector, count, local->cont.writev.offset, local->cont.writev.flags, + local->cont.writev.iobref, local->xdata_req); - return 0; + return 0; } int -afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_writev_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; - - if (AFR_IS_ARBITER_BRICK(priv, subvol)) { - afr_arbiter_writev_wind (frame, this, subvol); - return 0; - } + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->writev, - local->fd, local->cont.writev.vector, - local->cont.writev.count, local->cont.writev.offset, - local->cont.writev.flags, local->cont.writev.iobref, - local->xdata_req); + if (AFR_IS_ARBITER_BRICK(priv, subvol)) { + afr_arbiter_writev_wind(frame, this, subvol); return 0; -} + } + STACK_WIND_COOKIE(frame, afr_writev_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->writev, local->fd, + local->cont.writev.vector, local->cont.writev.count, + local->cont.writev.offset, local->cont.writev.flags, + local->cont.writev.iobref, local->xdata_req); + return 0; +} int -afr_do_writev (call_frame_t *frame, xlator_t *this) +afr_do_writev(call_frame_t *frame, xlator_t *this) { - call_frame_t *transaction_frame = NULL; - afr_local_t *local = NULL; - int ret = -1; - int op_errno = ENOMEM; - - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - local = frame->local; - transaction_frame->local = local; - frame->local = NULL; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - if (!AFR_FRAME_INIT (frame, op_errno)) - goto out; + local = frame->local; + transaction_frame->local = local; + frame->local = NULL; - local->op = GF_FOP_WRITE; + if (!AFR_FRAME_INIT(frame, op_errno)) + goto out; - local->transaction.wind = afr_writev_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_transaction_writev_unwind; + local->op = GF_FOP_WRITE; - local->transaction.main_frame = frame; + local->transaction.wind = afr_writev_wind; + local->transaction.unwind = afr_transaction_writev_unwind; - if (local->fd->flags & O_APPEND) { - /* - * Backend vfs ignores the 'offset' for append mode fd so - * locking just the region provided for the writev does not - * give consistency guarantee. The actual write may happen at a - * completely different range than the one provided by the - * offset, len in the fop. So lock the entire file. - */ - local->transaction.start = 0; - local->transaction.len = 0; - } else { - local->transaction.start = local->cont.writev.offset; - local->transaction.len = iov_length (local->cont.writev.vector, - local->cont.writev.count); - } + local->transaction.main_frame = frame; - ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } - - return 0; + if (local->fd->flags & O_APPEND) { + /* + * Backend vfs ignores the 'offset' for append mode fd so + * locking just the region provided for the writev does not + * give consistency guarantee. The actual write may happen at a + * completely different range than the one provided by the + * offset, len in the fop. So lock the entire file. + */ + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = local->cont.writev.offset; + local->transaction.len = iov_length(local->cont.writev.vector, + local->cont.writev.count); + } + + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - int -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata) +afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) { - afr_local_t *local = NULL; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + int op_errno = ENOMEM; + int ret = -1; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + local->cont.writev.vector = iov_dup(vector, count); + if (!local->cont.writev.vector) + goto out; + local->cont.writev.count = count; + local->cont.writev.offset = offset; + local->cont.writev.flags = flags; + local->cont.writev.iobref = iobref_ref(iobref); - local->cont.writev.vector = iov_dup (vector, count); - if (!local->cont.writev.vector) - goto out; - local->cont.writev.count = count; - local->cont.writev.offset = offset; - local->cont.writev.flags = flags; - local->cont.writev.iobref = iobref_ref (iobref); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + if (!local->xdata_req) + goto out; - if (!local->xdata_req) - goto out; + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + if (dict_set_uint32(local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) { + op_errno = ENOMEM; + goto out; + } - if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { - op_errno = ENOMEM; - goto out; - } + if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT, + this->name)) { + op_errno = ENOMEM; + goto out; + } - if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { - op_errno = ENOMEM; - goto out; - } + if (dict_set_uint32(local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; + goto out; + } - /* Set append_write to be true speculatively. If on any - server it turns not be true, we unset it in the - callback. - */ - local->append_write = _gf_true; + /* Set append_write to be true speculatively. If on any + server it turns not be true, we unset it in the + callback. + */ + local->append_write = _gf_true; - /* detect here, but set it in writev_wind_cbk *after* the unstable - write is performed - */ - local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC)); + /* detect here, but set it in writev_wind_cbk *after* the unstable + write is performed + */ + local->stable_write = !!((fd->flags | flags) & (O_SYNC | O_DSYNC)); - afr_fix_open (fd, this); + afr_fix_open(fd, this); - afr_do_writev (frame, this); + afr_do_writev(frame, this); - return 0; + return 0; out: - AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } - /* }}} */ /* {{{ truncate */ int -afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +afr_truncate_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(truncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +afr_truncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - int -afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_truncate_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->truncate, - &local->loc, local->cont.truncate.offset, - local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_truncate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->truncate, &local->loc, + local->cont.truncate.offset, local->xdata_req); + return 0; } - int -afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset, dict_t *xdata) +afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata) { - afr_local_t * local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.truncate.offset = offset; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + local->cont.truncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->transaction.wind = afr_truncate_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_truncate_unwind; + local->transaction.wind = afr_truncate_wind; + local->transaction.unwind = afr_truncate_unwind; - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local->op = GF_FOP_TRUNCATE; + local->op = GF_FOP_TRUNCATE; - local->transaction.main_frame = frame; - local->transaction.start = offset; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = offset; + local->transaction.len = 0; - /* Set it true speculatively, will get reset in afr_truncate_wind_cbk - if truncate was not a NOP */ - local->stable_write = _gf_true; + /* Set it true speculatively, will get reset in afr_truncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - /* }}} */ /* {{{ ftruncate */ - int -afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +afr_ftruncate_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; - - AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(ftruncate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +afr_ftruncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) - local->stable_write = _gf_false; + if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size) + local->stable_write = _gf_false; - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - int -afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_ftruncate_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->ftruncate, - local->fd, local->cont.ftruncate.offset, - local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_ftruncate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->ftruncate, local->fd, + local->cont.ftruncate.offset, local->xdata_req); + return 0; } - int -afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - dict_t *xdata) +afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.ftruncate.offset = offset; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + local->cont.ftruncate.offset = offset; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local->op = GF_FOP_FTRUNCATE; + local->op = GF_FOP_FTRUNCATE; - local->transaction.wind = afr_ftruncate_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_ftruncate_unwind; + local->transaction.wind = afr_ftruncate_wind; + local->transaction.unwind = afr_ftruncate_unwind; - local->transaction.main_frame = frame; + local->transaction.main_frame = frame; - local->transaction.start = local->cont.ftruncate.offset; - local->transaction.len = 0; + local->transaction.start = local->cont.ftruncate.offset; + local->transaction.len = 0; - afr_fix_open (fd, this); + afr_fix_open(fd, this); - /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk - if truncate was not a NOP */ - local->stable_write = _gf_true; + /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk + if truncate was not a NOP */ + local->stable_write = _gf_true; - ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); + AFR_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + return 0; } /* }}} */ @@ -805,1707 +788,1778 @@ out: /* {{{ setattr */ int -afr_setattr_unwind (call_frame_t *frame, xlator_t *this) +afr_setattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; - - AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, - local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(setattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) +afr_setattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *preop, + struct iatt *postop, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - preop, postop, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop, + postop, NULL, xdata); } - int -afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_setattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->setattr, - &local->loc, &local->cont.setattr.in_buf, - local->cont.setattr.valid, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_setattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->setattr, &local->loc, + &local->cont.setattr.in_buf, local->cont.setattr.valid, + local->xdata_req); + return 0; } - int -afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, - int32_t valid, dict_t *xdata) +afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.setattr.in_buf = *buf; - local->cont.setattr.valid = valid; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + local->cont.setattr.in_buf = *buf; + local->cont.setattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->transaction.wind = afr_setattr_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_setattr_unwind; + local->transaction.wind = afr_setattr_wind; + local->transaction.unwind = afr_setattr_unwind; - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local->op = GF_FOP_SETATTR; + local->op = GF_FOP_SETATTR; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* {{{ fsetattr */ int -afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fsetattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fsetattr, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, - struct iatt *preop, struct iatt *postop, dict_t *xdata) +afr_fsetattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *preop, + struct iatt *postop, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - preop, postop, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop, + postop, NULL, xdata); } - int -afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_fsetattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->fsetattr, - local->fd, &local->cont.fsetattr.in_buf, - local->cont.fsetattr.valid, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_fsetattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetattr, local->fd, + &local->cont.fsetattr.in_buf, local->cont.fsetattr.valid, + local->xdata_req); + return 0; } - int -afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata) +afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, + int32_t valid, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.fsetattr.in_buf = *buf; - local->cont.fsetattr.valid = valid; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + local->cont.fsetattr.in_buf = *buf; + local->cont.fsetattr.valid = valid; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->transaction.wind = afr_fsetattr_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_fsetattr_unwind; + local->transaction.wind = afr_fsetattr_wind; + local->transaction.unwind = afr_fsetattr_unwind; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local->op = GF_FOP_FSETATTR; + local->op = GF_FOP_FSETATTR; - afr_fix_open (fd, this); + afr_fix_open(fd, this); - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - /* {{{ setxattr */ - int -afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +afr_setxattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno, - local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(setxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} int -afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_setxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - NULL, NULL, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); } - int -afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_setxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->setxattr, - &local->loc, local->cont.setxattr.dict, - local->cont.setxattr.flags, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_setxattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->setxattr, &local->loc, + local->cont.setxattr.dict, local->cont.setxattr.flags, + local->xdata_req); + return 0; } int -afr_emptyb_set_pending_changelog_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int op_ret, int op_errno, - dict_t *xattr, dict_t *xdata) +afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i, ret = 0; - char *op_type = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i, ret = 0; + char *op_type = NULL; - local = frame->local; - priv = this->private; - i = (long) cookie; + local = frame->local; + priv = this->private; + i = (long)cookie; - local->replies[i].valid = 1; - local->replies[i].op_ret = op_ret; - local->replies[i].op_errno = op_errno; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - ret = dict_get_str (local->xdata_req, "replicate-brick-op", &op_type); - if (ret) - goto out; + ret = dict_get_str_sizen(local->xdata_req, "replicate-brick-op", &op_type); + if (ret) + goto out; - gf_msg (this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, - op_ret ? op_errno : 0, - afr_get_msg_id (op_type), - "Set of pending xattr %s on" - " %s.", op_ret ? "failed" : "succeeded", - priv->children[i]->name); + gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO, + op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s", + priv->children[i]->name, "op_ret=%s", + op_ret ? "failed" : "succeeded", NULL); out: - syncbarrier_wake (&local->barrier); - return 0; + syncbarrier_wake(&local->barrier); + return 0; } int -afr_emptyb_set_pending_changelog (call_frame_t *frame, xlator_t *this, - unsigned char *locked_nodes) +afr_emptyb_set_pending_changelog(call_frame_t *frame, xlator_t *this, + unsigned char *locked_nodes) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int ret = 0, i = 0; - - local = frame->local; - priv = this->private; - - AFR_ONLIST (locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk, - xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, - local->xattr_req, NULL); - - /* It is sufficient if xattrop was successful on one child */ - for (i = 0; i < priv->child_count; i++) { - if (!local->replies[i].valid) - continue; - - if (local->replies[i].op_ret == 0) { - ret = 0; - goto out; - } else { - ret = afr_higher_errno (ret, - local->replies[i].op_errno); - } - } -out: - return -ret; -} - -int -_afr_handle_empty_brick_type (xlator_t *this, call_frame_t *frame, - loc_t *loc, int empty_index, - afr_transaction_type type, - char *op_type) -{ - int count = 0; - int ret = -ENOMEM; - int idx = -1; - int d_idx = -1; - unsigned char *locked_nodes = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - - locked_nodes = alloca0 (priv->child_count); - - idx = afr_index_for_transaction_type (type); - d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - - local->pending = afr_matrix_create (priv->child_count, - AFR_NUM_CHANGE_LOGS); - if (!local->pending) - goto out; - - local->pending[empty_index][idx] = hton32 (1); - - if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION)) - local->pending[empty_index][d_idx] = hton32 (1); + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int ret = 0, i = 0; - local->xdata_req = dict_new (); - if (!local->xdata_req) - goto out; - - ret = dict_set_str (local->xdata_req, "replicate-brick-op", op_type); - if (ret) - goto out; + local = frame->local; + priv = this->private; - local->xattr_req = dict_new (); - if (!local->xattr_req) - goto out; + AFR_ONLIST(locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk, + xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, local->xattr_req, + NULL); - ret = afr_set_pending_dict (priv, local->xattr_req, local->pending); - if (ret < 0) - goto out; + /* It is sufficient if xattrop was successful on one child */ + for (i = 0; i < priv->child_count; i++) { + if (!local->replies[i].valid) + continue; - if (AFR_ENTRY_TRANSACTION == type) { - count = afr_selfheal_entrylk (frame, this, loc->inode, - this->name, NULL, locked_nodes); + if (local->replies[i].op_ret == 0) { + ret = 0; + goto out; } else { - count = afr_selfheal_inodelk (frame, this, loc->inode, - this->name, LLONG_MAX - 1, 0, - locked_nodes); - } - - if (!count) { - gf_msg (this->name, GF_LOG_ERROR, EAGAIN, - AFR_MSG_REPLACE_BRICK_STATUS, "Couldn't acquire lock on" - " any child."); - ret = -EAGAIN; - goto unlock; + ret = afr_higher_errno(ret, local->replies[i].op_errno); } + } +out: + return -ret; +} - ret = afr_emptyb_set_pending_changelog (frame, this, locked_nodes); - if (ret) - goto unlock; - ret = 0; +static int +_afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc, + int empty_index, afr_transaction_type type, + char *op_type, const int op_type_len) +{ + int count = 0; + int ret = -ENOMEM; + int idx = -1; + int d_idx = -1; + unsigned char *locked_nodes = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + locked_nodes = alloca0(priv->child_count); + + idx = afr_index_for_transaction_type(type); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!local->pending) + goto out; + + local->pending[empty_index][idx] = hton32(1); + + if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION)) + local->pending[empty_index][d_idx] = hton32(1); + + local->xdata_req = dict_new(); + if (!local->xdata_req) + goto out; + + ret = dict_set_nstrn(local->xdata_req, "replicate-brick-op", + SLEN("replicate-brick-op"), op_type, op_type_len); + if (ret) + goto out; + + local->xattr_req = dict_new(); + if (!local->xattr_req) + goto out; + + ret = afr_set_pending_dict(priv, local->xattr_req, local->pending); + if (ret < 0) + goto out; + + if (AFR_ENTRY_TRANSACTION == type) { + count = afr_selfheal_entrylk(frame, this, loc->inode, this->name, NULL, + locked_nodes); + } else { + count = afr_selfheal_inodelk(frame, this, loc->inode, this->name, + LLONG_MAX - 1, 0, locked_nodes); + } + + if (!count) { + gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS, + NULL); + ret = -EAGAIN; + goto unlock; + } + + ret = afr_emptyb_set_pending_changelog(frame, this, locked_nodes); + if (ret) + goto unlock; + ret = 0; unlock: - if (AFR_ENTRY_TRANSACTION == type) { - afr_selfheal_unentrylk (frame, this, loc->inode, this->name, - NULL, locked_nodes, NULL); - } else { - afr_selfheal_uninodelk (frame, this, loc->inode, this->name, - LLONG_MAX - 1, 0, locked_nodes); - } + if (AFR_ENTRY_TRANSACTION == type) { + afr_selfheal_unentrylk(frame, this, loc->inode, this->name, NULL, + locked_nodes, NULL); + } else { + afr_selfheal_uninodelk(frame, this, loc->inode, this->name, + LLONG_MAX - 1, 0, locked_nodes); + } out: - return ret; + return ret; } void -afr_brick_args_cleanup (void *opaque) -{ - afr_empty_brick_args_t *data = NULL; - - data = opaque; - loc_wipe (&data->loc); - GF_FREE (data); -} - -int -_afr_handle_empty_brick_cbk (int ret, call_frame_t *frame, void *opaque) -{ - afr_brick_args_cleanup (opaque); - return 0; -} - -int -_afr_handle_empty_brick (void *opaque) -{ - - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int empty_index = -1; - int ret = -1; - int op_errno = ENOMEM; - call_frame_t *frame = NULL; - xlator_t *this = NULL; - char *op_type = NULL; - afr_empty_brick_args_t *data = NULL; - - data = opaque; - frame = data->frame; - empty_index = data->empty_index; - op_type = data->op_type; - this = frame->this; - priv = this->private; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - loc_copy (&local->loc, &data->loc); - - gf_msg (this->name, GF_LOG_INFO, 0, 0, "New brick is : %s", - priv->children[empty_index]->name); - - ret = _afr_handle_empty_brick_type (this, frame, &local->loc, empty_index, - AFR_METADATA_TRANSACTION, op_type); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - - dict_unref (local->xdata_req); - dict_unref (local->xattr_req); - afr_matrix_cleanup (local->pending, priv->child_count); - local->pending = NULL; - local->xattr_req = NULL; - local->xdata_req = NULL; - - ret = _afr_handle_empty_brick_type (this, frame, &local->loc, empty_index, - AFR_ENTRY_TRANSACTION, op_type); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - ret = 0; +afr_brick_args_cleanup(void *opaque) +{ + afr_empty_brick_args_t *data = NULL; + + data = opaque; + loc_wipe(&data->loc); + GF_FREE(data); +} + +int +_afr_handle_empty_brick_cbk(int ret, call_frame_t *frame, void *opaque) +{ + afr_brick_args_cleanup(opaque); + return 0; +} + +int +_afr_handle_empty_brick(void *opaque) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int empty_index = -1; + int ret = -1; + int op_errno = ENOMEM; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + char *op_type = NULL; + int op_type_len = 0; + afr_empty_brick_args_t *data = NULL; + call_frame_t *op_frame = NULL; + + data = opaque; + frame = data->frame; + empty_index = data->empty_index; + if (!data->op_type) + goto out; + + op_frame = copy_frame(frame); + if (!op_frame) { + ret = -1; + op_errno = ENOMEM; + goto out; + } + + op_type = data->op_type; + op_type_len = strlen(op_type); + this = op_frame->this; + priv = this->private; + + afr_set_lk_owner(op_frame, this, op_frame->root); + local = AFR_FRAME_INIT(op_frame, op_errno); + if (!local) + goto out; + + loc_copy(&local->loc, &data->loc); + + gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s", + priv->children[empty_index]->name, NULL); + + ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, + AFR_METADATA_TRANSACTION, op_type, + op_type_len); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + + dict_unref(local->xdata_req); + dict_unref(local->xattr_req); + afr_matrix_cleanup(local->pending, priv->child_count); + local->pending = NULL; + local->xattr_req = NULL; + local->xdata_req = NULL; + + ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index, + AFR_ENTRY_TRANSACTION, op_type, + op_type_len); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = 0; out: - AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); - return 0; + if (op_frame) { + AFR_STACK_DESTROY(op_frame); + } + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + return 0; +} + +int +afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc, + char *data) +{ + afr_local_t *local = NULL; + int ret = -1; + int op_errno = EINVAL; + + local = frame->local; + local->xdata_req = dict_new(); + + if (!local->xdata_req) { + op_errno = ENOMEM; + goto out; + } + + ret = dict_set_int32_sizen(local->xdata_req, "heal-op", + GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + ret = dict_set_str_sizen(local->xdata_req, "child-name", data); + if (ret) { + op_errno = -ret; + ret = -1; + goto out; + } + /* set spb choice to -1 whether heal succeeds or not: + * If heal succeeds : spb-choice should be set to -1 as + * it is no longer valid; file is not + * in split-brain anymore. + * If heal doesn't succeed: + * spb-choice should be set to -1 + * otherwise reads will be served + * from spb-choice which is misleading. + */ + ret = afr_inode_split_brain_choice_set(loc->inode, this, -1); + if (ret) + gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + NULL); + afr_heal_splitbrain_file(frame, this, loc); + ret = 0; +out: + if (ret < 0) + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + return 0; } - int -afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc, - char *data) +afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len) { - afr_local_t *local = NULL; - int ret = -1; - int op_errno = EINVAL; + int spb_child_index = -1; + char *spb_child_str = NULL; - local = frame->local; - local->xdata_req = dict_new (); + spb_child_str = alloca0(len + 1); + memcpy(spb_child_str, value, len); - if (!local->xdata_req) { - op_errno = ENOMEM; - goto out; - } + if (!strcmp(spb_child_str, "none")) + return -2; - ret = dict_set_int32 (local->xdata_req, "heal-op", - GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - ret = dict_set_str (local->xdata_req, "child-name", data); - if (ret) { - op_errno = -ret; - ret = -1; - goto out; - } - /* set spb choice to -1 whether heal succeeds or not: - * If heal succeeds : spb-choice should be set to -1 as - * it is no longer valid; file is not - * in split-brain anymore. - * If heal doesn't succeed: - * spb-choice should be set to -1 - * otherwise reads will be served - * from spb-choice which is misleading. - */ - ret = afr_inode_split_brain_choice_set (loc->inode, this, -1); - if (ret) - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, "Failed to set" - "split-brain choice to -1"); - afr_heal_splitbrain_file (frame, this, loc); - ret = 0; -out: - if (ret < 0) - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - return 0; + spb_child_index = afr_get_child_index_from_name(this, spb_child_str); + if (spb_child_index < 0) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL, + "subvol=%s", spb_child_str, NULL); + } + return spb_child_index; } int -afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len) +afr_can_set_split_brain_choice(void *opaque) { - int spb_child_index = -1; - char *spb_child_str = NULL; + afr_spbc_timeout_t *data = opaque; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + loc_t *loc = NULL; + int ret = -1; - spb_child_str = alloca0 (len + 1); - memcpy (spb_child_str, value, len); + frame = data->frame; + loc = data->loc; + this = frame->this; - if (!strcmp (spb_child_str, "none")) - return -2; + ret = afr_is_split_brain(frame, this, loc->inode, loc->gfid, &data->d_spb, + &data->m_spb); - spb_child_index = afr_get_child_index_from_name (this, - spb_child_str); - if (spb_child_index < 0) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_INVALID_SUBVOL, "Invalid subvol: %s", - spb_child_str); - } - return spb_child_index; + if (ret) + gf_smsg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s", + uuid_utoa(loc->gfid), NULL); + return ret; } int -afr_can_set_split_brain_choice (void *opaque) +afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) { - afr_spbc_timeout_t *data = opaque; - call_frame_t *frame = NULL; - xlator_t *this = NULL; - loc_t *loc = NULL; - int ret = -1; + void *choice_value = NULL; + void *resolve_value = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_spbc_timeout_t *data = NULL; + int len = 0; + int spb_child_index = -1; + int ret = -1; + int op_errno = EINVAL; - frame = data->frame; - loc = data->loc; - this = frame->this; + priv = this->private; - ret = afr_is_split_brain (frame, this, loc->inode, loc->gfid, - &data->d_spb, &data->m_spb); + ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len); + ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value, + &len); + if (!choice_value && !resolve_value) { + ret = -1; + goto out; + } - if (ret) - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to determine if %s" - " is in split-brain. " - "Aborting split-brain-choice set.", - uuid_utoa (loc->gfid)); - return ret; -} + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) { + ret = 1; + goto out; + } -int -afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame, - loc_t *loc, dict_t *dict) -{ - void *value = NULL; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - afr_spbc_timeout_t *data = NULL; - int len = 0; - int spb_child_index = -1; - int ret = -1; - int op_errno = EINVAL; + local->op = GF_FOP_SETXATTR; - priv = this->private; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) { + if (choice_value) { + spb_child_index = afr_get_split_brain_child_index(this, choice_value, + len); + if (spb_child_index < 0) { + /* Case where value was "none" */ + if (spb_child_index == -2) + spb_child_index = -1; + else { ret = 1; + op_errno = EINVAL; goto out; + } } - local->op = GF_FOP_SETXATTR; - - ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value, - &len); - if (value) { - spb_child_index = afr_get_split_brain_child_index (this, value, - len); - if (spb_child_index < 0) { - /* Case where value was "none" */ - if (spb_child_index == -2) - spb_child_index = -1; - else { - ret = 1; - op_errno = EINVAL; - goto out; - } - } - - data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spbc_timeout_t); - if (!data) { - ret = 1; - goto out; - } - data->spb_child_index = spb_child_index; - data->frame = frame; - loc_copy (&local->loc, loc); - data->loc = &local->loc; - ret = synctask_new (this->ctx->env, - afr_can_set_split_brain_choice, - afr_set_split_brain_choice, NULL, data); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, - "Failed to create" - " synctask. Aborting split-brain choice set" - " for %s", loc->name); - ret = 1; - op_errno = ENOMEM; - goto out; - } - ret = 0; - goto out; + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spbc_timeout_t); + if (!data) { + ret = 1; + goto out; + } + data->spb_child_index = spb_child_index; + data->frame = frame; + loc_copy(&local->loc, loc); + data->loc = &local->loc; + ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice, + afr_set_split_brain_choice, NULL, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + "name=%s", loc->name, NULL); + ret = 1; + op_errno = ENOMEM; + goto out; } + ret = 0; + goto out; + } - ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_RESOLVE, &value, &len); - if (value) { - spb_child_index = afr_get_split_brain_child_index (this, value, - len); - if (spb_child_index < 0) { - ret = 1; - goto out; - } - - afr_split_brain_resolve_do (frame, this, loc, - priv->children[spb_child_index]->name); - ret = 0; + if (resolve_value) { + spb_child_index = afr_get_split_brain_child_index(this, resolve_value, + len); + if (spb_child_index < 0) { + ret = 1; + goto out; } + + afr_split_brain_resolve_do(frame, this, loc, + priv->children[spb_child_index]->name); + ret = 0; + } out: - /* key was correct but value was invalid when ret == 1 */ - if (ret == 1) { - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - if (data) - GF_FREE (data); - ret = 0; - } - return ret; + /* key was correct but value was invalid when ret == 1 */ + if (ret == 1) { + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + if (data) + GF_FREE(data); + ret = 0; + } + return ret; } int -afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame, - dict_t *dict) +afr_handle_spb_choice_timeout(xlator_t *this, call_frame_t *frame, dict_t *dict) { - int ret = -1; - int op_errno = 0; - uint64_t timeout = 0; - afr_private_t *priv = NULL; + int ret = -1; + int op_errno = 0; + uint64_t timeout = 0; + afr_private_t *priv = NULL; - priv = this->private; + priv = this->private; - ret = dict_get_uint64 (dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout); - if (!ret) { - priv->spb_choice_timeout = timeout * 60; - AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL); - } + ret = dict_get_uint64(dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout); + if (!ret) { + priv->spb_choice_timeout = timeout * 60; + AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL); + } - return ret; + return ret; } int -afr_handle_empty_brick (xlator_t *this, call_frame_t *frame, loc_t *loc, - dict_t *dict) +afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) { - int ret = -1; - int ab_ret = -1; - int empty_index = -1; - int op_errno = EPERM; - char *empty_brick = NULL; - char *op_type = NULL; - afr_empty_brick_args_t *data = NULL; + int ret = -1; + int ab_ret = -1; + int empty_index = -1; + int op_errno = EPERM; + char *empty_brick = NULL; + char *op_type = NULL; + afr_empty_brick_args_t *data = NULL; - ret = dict_get_str (dict, GF_AFR_REPLACE_BRICK, &empty_brick); - if (!ret) - op_type = GF_AFR_REPLACE_BRICK; + ret = dict_get_str_sizen(dict, GF_AFR_REPLACE_BRICK, &empty_brick); + if (!ret) + op_type = GF_AFR_REPLACE_BRICK; - ab_ret = dict_get_str (dict, GF_AFR_ADD_BRICK, &empty_brick); - if (!ab_ret) - op_type = GF_AFR_ADD_BRICK; + ab_ret = dict_get_str_sizen(dict, GF_AFR_ADD_BRICK, &empty_brick); + if (!ab_ret) + op_type = GF_AFR_ADD_BRICK; - if (ret && ab_ret) - goto out; + if (ret && ab_ret) + goto out; - if (frame->root->pid != GF_CLIENT_PID_SELF_HEALD) { - gf_msg (this->name, GF_LOG_ERROR, EPERM, - afr_get_msg_id (op_type), - "'%s' is an internal extended attribute.", - op_type); - ret = 1; - goto out; + if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) { + gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR, + "op_type=%s", op_type, NULL); + ret = 1; + goto out; + } + empty_index = afr_get_child_index_from_name(this, empty_brick); + + if (empty_index < 0) { + /* Didn't belong to this replica pair + * Just do a no-op + */ + AFR_STACK_UNWIND(setxattr, frame, 0, 0, NULL); + return 0; + } else { + data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_empty_brick_t); + if (!data) { + ret = 1; + op_errno = ENOMEM; + goto out; } - empty_index = afr_get_child_index_from_name (this, empty_brick); - - if (empty_index < 0) { - /* Didn't belong to this replica pair - * Just do a no-op - */ - AFR_STACK_UNWIND (setxattr, frame, 0, 0, NULL); - return 0; - } else { - data = GF_CALLOC (1, sizeof (*data), - gf_afr_mt_empty_brick_t); - if (!data) { - ret = 1; - op_errno = ENOMEM; - goto out; - } - data->frame = frame; - loc_copy (&data->loc, loc); - data->empty_index = empty_index; - data->op_type = op_type; - ret = synctask_new (this->ctx->env, - _afr_handle_empty_brick, - _afr_handle_empty_brick_cbk, - NULL, data); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - afr_get_msg_id (op_type), - "Failed to create synctask."); - ret = 1; - op_errno = ENOMEM; - afr_brick_args_cleanup (data); - goto out; - } + data->frame = frame; + loc_copy(&data->loc, loc); + data->empty_index = empty_index; + data->op_type = op_type; + ret = synctask_new(this->ctx->env, _afr_handle_empty_brick, + _afr_handle_empty_brick_cbk, NULL, data); + if (ret) { + gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS, + NULL); + ret = 1; + op_errno = ENOMEM; + afr_brick_args_cleanup(data); + goto out; } - ret = 0; + } + ret = 0; out: - if (ret == 1) { - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); - ret = 0; - } - return ret; + if (ret == 1) { + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); + ret = 0; + } + return ret; } static int -afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc, - dict_t *dict) +afr_handle_special_xattr(xlator_t *this, call_frame_t *frame, loc_t *loc, + dict_t *dict) { - int ret = -1; + int ret = -1; - ret = afr_handle_split_brain_commands (this, frame, loc, dict); - if (ret == 0) - goto out; + ret = afr_handle_split_brain_commands(this, frame, loc, dict); + if (ret == 0) + goto out; - ret = afr_handle_spb_choice_timeout (this, frame, dict); - if (ret == 0) - goto out; + ret = afr_handle_spb_choice_timeout(this, frame, dict); + if (ret == 0) + goto out; - /* Applicable for replace-brick and add-brick commands */ - ret = afr_handle_empty_brick (this, frame, loc, dict); + /* Applicable for replace-brick and add-brick commands */ + ret = afr_handle_empty_brick(this, frame, loc, dict); out: - return ret; + return ret; } int -afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, - int32_t flags, dict_t *xdata) +afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = EINVAL; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = EINVAL; - GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, - op_errno, out); + GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out); - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, - op_errno, out); + GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); - ret = afr_handle_special_xattr (this, frame, loc, dict); - if (ret == 0) - return 0; + ret = afr_handle_special_xattr(this, frame, loc, dict); + if (ret == 0) + return 0; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.setxattr.dict = dict_ref (dict); - local->cont.setxattr.flags = flags; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + local->cont.setxattr.dict = dict_ref(dict); + local->cont.setxattr.flags = flags; + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->transaction.wind = afr_setxattr_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_setxattr_unwind; + local->transaction.wind = afr_setxattr_wind; + local->transaction.unwind = afr_setxattr_unwind; - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - local->op = GF_FOP_SETXATTR; + local->op = GF_FOP_SETXATTR; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL); - return 0; + return 0; } /* {{{ fsetxattr */ - int -afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fsetxattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno, - local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fsetxattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} int -afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_fsetxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - NULL, NULL, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); } - int -afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_fsetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->fsetxattr, - local->fd, local->cont.fsetxattr.dict, - local->cont.fsetxattr.flags, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_fsetxattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsetxattr, local->fd, + local->cont.fsetxattr.dict, local->cont.fsetxattr.flags, + local->xdata_req); + return 0; } - int -afr_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata) +afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict, - op_errno, out); + GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out); - GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict, - op_errno, out); + GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out); - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.fsetxattr.dict = dict_ref (dict); - local->cont.fsetxattr.flags = flags; + local->cont.fsetxattr.dict = dict_ref(dict); + local->cont.fsetxattr.flags = flags; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->transaction.wind = afr_fsetxattr_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_fsetxattr_unwind; + local->transaction.wind = afr_fsetxattr_wind; + local->transaction.unwind = afr_fsetxattr_unwind; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local->op = GF_FOP_FSETXATTR; + local->op = GF_FOP_FSETXATTR; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); - return 0; + AFR_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL); + return 0; } /* }}} */ - /* {{{ removexattr */ - int -afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +afr_removexattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; - - AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno, - local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(removexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} int -afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_removexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - NULL, NULL, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); } - int -afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_removexattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->removexattr, - &local->loc, local->cont.removexattr.name, - local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_removexattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->removexattr, &local->loc, + local->cont.removexattr.name, local->xdata_req); + return 0; } - int -afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata) +afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", - name, op_errno, out); + GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out); - GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", - name, op_errno, out); + GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.removexattr.name = gf_strdup (name); + local->cont.removexattr.name = gf_strdup(name); - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->transaction.wind = afr_removexattr_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_removexattr_unwind; + local->transaction.wind = afr_removexattr_wind; + local->transaction.unwind = afr_removexattr_unwind; - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local->op = GF_FOP_REMOVEXATTR; + local->op = GF_FOP_REMOVEXATTR; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); - return 0; + AFR_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL); + return 0; } /* ffremovexattr */ int -afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this) +afr_fremovexattr_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno, - local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fremovexattr, main_frame, local->op_ret, local->op_errno, + local->xdata_rsp); + return 0; +} int -afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +afr_fremovexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - NULL, NULL, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, NULL, xdata); } - int -afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_fremovexattr_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->fremovexattr, - local->fd, local->cont.removexattr.name, - local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_fremovexattr_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fremovexattr, local->fd, + local->cont.removexattr.name, local->xdata_req); + return 0; } - int -afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, - const char *name, dict_t *xdata) +afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*", - name, op_errno, out); + GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out); - GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*", - name, op_errno, out); + GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out); - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.removexattr.name = gf_strdup (name); - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + local->cont.removexattr.name = gf_strdup(name); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->transaction.wind = afr_fremovexattr_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_fremovexattr_unwind; + local->transaction.wind = afr_fremovexattr_wind; + local->transaction.unwind = afr_fremovexattr_unwind; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local->op = GF_FOP_FREMOVEXATTR; + local->op = GF_FOP_FREMOVEXATTR; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); + AFR_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL); - return 0; + return 0; } - int -afr_fallocate_unwind (call_frame_t *frame, xlator_t *this) +afr_fallocate_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - local = frame->local; + local = frame->local; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; - - AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(fallocate, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +afr_fallocate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - int -afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_fallocate_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->fallocate, - local->fd, local->cont.fallocate.mode, - local->cont.fallocate.offset, - local->cont.fallocate.len, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_fallocate_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fallocate, local->fd, + local->cont.fallocate.mode, local->cont.fallocate.offset, + local->cont.fallocate.len, local->xdata_req); + return 0; } - int -afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata) +afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) { - call_frame_t *transaction_frame = NULL; - afr_local_t *local = NULL; - int ret = -1; - int op_errno = ENOMEM; + call_frame_t *transaction_frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.fallocate.mode = mode; - local->cont.fallocate.offset = offset; - local->cont.fallocate.len = len; + local->cont.fallocate.mode = mode; + local->cont.fallocate.offset = offset; + local->cont.fallocate.len = len; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->op = GF_FOP_FALLOCATE; + local->op = GF_FOP_FALLOCATE; - local->transaction.wind = afr_fallocate_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_fallocate_unwind; + local->transaction.wind = afr_fallocate_wind; + local->transaction.unwind = afr_fallocate_unwind; - local->transaction.main_frame = frame; + local->transaction.main_frame = frame; - local->transaction.start = local->cont.fallocate.offset; - local->transaction.len = 0; + local->transaction.start = local->cont.fallocate.offset; + local->transaction.len = 0; - afr_fix_open (fd, this); + afr_fix_open(fd, this); - ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - /* }}} */ /* {{{ discard */ int -afr_discard_unwind (call_frame_t *frame, xlator_t *this) +afr_discard_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +afr_discard_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - int -afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_discard_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->discard, - local->fd, local->cont.discard.offset, - local->cont.discard.len, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_discard_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->discard, local->fd, + local->cont.discard.offset, local->cont.discard.len, + local->xdata_req); + return 0; } - int -afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata) +afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.discard.offset = offset; - local->cont.discard.len = len; + local->cont.discard.offset = offset; + local->cont.discard.len = len; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->op = GF_FOP_DISCARD; + local->op = GF_FOP_DISCARD; - local->transaction.wind = afr_discard_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_discard_unwind; + local->transaction.wind = afr_discard_wind; + local->transaction.unwind = afr_discard_unwind; - local->transaction.main_frame = frame; + local->transaction.main_frame = frame; - local->transaction.start = local->cont.discard.offset; - local->transaction.len = 0; + local->transaction.start = local->cont.discard.offset; + local->transaction.len = 0; - afr_fix_open (fd, this); + afr_fix_open(fd, this); - ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } - /* {{{ zerofill */ int -afr_zerofill_unwind (call_frame_t *frame, xlator_t *this) +afr_zerofill_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t * local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno, - &local->cont.inode_wfop.prebuf, - &local->cont.inode_wfop.postbuf, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; -} + AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + return 0; +} int -afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - prebuf, postbuf, NULL, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); } - int -afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_zerofill_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->zerofill, - local->fd, local->cont.zerofill.offset, - local->cont.zerofill.len, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_zerofill_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->zerofill, local->fd, + local->cont.zerofill.offset, local->cont.zerofill.len, + local->xdata_req); + return 0; } int -afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, +afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, size_t len, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.zerofill.offset = offset; - local->cont.zerofill.len = len; + local->cont.zerofill.offset = offset; + local->cont.zerofill.len = len; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - if (xdata) - local->xdata_req = dict_copy_with_ref (xdata, NULL); - else - local->xdata_req = dict_new (); + if (xdata) + local->xdata_req = dict_copy_with_ref(xdata, NULL); + else + local->xdata_req = dict_new(); - if (!local->xdata_req) - goto out; + if (!local->xdata_req) + goto out; - local->op = GF_FOP_ZEROFILL; + local->op = GF_FOP_ZEROFILL; - local->transaction.wind = afr_zerofill_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_zerofill_unwind; + local->transaction.wind = afr_zerofill_wind; + local->transaction.unwind = afr_zerofill_unwind; - local->transaction.main_frame = frame; + local->transaction.main_frame = frame; - local->transaction.start = local->cont.discard.offset; - local->transaction.len = len; + local->transaction.start = local->cont.zerofill.offset; + local->transaction.len = len; - afr_fix_open (fd, this); + afr_fix_open(fd, this); - ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL); - return 0; + AFR_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL); + return 0; } /* }}} */ int32_t -afr_xattrop_wind_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) +afr_xattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - NULL, NULL, xattr, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, xattr, xdata); } int -afr_xattrop_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_xattrop_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_xattrop_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->xattrop, - &local->loc, local->cont.xattrop.optype, - local->cont.xattrop.xattr, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_xattrop_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->xattrop, &local->loc, + local->cont.xattrop.optype, local->cont.xattrop.xattr, + local->xdata_req); + return 0; } int -afr_xattrop_unwind (call_frame_t *frame, xlator_t *this) +afr_xattrop_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (xattrop, main_frame, local->op_ret, local->op_errno, - local->xattr_rsp, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; + + AFR_STACK_UNWIND(xattrop, main_frame, local->op_ret, local->op_errno, + local->xattr_rsp, local->xdata_rsp); + return 0; } int32_t -afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.xattrop.xattr = dict_ref (xattr); - local->cont.xattrop.optype = optype; - if (xdata) - local->xdata_req = dict_ref (xdata); + local->cont.xattrop.xattr = dict_ref(xattr); + local->cont.xattrop.optype = optype; + if (xdata) + local->xdata_req = dict_ref(xdata); - local->transaction.wind = afr_xattrop_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_xattrop_unwind; + local->transaction.wind = afr_xattrop_wind; + local->transaction.unwind = afr_xattrop_unwind; - loc_copy (&local->loc, loc); - local->inode = inode_ref (loc->inode); + loc_copy(&local->loc, loc); + ret = afr_set_inode_local(this, local, loc->inode); + if (ret) + goto out; - local->op = GF_FOP_XATTROP; + local->op = GF_FOP_XATTROP; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); - AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL); - return 0; + AFR_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL); + return 0; } int32_t -afr_fxattrop_wind_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - dict_t *xattr, dict_t *xdata) +afr_fxattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr, + dict_t *xdata) { - return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, - NULL, NULL, xattr, xdata); + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL, + NULL, xattr, xdata); } int -afr_fxattrop_wind (call_frame_t *frame, xlator_t *this, int subvol) +afr_fxattrop_wind(call_frame_t *frame, xlator_t *this, int subvol) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - STACK_WIND_COOKIE (frame, afr_fxattrop_wind_cbk, (void *) (long) subvol, - priv->children[subvol], - priv->children[subvol]->fops->fxattrop, - local->fd, local->cont.xattrop.optype, - local->cont.xattrop.xattr, local->xdata_req); - return 0; + STACK_WIND_COOKIE(frame, afr_fxattrop_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fxattrop, local->fd, + local->cont.xattrop.optype, local->cont.xattrop.xattr, + local->xdata_req); + return 0; } int -afr_fxattrop_unwind (call_frame_t *frame, xlator_t *this) +afr_fxattrop_unwind(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - call_frame_t *main_frame = NULL; - - local = frame->local; + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - main_frame = afr_transaction_detach_fop_frame (frame); - if (!main_frame) - return 0; + local = frame->local; - AFR_STACK_UNWIND (fxattrop, main_frame, local->op_ret, local->op_errno, - local->xattr_rsp, local->xdata_rsp); + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; + + AFR_STACK_UNWIND(fxattrop, main_frame, local->op_ret, local->op_errno, + local->xattr_rsp, local->xdata_rsp); + return 0; } int32_t -afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) +afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; - call_frame_t *transaction_frame = NULL; - int ret = -1; - int op_errno = ENOMEM; + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int op_errno = ENOMEM; - transaction_frame = copy_frame (frame); - if (!transaction_frame) - goto out; + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; - local = AFR_FRAME_INIT (transaction_frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; - local->cont.xattrop.xattr = dict_ref (xattr); - local->cont.xattrop.optype = optype; - if (xdata) - local->xdata_req = dict_ref (xdata); + local->cont.xattrop.xattr = dict_ref(xattr); + local->cont.xattrop.optype = optype; + if (xdata) + local->xdata_req = dict_ref(xdata); - local->transaction.wind = afr_fxattrop_wind; - local->transaction.fop = __afr_txn_write_fop; - local->transaction.done = __afr_txn_write_done; - local->transaction.unwind = afr_fxattrop_unwind; + local->transaction.wind = afr_fxattrop_wind; + local->transaction.unwind = afr_fxattrop_unwind; - local->fd = fd_ref (fd); - local->inode = inode_ref (fd->inode); + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; - local->op = GF_FOP_FXATTROP; + local->op = GF_FOP_FXATTROP; - local->transaction.main_frame = frame; - local->transaction.start = LLONG_MAX - 1; - local->transaction.len = 0; + local->transaction.main_frame = frame; + local->transaction.start = LLONG_MAX - 1; + local->transaction.len = 0; - ret = afr_transaction (transaction_frame, this, - AFR_METADATA_TRANSACTION); - if (ret < 0) { - op_errno = -ret; - goto out; - } + ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } - return 0; + return 0; out: - if (transaction_frame) - AFR_STACK_DESTROY (transaction_frame); + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +afr_fsync_unwind(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + call_frame_t *main_frame = NULL; - AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + local = frame->local; + + main_frame = afr_transaction_detach_fop_frame(frame); + if (!main_frame) return 0; + + AFR_STACK_UNWIND(fsync, main_frame, local->op_ret, local->op_errno, + &local->cont.inode_wfop.prebuf, + &local->cont.inode_wfop.postbuf, local->xdata_rsp); + + return 0; +} + +int +afr_fsync_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf, + postbuf, NULL, xdata); +} + +int +afr_fsync_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + STACK_WIND_COOKIE(frame, afr_fsync_wind_cbk, (void *)(long)subvol, + priv->children[subvol], + priv->children[subvol]->fops->fsync, local->fd, + local->cont.fsync.datasync, local->xdata_req); + return 0; +} + +int +afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) +{ + afr_local_t *local = NULL; + call_frame_t *transaction_frame = NULL; + int ret = -1; + int32_t op_errno = ENOMEM; + int8_t last_fsync = 0; + + AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out); + transaction_frame = copy_frame(frame); + if (!transaction_frame) + goto out; + + local = AFR_FRAME_INIT(transaction_frame, op_errno); + if (!local) + goto out; + + if (xdata) { + local->xdata_req = dict_copy_with_ref(xdata, NULL); + if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) { + if (last_fsync) { + local->transaction.disable_delayed_post_op = _gf_true; + } + } + } else { + local->xdata_req = dict_new(); + } + + if (!local->xdata_req) + goto out; + + local->fd = fd_ref(fd); + ret = afr_set_inode_local(this, local, fd->inode); + if (ret) + goto out; + + local->op = GF_FOP_FSYNC; + local->cont.fsync.datasync = datasync; + + if (afr_fd_has_witnessed_unstable_write(this, fd->inode)) { + /* don't care. we only wanted to CLEAR the bit */ + } + + local->transaction.wind = afr_fsync_wind; + local->transaction.unwind = afr_fsync_unwind; + + local->transaction.main_frame = frame; + + ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + return 0; +out: + if (transaction_frame) + AFR_STACK_DESTROY(transaction_frame); + + AFR_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL); + + return 0; } diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h index e174cc2d610..a787069b7a1 100644 --- a/xlators/cluster/afr/src/afr-inode-write.h +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -12,79 +12,83 @@ #define __INODE_WRITE_H__ int32_t -afr_chmod (call_frame_t *frame, xlator_t *this, - loc_t *loc, mode_t mode, dict_t *xdata); +afr_chmod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dict_t *xdata); int32_t -afr_chown (call_frame_t *frame, xlator_t *this, - loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata); +afr_chown(call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, gid_t gid, + dict_t *xdata); int -afr_fchown (call_frame_t *frame, xlator_t *this, - fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata); +afr_fchown(call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid, + dict_t *xdata); int32_t -afr_fchmod (call_frame_t *frame, xlator_t *this, - fd_t *fd, mode_t mode, dict_t *xdata); +afr_fchmod(call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode, + dict_t *xdata); int32_t -afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, - struct iovec *vector, int32_t count, off_t offset, - uint32_t flags, struct iobref *iobref, dict_t *xdata); +afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, + int32_t count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata); int32_t -afr_truncate (call_frame_t *frame, xlator_t *this, - loc_t *loc, off_t offset, dict_t *xdata); +afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + dict_t *xdata); int32_t -afr_ftruncate (call_frame_t *frame, xlator_t *this, - fd_t *fd, off_t offset, dict_t *xdata); +afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata); int32_t -afr_utimens (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct timespec tv[2], dict_t *xdata); +afr_utimens(call_frame_t *frame, xlator_t *this, loc_t *loc, + struct timespec tv[2], dict_t *xdata); int -afr_setattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata); +afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + int32_t valid, dict_t *xdata); int -afr_fsetattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata); +afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf, + int32_t valid, dict_t *xdata); int32_t -afr_setxattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata); +afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata); int32_t -afr_fsetxattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata); +afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + int32_t flags, dict_t *xdata); int32_t -afr_removexattr (call_frame_t *frame, xlator_t *this, - loc_t *loc, const char *name, dict_t *xdata); +afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *name, dict_t *xdata); int32_t -afr_fremovexattr (call_frame_t *frame, xlator_t *this, - fd_t *fd, const char *name, dict_t *xdata); +afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *name, dict_t *xdata); int -afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, - size_t len, dict_t *xdata); +afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata); int -afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, - off_t offset, size_t len, dict_t *xdata); +afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata); int afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, off_t len, dict_t *xdata); int32_t -afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); +afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); int32_t -afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, - gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); +afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); + +int +afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata); #endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c index 1f2a11755bf..bc8eabe0f43 100644 --- a/xlators/cluster/afr/src/afr-lk-common.c +++ b/xlators/cluster/afr/src/afr-lk-common.c @@ -8,9 +8,9 @@ cases as published by the Free Software Foundation. */ -#include "dict.h" -#include "byte-order.h" -#include "common-utils.h" +#include <glusterfs/dict.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/common-utils.h> #include "afr.h" #include "afr-transaction.h" @@ -18,1755 +18,774 @@ #include <signal.h> - -#define LOCKED_NO 0x0 /* no lock held */ -#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ -#define LOCKED_LOWER 0x2 /* for lower path */ - -#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ - do { \ - afr_private_t *_priv = this->private; \ - if (!_priv->inodelk_trace) \ - break; \ - afr_trace_inodelk_in (frame, this, params); \ - } while (0); - -#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ - do { \ - afr_private_t *_priv = this->private; \ - if (!_priv->inodelk_trace) \ - break; \ - afr_trace_inodelk_out (frame, this, params); \ - } while (0); - -#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ - do { \ - afr_private_t *_priv = this->private; \ - if (!_priv->entrylk_trace) \ - break; \ - afr_trace_entrylk_in (frame, this, params); \ - } while (0); - -#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ - do { \ - afr_private_t *_priv = this->private; \ - if (!_priv->entrylk_trace) \ - break; \ - afr_trace_entrylk_out (frame, this, params); \ - } while (0); - -int -afr_entry_lockee_cmp (const void *l1, const void *l2) -{ - const afr_entry_lockee_t *r1 = l1; - const afr_entry_lockee_t *r2 = l2; - int ret = 0; - uuid_t gfid1 = {0}; - uuid_t gfid2 = {0}; - - loc_gfid ((loc_t*)&r1->loc, gfid1); - loc_gfid ((loc_t*)&r2->loc, gfid2); - ret = gf_uuid_compare (gfid1, gfid2); - /*Entrylks with NULL basename are the 'smallest'*/ - if (ret == 0) { - if (!r1->basename) - return -1; - if (!r2->basename) - return 1; - ret = strcmp (r1->basename, r2->basename); - } - - if (ret <= 0) - return -1; - else - return 1; -} - -int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); - -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); - -static uint64_t afr_lock_number = 1; - -static uint64_t -get_afr_lock_number () -{ - return (++afr_lock_number); -} - -int -afr_set_lock_number (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->lock_number = get_afr_lock_number (); - - return 0; -} +#define LOCKED_NO 0x0 /* no lock held */ +#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ +#define LOCKED_LOWER 0x2 /* for lower path */ void -afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) +afr_lockee_cleanup(afr_lockee_t *lockee) { - gf_msg_trace (this->name, 0, - "Setting lk-owner=%llu", - (unsigned long long) (unsigned long)lk_owner); + if (lockee->fd) { + fd_unref(lockee->fd); + lockee->fd = NULL; + } else { + loc_wipe(&lockee->loc); + } - set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); -} + GF_FREE(lockee->basename); + lockee->basename = NULL; + GF_FREE(lockee->locked_nodes); + lockee->locked_nodes = NULL; -static int -is_afr_lock_selfheal (afr_local_t *local) -{ - afr_internal_lock_t *int_lock = NULL; - int ret = -1; - - int_lock = &local->internal_lock; - - switch (int_lock->selfheal_lk_type) { - case AFR_DATA_SELF_HEAL_LK: - case AFR_METADATA_SELF_HEAL_LK: - ret = 1; - break; - case AFR_ENTRY_SELF_HEAL_LK: - ret = 0; - break; - } - - return ret; - -} - -int32_t -internal_lock_count (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int32_t call_count = 0; - int i = 0; - - local = frame->local; - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) - ++call_count; - } - - return call_count; -} - -static void -afr_print_inodelk (char *str, int size, int cmd, - struct gf_flock *flock, gf_lkowner_t *owner) -{ - char *cmd_str = NULL; - char *type_str = NULL; - - switch (cmd) { -#if F_GETLK != F_GETLK64 - case F_GETLK64: -#endif - case F_GETLK: - cmd_str = "GETLK"; - break; - -#if F_SETLK != F_SETLK64 - case F_SETLK64: -#endif - case F_SETLK: - cmd_str = "SETLK"; - break; - -#if F_SETLKW != F_SETLKW64 - case F_SETLKW64: -#endif - case F_SETLKW: - cmd_str = "SETLKW"; - break; - - default: - cmd_str = "<null>"; - break; - } - - switch (flock->l_type) { - case F_RDLCK: - type_str = "READ"; - break; - case F_WRLCK: - type_str = "WRITE"; - break; - case F_UNLCK: - type_str = "UNLOCK"; - break; - default: - type_str = "UNKNOWN"; - break; - } - - snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " - "start=%llu, len=%llu, pid=%llu, lk-owner=%s", - cmd_str, type_str, (unsigned long long) flock->l_start, - (unsigned long long) flock->l_len, - (unsigned long long) flock->l_pid, - lkowner_utoa (owner)); - -} - -static void -afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, - int child_index) -{ - snprintf (str, size, "path=%s, fd=%p, child=%d", - loc->path ? loc->path : "<nul>", - fd ? fd : NULL, - child_index); + return; } void -afr_print_entrylk (char *str, int size, const char *basename, - gf_lkowner_t *owner) -{ - snprintf (str, size, "Basename=%s, lk-owner=%s", - basename ? basename : "<nul>", - lkowner_utoa (owner)); -} - -static void -afr_print_verdict (int op_ret, int op_errno, char *str) +afr_lockees_cleanup(afr_internal_lock_t *int_lock) { - if (op_ret < 0) { - if (op_errno == EAGAIN) - strcpy (str, "EAGAIN"); - else - strcpy (str, "FAILED"); - } - else - strcpy (str, "GRANTED"); -} + int i = 0; -static void -afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, - char *lock_call_type_str, - afr_internal_lock_t *int_lock) -{ - switch (lock_call_type) { - case AFR_INODELK_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL"); - break; - case AFR_INODELK_NB_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL"); - break; - case AFR_ENTRYLK_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL"); - break; - case AFR_ENTRYLK_NB_TRANSACTION: - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) - strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION"); - else - strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL"); - break; - default: - strcpy (lock_call_type_str, "UNKNOWN"); - break; - } + for (i = 0; i < int_lock->lockee_count; i++) { + afr_lockee_cleanup(&int_lock->lockee[i]); + } + return; } - -static void -afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, - afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, struct gf_flock *flock, - int op_ret, int op_errno, int32_t child_index) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - - char lockee[256]; - char lock_call_type_str[256]; - char verdict[16]; - - local = frame->local; - int_lock = &local->internal_lock; - - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - afr_print_verdict (op_ret, op_errno, verdict); - - gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, - "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, lkowner_utoa (&frame->root->lk_owner), lockee, - (unsigned long long) int_lock->lock_number); - -} - -static void -afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, - afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, struct gf_flock *flock, - int32_t cmd, int32_t child_index) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - - local = frame->local; - int_lock = &local->internal_lock; - - afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); - afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", - lock, lockee, - (unsigned long long) int_lock->lock_number); - +int +afr_entry_lockee_cmp(const void *l1, const void *l2) +{ + const afr_lockee_t *r1 = l1; + const afr_lockee_t *r2 = l2; + int ret = 0; + uuid_t gfid1 = {0}; + uuid_t gfid2 = {0}; + + loc_gfid((loc_t *)&r1->loc, gfid1); + loc_gfid((loc_t *)&r2->loc, gfid2); + ret = gf_uuid_compare(gfid1, gfid2); + /*Entrylks with NULL basename are the 'smallest'*/ + if (ret == 0) { + if (!r1->basename) + return -1; + if (!r2->basename) + return 1; + ret = strcmp(r1->basename, r2->basename); + } + + if (ret <= 0) + return -1; + else + return 1; } -static void -afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, - afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, - int32_t cookie) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - int child_index = 0; - int lockee_no = 0; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->entrylk_trace) { - return; - } - lockee_no = cookie / priv->child_count; - child_index = cookie % priv->child_count; - - afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); - afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, - child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, - "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", - lock, lockee, - (unsigned long long) int_lock->lock_number, - cookie); -} +int +afr_lock_blocking(call_frame_t *frame, xlator_t *this, int child_index); -static void -afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, - afr_lock_call_type_t lock_call_type, - afr_lock_op_type_t lk_op_type, const char *basename, - int op_ret, int op_errno, int32_t cookie) +void +afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int lockee_no = 0; - int child_index = 0; - - char lock[256]; - char lockee[256]; - char lock_call_type_str[256]; - char verdict[16]; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - if (!priv->entrylk_trace) { - return; - } - lockee_no = cookie / priv->child_count; - child_index = cookie % priv->child_count; - - afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); - afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, - child_index); - - afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); - - afr_print_verdict (op_ret, op_errno, verdict); - - gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, - "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", - lock_call_type_str, - lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", - verdict, - lock, lockee, - (unsigned long long) int_lock->lock_number, - cookie); + gf_msg_trace(this->name, 0, "Setting lk-owner=%llu", + (unsigned long long)(unsigned long)lk_owner); + set_lk_owner_from_ptr(&frame->root->lk_owner, lk_owner); } -static int -transaction_lk_op (afr_local_t *local) +int32_t +internal_lock_count(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - int ret = -1; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int32_t call_count = 0; + int i = 0; - int_lock = &local->internal_lock; + local = frame->local; + priv = this->private; - if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) { - gf_msg_debug (THIS->name, 0, - "lk op is for a transaction"); - ret = 1; - } - else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) { - gf_msg_debug (THIS->name, 0, - "lk op is for a self heal"); - - ret = 0; - } - - if (ret == -1) - gf_msg_debug (THIS->name, 0, - "lk op is not set"); - - return ret; + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) + ++call_count; + } + return call_count; } int -afr_is_inodelk_transaction(afr_local_t *local) +afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename, + int child_count) { - int ret = 0; + int ret = -ENOMEM; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count]; - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - ret = 1; - break; + GF_ASSERT(int_lock->lockee_count < AFR_LOCKEE_COUNT_MAX); + loc_copy(&lockee->loc, loc); + lockee->basename = (basename) ? gf_strdup(basename) : NULL; + if (basename && !lockee->basename) + goto out; - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - ret = 0; - break; + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes), + gf_afr_mt_afr_node_character); - } + if (!lockee->locked_nodes) + goto out; - return ret; + ret = 0; + int_lock->lockee_count++; +out: + if (ret) { + afr_lockee_cleanup(lockee); + } + return ret; } int -afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local, - loc_t *loc, char *basename, int child_count) +afr_add_inode_lockee(afr_local_t *local, int child_count) { - int ret = -1; + int ret = -ENOMEM; + afr_internal_lock_t *int_lock = &local->internal_lock; + afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count]; - loc_copy (&lockee->loc, loc); - lockee->basename = (basename)? gf_strdup (basename): NULL; - if (basename && !lockee->basename) - goto out; + if (local->fd) { + lockee->fd = fd_ref(local->fd); + } else { + loc_copy(&lockee->loc, &local->loc); + } - lockee->locked_count = 0; - lockee->locked_nodes = GF_CALLOC (child_count, - sizeof (*lockee->locked_nodes), - gf_afr_mt_afr_node_character); + lockee->locked_count = 0; + lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes), + gf_afr_mt_afr_node_character); - if (!lockee->locked_nodes) - goto out; + if (!lockee->locked_nodes) + goto out; - ret = 0; + ret = 0; + int_lock->lockee_count++; out: - return ret; - -} - -void -afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock) -{ - int i = 0; - - for (i = 0; i < int_lock->lockee_count; i++) { - loc_wipe (&int_lock->lockee[i].loc); - if (int_lock->lockee[i].basename) - GF_FREE (int_lock->lockee[i].basename); - if (int_lock->lockee[i].locked_nodes) - GF_FREE (int_lock->lockee[i].locked_nodes); - } - - return; + if (ret) { + afr_lockee_cleanup(lockee); + } + return ret; } static int -initialize_entrylk_variables (call_frame_t *frame, xlator_t *this) +initialize_internal_lock_variables(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - - int i = 0; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - int_lock->entrylk_lock_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; - - for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { - if (!int_lock->lockee[i].locked_nodes) - break; - int_lock->lockee[i].locked_count = 0; - memset (int_lock->lockee[i].locked_nodes, 0, - sizeof (*int_lock->lockee[i].locked_nodes) * - priv->child_count); - } + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; - return 0; -} + int i = 0; -static int -initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - afr_private_t *priv = NULL; - afr_inodelk_t *inodelk = NULL; + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + int_lock->lock_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + int_lock->lk_attempted_count = 0; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); + for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) { + if (!int_lock->lockee[i].locked_nodes) + break; + int_lock->lockee[i].locked_count = 0; + memset(int_lock->lockee[i].locked_nodes, 0, + sizeof(*int_lock->lockee[i].locked_nodes) * priv->child_count); + } - inodelk->lock_count = 0; - int_lock->lk_attempted_count = 0; - int_lock->lock_op_ret = -1; - int_lock->lock_op_errno = 0; - - memset (inodelk->locked_nodes, 0, - sizeof (*inodelk->locked_nodes) * priv->child_count); - memset (int_lock->locked_nodes, 0, - sizeof (*int_lock->locked_nodes) * priv->child_count); - - return 0; + return 0; } int -afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock) +afr_lockee_locked_nodes_count(afr_internal_lock_t *int_lock) { - int call_count = 0; - int i = 0; + int call_count = 0; + int i = 0; - for (i = 0; i < int_lock->lockee_count; i++) - call_count += int_lock->lockee[i].locked_count; + for (i = 0; i < int_lock->lockee_count; i++) + call_count += int_lock->lockee[i].locked_count; - return call_count; + return call_count; } int -afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) - -{ - int i = 0; - int call_count = 0; +afr_locked_nodes_count(unsigned char *locked_nodes, int child_count) - for (i = 0; i < child_count; i++) { - if (locked_nodes[i] & LOCKED_YES) - call_count++; - } - - return call_count; -} - -/* FIXME: What if UNLOCK fails */ -static int32_t -afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - int call_count = 0; - - local = frame->local; - int_lock = &local->internal_lock; + int i = 0; + int call_count = 0; - LOCK (&frame->lock); - { - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - gf_msg_trace (this->name, 0, - "All internal locks unlocked"); - - int_lock->lock_cbk (frame, this); - } + for (i = 0; i < child_count; i++) { + if (locked_nodes[i] & LOCKED_YES) + call_count++; + } - return 0; + return call_count; } -void -afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock, - int32_t child_index) +static void +afr_log_locks_failure(call_frame_t *frame, char *where, char *what, + int op_errno) { - afr_inodelk_t *inodelk = NULL; + xlator_t *this = frame->this; + gf_lkowner_t *lk_owner = &frame->root->lk_owner; + afr_local_t *local = frame->local; + const char *fop = NULL; + char *gfid = NULL; + const char *name = NULL; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - inodelk->locked_nodes[child_index] &= LOCKED_NO; - if (local->transaction.eager_lock) - local->transaction.eager_lock[child_index] = 0; + fop = gf_fop_list[local->op]; + switch (local->transaction.type) { + case AFR_ENTRY_RENAME_TRANSACTION: + case AFR_ENTRY_TRANSACTION: + switch (local->op) { + case GF_FOP_LINK: + gfid = uuid_utoa(local->newloc.pargfid); + name = local->newloc.name; + break; + default: + gfid = uuid_utoa(local->loc.pargfid); + name = local->loc.name; + break; + } + gf_msg(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_INTERNAL_LKS_FAILED, + "Unable to do entry %s with lk-owner:%s on %s " + "while attempting %s on {pgfid:%s, name:%s}.", + what, lkowner_utoa(lk_owner), where, fop, gfid, name); + break; + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + gfid = uuid_utoa(local->inode->gfid); + gf_msg(this->name, GF_LOG_WARNING, op_errno, + AFR_MSG_INTERNAL_LKS_FAILED, + "Unable to do inode %s with lk-owner:%s on %s " + "while attempting %s on gfid:%s.", + what, lkowner_utoa(lk_owner), where, fop, gfid); + break; + } } static int32_t -afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_internal_lock_t *int_lock = NULL; - int32_t child_index = (long)cookie; - afr_private_t *priv = NULL; - - local = frame->local; - int_lock = &local->internal_lock; - - AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, NULL, op_ret, - op_errno, child_index); - - priv = this->private; - - if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { - gf_msg (this->name, GF_LOG_ERROR, op_errno, - AFR_MSG_UNLOCK_FAIL, - "path=%s gfid=%s: unlock failed on subvolume %s " - "with lock owner %s", local->loc.path, - loc_gfid_utoa (&(local->loc)), - priv->children[child_index]->name, - lkowner_utoa (&frame->root->lk_owner)); - } - - afr_update_uninodelk (local, int_lock, child_index); - - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata); - - return 0; - -} - -static int -afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) +afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - struct gf_flock flock = {0,}; - struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = NULL; - int call_count = 0; - int i = 0; - int piggyback = 0; - afr_fd_ctx_t *fd_ctx = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int lockee_num = 0; + int call_count = 0; + int child_index = 0; + int ret = 0; + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + lockee_num = (int)((long)cookie) / priv->child_count; + child_index = (int)((long)cookie) % priv->child_count; - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { + afr_log_locks_failure(frame, priv->children[child_index]->name, + "unlock", op_errno); + } - inodelk = afr_get_inodelk (int_lock, int_lock->domain); + int_lock->lockee[lockee_num].locked_nodes[child_index] &= LOCKED_NO; + if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1) + ret = afr_write_subvol_reset(frame, this); - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; - flock.l_type = F_UNLCK; + LOCK(&frame->lock); + { + call_count = --int_lock->lk_call_count; + } + UNLOCK(&frame->lock); - full_flock.l_type = F_UNLCK; - call_count = afr_locked_nodes_count (inodelk->locked_nodes, - priv->child_count); + if (call_count == 0) { + int_lock->lock_cbk(frame, this); + } - int_lock->lk_call_count = call_count; - - if (!call_count) { - gf_msg_trace (this->name, 0, - "No internal locks unlocked"); - - int_lock->lock_cbk (frame, this); - goto out; - } - - if (local->fd) - fd_ctx = afr_fd_ctx_get (local->fd, this); - - for (i = 0; i < priv->child_count; i++) { - if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) - continue; - - if (local->fd) { - flock_use = &flock; - if (!local->transaction.eager_lock[i]) { - goto wind; - } - - piggyback = 0; - - LOCK (&local->fd->lock); - { - if (fd_ctx->lock_piggyback[i]) { - fd_ctx->lock_piggyback[i]--; - piggyback = 1; - } else { - fd_ctx->lock_acquired[i]--; - } - } - UNLOCK (&local->fd->lock); - - if (piggyback) { - afr_unlock_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0, NULL); - if (!--call_count) - break; - continue; - } - - flock_use = &full_flock; - wind: - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, flock_use, F_SETLK, - i); - - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->finodelk, - int_lock->domain, local->fd, - F_SETLK, flock_use, NULL); - - if (!--call_count) - break; - - } else { - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_TRANSACTION, - AFR_UNLOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, - (void *) (long)i, - priv->children[i], - priv->children[i]->fops->inodelk, - int_lock->domain, &local->loc, - F_SETLK, &flock, NULL); - - if (!--call_count) - break; - } - } -out: - return 0; + return ret; } -static int32_t -afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; - int32_t child_index = 0; - int lockee_no = 0; - - priv = this->private; - lockee_no = (int)((long) cookie) / priv->child_count; - child_index = (int) ((long) cookie) % priv->child_count; - - local = frame->local; - int_lock = &local->internal_lock; - - AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_UNLOCK_OP, - int_lock->lockee[lockee_no].basename, op_ret, - op_errno, (int) ((long)cookie)); - - if (op_ret < 0) { - gf_msg (this->name, GF_LOG_ERROR, op_errno, - AFR_MSG_ENTRY_UNLOCK_FAIL, - "%s: unlock failed on %s", local->loc.path, - priv->children[child_index]->name); - } - - int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO; - afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL); +void +afr_internal_lock_wind(call_frame_t *frame, + int32_t (*cbk)(call_frame_t *, void *, xlator_t *, + int32_t, int32_t, dict_t *), + void *cookie, int child, int lockee_num, + gf_boolean_t blocking, gf_boolean_t unlock) +{ + afr_local_t *local = frame->local; + xlator_t *this = frame->this; + afr_private_t *priv = this->private; + afr_internal_lock_t *int_lock = &local->internal_lock; + entrylk_cmd cmd = ENTRYLK_LOCK_NB; + int32_t cmd1 = F_SETLK; + struct gf_flock flock = { + 0, + }; + + switch (local->transaction.type) { + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + if (unlock) { + cmd = ENTRYLK_UNLOCK; + } else if (blocking) { /*Doesn't make sense to have blocking + unlock*/ + cmd = ENTRYLK_LOCK; + } + + if (local->fd) { + STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->fentrylk, + int_lock->domain, + int_lock->lockee[lockee_num].fd, + int_lock->lockee[lockee_num].basename, cmd, + ENTRYLK_WRLCK, NULL); + } else { + STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->entrylk, + int_lock->domain, + &int_lock->lockee[lockee_num].loc, + int_lock->lockee[lockee_num].basename, cmd, + ENTRYLK_WRLCK, NULL); + } + break; - return 0; + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + flock = int_lock->lockee[lockee_num].flock; + if (unlock) { + flock.l_type = F_UNLCK; + } else if (blocking) { /*Doesn't make sense to have blocking + unlock*/ + cmd1 = F_SETLKW; + } + + if (local->fd) { + STACK_WIND_COOKIE( + frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->finodelk, int_lock->domain, + int_lock->lockee[lockee_num].fd, cmd1, &flock, NULL); + } else { + STACK_WIND_COOKIE( + frame, cbk, cookie, priv->children[child], + priv->children[child]->fops->inodelk, int_lock->domain, + &int_lock->lockee[lockee_num].loc, cmd1, &flock, NULL); + } + break; + } } static int -afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int index = 0; - int lockee_no = 0; - int copies = 0; - int i = -1; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - copies = priv->child_count; - - call_count = afr_lockee_locked_nodes_count (int_lock); - - int_lock->lk_call_count = call_count; - - if (!call_count){ - gf_msg_trace (this->name, 0, - "No internal locks unlocked"); - int_lock->lock_cbk (frame, this); - goto out; - } - - for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { - lockee_no = i / copies; - index = i % copies; - if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, - AFR_UNLOCK_OP, - int_lock->lockee[lockee_no].basename, - i); - - STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, - (void *) (long) i, - priv->children[index], - priv->children[index]->fops->entrylk, - int_lock->domain, - &int_lock->lockee[lockee_no].loc, - int_lock->lockee[lockee_no].basename, - ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL); - - if (!--call_count) - break; - } +afr_unlock_now(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + int lockee_num = 0; + int i = -1; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + call_count = afr_lockee_locked_nodes_count(int_lock); + + int_lock->lk_call_count = call_count; + + if (!call_count) { + gf_msg_trace(this->name, 0, "No internal locks unlocked"); + int_lock->lock_cbk(frame, this); + goto out; + } + + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + lockee_num = i / priv->child_count; + child_index = i % priv->child_count; + if (int_lock->lockee[lockee_num].locked_nodes[child_index] & + LOCKED_YES) { + afr_internal_lock_wind(frame, afr_unlock_common_cbk, + (void *)(long)i, child_index, lockee_num, + _gf_false, _gf_true); + if (!--call_count) + break; } + } out: - return 0; - + return 0; } static int32_t -afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int cky = (long) cookie; - int child_index = 0; - int lockee_no = 0; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - child_index = ((int)cky) % priv->child_count; - lockee_no = ((int)cky) / priv->child_count; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_msg (this->name, GF_LOG_ERROR, ENOSYS, - AFR_MSG_LOCK_XLATOR_NOT_LOADED, - "subvolume does not support locking. " - "please load features/locks xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - } - - local->op_errno = op_errno; - int_lock->lock_op_errno = op_errno; - } - - int_lock->lk_attempted_count++; - } - UNLOCK (&frame->lock); - - if ((op_ret == -1) && - (op_errno == ENOSYS)) { - afr_unlock (frame, this); - } else { - if (op_ret == 0) { - if (local->transaction.type == AFR_ENTRY_TRANSACTION || - local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { - int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES; - int_lock->lockee[lockee_no].locked_count++; - int_lock->entrylk_lock_count++; - } else { - int_lock->locked_nodes[child_index] |= LOCKED_YES; - int_lock->lock_count++; - } +afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xdata) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int cky = (long)cookie; + int child_index = 0; + int lockee_num = 0; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + + child_index = ((int)cky) % priv->child_count; + lockee_num = ((int)cky) / priv->child_count; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_msg(this->name, GF_LOG_ERROR, ENOSYS, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, + "subvolume does not support locking. " + "please load features/locks xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + } + + local->op_errno = op_errno; + int_lock->lock_op_errno = op_errno; + } + + int_lock->lk_attempted_count++; + } + UNLOCK(&frame->lock); + + if ((op_ret == -1) && (op_errno == ENOSYS)) { + afr_unlock_now(frame, this); + } else { + if (op_ret == 0) { + int_lock->lockee[lockee_num] + .locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_num].locked_count++; + int_lock->lock_count++; + if (local->transaction.type == AFR_DATA_TRANSACTION) { + LOCK(&local->inode->lock); + { + local->inode_ctx->lock_count++; } - afr_lock_blocking (frame, this, cky + 1); - } - - return 0; -} - -static int32_t -afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long) cookie); - - afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); - return 0; - -} - -static int32_t -afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long)cookie); - - afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); - return 0; -} - -static int -afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; - - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - memcpy (inodelk->locked_nodes, int_lock->locked_nodes, - sizeof (*inodelk->locked_nodes) * priv->child_count); - inodelk->lock_count = int_lock->lock_count; - break; - - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - /*entrylk_count is being used in both non-blocking and blocking - * modes */ - break; - } - - return 0; - -} - -static gf_boolean_t -afr_is_entrylk (afr_internal_lock_t *int_lock, - afr_transaction_type trans_type) -{ - gf_boolean_t is_entrylk = _gf_false; - - if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && - int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { - - is_entrylk = _gf_true; - - } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && - (trans_type == AFR_ENTRY_TRANSACTION || - trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { - - is_entrylk = _gf_true; - - } else { - is_entrylk = _gf_false; + UNLOCK(&local->inode->lock); + } } + afr_lock_blocking(frame, this, cky + 1); + } - return is_entrylk; + return 0; } static gf_boolean_t -_is_lock_wind_needed (afr_local_t *local, int child_index) -{ - if (!local->child_up[child_index]) - return _gf_false; - - return _gf_true; -} - -static void -afr_log_entry_locks_failure(xlator_t *this, afr_local_t *local, - afr_internal_lock_t *int_lock) +_is_lock_wind_needed(afr_local_t *local, int child_index) { - const char *fop = NULL; - char *pargfid = NULL; - const char *name = NULL; + if (!local->child_up[child_index]) + return _gf_false; - fop = gf_fop_list[local->op]; - - switch (local->op) { - case GF_FOP_LINK: - pargfid = uuid_utoa(local->newloc.pargfid); - name = local->newloc.name; - break; - default: - pargfid = uuid_utoa(local->loc.pargfid); - name = local->loc.name; - break; - } - - gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_BLOCKING_LKS_FAILED, - "Unable to obtain sufficient blocking entry locks on at least " - "one child while attempting %s on {pgfid:%s, name:%s}.", fop, - pargfid, name); + return _gf_true; } static gf_boolean_t -is_blocking_locks_count_sufficient (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_internal_lock_t *int_lock = NULL; - gf_boolean_t is_entrylk = _gf_false; - int child = 0; - int nlockee = 0; - int lockee_count = 0; - gf_boolean_t ret = _gf_true; - - local = frame->local; - priv = this->private; - int_lock = &local->internal_lock; - lockee_count = int_lock->lockee_count; - is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - - if (!is_entrylk) { - if (int_lock->lock_count == 0) { - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_BLOCKING_LKS_FAILED, "Unable to obtain " - "blocking inode lock on even one child for " - "gfid:%s.", uuid_utoa (local->inode->gfid)); - return _gf_false; - } else { - /*inodelk succeded on atleast one child. */ - return _gf_true; - } - - } else { - if (int_lock->entrylk_lock_count == 0) { - afr_log_entry_locks_failure (this, local, int_lock); - return _gf_false; - } - /* For FOPS that take multiple sets of locks (mkdir, rename), - * there must be atleast one brick on which the locks from - * all lock sets were successful. */ - for (child = 0; child < priv->child_count; child++) { - ret = _gf_true; - for (nlockee = 0; nlockee < lockee_count; nlockee++) { - if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES)) - ret = _gf_false; - } - if (ret) - return ret; - } - if (!ret) - afr_log_entry_locks_failure (this, local, int_lock); - } - - return ret; - +is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + int child = 0; + int nlockee = 0; + int lockee_count = 0; + gf_boolean_t ret = _gf_true; + + local = frame->local; + priv = this->private; + int_lock = &local->internal_lock; + lockee_count = int_lock->lockee_count; + + if (int_lock->lock_count == 0) { + afr_log_locks_failure(frame, "any subvolume", "lock", + int_lock->lock_op_errno); + return _gf_false; + } + /* For FOPS that take multiple sets of locks (mkdir, rename), + * there must be at least one brick on which the locks from + * all lock sets were successful. */ + for (child = 0; child < priv->child_count; child++) { + ret = _gf_true; + for (nlockee = 0; nlockee < lockee_count; nlockee++) { + if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES)) + ret = _gf_false; + } + if (ret) + return ret; + } + if (!ret) + afr_log_locks_failure(frame, "all", "lock", int_lock->lock_op_errno); + + return ret; } int -afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) +afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - struct gf_flock flock = {0,}; - uint64_t ctx = 0; - int ret = 0; - int child_index = 0; - int lockee_no = 0; - gf_boolean_t is_entrylk = _gf_false; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - child_index = cookie % priv->child_count; - lockee_no = cookie / priv->child_count; - is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); - - - if (!is_entrylk) { - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; - flock.l_type = inodelk->flock.l_type; - } - - if (local->fd) { - ret = fd_ctx_get (local->fd, this, &ctx); - - if (ret < 0) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_FD_CTX_GET_FAILED, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - - afr_copy_locked_nodes (frame, this); - - afr_unlock (frame, this); + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + uint64_t ctx = 0; + int ret = 0; + int child_index = 0; + int lockee_num = 0; - return 0; - } - } + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + child_index = cookie % priv->child_count; + lockee_num = cookie / priv->child_count; - if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { - if (!is_blocking_locks_count_sufficient (frame, this)) { + if (local->fd) { + ret = fd_ctx_get(local->fd, this, &ctx); - local->op_ret = -1; - int_lock->lock_op_ret = -1; + if (ret < 0) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED, + "unable to get fd ctx for fd=%p", local->fd); - afr_copy_locked_nodes (frame, this); + local->op_ret = -1; + int_lock->lock_op_ret = -1; - afr_unlock(frame, this); + afr_unlock_now(frame, this); - return 0; - } + return 0; } + } - if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { - /* we're done locking */ + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + if (!is_blocking_locks_count_sufficient(frame, this)) { + local->op_ret = -1; + int_lock->lock_op_ret = -1; - gf_msg_debug (this->name, 0, - "we're done locking"); + afr_unlock_now(frame, this); - afr_copy_locked_nodes (frame, this); - - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - return 0; + return 0; } + } - if (!_is_lock_wind_needed (local, child_index)) { - afr_lock_blocking (frame, this, cookie + 1); - return 0; - } + if (int_lock->lk_expected_count == int_lock->lk_attempted_count) { + /* we're done locking */ - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - - if (local->fd) { - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLKW, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->finodelk, - int_lock->domain, local->fd, - F_SETLKW, &flock, NULL); - - } else { - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLKW, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, - (void *) (long) child_index, - priv->children[child_index], - priv->children[child_index]->fops->inodelk, - int_lock->domain, &local->loc, - F_SETLKW, &flock, NULL); - } + gf_msg_debug(this->name, 0, "we're done locking"); - break; + int_lock->lock_op_ret = 0; + int_lock->lock_cbk(frame, this); + return 0; + } - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - /*Accounting for child_index increments on 'down' - *and 'fd-less' children */ - - if (local->fd) { - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, - int_lock->lockee[lockee_no].basename, - cookie); - - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) cookie, - priv->children[child_index], - priv->children[child_index]->fops->fentrylk, - int_lock->domain, local->fd, - int_lock->lockee[lockee_no].basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); - } else { - AFR_TRACE_ENTRYLK_IN (frame, this, - AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, local->transaction.basename, - child_index); - - STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, - (void *) (long) cookie, - priv->children[child_index], - priv->children[child_index]->fops->entrylk, - int_lock->domain, - &int_lock->lockee[lockee_no].loc, - int_lock->lockee[lockee_no].basename, - ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); - } + if (!_is_lock_wind_needed(local, child_index)) { + afr_lock_blocking(frame, this, cookie + 1); + return 0; + } - break; - } + afr_internal_lock_wind(frame, afr_lock_cbk, (void *)(long)cookie, + child_index, lockee_num, _gf_true, _gf_false); - return 0; + return 0; } int32_t -afr_blocking_lock (call_frame_t *frame, xlator_t *this) +afr_blocking_lock(call_frame_t *frame, xlator_t *this) { - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int up_count = 0; - - priv = this->private; - local = frame->local; - int_lock = &local->internal_lock; + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int up_count = 0; - switch (local->transaction.type) { - case AFR_DATA_TRANSACTION: - case AFR_METADATA_TRANSACTION: - initialize_inodelk_variables (frame, this); - break; + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; - case AFR_ENTRY_RENAME_TRANSACTION: - case AFR_ENTRY_TRANSACTION: - up_count = AFR_COUNT (local->child_up, priv->child_count); - int_lock->lk_call_count = int_lock->lk_expected_count - = (int_lock->lockee_count * - up_count); - initialize_entrylk_variables (frame, this); - break; - } + up_count = AFR_COUNT(local->child_up, priv->child_count); + int_lock->lk_call_count = int_lock->lk_expected_count = + (int_lock->lockee_count * up_count); + initialize_internal_lock_variables(frame, this); - afr_lock_blocking (frame, this, 0); + afr_lock_blocking(frame, this, 0); - return 0; + return 0; } static int32_t -afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - int call_count = 0; - int child_index = (long) cookie; - int copies = 0; - int index = 0; - int lockee_no = 0; - afr_private_t *priv = NULL; - - priv = this->private; - - copies = priv->child_count; - index = child_index % copies; - lockee_no = child_index / copies; - - local = frame->local; - int_lock = &local->internal_lock; - - AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, - AFR_LOCK_OP, - int_lock->lockee[lockee_no].basename, op_ret, - op_errno, (long) cookie); - - LOCK (&frame->lock); - { - if (op_ret < 0 ) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_msg (this->name, GF_LOG_ERROR, - ENOSYS, AFR_MSG_LOCK_XLATOR_NOT_LOADED, - "subvolume does not support " - "locking. please load features/locks" - " xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - } else if (op_ret == 0) { - int_lock->lockee[lockee_no].locked_nodes[index] |= \ - LOCKED_YES; - int_lock->lockee[lockee_no].locked_count++; - int_lock->entrylk_lock_count++; - } - - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - gf_msg_trace (this->name, 0, - "Last locking reply received"); - /* all locks successful. Proceed to call FOP */ - if (int_lock->entrylk_lock_count == - int_lock->lk_expected_count) { - gf_msg_trace (this->name, 0, - "All servers locked. Calling the cbk"); - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - } - /* Not all locks were successful. Unlock and try locking - again, this time with serially blocking locks */ - else { - gf_msg_trace (this->name, 0, - "%d servers locked. Trying again " - "with blocking calls", - int_lock->lock_count); - - afr_unlock(frame, this); - } - } - - return 0; -} - -int -afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int copies = 0; - int index = 0; - int lockee_no = 0; - int32_t call_count = 0; - int i = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - copies = priv->child_count; - initialize_entrylk_variables (frame, this); - - if (local->fd) { - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_FD_CTX_GET_FAILED, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EINVAL; - int_lock->lock_op_errno = EINVAL; - - afr_unlock (frame, this); - return -1; - } - - call_count = int_lock->lockee_count * internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - if (!call_count) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_INFO_COMMON, - "fd not open on any subvolumes. aborting."); - afr_unlock (frame, this); - goto out; - } - - /* Send non-blocking entrylk calls only on up children - and where the fd has been opened */ - for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { - index = i%copies; - lockee_no = i/copies; - if (local->child_up[index]) { - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, - int_lock->lockee[lockee_no].basename, - i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, - (void *) (long) i, - priv->children[index], - priv->children[index]->fops->fentrylk, - this->name, local->fd, - int_lock->lockee[lockee_no].basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, - NULL); - if (!--call_count) - break; - } - } - } else { - call_count = int_lock->lockee_count * internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) { - index = i%copies; - lockee_no = i/copies; - if (local->child_up[index]) { - AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, - AFR_LOCK_OP, - int_lock->lockee[lockee_no].basename, - i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, - (void *) (long) i, - priv->children[index], - priv->children[index]->fops->entrylk, - this->name, &int_lock->lockee[lockee_no].loc, - int_lock->lockee[lockee_no].basename, - ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, - NULL); - - if (!--call_count) - break; - } - } - } -out: - return 0; -} - -int32_t -afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, dict_t *xdata) +afr_nb_internal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) { - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - int call_count = 0; - int child_index = (long) cookie; - afr_fd_ctx_t *fd_ctx = NULL; - + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + int call_count = 0; + int child_index = 0; + int lockee_num = 0; + afr_private_t *priv = NULL; - local = frame->local; - int_lock = &local->internal_lock; - inodelk = afr_get_inodelk (int_lock, int_lock->domain); + priv = this->private; - AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, NULL, op_ret, - op_errno, (long) cookie); + child_index = ((long)cookie) % priv->child_count; + lockee_num = ((long)cookie) / priv->child_count; - if (local->fd) - fd_ctx = afr_fd_ctx_get (local->fd, this); + local = frame->local; + int_lock = &local->internal_lock; - LOCK (&frame->lock); + if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) { + LOCK(&local->inode->lock); { - if (op_ret < 0) { - if (op_errno == ENOSYS) { - /* return ENOTSUP */ - gf_msg (this->name, GF_LOG_ERROR, ENOSYS, - AFR_MSG_LOCK_XLATOR_NOT_LOADED, - "subvolume does not support " - "locking. please load features/locks" - " xlator on server"); - local->op_ret = op_ret; - int_lock->lock_op_ret = op_ret; - int_lock->lock_op_errno = op_errno; - local->op_errno = op_errno; - } - if (local->transaction.eager_lock) - local->transaction.eager_lock[child_index] = 0; - } else { - inodelk->locked_nodes[child_index] |= LOCKED_YES; - inodelk->lock_count++; - - if (local->transaction.eager_lock && - local->transaction.eager_lock[child_index] && - local->fd) { - /* piggybacked */ - if (op_ret == 1) { - /* piggybacked */ - } else if (op_ret == 0) { - /* lock acquired from server */ - fd_ctx->lock_acquired[child_index]++; - } - } - } - - call_count = --int_lock->lk_call_count; - } - UNLOCK (&frame->lock); - - if (call_count == 0) { - gf_msg_trace (this->name, 0, - "Last inode locking reply received"); - /* all locks successful. Proceed to call FOP */ - if (inodelk->lock_count == int_lock->lk_expected_count) { - gf_msg_trace (this->name, 0, - "All servers locked. Calling the cbk"); - int_lock->lock_op_ret = 0; - int_lock->lock_cbk (frame, this); - } - /* Not all locks were successful. Unlock and try locking - again, this time with serially blocking locks */ - else { - gf_msg_trace (this->name, 0, - "%d servers locked. " - "Trying again with blocking calls", - int_lock->lock_count); - - afr_unlock(frame, this); - } + local->inode_ctx->lock_count++; } + UNLOCK(&local->inode->lock); + } - return 0; + LOCK(&frame->lock); + { + if (op_ret < 0) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_msg(this->name, GF_LOG_ERROR, ENOSYS, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, + "subvolume does not support " + "locking. please load features/locks" + " xlator on server"); + local->op_ret = op_ret; + int_lock->lock_op_ret = op_ret; + + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } + } else if (op_ret == 0) { + int_lock->lockee[lockee_num] + .locked_nodes[child_index] |= LOCKED_YES; + int_lock->lockee[lockee_num].locked_count++; + int_lock->lock_count++; + } + + call_count = --int_lock->lk_call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + gf_msg_trace(this->name, 0, "Last locking reply received"); + /* all locks successful. Proceed to call FOP */ + if (int_lock->lock_count == int_lock->lk_expected_count) { + gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk"); + int_lock->lock_op_ret = 0; + int_lock->lock_cbk(frame, this); + } + /* Not all locks were successful. Unlock and try locking + again, this time with serially blocking locks */ + else { + gf_msg_trace(this->name, 0, + "%d servers locked. Trying again " + "with blocking calls", + int_lock->lock_count); + + afr_unlock_now(frame, this); + } + } + + return 0; } int -afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) -{ - afr_internal_lock_t *int_lock = NULL; - afr_inodelk_t *inodelk = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int32_t call_count = 0; - int i = 0; - int ret = 0; - struct gf_flock flock = {0,}; - struct gf_flock full_flock = {0,}; - struct gf_flock *flock_use = NULL; - int piggyback = 0; - - local = frame->local; - int_lock = &local->internal_lock; - priv = this->private; - - inodelk = afr_get_inodelk (int_lock, int_lock->domain); - - flock.l_start = inodelk->flock.l_start; - flock.l_len = inodelk->flock.l_len; - flock.l_type = inodelk->flock.l_type; - - full_flock.l_type = inodelk->flock.l_type; - - initialize_inodelk_variables (frame, this); - - if (local->fd) { - fd_ctx = afr_fd_ctx_get (local->fd, this); - if (!fd_ctx) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_FD_CTX_GET_FAILED, - "unable to get fd ctx for fd=%p", - local->fd); - - local->op_ret = -1; - int_lock->lock_op_ret = -1; - local->op_errno = EINVAL; - int_lock->lock_op_errno = EINVAL; - - afr_unlock (frame, this); - ret = -1; - goto out; - } - - call_count = internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - if (!call_count) { - gf_msg (this->name, GF_LOG_INFO, 0, - AFR_MSG_SUBVOLS_DOWN, - "All bricks are down, aborting."); - afr_unlock (frame, this); - goto out; - } - - /* Send non-blocking inodelk calls only on up children - and where the fd has been opened */ - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - - flock_use = &flock; - if (!local->transaction.eager_lock_on) { - goto wind; - } - - piggyback = 0; - local->transaction.eager_lock[i] = 1; - - afr_set_delayed_post_op (frame, this); - - LOCK (&local->fd->lock); - { - if (fd_ctx->lock_acquired[i]) { - fd_ctx->lock_piggyback[i]++; - piggyback = 1; - } - } - UNLOCK (&local->fd->lock); - - if (piggyback) { - /* (op_ret == 1) => indicate piggybacked lock */ - afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, - this, 1, 0, NULL); - if (!--call_count) - break; - continue; - } - flock_use = &full_flock; - wind: - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, flock_use, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->finodelk, - int_lock->domain, local->fd, - F_SETLK, flock_use, NULL); - - if (!--call_count) - break; - } - } else { - call_count = internal_lock_count (frame, this); - int_lock->lk_call_count = call_count; - int_lock->lk_expected_count = call_count; - - for (i = 0; i < priv->child_count; i++) { - if (!local->child_up[i]) - continue; - AFR_TRACE_INODELK_IN (frame, this, - AFR_INODELK_NB_TRANSACTION, - AFR_LOCK_OP, &flock, F_SETLK, i); - - STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, - (void *) (long) i, - priv->children[i], - priv->children[i]->fops->inodelk, - int_lock->domain, &local->loc, - F_SETLK, &flock, NULL); - - if (!--call_count) - break; - } +afr_lock_nonblocking(call_frame_t *frame, xlator_t *this) +{ + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int child = 0; + int lockee_num = 0; + int32_t call_count = 0; + int i = 0; + int ret = 0; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + + initialize_internal_lock_variables(frame, this); + + if (local->fd) { + fd_ctx = afr_fd_ctx_get(local->fd, this); + if (!fd_ctx) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED, + "unable to get fd ctx for fd=%p", local->fd); + + local->op_ret = -1; + int_lock->lock_op_ret = -1; + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + + afr_unlock_now(frame, this); + ret = -1; + goto out; + } + } + + call_count = int_lock->lockee_count * internal_lock_count(frame, this); + int_lock->lk_call_count = call_count; + int_lock->lk_expected_count = call_count; + + if (!call_count) { + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON, + "fd not open on any subvolumes. aborting."); + afr_unlock_now(frame, this); + goto out; + } + + /* Send non-blocking lock calls only on up children + and where the fd has been opened */ + for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) { + child = i % priv->child_count; + lockee_num = i / priv->child_count; + if (local->child_up[child]) { + afr_internal_lock_wind(frame, afr_nb_internal_lock_cbk, + (void *)(long)i, child, lockee_num, + _gf_false, _gf_false); + if (!--call_count) + break; } + } out: - return ret; + return ret; } int32_t -afr_unlock (call_frame_t *frame, xlator_t *this) -{ - afr_local_t *local = NULL; - - local = frame->local; - - if (transaction_lk_op (local)) { - if (afr_is_inodelk_transaction(local)) - afr_unlock_inodelk (frame, this); - else - afr_unlock_entrylk (frame, this); - - } else { - if (is_afr_lock_selfheal (local)) - afr_unlock_inodelk (frame, this); - else - afr_unlock_entrylk (frame, this); - } - +afr_unlock(call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_lock_t *lock = NULL; + + local = frame->local; + + if (!local->transaction.eager_lock_on) + goto out; + lock = &local->inode_ctx->lock[local->transaction.type]; + LOCK(&local->inode->lock); + { + list_del_init(&local->transaction.owner_list); + if (list_empty(&lock->owners) && list_empty(&lock->post_op)) { + local->transaction.do_eager_unlock = _gf_true; + /*TODO: Need to get metadata use on_disk and inherit/uninherit + *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]); + *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]); + */ + GF_ASSERT(lock->release); + } + } + UNLOCK(&local->inode->lock); + if (!local->transaction.do_eager_unlock) { + local->internal_lock.lock_cbk(frame, this); return 0; -} - -int -afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, - unsigned int child_count) -{ - afr_local_t *dst_local = NULL; - afr_local_t *src_local = NULL; - afr_internal_lock_t *dst_lock = NULL; - afr_internal_lock_t *src_lock = NULL; - afr_inodelk_t *dst_inodelk = NULL; - afr_inodelk_t *src_inodelk = NULL; - int ret = -1; - - src_local = src->local; - src_lock = &src_local->internal_lock; - src_inodelk = afr_get_inodelk (src_lock, dom); - dst_local = dst->local; - dst_lock = &dst_local->internal_lock; - dst_inodelk = afr_get_inodelk (dst_lock, dom); - if (!dst_inodelk || !src_inodelk) - goto out; - if (src_inodelk->locked_nodes) { - memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, - sizeof (*dst_inodelk->locked_nodes) * child_count); - memset (src_inodelk->locked_nodes, 0, - sizeof (*src_inodelk->locked_nodes) * child_count); - } + } - dst_lock->transaction_lk_type = src_lock->transaction_lk_type; - dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; - dst_inodelk->lock_count = src_inodelk->lock_count; - src_inodelk->lock_count = 0; - ret = 0; out: - return ret; + afr_unlock_now(frame, this); + return 0; } diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h index c7d6261b110..816065fb57a 100644 --- a/xlators/cluster/afr/src/afr-mem-types.h +++ b/xlators/cluster/afr/src/afr-mem-types.h @@ -8,46 +8,31 @@ cases as published by the Free Software Foundation. */ - #ifndef __AFR_MEM_TYPES_H__ #define __AFR_MEM_TYPES_H__ -#include "mem-types.h" +#include <glusterfs/mem-types.h> enum gf_afr_mem_types_ { - gf_afr_mt_iovec = gf_common_mt_end + 1, - gf_afr_mt_afr_fd_ctx_t, - gf_afr_mt_afr_private_t, - gf_afr_mt_int32_t, - gf_afr_mt_char, - gf_afr_mt_xattr_key, - gf_afr_mt_dict_t, - gf_afr_mt_xlator_t, - gf_afr_mt_iatt, - gf_afr_mt_int, - gf_afr_mt_afr_node_character, - gf_afr_mt_sh_diff_loop_state, - gf_afr_mt_uint8_t, - gf_afr_mt_loc_t, - gf_afr_mt_entry_name, - gf_afr_mt_pump_priv, - gf_afr_mt_locked_fd, - gf_afr_mt_inode_ctx_t, - gf_afr_fd_paused_call_t, - gf_afr_mt_crawl_data_t, - gf_afr_mt_brick_pos_t, - gf_afr_mt_shd_bool_t, - gf_afr_mt_shd_timer_t, - gf_afr_mt_shd_event_t, - gf_afr_mt_time_t, - gf_afr_mt_pos_data_t, - gf_afr_mt_reply_t, - gf_afr_mt_subvol_healer_t, - gf_afr_mt_spbc_timeout_t, - gf_afr_mt_spb_status_t, - gf_afr_mt_empty_brick_t, - gf_afr_mt_child_latency_t, + gf_afr_mt_afr_fd_ctx_t = gf_common_mt_end + 1, + gf_afr_mt_afr_private_t, + gf_afr_mt_int32_t, + gf_afr_mt_char, + gf_afr_mt_xattr_key, + gf_afr_mt_dict_t, + gf_afr_mt_xlator_t, + gf_afr_mt_afr_node_character, + gf_afr_mt_inode_ctx_t, + gf_afr_mt_shd_event_t, + gf_afr_mt_reply_t, + gf_afr_mt_subvol_healer_t, + gf_afr_mt_spbc_timeout_t, + gf_afr_mt_spb_status_t, + gf_afr_mt_empty_brick_t, + gf_afr_mt_child_latency_t, + gf_afr_mt_atomic_t, + gf_afr_mt_lk_heal_info_t, + gf_afr_mt_gf_lock, gf_afr_mt_end }; #endif - diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h index 02eb206fd08..e73fd997765 100644 --- a/xlators/cluster/afr/src/afr-messages.h +++ b/xlators/cluster/afr/src/afr-messages.h @@ -11,363 +11,157 @@ #ifndef _AFR_MESSAGES_H_ #define _AFR_MESSAGES_H_ -#include "glfs-message-id.h" - -/*! \file afr-messages.h - * \brief AFR log-message IDs and their descriptions. - */ - -/* NOTE: Rules for message additions - * 1) Each instance of a message is _better_ left with a unique message ID, even - * if the message format is the same. Reasoning is that, if the message - * format needs to change in one instance, the other instances are not - * impacted or the new change does not change the ID of the instance being - * modified. - * 2) Addition of a message, - * - Should increment the GLFS_NUM_MESSAGES - * - Append to the list of messages defined, towards the end - * - Retain macro naming as glfs_msg_X (for redability across developers) - * NOTE: Rules for message format modifications - * 3) Check acorss the code if the message ID macro in question is reused - * anywhere. If reused then then the modifications should ensure correctness - * everywhere, or needs a new message ID as (1) above was not adhered to. If - * not used anywhere, proceed with the required modification. - * NOTE: Rules for message deletion - * 4) Check (3) and if used anywhere else, then cannot be deleted. If not used - * anywhere, then can be deleted, but will leave a hole by design, as - * addition rules specify modification to the end of the list and not filling - * holes. +#include <glusterfs/glfs-message-id.h> + +/* To add new message IDs, append new identifiers at the end of the list. + * + * Never remove a message ID. If it's not used anymore, you can rename it or + * leave it as it is, but not delete it. This is to prevent reutilization of + * IDs by other messages. + * + * The component name must match one of the entries defined in + * glfs-message-id.h. */ -#define GLFS_COMP_BASE_AFR GLFS_MSGID_COMP_AFR -#define GLFS_NUM_MESSAGES 42 -#define GLFS_MSGID_END (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1) - -#define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages" - -/*! - * @messageid 108001 - * @diagnosis Client quorum is not met due to which file modification - * operations are disallowed. - * @recommendedaction Some brick processes are down/ not visible from the - * client. Ensure that the bricks are up/ network traffic is not blocked. - */ -#define AFR_MSG_QUORUM_FAIL (GLFS_COMP_BASE_AFR + 1) - - -/*! - * @messageid 108002 - * @diagnosis The bricks that were down are now up and quorum is restored. - * @recommendedaction Possibly check why the bricks went down to begin with. - */ -#define AFR_MSG_QUORUM_MET (GLFS_COMP_BASE_AFR + 2) - - -/*! - * @messageid 108003 - * @diagnosis Client quorum-type was set to auto due to which the quorum-count - * option is no longer valid. - * @recommendedaction None. - */ -#define AFR_MSG_QUORUM_OVERRIDE (GLFS_COMP_BASE_AFR + 3) - - -/*! - * @messageid 108004 - * @diagnosis Replication sub volume witnessed a connection notification - * from a brick which does not belong to its replica set. - * @recommendedaction None. This is a safety check in code. - */ -#define AFR_MSG_INVALID_CHILD_UP (GLFS_COMP_BASE_AFR + 4) - - -/*! - * @messageid 108005 - * @diagnosis A replica set that was inaccessible because all its bricks were - * down is now accessible because at least one of its bricks came back up. - * @recommendedaction Possibly check why all the bricks of that replica set - * went down to begin with. - */ -#define AFR_MSG_SUBVOL_UP (GLFS_COMP_BASE_AFR + 5) - - -/*! - * @messageid 108006 - * @diagnosis bricks of a replica set are down. Data residing in that - * replica cannot be accessed until one of the bricks come back up. - * @recommendedaction Ensure that the bricks are up. - */ -#define AFR_MSG_SUBVOLS_DOWN (GLFS_COMP_BASE_AFR + 6) - - -/*! - * @messageid 108007 - * @diagnosis Entry unlocks failed on a brick. - * @recommendedaction Error number in the log should give the reason why it - * failed. Also observe brick logs for more information. -*/ -#define AFR_MSG_ENTRY_UNLOCK_FAIL (GLFS_COMP_BASE_AFR + 7) - - -/*! - * @messageid 108008 - * @diagnosis There is an inconsistency in the file's data/metadata/gfid - * amongst the bricks of a replica set. - * @recommendedaction Resolve the split brain by clearing the AFR changelog - * attributes from the appropriate brick and trigger self-heal. - */ -#define AFR_MSG_SPLIT_BRAIN (GLFS_COMP_BASE_AFR + 8) - - -/*! - * @messageid 108009 - * @diagnosis open/opendir failed on a brick. - * @recommendedaction Error number in the log should give the reason why it - * failed. Also observe brick logs for more information. - */ -#define AFR_MSG_OPEN_FAIL (GLFS_COMP_BASE_AFR + 9) - - -/*! - * @messageid 108010 - * @diagnosis unlocks failed on a brick. - * @recommendedaction Error number in the log should give the reason why it - * failed. Also observe brick logs for more information. -*/ -#define AFR_MSG_UNLOCK_FAIL (GLFS_COMP_BASE_AFR + 10) - -/*! - * @messageid 108011 - * @diagnosis Setting of pending xattrs succeeded/failed during replace-brick - * operation. - * @recommendedaction In case of failure, error number in the log should give - * the reason why it failed. Also observe brick logs for more information. -*/ -#define AFR_MSG_REPLACE_BRICK_STATUS (GLFS_COMP_BASE_AFR + 11) - -/*! - * @messageid 108012 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_GFID_NULL (GLFS_COMP_BASE_AFR + 12) - -/*! - * @messageid 108013 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_FD_CREATE_FAILED (GLFS_COMP_BASE_AFR + 13) - -/*! - * @messageid 108014 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_DICT_SET_FAILED (GLFS_COMP_BASE_AFR + 14) - -/*! - * @messageid 108015 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_EXPUNGING_FILE_OR_DIR (GLFS_COMP_BASE_AFR + 15) - -/*! - * @messageid 108016 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_MIGRATION_IN_PROGRESS (GLFS_COMP_BASE_AFR + 16) - -/*! - * @messageid 108017 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_CHILD_MISCONFIGURED (GLFS_COMP_BASE_AFR + 17) - -/*! - * @messageid 108018 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_VOL_MISCONFIGURED (GLFS_COMP_BASE_AFR + 18) - -/*! - * @messageid 108019 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_BLOCKING_LKS_FAILED (GLFS_COMP_BASE_AFR + 19) - -/*! - * @messageid 108020 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_INVALID_FD (GLFS_COMP_BASE_AFR + 20) - -/*! - * @messageid 108021 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_LOCK_INFO (GLFS_COMP_BASE_AFR + 21) - -/*! - * @messageid 108022 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_LOCK_XLATOR_NOT_LOADED (GLFS_COMP_BASE_AFR + 22) - -/*! - * @messageid 108023 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_FD_CTX_GET_FAILED (GLFS_COMP_BASE_AFR + 23) - -/*! - * @messageid 108024 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_INVALID_SUBVOL (GLFS_COMP_BASE_AFR + 24) - -/*! - * @messageid 108025 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_PUMP_XLATOR_ERROR (GLFS_COMP_BASE_AFR + 25) - -/*! - * @messageid 108026 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_SELF_HEAL_INFO (GLFS_COMP_BASE_AFR + 26) - -/*! - * @messageid 108027 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_READ_SUBVOL_ERROR (GLFS_COMP_BASE_AFR + 27) - -/*! - * @messageid 108028 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_DICT_GET_FAILED (GLFS_COMP_BASE_AFR + 28) - - -/*! - * @messageid 108029 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_INFO_COMMON (GLFS_COMP_BASE_AFR + 29) - -/*! - * @messageid 108030 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR (GLFS_COMP_BASE_AFR + 30) - -/*! - * @messageid 108031 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_LOCAL_CHILD (GLFS_COMP_BASE_AFR + 31) - -/*! - * @messageid 108032 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_INVALID_DATA (GLFS_COMP_BASE_AFR + 32) - -/*! - * @messageid 108033 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_INVALID_ARG (GLFS_COMP_BASE_AFR + 33) - -/*! - * @messageid 108034 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_INDEX_DIR_GET_FAILED (GLFS_COMP_BASE_AFR + 34) - -/*! - * @messageid 108035 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_FSYNC_FAILED (GLFS_COMP_BASE_AFR + 35) - -/*! - * @messageid 108036 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_FAVORITE_CHILD (GLFS_COMP_BASE_AFR + 36) -/*! - * @messageid 108037 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_SELF_HEAL_FAILED (GLFS_COMP_BASE_AFR + 37) - -/*! - * @messageid 108038 - * @diagnosis - * @recommendedaction -*/ -#define AFR_MSG_SPLIT_BRAIN_STATUS (GLFS_COMP_BASE_AFR + 38) - -/*! - * @messageid 108039 - * @diagnosis Setting of pending xattrs succeeded/failed during add-brick - * operation. - * @recommendedaction In case of failure, error number in the log should give - * the reason why it failed. Also observe brick logs for more information. -*/ -#define AFR_MSG_ADD_BRICK_STATUS (GLFS_COMP_BASE_AFR + 39) - - -/*! - * @messageid 108040 - * @diagnosis AFR was unable to be loaded because the pending-changelog xattrs - * were not found in the volfile. - * @recommendedaction Please ensure cluster op-version is atleast 30707 and the - * volfiles are regenerated. -*/ -#define AFR_MSG_NO_CHANGELOG (GLFS_COMP_BASE_AFR + 40) - -/*! - * @messageid 108041 - * @diagnosis Unable to create timer thread for delayed initialization. - * @recommendedaction Possibly check process's log file for messages from - * timer infra. -*/ -#define AFR_MSG_TIMER_CREATE_FAIL (GLFS_COMP_BASE_AFR + 41) - -/*! - * @messageid 108042 - * @diagnosis Log messages relating to automated resolution of split-brain files - * based on favorite child policies. - * @recommendedaction -*/ -#define AFR_MSG_SBRAIN_FAV_CHILD_POLICY (GLFS_COMP_BASE_AFR + 42) - -#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" +GLFS_MSGID( + AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE, + AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN, + AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL, + AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL, + AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED, + AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS, + AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED, + AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO, + AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED, + AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO, + AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON, + AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA, + AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED, + AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED, + AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG, + AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB, + AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED, + AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET, + AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START, + AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT, + AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED, + AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG, + AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED, + AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH, + AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED, + AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA, + SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE, + SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED, + AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED, + AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED, + AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED, + AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN, + AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN, + AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP, + AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED, + AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED, + AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR, + AFR_MSG_INTERNAL_ATTR); + +#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed" +#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed" +#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed" +#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed" +#define AFR_MSG_INVALID_ARG_STR "Invalid argument" +#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on " +#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file" +#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict." +#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed." +#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \ + "Failed to populate loc for thin-arbiter" +#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending" +#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping." +#define AFR_MSG_UNKNOWN_SET_STR "Unknown set" +#define AFR_MSG_NO_XL_ID_STR "xl does not have id" +#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on" +#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on" +#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter." +#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output" +#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time" +#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict" +#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \ + "No majority to resolve gfid split brain" +#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected" +#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal" +#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch" +#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \ + "Size policy is not applicable to directories." +#define AFR_MSG_NO_CHILD_SELECTED_STR \ + "No child selected by favorite-child policy" +#define AFR_MSG_INVALID_CHILD_STR "Invalid child" +#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \ + "selected as authentic to resolve conflicting data" +#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick" +#define SNO_DIFF_IN_MTIME_STR "No difference in mtime" +#define SNO_BIGGER_FILE_STR "No bigger file" +#define SALL_BRICKS_UP_TO_RESOLVE_STR \ + "All the bricks should be up to resolve the gfid split brain" +#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock" +#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed" +#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame" +#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop" +#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed" +#define AFR_MSG_FSYNC_FAILED_STR "fsync failed" +#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met" +#define AFR_MSG_FOP_FAILED_STR "Failing Fop" +#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume" +#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling" +#define AFR_MSG_CHILD_MISCONFIGURED_STR \ + "replicate translator needs more than one subvolume defined" +#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads" +#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count" +#define AFR_MSG_UNABLE_TO_FETCH_STR \ + "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \ + "using client translator names" +#define AFR_MSG_NULL_DEREF_STR "possible NULL deref" +#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key" +#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask" +#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up" +#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \ + "Failed to cancel split-brain choice" +#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \ + "Cannot set replica. File is not in data/metadata split-brain" +#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx" +#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols" +#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child" +#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file" +#define AFR_MSG_TIMER_CREATE_FAIL_STR \ + "Cannot create timer for delayed initialization" +#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online" +#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \ + "All subvolumes are down. Going offline until atleast one of them is up" +#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock" +#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume" +#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met" +#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir" +#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid" +#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file" +#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain" +#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down" +#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask" +#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \ + "read subvolume in this generation is not up" +#define AFR_MSG_INTERNAL_LKS_FAILED_STR \ + "Unable to work with lk-owner while attempting fop" +#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \ + "subvolume does not support locking. please load features/locks xlator " \ + "on server." +#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx" +#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting." +#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child." +#define AFR_MSG_NEW_BRICK_STR "New brick" +#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \ + "Failed to set split-brain choice to -1" +#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \ + "Failed to determine split-brain. Aborting split-brain-choice set" +#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume" +#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr" +#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute" #endif /* !_AFR_MESSAGES_H_ */ diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c index de125296bb3..64856042b65 100644 --- a/xlators/cluster/afr/src/afr-open.c +++ b/xlators/cluster/afr/src/afr-open.c @@ -8,364 +8,346 @@ cases as published by the Free Software Foundation. */ -#include <libgen.h> #include <unistd.h> -#include <fnmatch.h> #include <sys/time.h> #include <stdlib.h> #include <signal.h> -#include "glusterfs.h" +#include <glusterfs/glusterfs.h> #include "afr.h" -#include "dict.h" -#include "xlator.h" -#include "hashfn.h" -#include "logging.h" -#include "list.h" -#include "call-stub.h" -#include "defaults.h" -#include "common-utils.h" -#include "compat-errno.h" -#include "compat.h" -#include "byte-order.h" -#include "statedump.h" - -#include "afr-inode-read.h" -#include "afr-inode-write.h" -#include "afr-dir-read.h" -#include "afr-dir-write.h" -#include "afr-transaction.h" +#include <glusterfs/dict.h> +#include <glusterfs/logging.h> +#include <glusterfs/defaults.h> +#include <glusterfs/common-utils.h> +#include <glusterfs/compat-errno.h> +#include <glusterfs/compat.h> +#include <glusterfs/byte-order.h> +#include <glusterfs/statedump.h> +#include "afr-transaction.h" gf_boolean_t -afr_is_fd_fixable (fd_t *fd) +afr_is_fd_fixable(fd_t *fd) { - if (!fd || !fd->inode) - return _gf_false; - else if (fd_is_anonymous (fd)) - return _gf_false; - else if (gf_uuid_is_null (fd->inode->gfid)) - return _gf_false; - - return _gf_true; + if (!fd || !fd->inode) + return _gf_false; + else if (fd_is_anonymous(fd)) + return _gf_false; + else if (gf_uuid_is_null(fd->inode->gfid)) + return _gf_false; + + return _gf_true; } - int -afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, struct iatt *prebuf, - struct iatt *postbuf, dict_t *xdata) +afr_open_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) { - afr_local_t * local = frame->local; + afr_local_t *local = frame->local; - AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno, - local->fd, xdata); - return 0; + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, + local->cont.open.fd, xdata); + return 0; } int -afr_open_cbk (call_frame_t *frame, void *cookie, - xlator_t *this, int32_t op_ret, int32_t op_errno, - fd_t *fd, dict_t *xdata) +afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, fd_t *fd, dict_t *xdata) { - afr_local_t *local = NULL; - int call_count = -1; - int child_index = (long) cookie; - afr_fd_ctx_t *fd_ctx = NULL; - - local = frame->local; - fd_ctx = local->fd_ctx; - - LOCK (&frame->lock); - { - if (op_ret == -1) { - local->op_errno = op_errno; - fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; - } else { - local->op_ret = op_ret; - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - if (!local->xdata_rsp && xdata) - local->xdata_rsp = dict_ref (xdata); - } + afr_local_t *local = NULL; + int call_count = -1; + int child_index = (long)cookie; + afr_fd_ctx_t *fd_ctx = NULL; + + local = frame->local; + fd_ctx = local->fd_ctx; + + local->replies[child_index].valid = 1; + local->replies[child_index].op_ret = op_ret; + local->replies[child_index].op_errno = op_errno; + + LOCK(&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; + } else { + local->op_ret = op_ret; + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; + if (!local->xdata_rsp && xdata) + local->xdata_rsp = dict_ref(xdata); } - UNLOCK (&frame->lock); - - call_count = afr_frame_return (frame); - - if (call_count == 0) { - if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) { - STACK_WIND (frame, afr_open_ftruncate_cbk, - this, this->fops->ftruncate, - fd, 0, NULL); - } else { - AFR_STACK_UNWIND (open, frame, local->op_ret, - local->op_errno, local->cont.open.fd, - local->xdata_rsp); - } + call_count = --local->call_count; + } + UNLOCK(&frame->lock); + + if (call_count == 0) { + afr_handle_replies_quorum(frame, this); + if (local->op_ret == -1) { + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL, + NULL); + } else if (fd_ctx->flags & O_TRUNC) { + STACK_WIND(frame, afr_open_ftruncate_cbk, this, + this->fops->ftruncate, fd, 0, NULL); + } else { + AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, + local->cont.open.fd, local->xdata_rsp); } + } - return 0; + return 0; } - int -afr_open_continue (call_frame_t *frame, xlator_t *this, int err) +afr_open_continue(call_frame_t *frame, xlator_t *this, int err) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int call_count = 0; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - if (err) { - AFR_STACK_UNWIND (open, frame, -1, -err, NULL, NULL); - } else { - local->call_count = AFR_COUNT (local->child_up, - priv->child_count); - call_count = local->call_count; - - for (i = 0; i < priv->child_count; i++) { - if (local->child_up[i]) { - STACK_WIND_COOKIE (frame, afr_open_cbk, - (void *)(long)i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - (local->cont.open.flags & ~O_TRUNC), - local->cont.open.fd, - local->xdata_req); - if (!--call_count) - break; - } - } + if (err) { + AFR_STACK_UNWIND(open, frame, -1, err, NULL, NULL); + } else { + local->call_count = AFR_COUNT(local->child_up, priv->child_count); + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE(frame, afr_open_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->open, &local->loc, + (local->cont.open.flags & ~O_TRUNC), + local->cont.open.fd, local->xdata_req); + if (!--call_count) + break; + } } - return 0; + } + return 0; } int -afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, - fd_t *fd, dict_t *xdata) +afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int spb_choice = 0; - int event_generation = 0; - int ret = 0; - int32_t op_errno = 0; - afr_fd_ctx_t *fd_ctx = NULL; - - //We can't let truncation to happen outside transaction. - - priv = this->private; - - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; - - local->op = GF_FOP_OPEN; - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) { - op_errno = ENOMEM; - goto out; - } - - if (!afr_is_consistent_io_possible (local, priv, &op_errno)) - goto out; - - local->inode = inode_ref (loc->inode); - loc_copy (&local->loc, loc); - local->fd_ctx = fd_ctx; - fd_ctx->flags = flags; - if (xdata) - local->xdata_req = dict_ref (xdata); - - local->cont.open.flags = flags; - local->cont.open.fd = fd_ref (fd); - - ret = afr_inode_get_readable (frame, local->inode, this, - NULL, &event_generation, - AFR_DATA_TRANSACTION); - if ((ret < 0) && - (afr_inode_split_brain_choice_get (local->inode, - this, &spb_choice) == 0) && - spb_choice < 0) { - afr_inode_refresh (frame, this, local->inode, - local->inode->gfid, afr_open_continue); - } else { - afr_open_continue (frame, this, 0); - } - - return 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int spb_subvol = 0; + int event_generation = 0; + int ret = 0; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + + // We can't let truncation to happen outside transaction. + + priv = this->private; + + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; + + local->op = GF_FOP_OPEN; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) { + op_errno = ENOMEM; + goto out; + } + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + op_errno = afr_quorum_errno(priv); + goto out; + } + + if (!afr_is_consistent_io_possible(local, priv, &op_errno)) + goto out; + + local->inode = inode_ref(loc->inode); + loc_copy(&local->loc, loc); + local->fd_ctx = fd_ctx; + fd_ctx->flags = flags; + if (xdata) + local->xdata_req = dict_ref(xdata); + + local->cont.open.flags = flags; + local->cont.open.fd = fd_ref(fd); + + ret = afr_inode_get_readable(frame, local->inode, this, NULL, + &event_generation, AFR_DATA_TRANSACTION); + if ((ret < 0) && + (afr_split_brain_read_subvol_get(local->inode, this, NULL, + &spb_subvol) == 0) && + spb_subvol < 0) { + afr_inode_refresh(frame, this, local->inode, local->inode->gfid, + afr_open_continue); + } else { + afr_open_continue(frame, this, 0); + } + + return 0; out: - AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL); + AFR_STACK_UNWIND(open, frame, -1, op_errno, fd, NULL); - return 0; + return 0; } int -afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int32_t op_ret, int32_t op_errno, fd_t *fd, - dict_t *xdata) +afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd, + dict_t *xdata) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - afr_fd_ctx_t *fd_ctx = NULL; - int call_count = 0; - int child_index = (long) cookie; - - priv = this->private; - local = frame->local; - + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long)cookie; + + priv = this->private; + local = frame->local; + + if (op_ret >= 0) { + gf_msg_debug(this->name, 0, + "fd for %s opened " + "successfully on subvolume %s", + local->loc.path, priv->children[child_index]->name); + } else { + gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno, + AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s", + priv->children[child_index]->name, NULL); + } + + fd_ctx = local->fd_ctx; + + LOCK(&local->fd->lock); + { if (op_ret >= 0) { - gf_msg_debug (this->name, 0, "fd for %s opened " - "successfully on subvolume %s", local->loc.path, - priv->children[child_index]->name); + fd_ctx->opened_on[child_index] = AFR_FD_OPENED; } else { - gf_msg (this->name, fop_log_level (GF_FOP_OPEN, op_errno), - op_errno, AFR_MSG_OPEN_FAIL, "Failed to open %s on " - "subvolume %s", local->loc.path, - priv->children[child_index]->name); + fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; } + } + UNLOCK(&local->fd->lock); - fd_ctx = local->fd_ctx; + call_count = afr_frame_return(frame); + if (call_count == 0) + AFR_STACK_DESTROY(frame); - LOCK (&local->fd->lock); - { - if (op_ret >= 0) { - fd_ctx->opened_on[child_index] = AFR_FD_OPENED; - } else { - fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED; - } - } - UNLOCK (&local->fd->lock); + return 0; +} - call_count = afr_frame_return (frame); - if (call_count == 0) - AFR_STACK_DESTROY (frame); +static int +afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open) +{ + afr_fd_ctx_t *fd_ctx = NULL; + afr_private_t *priv = NULL; + int i = 0; + int count = 0; + priv = this->private; + + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) return 0; -} + LOCK(&fd->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && + priv->child_up[i]) { + fd_ctx->opened_on[i] = AFR_FD_OPENING; + need_open[i] = 1; + count++; + } else { + need_open[i] = 0; + } + } + } + UNLOCK(&fd->lock); -static int -afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open) -{ - afr_fd_ctx_t *fd_ctx = NULL; - afr_private_t *priv = NULL; - int i = 0; - int count = 0; - - priv = this->private; - - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - return 0; - - LOCK (&fd->lock); - { - for (i = 0; i < priv->child_count; i++) { - if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED && - priv->child_up[i]) { - fd_ctx->opened_on[i] = AFR_FD_OPENING; - need_open[i] = 1; - count++; - } else { - need_open[i] = 0; - } - } - } - UNLOCK (&fd->lock); - - return count; + return count; } - void -afr_fix_open (fd_t *fd, xlator_t *this) +afr_fix_open(fd_t *fd, xlator_t *this) { - afr_private_t *priv = NULL; - int i = 0; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - int ret = -1; - int32_t op_errno = 0; - afr_fd_ctx_t *fd_ctx = NULL; - unsigned char *need_open = NULL; - int call_count = 0; + afr_private_t *priv = NULL; + int i = 0; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + int ret = -1; + int32_t op_errno = 0; + afr_fd_ctx_t *fd_ctx = NULL; + unsigned char *need_open = NULL; + int call_count = 0; - priv = this->private; + priv = this->private; - if (!afr_is_fd_fixable (fd)) - goto out; + if (!afr_is_fd_fixable(fd)) + goto out; - fd_ctx = afr_fd_ctx_get (fd, this); - if (!fd_ctx) - goto out; + fd_ctx = afr_fd_ctx_get(fd, this); + if (!fd_ctx) + goto out; - need_open = alloca0 (priv->child_count); + need_open = alloca0(priv->child_count); - call_count = afr_fd_ctx_need_open (fd, this, need_open); - if (!call_count) - goto out; + call_count = afr_fd_ctx_need_open(fd, this, need_open); + if (!call_count) + goto out; - frame = create_frame (this, this->ctx->pool); - if (!frame) - goto out; + frame = create_frame(this, this->ctx->pool); + if (!frame) + goto out; - local = AFR_FRAME_INIT (frame, op_errno); - if (!local) - goto out; + local = AFR_FRAME_INIT(frame, op_errno); + if (!local) + goto out; - local->loc.inode = inode_ref (fd->inode); - ret = loc_path (&local->loc, NULL); - if (ret < 0) - goto out; + local->loc.inode = inode_ref(fd->inode); + ret = loc_path(&local->loc, NULL); + if (ret < 0) + goto out; - local->fd = fd_ref (fd); - local->fd_ctx = fd_ctx; + local->fd = fd_ref(fd); + local->fd_ctx = fd_ctx; - local->call_count = call_count; + local->call_count = call_count; - gf_msg_debug (this->name, 0, "need open count: %d", - call_count); + gf_msg_debug(this->name, 0, "need open count: %d", call_count); - for (i = 0; i < priv->child_count; i++) { - if (!need_open[i]) - continue; - - if (IA_IFDIR == fd->inode->ia_type) { - gf_msg_debug (this->name, 0, - "opening fd for dir %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, - (void*) (long) i, - priv->children[i], - priv->children[i]->fops->opendir, - &local->loc, local->fd, - NULL); - } else { - gf_msg_debug (this->name, 0, - "opening fd for file %s on subvolume %s", - local->loc.path, priv->children[i]->name); - - STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk, - (void *)(long) i, - priv->children[i], - priv->children[i]->fops->open, - &local->loc, - fd_ctx->flags & (~O_TRUNC), - local->fd, NULL); - } - - if (!--call_count) - break; + for (i = 0; i < priv->child_count; i++) { + if (!need_open[i]) + continue; + + if (IA_IFDIR == fd->inode->ia_type) { + gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, + priv->children[i], + priv->children[i]->fops->opendir, &local->loc, + local->fd, NULL); + } else { + gf_msg_debug(this->name, 0, + "opening fd for file %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i, + priv->children[i], priv->children[i]->fops->open, + &local->loc, fd_ctx->flags & (~O_TRUNC), + local->fd, NULL); } - return; + if (!--call_count) + break; + } + + return; out: - if (frame) - AFR_STACK_DESTROY (frame); + if (frame) + AFR_STACK_DESTROY(frame); } diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c index 50e8040d33e..6fc2c75145c 100644 --- a/xlators/cluster/afr/src/afr-read-txn.c +++ b/xlators/cluster/afr/src/afr-read-txn.c @@ -12,107 +12,327 @@ #include "afr-transaction.h" #include "afr-messages.h" -int -afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this) +void +afr_pending_read_increment(afr_private_t *priv, int child_index) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - int subvol = -1; - - local = frame->local; - priv = this->private; - - - for (i = 0; i < priv->child_count; i++) { - if (!local->readable[i]) { - /* don't even bother trying here. - just mark as attempted and move on. */ - local->read_attempted[i] = 1; - continue; - } - - if (!local->read_attempted[i]) { - subvol = i; - break; - } - } - - /* If no more subvols were available for reading, we leave - @subvol as -1, which is an indication we have run out of - readable subvols. */ - if (subvol != -1) - local->read_attempted[subvol] = 1; - local->readfn (frame, this, subvol); - - return 0; + if (child_index < 0 || child_index > priv->child_count) + return; + + GF_ATOMIC_INC(priv->pending_reads[child_index]); +} + +void +afr_pending_read_decrement(afr_private_t *priv, int child_index) +{ + if (child_index < 0 || child_index > priv->child_count) + return; + + GF_ATOMIC_DEC(priv->pending_reads[child_index]); +} + +void +afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = this->private; + + afr_pending_read_decrement(priv, local->read_subvol); + local->read_subvol = subvol; + afr_pending_read_increment(priv, subvol); + local->readfn(frame, this, subvol); } int -afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) +afr_read_txn_next_subvol(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - int read_subvol = 0; - inode_t *inode = NULL; - int ret = -1; - int spb_choice = -1; - - local = frame->local; - inode = local->inode; - - if (err) { - read_subvol = -1; - goto readfn; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + int subvol = -1; + + local = frame->local; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (!local->readable[i]) { + /* don't even bother trying here. + just mark as attempted and move on. */ + local->read_attempted[i] = 1; + continue; } - read_subvol = afr_read_subvol_select_by_policy (inode, this, - local->readable, NULL); - if (read_subvol == -1) { - err = -EIO; - goto readfn; + if (!local->read_attempted[i]) { + subvol = i; + break; } + } - if (local->read_attempted[read_subvol]) { - afr_read_txn_next_subvol (frame, this); - return 0; - } + /* If no more subvols were available for reading, we leave + @subvol as -1, which is an indication we have run out of + readable subvols. */ + if (subvol != -1) + local->read_attempted[subvol] = 1; + afr_read_txn_wind(frame, this, subvol); - local->read_attempted[read_subvol] = 1; -readfn: - if (read_subvol == -1) { - ret = afr_inode_split_brain_choice_get (inode, this, - &spb_choice); - if ((ret == 0) && spb_choice >= 0) - read_subvol = spb_choice; - } + return 0; +} - if (read_subvol == -1) { - AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN (-1, -err); - } - local->readfn (frame, this, read_subvol); +static int +afr_ta_read_txn_done(int ret, call_frame_t *ta_frame, void *opaque) +{ + STACK_DESTROY(ta_frame->root); + return 0; +} - return 0; +static int +afr_ta_read_txn(void *opaque) +{ + call_frame_t *frame = NULL; + xlator_t *this = NULL; + int read_subvol = -1; + int query_child = AFR_CHILD_UNKNOWN; + int possible_bad_child = AFR_CHILD_UNKNOWN; + int ret = 0; + int op_errno = ENOMEM; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = { + 0, + }; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int **pending = NULL; + loc_t loc = { + 0, + }; + + frame = (call_frame_t *)opaque; + this = frame->this; + local = frame->local; + priv = this->private; + query_child = local->read_txn_query_child; + + if (query_child == AFR_CHILD_ZERO) { + possible_bad_child = AFR_CHILD_ONE; + } else if (query_child == AFR_CHILD_ONE) { + possible_bad_child = AFR_CHILD_ZERO; + } else { + /*read_txn_query_child is AFR_CHILD_UNKNOWN*/ + goto out; + } + + /* Ask the query_child to see if it blames the possibly bad one. */ + xdata_req = dict_new(); + if (!xdata_req) + goto out; + + pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS); + if (!pending) + goto out; + + ret = afr_set_pending_dict(priv, xdata_req, pending); + if (ret < 0) + goto out; + + if (local->fd) { + ret = syncop_fxattrop(priv->children[query_child], local->fd, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + } else { + ret = syncop_xattrop(priv->children[query_child], &local->loc, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + } + if (ret || !xdata_rsp) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed xattrop for gfid %s on %s", + uuid_utoa(local->inode->gfid), + priv->children[query_child]->name); + op_errno = -ret; + goto out; + } + + if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, + possible_bad_child)) { + read_subvol = query_child; + goto out; + } + dict_unref(xdata_rsp); + xdata_rsp = NULL; + + /* It doesn't. So query thin-arbiter to see if it blames any data brick. */ + ret = afr_fill_ta_loc(this, &loc, _gf_true); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "Failed to populate thin-arbiter loc for: %s.", loc.name); + goto out; + } + flock.l_type = F_WRLCK; /*start and length are already zero. */ + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.", + uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + op_errno = -ret; + goto out; + } + + ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, + GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp, + NULL); + if (ret || !xdata_rsp) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed xattrop on %s.", uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + op_errno = -ret; + goto unlock; + } + + if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) { + read_subvol = query_child; + } else { + gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB, + "Failing read for gfid %s since good brick %s is down", + uuid_utoa(local->inode->gfid), + priv->children[possible_bad_child]->name); + op_errno = EIO; + } + +unlock: + flock.l_type = F_UNLCK; + ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX], + AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, NULL, NULL); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB, + "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on " + "%s.", + uuid_utoa(local->inode->gfid), + priv->pending_key[THIN_ARBITER_BRICK_INDEX]); + } +out: + if (xdata_req) + dict_unref(xdata_req); + if (xdata_rsp) + dict_unref(xdata_rsp); + if (pending) + afr_matrix_cleanup(pending, priv->child_count); + loc_wipe(&loc); + + if (read_subvol == -1) { + local->op_ret = -1; + local->op_errno = op_errno; + } + afr_read_txn_wind(frame, this, read_subvol); + return ret; } +void +afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this) +{ + call_frame_t *ta_frame = NULL; + afr_local_t *local = NULL; + int ret = 0; + + local = frame->local; + ta_frame = afr_ta_frame_create(this); + if (!ta_frame) { + local->op_ret = -1; + local->op_errno = ENOMEM; + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to create ta_frame"); + goto out; + } + ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done, + ta_frame, frame); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB, + "Failed to launch " + "afr_ta_read_txn synctask for gfid %s.", + uuid_utoa(local->inode->gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + STACK_DESTROY(ta_frame->root); + goto out; + } + return; +out: + afr_read_txn_wind(frame, this, -1); +} int -afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) +afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err) { - afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int read_subvol = -1; + inode_t *inode = NULL; + int ret = -1; + int spb_subvol = -1; + + local = frame->local; + inode = local->inode; + priv = this->private; + + if (err) { + if (!priv->thin_arbiter_count) + goto readfn; + if (err != EINVAL) + goto readfn; + /* We need to query the good bricks and/or thin-arbiter.*/ + afr_ta_read_txn_synctask(frame, this); + return 0; + } + + read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable, + NULL); + if (read_subvol == -1) { + err = EIO; + goto readfn; + } + + if (local->read_attempted[read_subvol]) { + afr_read_txn_next_subvol(frame, this); + return 0; + } + + local->read_attempted[read_subvol] = 1; +readfn: + if (read_subvol == -1) { + ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol); + if ((ret == 0) && spb_subvol >= 0) + read_subvol = spb_subvol; + } + + if (read_subvol == -1) { + AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err); + } + afr_read_txn_wind(frame, this, read_subvol); + + return 0; +} - local = frame->local; +int +afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol) +{ + afr_local_t *local = NULL; - if (!local->refreshed) { - local->refreshed = _gf_true; - afr_inode_refresh (frame, this, local->inode, NULL, - afr_read_txn_refresh_done); - } else { - afr_read_txn_next_subvol (frame, this); - } + local = frame->local; - return 0; -} + if (!local->refreshed) { + local->refreshed = _gf_true; + afr_inode_refresh(frame, this, local->inode, NULL, + afr_read_txn_refresh_done); + } else { + afr_read_txn_next_subvol(frame, this); + } + return 0; +} /* afr_read_txn_wipe: @@ -122,27 +342,26 @@ afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol) */ void -afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) +afr_read_txn_wipe(call_frame_t *frame, xlator_t *this) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - local->readfn = NULL; + local->readfn = NULL; - if (local->inode) - inode_unref (local->inode); + if (local->inode) + inode_unref(local->inode); - for (i = 0; i < priv->child_count; i++) { - local->read_attempted[i] = 0; - local->readable[i] = 0; - } + for (i = 0; i < priv->child_count; i++) { + local->read_attempted[i] = 0; + local->readable[i] = 0; + } } - /* afr_read_txn: @@ -171,88 +390,105 @@ afr_read_txn_wipe (call_frame_t *frame, xlator_t *this) */ int -afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, - afr_read_txn_wind_t readfn, afr_transaction_type type) +afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode, + afr_read_txn_wind_t readfn, afr_transaction_type type) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - unsigned char *data = NULL; - unsigned char *metadata = NULL; - int read_subvol = -1; - int event_generation = 0; - int ret = -1; - - priv = this->private; - local = frame->local; - data = alloca0 (priv->child_count); - metadata = alloca0 (priv->child_count); - - afr_read_txn_wipe (frame, this); - - local->readfn = readfn; - local->inode = inode_ref (inode); - local->is_read_txn = _gf_true; - - if (priv->quorum_reads && - priv->quorum_count && !afr_has_quorum (priv->child_up, this)) { - local->op_ret = -1; - local->op_errno = ENOTCONN; - read_subvol = -1; - goto read; - } - - if (!afr_is_consistent_io_possible (local, priv, &local->op_errno)) { - local->op_ret = -1; - read_subvol = -1; - goto read; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + unsigned char *data = NULL; + unsigned char *metadata = NULL; + int read_subvol = -1; + int event_generation = 0; + int ret = -1; + + priv = this->private; + local = frame->local; + data = alloca0(priv->child_count); + metadata = alloca0(priv->child_count); + + afr_read_txn_wipe(frame, this); + + local->readfn = readfn; + local->inode = inode_ref(inode); + local->is_read_txn = _gf_true; + local->transaction.type = type; + + if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) { + local->op_ret = -1; + local->op_errno = afr_quorum_errno(priv); + goto read; + } + + if (!afr_is_consistent_io_possible(local, priv, &local->op_errno)) { + local->op_ret = -1; + goto read; + } + + if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) { + local->op_ret = -1; + local->op_errno = -afr_quorum_errno(priv); + goto read; + } + + if (priv->thin_arbiter_count && + AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) { + if (local->child_up[0]) { + local->read_txn_query_child = AFR_CHILD_ZERO; + } else if (local->child_up[1]) { + local->read_txn_query_child = AFR_CHILD_ONE; } - - local->transaction.type = type; - ret = afr_inode_read_subvol_get (inode, this, data, metadata, - &event_generation); - if (ret == -1) - /* very first transaction on this inode */ - goto refresh; - AFR_INTERSECT (local->readable, data, metadata, priv->child_count); - - gf_msg_debug (this->name, 0, "%s: generation now vs cached: %d, " - "%d", uuid_utoa (inode->gfid), local->event_generation, - event_generation); - if (afr_is_inode_refresh_reqd (inode, this, local->event_generation, - event_generation)) - /* servers have disconnected / reconnected, and possibly - rebooted, very likely changing the state of freshness - of copies */ - goto refresh; - - read_subvol = afr_read_subvol_select_by_policy (inode, this, - local->readable, NULL); - - if (read_subvol < 0 || read_subvol > priv->child_count) { - gf_msg_debug (this->name, 0, "Unreadable subvolume %d found " - "with event generation %d for gfid %s.", - read_subvol, event_generation, uuid_utoa(inode->gfid)); - goto refresh; - } - - if (!local->child_up[read_subvol]) { - /* should never happen, just in case */ - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_READ_SUBVOL_ERROR, "subvolume %d is the " - "read subvolume in this generation, but is not up", - read_subvol); - goto refresh; - } - - local->read_attempted[read_subvol] = 1; + afr_ta_read_txn_synctask(frame, this); + return 0; + } + + ret = afr_inode_read_subvol_get(inode, this, data, metadata, + &event_generation); + if (ret == -1) + /* very first transaction on this inode */ + goto refresh; + AFR_INTERSECT(local->readable, data, metadata, priv->child_count); + + gf_msg_debug(this->name, 0, + "%s: generation now vs cached: %d, " + "%d", + uuid_utoa(inode->gfid), local->event_generation, + event_generation); + if (afr_is_inode_refresh_reqd(inode, this, local->event_generation, + event_generation)) + /* servers have disconnected / reconnected, and possibly + rebooted, very likely changing the state of freshness + of copies */ + goto refresh; + + read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable, + NULL); + + if (read_subvol < 0 || read_subvol > priv->child_count) { + gf_msg_debug(this->name, 0, + "Unreadable subvolume %d found " + "with event generation %d for gfid %s.", + read_subvol, event_generation, uuid_utoa(inode->gfid)); + goto refresh; + } + + if (!local->child_up[read_subvol]) { + /* should never happen, just in case */ + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR, + "subvolume %d is the " + "read subvolume in this generation, but is not up", + read_subvol); + goto refresh; + } + + local->read_attempted[read_subvol] = 1; read: - local->readfn (frame, this, read_subvol); + afr_read_txn_wind(frame, this, read_subvol); - return 0; + return 0; refresh: - afr_inode_refresh (frame, this, inode, NULL, afr_read_txn_refresh_done); + afr_inode_refresh(frame, this, inode, NULL, afr_read_txn_refresh_done); - return 0; + return 0; } diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c index 26d3860b234..a580a1584cc 100644 --- a/xlators/cluster/afr/src/afr-self-heal-common.c +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -8,743 +8,808 @@ cases as published by the Free Software Foundation. */ - #include "afr.h" #include "afr-self-heal.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> #include "protocol-common.h" #include "afr-messages.h" -#include "events.h" +#include <glusterfs/events.h> void -afr_heal_synctask (xlator_t *this, afr_local_t *local); +afr_heal_synctask(xlator_t *this, afr_local_t *local); int -afr_lookup_and_heal_gfid (xlator_t *this, inode_t *parent, const char *name, - inode_t *inode, struct afr_reply *replies, - int source, void *gfid) -{ - afr_private_t *priv = NULL; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - unsigned char *wind_on = NULL; - ia_type_t ia_type = IA_INVAL; - dict_t *xdata = NULL; - loc_t loc = {0, }; - int ret = 0; - int i = 0; - - priv = this->private; - wind_on = alloca0 (priv->child_count); +afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name, + inode_t *inode, struct afr_reply *replies, int source, + unsigned char *sources, void *gfid, int *gfid_idx) +{ + afr_private_t *priv = NULL; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + unsigned char *wind_on = NULL; + ia_type_t ia_type = IA_INVAL; + dict_t *xdata = NULL; + loc_t loc = { + 0, + }; + int ret = 0; + int i = 0; + + priv = this->private; + wind_on = alloca0(priv->child_count); + if (source >= 0 && replies[source].valid && replies[source].op_ret == 0) ia_type = replies[source].poststat.ia_type; - /* gfid heal on those subvolumes that do not have gfid associated - * with the inode and update those replies. - */ - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid || replies[i].op_ret != 0) - continue; - if (!gf_uuid_is_null (replies[i].poststat.ia_gfid) || - replies[i].poststat.ia_type != ia_type) - continue; - - wind_on[i] = 1; + if (ia_type != IA_INVAL) + goto heal; + + /* If ia_type is still invalid, it means either + * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain + * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks + * are sources) and the 'source' we selected earlier might be the one where + * the file is not actually present. + * + * In both cases, let us pick a brick with a successful reply and use its + * ia_type. + * */ + for (i = 0; i < priv->child_count; i++) { + if (source == -1) { + /* case (a) above. */ + if (replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { + ia_type = replies[i].poststat.ia_type; + break; + } + } else { + /* case (b) above. */ + if (i == source) + continue; + if (sources[i] && replies[i].valid && replies[i].op_ret == 0 && + replies[i].poststat.ia_type != IA_INVAL) { + ia_type = replies[i].poststat.ia_type; + break; + } } + } - if (AFR_COUNT(wind_on, priv->child_count) == 0) - return 0; +heal: + /* gfid heal on those subvolumes that do not have gfid associated + * with the inode and update those replies. + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; - xdata = dict_new (); - if (!xdata) { - ret = -ENOMEM; - goto out; - } + if (gf_uuid_is_null(gfid) && + !gf_uuid_is_null(replies[i].poststat.ia_gfid) && + replies[i].poststat.ia_type == ia_type) + gfid = replies[i].poststat.ia_gfid; - ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16); - if (ret) { - ret = -ENOMEM; - goto out; - } + if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) || + replies[i].poststat.ia_type != ia_type) + continue; - frame = afr_frame_create (this, &ret); - if (!frame) { - ret = -ret; - goto out; - } + wind_on[i] = 1; + } - local = frame->local; - loc.parent = inode_ref (parent); - gf_uuid_copy (loc.pargfid, parent->gfid); - loc.name = name; - loc.inode = inode_ref (inode); + if (AFR_COUNT(wind_on, priv->child_count) == 0) + return 0; + + xdata = dict_new(); + if (!xdata) { + ret = -ENOMEM; + goto out; + } - AFR_ONLIST (wind_on, frame, afr_selfheal_discover_cbk, lookup, - &loc, xdata); + ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true); + if (ret) { + ret = -ENOMEM; + goto out; + } + frame = afr_frame_create(this, &ret); + if (!frame) { + ret = -ret; + goto out; + } + + local = frame->local; + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref(inode); + + AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!wind_on[i]) + continue; + afr_reply_wipe(&replies[i]); + afr_reply_copy(&replies[i], &local->replies[i]); + } + if (gfid_idx && (*gfid_idx == -1)) { + /*Pick a brick where the gifd heal was successful.*/ for (i = 0; i < priv->child_count; i++) { - if (!wind_on[i]) - continue; - afr_reply_wipe (&replies[i]); - afr_reply_copy (&replies[i], &local->replies[i]); + if (!wind_on[i]) + continue; + if (replies[i].valid && replies[i].op_ret == 0 && + !gf_uuid_is_null(replies[i].poststat.ia_gfid)) { + *gfid_idx = i; + break; + } } + } out: - loc_wipe (&loc); - if (frame) - AFR_STACK_DESTROY (frame); - if (xdata) - dict_unref (xdata); + if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) { + ret = -afr_final_errno(local, priv); + } + loc_wipe(&loc); + if (frame) + AFR_STACK_DESTROY(frame); + if (xdata) + dict_unref(xdata); - return ret; + return ret; } int -afr_gfid_sbrain_source_from_src_brick (xlator_t *this, - struct afr_reply *replies, - char *src_brick) +afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies, + char *src_brick) { - int i = 0; - afr_private_t *priv = NULL; + int i = 0; + afr_private_t *priv = NULL; - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid || replies[i].op_ret == -1) - continue; - if (strcmp (priv->children[i]->name, src_brick) == 0) - return i; - } - return -1; + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (strcmp(priv->children[i]->name, src_brick) == 0) + return i; + } + return -1; } int -afr_selfheal_gfid_mismatch_by_majority (struct afr_reply *replies, - int child_count) -{ - int j = 0; - int i = 0; - int src = -1; - int votes[child_count]; - - for (i = 0; i < child_count; i++) { - if (!replies[i].valid || replies[i].op_ret == -1) - continue; - - votes[i] = 1; - for (j = i+1; j < child_count; j++) { - if ((!gf_uuid_compare (replies[i].poststat.ia_gfid, - replies[j].poststat.ia_gfid))) - votes[i]++; - if (votes[i] > child_count / 2) { - src = i; - goto out; - } - } +afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies, + int child_count) +{ + int j = 0; + int i = 0; + int votes; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + + votes = 1; + for (j = i + 1; j < child_count; j++) { + if ((!gf_uuid_compare(replies[i].poststat.ia_gfid, + replies[j].poststat.ia_gfid))) + votes++; + if (votes > child_count / 2) + return i; } + } -out: - return src; + return -1; } -int afr_gfid_sbrain_source_from_bigger_file (struct afr_reply *replies, - int child_count) +int +afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies, + int child_count) { - int i = 0; - int src = -1; - uint64_t size = 0; - - for (i = 0; i < child_count; i++) { - if (!replies[i].valid || replies[i].op_ret == -1) - continue; - if (size < replies[i].poststat.ia_size) { - src = i; - size = replies[i].poststat.ia_size; - } else if (replies[i].poststat.ia_size == size) { - src = -1; - } - } - return src; -} - -int afr_gfid_sbrain_source_from_latest_mtime (struct afr_reply *replies, - int child_count) -{ - int i = 0; - int src = -1; - uint32_t mtime = 0; - uint32_t mtime_nsec = 0; - - for (i = 0; i < child_count; i++) { - if (!replies[i].valid || replies[i].op_ret != 0) - continue; - if ((mtime < replies[i].poststat.ia_mtime) || - ((mtime == replies[i].poststat.ia_mtime) && - (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { - src = i; - mtime = replies[i].poststat.ia_mtime; - mtime_nsec = replies[i].poststat.ia_mtime_nsec; - } else if ((mtime == replies[i].poststat.ia_mtime) && - (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) { - src = -1; - } + int i = 0; + int src = -1; + uint64_t size = 0; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) + continue; + if (size < replies[i].poststat.ia_size) { + src = i; + size = replies[i].poststat.ia_size; + } else if (replies[i].poststat.ia_size == size) { + src = -1; } - return src; + } + return src; } int -afr_gfid_split_brain_source (xlator_t *this, struct afr_reply *replies, - inode_t *inode, uuid_t pargfid, const char *bname, - int src_idx, int child_idx, - unsigned char *locked_on, int *src, dict_t *xdata) -{ - afr_private_t *priv = NULL; - char g1[64] = {0,}; - char g2[64] = {0,}; - int up_count = 0; - int heal_op = -1; - int ret = -1; - char *src_brick = NULL; - - *src = -1; - priv = this->private; - up_count = AFR_COUNT (locked_on, priv->child_count); - if (up_count != priv->child_count) { - gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, - "All the bricks should be up to resolve the gfid split " - "barin"); - if (xdata) { - ret = dict_set_str (xdata, "gfid-heal-msg", "All the " - "bricks should be up to resolve the" - " gfid split barin"); - if (ret) - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_DICT_SET_FAILED, "Error setting" - " gfid-heal-msg dict"); - } - goto out; +afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies, + int child_count) +{ + int i = 0; + int src = -1; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + + for (i = 0; i < child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; + if ((mtime < replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { + src = i; + mtime = replies[i].poststat.ia_mtime; + mtime_nsec = replies[i].poststat.ia_mtime_nsec; + } else if ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) { + src = -1; } + } + return src; +} +int +afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies, + inode_t *inode, uuid_t pargfid, const char *bname, + int src_idx, int child_idx, + unsigned char *locked_on, int *src, dict_t *xdata) +{ + afr_private_t *priv = NULL; + char g1[64] = { + 0, + }; + char g2[64] = { + 0, + }; + int up_count = 0; + int heal_op = -1; + int ret = -1; + char *src_brick = NULL; + + *src = -1; + priv = this->private; + up_count = AFR_COUNT(locked_on, priv->child_count); + if (up_count != priv->child_count) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "All the bricks should be up to resolve the gfid split " + "barin"); if (xdata) { - ret = dict_get_int32 (xdata, "heal-op", &heal_op); - if (ret) - goto fav_child; - } else { - goto fav_child; + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SALL_BRICKS_UP_TO_RESOLVE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED, + "Error setting" + " gfid-heal-msg dict"); } + goto out; + } + + if (xdata) { + ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op); + if (ret) + goto fav_child; + } else { + goto fav_child; + } - switch (heal_op) { + switch (heal_op) { case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: - *src = afr_gfid_sbrain_source_from_bigger_file (replies, - priv->child_count); - if (*src == -1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN, "No bigger file"); - if (xdata) { - ret = dict_set_str (xdata, "gfid-heal-msg", - "No bigger file"); - if (ret) - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_DICT_SET_FAILED, "Error" - " setting gfid-heal-msg dict"); - } + *src = afr_gfid_sbrain_source_from_bigger_file(replies, + priv->child_count); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_BIGGER_FILE); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SNO_BIGGER_FILE); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + " setting gfid-heal-msg dict"); } - break; + } + break; case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: - *src = afr_gfid_sbrain_source_from_latest_mtime (replies, - priv->child_count); - if (*src == -1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN, "No difference in mtime"); - if (xdata) { - ret = dict_set_str (xdata, "gfid-heal-msg", - "No difference in mtime"); - if (ret) - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_DICT_SET_FAILED, "Error" - "setting gfid-heal-msg dict"); - } + *src = afr_gfid_sbrain_source_from_latest_mtime(replies, + priv->child_count); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SNO_DIFF_IN_MTIME); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SNO_DIFF_IN_MTIME); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + "setting gfid-heal-msg dict"); } - break; + } + break; case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: - ret = dict_get_str (xdata, "child-name", &src_brick); - if (ret) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN, "Error getting the source " - "brick"); - break; - } - *src = afr_gfid_sbrain_source_from_src_brick (this, replies, - src_brick); - if (*src == -1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN, "Error getting the source " - "brick"); - if (xdata) { - ret = dict_set_str (xdata, "gfid-heal-msg", - "Error getting the source " - "brick"); - if (ret) - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_DICT_SET_FAILED, "Error" - " setting gfid-heal-msg dict"); - } - } + ret = dict_get_str_sizen(xdata, "child-name", &src_brick); + if (ret) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Error getting the source " + "brick"); break; + } + *src = afr_gfid_sbrain_source_from_src_brick(this, replies, + src_brick); + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + SERROR_GETTING_SRC_BRICK); + if (xdata) { + ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg", + SERROR_GETTING_SRC_BRICK); + if (ret) + gf_msg(this->name, GF_LOG_ERROR, 0, + AFR_MSG_DICT_SET_FAILED, + "Error" + " setting gfid-heal-msg dict"); + } + } + break; default: - break; - } - goto out; + break; + } + goto out; fav_child: - switch (priv->fav_child_policy) { + switch (priv->fav_child_policy) { case AFR_FAV_CHILD_BY_SIZE: - *src = afr_sh_fav_by_size (this, replies, inode); - break; + *src = afr_sh_fav_by_size(this, replies, inode); + break; case AFR_FAV_CHILD_BY_MTIME: - *src = afr_sh_fav_by_mtime (this, replies, inode); - break; + *src = afr_sh_fav_by_mtime(this, replies, inode); + break; case AFR_FAV_CHILD_BY_CTIME: - *src = afr_sh_fav_by_ctime(this, replies, inode); - break; + *src = afr_sh_fav_by_ctime(this, replies, inode); + break; case AFR_FAV_CHILD_BY_MAJORITY: - if (priv->child_count != 2) - *src = afr_selfheal_gfid_mismatch_by_majority (replies, - priv->child_count); - else - *src = -1; - - if (*src == -1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN, "No majority to resolve " - "gfid split brain"); - } - break; + if (priv->child_count != 2) + *src = afr_selfheal_gfid_mismatch_by_majority( + replies, priv->child_count); + else + *src = -1; + + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "No majority to resolve " + "gfid split brain"); + } + break; default: - break; - } + break; + } out: - if (*src == -1) { - gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, - "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and" - " %s on %s.", uuid_utoa (pargfid), bname, - uuid_utoa_r (replies[child_idx].poststat.ia_gfid, g1), - priv->children[child_idx]->name, - uuid_utoa_r (replies[src_idx].poststat.ia_gfid, g2), - priv->children[src_idx]->name); - gf_event (EVENT_AFR_SPLIT_BRAIN, "subvol=%s;type=gfid;file=" - "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;" - "child-%d=%s;gfid-%d=%s", this->name, - uuid_utoa (pargfid), bname, child_idx, - priv->children[child_idx]->name, child_idx, - uuid_utoa_r (replies[child_idx].poststat.ia_gfid, g1), - src_idx, priv->children[src_idx]->name, src_idx, - uuid_utoa_r (replies[src_idx].poststat.ia_gfid, g2)); - return -1; - } - return 0; + if (*src == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and" + " %s on %s.", + uuid_utoa(pargfid), bname, + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), + priv->children[child_idx]->name, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2), + priv->children[src_idx]->name); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=gfid;file=" + "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;" + "child-%d=%s;gfid-%d=%s", + this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid), + bname, child_idx, priv->children[child_idx]->name, child_idx, + uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx, + priv->children[src_idx]->name, src_idx, + uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2)); + return -1; + } + return 0; } - int -afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) +afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr, dict_t *xdata) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - local->op_ret = op_ret; - local->op_errno = op_errno; - syncbarrier_wake (&local->barrier); + local->op_ret = op_ret; + local->op_errno = op_errno; + syncbarrier_wake(&local->barrier); - return 0; + return 0; } - int -afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode, - int subvol, dict_t *xattr, dict_t *xdata) +afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode, + int subvol, dict_t *xattr, dict_t *xdata) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - loc_t loc = {0, }; - int ret = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + loc_t loc = { + 0, + }; + int ret = 0; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - local->op_ret = 0; + local->op_ret = 0; - STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol], - priv->children[subvol]->fops->xattrop, &loc, - GF_XATTROP_ADD_ARRAY, xattr, xdata); + STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol], + priv->children[subvol]->fops->xattrop, &loc, + GF_XATTROP_ADD_ARRAY, xattr, xdata); - syncbarrier_wait (&local->barrier, 1); - if (local->op_ret < 0) - ret = -local->op_errno; + syncbarrier_wait(&local->barrier, 1); + if (local->op_ret < 0) + ret = -local->op_errno; - loc_wipe (&loc); - local->op_ret = 0; + loc_wipe(&loc); + local->op_ret = 0; - return ret; + return ret; } int -afr_check_stale_error (struct afr_reply *replies, afr_private_t *priv) -{ - int i = 0; - int op_errno = 0; - int tmp_errno = 0; - int stale_count = 0; - - for (i = 0; i < priv->child_count; i++) { - tmp_errno = replies[i].op_errno; - if (tmp_errno == ENOENT || tmp_errno == ESTALE) { - op_errno = afr_higher_errno (op_errno, tmp_errno); - stale_count++; - } +afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv) +{ + int i = 0; + int op_errno = 0; + int tmp_errno = 0; + int stale_count = 0; + + for (i = 0; i < priv->child_count; i++) { + tmp_errno = replies[i].op_errno; + if (tmp_errno == ENOENT || tmp_errno == ESTALE) { + op_errno = afr_higher_errno(op_errno, tmp_errno); + stale_count++; } - if (stale_count != priv->child_count) - return -ENOTCONN; - else - return -op_errno; + } + if (stale_count != priv->child_count) + return -ENOTCONN; + else + return -op_errno; } int -afr_sh_generic_fop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, struct iatt *pre, struct iatt *post, - dict_t *xdata) +afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct iatt *pre, + struct iatt *post, dict_t *xdata) { - int i = (long) cookie; - afr_local_t *local = NULL; + int i = (long)cookie; + afr_local_t *local = NULL; - local = frame->local; + local = frame->local; - local->replies[i].valid = 1; - local->replies[i].op_ret = op_ret; - local->replies[i].op_errno = op_errno; - if (pre) - local->replies[i].prestat = *pre; - if (post) - local->replies[i].poststat = *post; - if (xdata) - local->replies[i].xdata = dict_ref (xdata); + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (pre) + local->replies[i].prestat = *pre; + if (post) + local->replies[i].poststat = *post; + if (xdata) + local->replies[i].xdata = dict_ref(xdata); - syncbarrier_wake (&local->barrier); + syncbarrier_wake(&local->barrier); - return 0; + return 0; } int -afr_selfheal_restore_time (call_frame_t *frame, xlator_t *this, inode_t *inode, - int source, unsigned char *healed_sinks, - struct afr_reply *replies) +afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, unsigned char *healed_sinks, + struct afr_reply *replies) { - loc_t loc = {0, }; + loc_t loc = { + 0, + }; - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - AFR_ONLIST (healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc, - &replies[source].poststat, - (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL); + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc, + &replies[source].poststat, + (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME), + NULL); - loc_wipe (&loc); + loc_wipe(&loc); - return 0; + return 0; } dict_t * -afr_selfheal_output_xattr (xlator_t *this, gf_boolean_t is_full_crawl, - afr_transaction_type type, int *output_dirty, - int **output_matrix, int subvol, - int **full_heal_mtx_out) -{ - int j = 0; - int idx = 0; - int d_idx = 0; - int ret = 0; - int *raw = 0; - dict_t *xattr = NULL; - afr_private_t *priv = NULL; - - priv = this->private; - idx = afr_index_for_transaction_type (type); - d_idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION); - - xattr = dict_new (); - if (!xattr) - return NULL; - - /* clear dirty */ - raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); - if (!raw) - goto err; - - raw[idx] = hton32 (output_dirty[subvol]); - ret = dict_set_bin (xattr, AFR_DIRTY, raw, - sizeof(int) * AFR_NUM_CHANGE_LOGS); - if (ret) { - GF_FREE (raw); - goto err; - } +afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl, + afr_transaction_type type, int *output_dirty, + int **output_matrix, int subvol, + int **full_heal_mtx_out) +{ + int j = 0; + int idx = 0; + int d_idx = 0; + int ret = 0; + int *raw = 0; + dict_t *xattr = NULL; + afr_private_t *priv = NULL; + + priv = this->private; + idx = afr_index_for_transaction_type(type); + d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION); + + xattr = dict_new(); + if (!xattr) + return NULL; - /* clear/set pending */ - for (j = 0; j < priv->child_count; j++) { - raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, - gf_afr_mt_int32_t); - if (!raw) - goto err; - - raw[idx] = hton32 (output_matrix[subvol][j]); - if (is_full_crawl) - raw[d_idx] = hton32 (full_heal_mtx_out[subvol][j]); - - ret = dict_set_bin (xattr, priv->pending_key[j], - raw, sizeof(int) * AFR_NUM_CHANGE_LOGS); - if (ret) { - GF_FREE (raw); - goto err; - } - } + /* clear dirty */ + raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + + raw[idx] = hton32(output_dirty[subvol]); + ret = dict_set_bin(xattr, AFR_DIRTY, raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + GF_FREE(raw); + goto err; + } + + /* clear/set pending */ + for (j = 0; j < priv->child_count; j++) { + raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t); + if (!raw) + goto err; + + raw[idx] = hton32(output_matrix[subvol][j]); + if (is_full_crawl) + raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]); + + ret = dict_set_bin(xattr, priv->pending_key[j], raw, + sizeof(int) * AFR_NUM_CHANGE_LOGS); + if (ret) { + GF_FREE(raw); + goto err; + } + } - return xattr; + return xattr; err: - if (xattr) - dict_unref (xattr); - return NULL; + if (xattr) + dict_unref(xattr); + return NULL; } - int -afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode, - unsigned char *sources, unsigned char *sinks, - unsigned char *healed_sinks, - unsigned char *undid_pending, - afr_transaction_type type, struct afr_reply *replies, - unsigned char *locked_on) -{ - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int i = 0; - int j = 0; - unsigned char *pending = NULL; - int *input_dirty = NULL; - int **input_matrix = NULL; - int **full_heal_mtx_in = NULL; - int **full_heal_mtx_out = NULL; - int *output_dirty = NULL; - int **output_matrix = NULL; - dict_t *xattr = NULL; - dict_t *xdata = NULL; - - priv = this->private; - local = frame->local; - - pending = alloca0 (priv->child_count); - - input_dirty = alloca0 (priv->child_count * sizeof (int)); - input_matrix = ALLOC_MATRIX (priv->child_count, int); - full_heal_mtx_in = ALLOC_MATRIX (priv->child_count, int); - full_heal_mtx_out = ALLOC_MATRIX (priv->child_count, int); - output_dirty = alloca0 (priv->child_count * sizeof (int)); - output_matrix = ALLOC_MATRIX (priv->child_count, int); - - xdata = dict_new (); - if (!xdata) - return -1; +afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *sources, unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, struct afr_reply *replies, + unsigned char *locked_on) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int j = 0; + unsigned char *pending = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int **full_heal_mtx_in = NULL; + int **full_heal_mtx_out = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + + priv = this->private; + local = frame->local; + + pending = alloca0(priv->child_count); + + input_dirty = alloca0(priv->child_count * sizeof(int)); + input_matrix = ALLOC_MATRIX(priv->child_count, int); + full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int); + full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int); + output_dirty = alloca0(priv->child_count * sizeof(int)); + output_matrix = ALLOC_MATRIX(priv->child_count, int); + + xdata = dict_new(); + if (!xdata) + return -1; - afr_selfheal_extract_xattr (this, replies, type, input_dirty, - input_matrix); - - if (local->need_full_crawl) - afr_selfheal_extract_xattr (this, replies, AFR_DATA_TRANSACTION, - NULL, full_heal_mtx_in); - - for (i = 0; i < priv->child_count; i++) - if (sinks[i] && !healed_sinks[i]) - pending[i] = 1; - - for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - if (pending[j]) { - output_matrix[i][j] = 1; - if (type == AFR_ENTRY_TRANSACTION) - full_heal_mtx_out[i][j] = 1; - } else if (locked_on[j]) { - output_matrix[i][j] = -input_matrix[i][j]; - if (type == AFR_ENTRY_TRANSACTION) - full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j]; - } - } - } - - for (i = 0; i < priv->child_count; i++) { - if (!pending[i]) - output_dirty[i] = -input_dirty[i]; - } - - for (i = 0; i < priv->child_count; i++) { - if (!locked_on[i]) - /* perform post-op only on subvols we had locked - and inspected on. - */ - continue; - if (undid_pending[i]) - /* We already unset the pending xattrs in - * _afr_fav_child_reset_sink_xattrs(). */ - continue; - - xattr = afr_selfheal_output_xattr (this, local->need_full_crawl, - type, output_dirty, - output_matrix, i, - full_heal_mtx_out); - if (!xattr) { - continue; - } - - if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) { - if (xdata && - dict_set_int8 (xdata, GF_XATTROP_PURGE_INDEX, 1)) - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_DICT_SET_FAILED, "Failed to set" - " dict value for %s", - GF_XATTROP_PURGE_INDEX); - } + afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix); + + if (local->need_full_crawl) + afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL, + full_heal_mtx_in); + + for (i = 0; i < priv->child_count; i++) + if (sinks[i] && !healed_sinks[i]) + pending[i] = 1; + + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (pending[j]) { + output_matrix[i][j] = 1; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = 1; + } else if (locked_on[j]) { + output_matrix[i][j] = -input_matrix[i][j]; + if (type == AFR_ENTRY_TRANSACTION) + full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j]; + } + } + } + + for (i = 0; i < priv->child_count; i++) { + if (!pending[i]) + output_dirty[i] = -input_dirty[i]; + } + + for (i = 0; i < priv->child_count; i++) { + if (!locked_on[i]) + /* perform post-op only on subvols we had locked + and inspected on. + */ + continue; + if (undid_pending[i]) + /* We already unset the pending xattrs in + * _afr_fav_child_reset_sink_xattrs(). */ + continue; + + xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type, + output_dirty, output_matrix, i, + full_heal_mtx_out); + if (!xattr) { + continue; + } + + if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) { + if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1)) + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED, + "Failed to set" + " dict value for %s", + GF_XATTROP_PURGE_INDEX); + } - afr_selfheal_post_op (frame, this, inode, i, xattr, xdata); - dict_unref (xattr); - } + afr_selfheal_post_op(frame, this, inode, i, xattr, xdata); + dict_unref(xattr); + } - if (xdata) - dict_unref (xdata); + if (xdata) + dict_unref(xdata); - return 0; + return 0; } void -afr_reply_copy (struct afr_reply *dst, struct afr_reply *src) -{ - dict_t *xdata = NULL; - - dst->valid = src->valid; - dst->op_ret = src->op_ret; - dst->op_errno = src->op_errno; - dst->prestat = src->prestat; - dst->poststat = src->poststat; - dst->preparent = src->preparent; - dst->postparent = src->postparent; - dst->preparent2 = src->preparent2; - dst->postparent2 = src->postparent2; - if (src->xdata) - xdata = dict_ref (src->xdata); - else - xdata = NULL; - if (dst->xdata) - dict_unref (dst->xdata); - dst->xdata = xdata; - memcpy (dst->checksum, src->checksum, MD5_DIGEST_LENGTH); +afr_reply_copy(struct afr_reply *dst, struct afr_reply *src) +{ + dict_t *xdata = NULL; + + dst->valid = src->valid; + dst->op_ret = src->op_ret; + dst->op_errno = src->op_errno; + dst->prestat = src->prestat; + dst->poststat = src->poststat; + dst->preparent = src->preparent; + dst->postparent = src->postparent; + dst->preparent2 = src->preparent2; + dst->postparent2 = src->postparent2; + if (src->xdata) + xdata = dict_ref(src->xdata); + else + xdata = NULL; + if (dst->xdata) + dict_unref(dst->xdata); + dst->xdata = xdata; + if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum", + _gf_false) == _gf_true) { + memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH); + } else { + memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH); + } + dst->fips_mode_rchecksum = src->fips_mode_rchecksum; } void -afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count) +afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count) { - int i = 0; + int i = 0; - if (dst == src) - return; + if (dst == src) + return; - for (i = 0; i < count; i++) { - afr_reply_copy (&dst[i], &src[i]); - } + for (i = 0; i < count; i++) { + afr_reply_copy(&dst[i], &src[i]); + } } int -afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol, - int idx, dict_t *xdata) +afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx, + dict_t *xdata) { - void *pending_raw = NULL; - int pending[3] = {0, }; + void *pending_raw = NULL; + int pending[3] = { + 0, + }; - if (!dirty) - return 0; + if (!dirty) + return 0; - if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw)) - return -1; + if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw)) + return -1; - if (!pending_raw) - return -1; + if (!pending_raw) + return -1; - memcpy (pending, pending_raw, sizeof(pending)); + memcpy(pending, pending_raw, sizeof(pending)); - dirty[subvol] = ntoh32 (pending[idx]); + dirty[subvol] = ntoh32(pending[idx]); - return 0; + return 0; } - int -afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, - int idx, dict_t *xdata) +afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx, + dict_t *xdata) { - int i = 0; - void *pending_raw = NULL; - int pending[3] = {0, }; - afr_private_t *priv = NULL; + int i = 0; + void *pending_raw = NULL; + int pending[3] = { + 0, + }; + afr_private_t *priv = NULL; - priv = this->private; + priv = this->private; - if (!matrix) - return 0; + if (!matrix) + return 0; - for (i = 0; i < priv->child_count; i++) { - if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw)) - continue; + for (i = 0; i < priv->child_count; i++) { + if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw)) + continue; - if (!pending_raw) - continue; + if (!pending_raw) + continue; - memcpy (pending, pending_raw, sizeof(pending)); + memcpy(pending, pending_raw, sizeof(pending)); - matrix[subvol][i] = ntoh32 (pending[idx]); - } + matrix[subvol][i] = ntoh32(pending[idx]); + } - return 0; + return 0; } - int -afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, - afr_transaction_type type, int *dirty, int **matrix) +afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies, + afr_transaction_type type, int *dirty, int **matrix) { - afr_private_t *priv = NULL; - int i = 0; - dict_t *xdata = NULL; - int idx = -1; + afr_private_t *priv = NULL; + int i = 0; + dict_t *xdata = NULL; + int idx = -1; - idx = afr_index_for_transaction_type (type); + idx = afr_index_for_transaction_type(type); - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid || replies[i].op_ret != 0) - continue; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret != 0) + continue; - if (!replies[i].xdata) - continue; + if (!replies[i].xdata) + continue; - xdata = replies[i].xdata; + xdata = replies[i].xdata; - afr_selfheal_fill_dirty (this, dirty, i, idx, xdata); - afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); - } + afr_selfheal_fill_dirty(this, dirty, i, idx, xdata); + afr_selfheal_fill_matrix(this, matrix, i, idx, xdata); + } - return 0; + return 0; } /* @@ -754,560 +819,566 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies, * This can happen if data was directly modified in the backend or for snapshots */ void -afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources, - struct afr_reply *replies) -{ - int i = 0; - afr_private_t *priv = NULL; - uint64_t size = 0; - - /* Find source with biggest file size */ - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (!replies[i].valid || replies[i].op_ret != 0) { - sources[i] = 0; - continue; - } - if (size <= replies[i].poststat.ia_size) { - size = replies[i].poststat.ia_size; - } +afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint64_t size = 0; + + /* Find source with biggest file size */ + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!replies[i].valid || replies[i].op_ret != 0) { + sources[i] = 0; + continue; } - - /* Mark sources with less size as not source */ - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (size > replies[i].poststat.ia_size) - sources[i] = 0; + if (size <= replies[i].poststat.ia_size) { + size = replies[i].poststat.ia_size; } + } + + /* Mark sources with less size as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (size > replies[i].poststat.ia_size) + sources[i] = 0; + } } void -afr_mark_latest_mtime_file_as_source (xlator_t *this, unsigned char *sources, - struct afr_reply *replies) -{ - int i = 0; - afr_private_t *priv = NULL; - uint32_t mtime = 0; - uint32_t mtime_nsec = 0; - - priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (!replies[i].valid || replies[i].op_ret != 0) { - sources[i] = 0; - continue; - } - if ((mtime < replies[i].poststat.ia_mtime) || - ((mtime == replies[i].poststat.ia_mtime) && - (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { - mtime = replies[i].poststat.ia_mtime; - mtime_nsec = replies[i].poststat.ia_mtime_nsec; - } +afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) +{ + int i = 0; + afr_private_t *priv = NULL; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (!replies[i].valid || replies[i].op_ret != 0) { + sources[i] = 0; + continue; } - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if ((mtime > replies[i].poststat.ia_mtime) || - ((mtime == replies[i].poststat.ia_mtime) && - (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) { - sources[i] = 0; - } + if ((mtime < replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) { + mtime = replies[i].poststat.ia_mtime; + mtime_nsec = replies[i].poststat.ia_mtime_nsec; + } + } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if ((mtime > replies[i].poststat.ia_mtime) || + ((mtime == replies[i].poststat.ia_mtime) && + (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) { + sources[i] = 0; } + } } void -afr_mark_active_sinks (xlator_t *this, unsigned char *sources, - unsigned char *locked_on, unsigned char *sinks) +afr_mark_active_sinks(xlator_t *this, unsigned char *sources, + unsigned char *locked_on, unsigned char *sinks) { - int i = 0; - afr_private_t *priv = NULL; + int i = 0; + afr_private_t *priv = NULL; - priv = this->private; + priv = this->private; - memset (sinks, 0, sizeof (*sinks) * priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (!sources[i] && locked_on[i]) - sinks[i] = 1; - } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i] && locked_on[i]) + sinks[i] = 1; + else + sinks[i] = 0; + } } gf_boolean_t -afr_dict_contains_heal_op (call_frame_t *frame) +afr_dict_contains_heal_op(call_frame_t *frame) { - afr_local_t *local = NULL; - dict_t *xdata_req = NULL; - int ret = 0; - int heal_op = -1; + afr_local_t *local = NULL; + dict_t *xdata_req = NULL; + int ret = 0; + int heal_op = -1; - local = frame->local; - xdata_req = local->xdata_req; - ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); - if (ret) - return _gf_false; - if (local->xdata_rsp == NULL) { - local->xdata_rsp = dict_new(); - if (!local->xdata_rsp) - return _gf_true; - } - ret = dict_set_str (local->xdata_rsp, "sh-fail-msg", - "File not in split-brain"); + local = frame->local; + xdata_req = local->xdata_req; + ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op); + if (ret) + return _gf_false; + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) + return _gf_true; + } + ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg", + SFILE_NOT_IN_SPLIT_BRAIN); - return _gf_true; + return _gf_true; } gf_boolean_t -afr_can_decide_split_brain_source_sinks (struct afr_reply *replies, - int child_count) +afr_can_decide_split_brain_source_sinks(struct afr_reply *replies, + int child_count) { - int i = 0; + int i = 0; - for (i = 0; i < child_count; i++) - if (replies[i].valid != 1 || replies[i].op_ret != 0) - return _gf_false; + for (i = 0; i < child_count; i++) + if (replies[i].valid != 1 || replies[i].op_ret != 0) + return _gf_false; - return _gf_true; + return _gf_true; } int -afr_mark_split_brain_source_sinks_by_heal_op (call_frame_t *frame, - xlator_t *this, unsigned char *sources, - unsigned char *sinks, - unsigned char *healed_sinks, - unsigned char *locked_on, - struct afr_reply *replies, - afr_transaction_type type, int heal_op) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *xdata_req = NULL; - dict_t *xdata_rsp = NULL; - int ret = 0; - int i = 0; - char *name = NULL; - int source = -1; - - local = frame->local; - priv = this->private; - xdata_req = local->xdata_req; - - for (i = 0; i < priv->child_count; i++) { - if (locked_on[i]) - if (sources[i] || !sinks[i] || !healed_sinks[i]) { - ret = -1; - goto out; - } - } - if (local->xdata_rsp == NULL) { - local->xdata_rsp = dict_new(); - if (!local->xdata_rsp) { - ret = -1; - goto out; - } - } - xdata_rsp = local->xdata_rsp; - - if (!afr_can_decide_split_brain_source_sinks (replies, - priv->child_count)) { - ret = dict_set_str (xdata_rsp, "sh-fail-msg", - SBRAIN_HEAL_NO_GO_MSG); +afr_mark_split_brain_source_sinks_by_heal_op( + call_frame_t *frame, xlator_t *this, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type, int heal_op) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + dict_t *xdata_rsp = NULL; + int ret = 0; + int i = 0; + char *name = NULL; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + for (i = 0; i < priv->child_count; i++) { + if (locked_on[i]) + if (sources[i] || !sinks[i] || !healed_sinks[i]) { ret = -1; goto out; + } + } + if (local->xdata_rsp == NULL) { + local->xdata_rsp = dict_new(); + if (!local->xdata_rsp) { + ret = -1; + goto out; } + } + xdata_rsp = local->xdata_rsp; + + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SBRAIN_HEAL_NO_GO_MSG); + ret = -1; + goto out; + } - for (i = 0 ; i < priv->child_count; i++) - if (locked_on[i]) - sources[i] = 1; - switch (heal_op) { + for (i = 0; i < priv->child_count; i++) + if (locked_on[i]) + sources[i] = 1; + switch (heal_op) { case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE: - if (type == AFR_METADATA_TRANSACTION) { - ret = dict_set_str (xdata_rsp, "sh-fail-msg", - "Use source-brick option to" - " heal metadata split-brain"); - if (!ret) - ret = -1; - goto out; - } - afr_mark_largest_file_as_source (this, sources, replies); - if (AFR_COUNT (sources, priv->child_count) != 1) { - ret = dict_set_str (xdata_rsp, "sh-fail-msg", - "No bigger file"); - if (!ret) - ret = -1; - goto out; - } - break; + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SUSE_SOURCE_BRICK_TO_HEAL); + if (!ret) + ret = -1; + goto out; + } + afr_mark_largest_file_as_source(this, sources, replies); + if (AFR_COUNT(sources, priv->child_count) != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SNO_BIGGER_FILE); + if (!ret) + ret = -1; + goto out; + } + break; case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME: - if (type == AFR_METADATA_TRANSACTION) { - ret = dict_set_str (xdata_rsp, "sh-fail-msg", - "Use source-brick option to" - " heal metadata split-brain"); - if (!ret) - ret = -1; - goto out; - } - afr_mark_latest_mtime_file_as_source (this, sources, replies); - if (AFR_COUNT (sources, priv->child_count) != 1) { - ret = dict_set_str (xdata_rsp, "sh-fail-msg", - "No difference in mtime"); - if (!ret) - ret = -1; - goto out; - } - break; + if (type == AFR_METADATA_TRANSACTION) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SUSE_SOURCE_BRICK_TO_HEAL); + if (!ret) + ret = -1; + goto out; + } + afr_mark_latest_mtime_file_as_source(this, sources, replies); + if (AFR_COUNT(sources, priv->child_count) != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SNO_DIFF_IN_MTIME); + if (!ret) + ret = -1; + goto out; + } + break; case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK: - ret = dict_get_str (xdata_req, "child-name", &name); - if (ret) - goto out; - source = afr_get_child_index_from_name (this, name); - if (source < 0) { - ret = dict_set_str (xdata_rsp, "sh-fail-msg", - "Invalid brick name"); - if (!ret) - ret = -1; - goto out; - } - if (locked_on[source] != 1) { - ret = dict_set_str (xdata_rsp, "sh-fail-msg", - "Brick is not up"); - if (!ret) - ret = -1; - goto out; - } - memset (sources, 0, sizeof (*sources) * priv->child_count); - sources[source] = 1; - break; - default: - ret = -1; + ret = dict_get_str_sizen(xdata_req, "child-name", &name); + if (ret) goto out; + source = afr_get_child_index_from_name(this, name); + if (source < 0) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SINVALID_BRICK_NAME); + if (!ret) + ret = -1; + goto out; + } + if (locked_on[source] != 1) { + ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg", + SBRICK_IS_NOT_UP); + if (!ret) + ret = -1; + goto out; + } + memset(sources, 0, sizeof(*sources) * priv->child_count); + sources[source] = 1; + break; + default: + ret = -1; + goto out; + } + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + break; } - for (i = 0 ; i < priv->child_count; i++) { - if (sources[i]) { - source = i; - break; - } - } - sinks[source] = 0; - healed_sinks[source] = 0; - ret = source; + } + sinks[source] = 0; + healed_sinks[source] = 0; + ret = source; out: - if (ret < 0) - memset (sources, 0, sizeof (*sources) * priv->child_count); - return ret; - + if (ret < 0) + memset(sources, 0, sizeof(*sources) * priv->child_count); + return ret; } int -afr_sh_fav_by_majority (xlator_t *this, struct afr_reply *replies, - inode_t *inode) -{ - afr_private_t *priv; - int vote_count = -1; - int fav_child = -1; - int i = 0; - int k = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (replies[i].valid == 1) { - gf_msg_debug (this->name, 0, "Child:%s " - "mtime_sec = %d, size = %lu for gfid %s", - priv->children[i]->name, - replies[i].poststat.ia_mtime, - replies[i].poststat.ia_size, - uuid_utoa (inode->gfid)); - vote_count = 0; - for (k = 0; k < priv->child_count; k++) { - if ((replies[k].poststat.ia_mtime == - replies[i].poststat.ia_mtime) && - (replies[k].poststat.ia_size == - replies[i].poststat.ia_size) - ) { - vote_count++; - } - } - if (vote_count > priv->child_count/2) { - fav_child = i; - break; - } +afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies, + inode_t *inode) +{ + afr_private_t *priv; + int vote_count = -1; + int fav_child = -1; + int i = 0; + int k = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64 + " for gfid %s", + priv->children[i]->name, replies[i].poststat.ia_mtime, + replies[i].poststat.ia_size, uuid_utoa(inode->gfid)); + vote_count = 0; + for (k = 0; k < priv->child_count; k++) { + if ((replies[k].poststat.ia_mtime == + replies[i].poststat.ia_mtime) && + (replies[k].poststat.ia_size == + replies[i].poststat.ia_size)) { + vote_count++; } + } + if (vote_count > priv->child_count / 2) { + fav_child = i; + break; + } } - return fav_child; + } + return fav_child; } /* * afr_sh_fav_by_mtime: Choose favorite child by mtime. */ int -afr_sh_fav_by_mtime (xlator_t *this, struct afr_reply *replies, inode_t *inode) -{ - afr_private_t *priv; - int fav_child = -1; - int i = 0; - uint32_t cmp_mtime = 0; - uint32_t cmp_mtime_nsec = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (replies[i].valid == 1) { - gf_msg_debug (this->name, 0, "Child:%s " - "mtime = %d, mtime_nsec = %d for gfid %s", - priv->children[i]->name, - replies[i].poststat.ia_mtime, - replies[i].poststat.ia_mtime_nsec, - uuid_utoa (inode->gfid)); - if (replies[i].poststat.ia_mtime > cmp_mtime) { - cmp_mtime = replies[i].poststat.ia_mtime; - cmp_mtime_nsec = - replies[i].poststat.ia_mtime_nsec; - fav_child = i; - } else if ((replies[i].poststat.ia_mtime == cmp_mtime) - && (replies[i].poststat.ia_mtime_nsec > - cmp_mtime_nsec)) { - cmp_mtime = replies[i].poststat.ia_mtime; - cmp_mtime_nsec = - replies[i].poststat.ia_mtime_nsec; - fav_child = i; - } - } +afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_mtime = 0; + uint32_t cmp_mtime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s mtime = %" PRId64 + ", mtime_nsec = %d for " + "gfid %s", + priv->children[i]->name, replies[i].poststat.ia_mtime, + replies[i].poststat.ia_mtime_nsec, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_mtime > cmp_mtime) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_mtime == cmp_mtime) && + (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) { + cmp_mtime = replies[i].poststat.ia_mtime; + cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec; + fav_child = i; + } } - return fav_child; + } + return fav_child; } /* * afr_sh_fav_by_ctime: Choose favorite child by ctime. */ int -afr_sh_fav_by_ctime (xlator_t *this, struct afr_reply *replies, inode_t *inode) -{ - afr_private_t *priv; - int fav_child = -1; - int i = 0; - uint32_t cmp_ctime = 0; - uint32_t cmp_ctime_nsec = 0; - - priv = this->private; - - for (i = 0; i < priv->child_count; i++) { - if (replies[i].valid == 1) { - gf_msg_debug (this->name, 0, "Child:%s " - "ctime = %d, ctime_nsec = %d for gfid %s", - priv->children[i]->name, - replies[i].poststat.ia_ctime, - replies[i].poststat.ia_ctime_nsec, - uuid_utoa (inode->gfid)); - if (replies[i].poststat.ia_ctime > cmp_ctime) { - cmp_ctime = replies[i].poststat.ia_ctime; - cmp_ctime_nsec = - replies[i].poststat.ia_ctime_nsec; - fav_child = i; - } else if ((replies[i].poststat.ia_ctime == cmp_ctime) - && (replies[i].poststat.ia_ctime_nsec > - cmp_ctime_nsec)) { - cmp_ctime = replies[i].poststat.ia_ctime; - cmp_ctime_nsec = - replies[i].poststat.ia_ctime_nsec; - fav_child = i; - } - } +afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode) +{ + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint32_t cmp_ctime = 0; + uint32_t cmp_ctime_nsec = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (replies[i].valid == 1) { + gf_msg_debug(this->name, 0, + "Child:%s ctime = %" PRId64 + ", ctime_nsec = %d for " + "gfid %s", + priv->children[i]->name, replies[i].poststat.ia_ctime, + replies[i].poststat.ia_ctime_nsec, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_ctime > cmp_ctime) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } else if ((replies[i].poststat.ia_ctime == cmp_ctime) && + (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) { + cmp_ctime = replies[i].poststat.ia_ctime; + cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec; + fav_child = i; + } } - return fav_child; + } + return fav_child; } /* - * afr_sh_fav_by_size: Choose favorite child by size. + * afr_sh_fav_by_size: Choose favorite child by size + * when not all files are of zero size. */ int -afr_sh_fav_by_size (xlator_t *this, struct afr_reply *replies, inode_t *inode) +afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode) { - afr_private_t *priv; - int fav_child = -1; - int i = 0; - uint64_t cmp_sz = 0; - - priv = this->private; + afr_private_t *priv; + int fav_child = -1; + int i = 0; + uint64_t cmp_sz = 0; - for (i = 0; i < priv->child_count; i++) { - if (replies[i].valid == 1) { - gf_msg_debug (this->name, 0, "Child:%s " - "file size = %lu for gfid %s", - priv->children[i]->name, - replies[i].poststat.ia_size, - uuid_utoa (inode->gfid)); - if (replies[i].poststat.ia_size > cmp_sz) { - cmp_sz = replies[i].poststat.ia_size; - fav_child = i; - } - } + priv = this->private; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) { + continue; } - return fav_child; + gf_msg_debug(this->name, 0, + "Child:%s file size = %" PRIu64 " for gfid %s", + priv->children[i]->name, replies[i].poststat.ia_size, + uuid_utoa(inode->gfid)); + if (replies[i].poststat.ia_type == IA_IFDIR) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Cannot perform selfheal on %s. " + "Size policy is not applicable to directories.", + uuid_utoa(inode->gfid)); + break; + } + if (replies[i].poststat.ia_size > cmp_sz) { + cmp_sz = replies[i].poststat.ia_size; + fav_child = i; + } else if (replies[i].poststat.ia_size == cmp_sz) { + fav_child = -1; + } + } + if (fav_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "No bigger file"); + } + return fav_child; } int -afr_sh_get_fav_by_policy (xlator_t *this, struct afr_reply *replies, - inode_t *inode, char **policy_str) +afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies, + inode_t *inode, char **policy_str) { - afr_private_t *priv = NULL; - int fav_child = -1; + afr_private_t *priv = NULL; + int fav_child = -1; - priv = this->private; - if (!afr_can_decide_split_brain_source_sinks (replies, - priv->child_count)) { - return -1; - } + priv = this->private; + if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) { + return -1; + } - switch (priv->fav_child_policy) { + switch (priv->fav_child_policy) { case AFR_FAV_CHILD_BY_SIZE: - fav_child = afr_sh_fav_by_size (this, replies, inode); - if (policy_str && fav_child >= 0) { - *policy_str = "SIZE"; - } - break; + fav_child = afr_sh_fav_by_size(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "SIZE"; + } + break; case AFR_FAV_CHILD_BY_CTIME: - fav_child = afr_sh_fav_by_ctime (this, replies, inode); - if (policy_str && fav_child >= 0) { - *policy_str = "CTIME"; - } - break; + fav_child = afr_sh_fav_by_ctime(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "CTIME"; + } + break; case AFR_FAV_CHILD_BY_MTIME: - fav_child = afr_sh_fav_by_mtime (this, replies, inode); - if (policy_str && fav_child >= 0) { - *policy_str = "MTIME"; - } - break; + fav_child = afr_sh_fav_by_mtime(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "MTIME"; + } + break; case AFR_FAV_CHILD_BY_MAJORITY: - fav_child = afr_sh_fav_by_majority (this, replies, inode); - if (policy_str && fav_child >= 0) { - *policy_str = "MAJORITY"; - } - break; + fav_child = afr_sh_fav_by_majority(this, replies, inode); + if (policy_str && fav_child >= 0) { + *policy_str = "MAJORITY"; + } + break; case AFR_FAV_CHILD_NONE: default: - break; - } + break; + } - return fav_child; + return fav_child; } int -afr_mark_split_brain_source_sinks_by_policy (call_frame_t *frame, - xlator_t *this, - inode_t *inode, - unsigned char *sources, - unsigned char *sinks, - unsigned char *healed_sinks, - unsigned char *locked_on, - struct afr_reply *replies, - afr_transaction_type type) -{ - afr_private_t *priv = NULL; - int fav_child = -1; - char mtime_str[256]; - char ctime_str[256]; - char *policy_str = NULL; - struct tm *tm_ptr; - time_t time; - - priv = this->private; - - fav_child = afr_sh_get_fav_by_policy (this, replies, inode, - &policy_str); - if (fav_child > priv->child_count - 1) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Invalid child (%d) " - "selected by policy %s.", fav_child, policy_str); - } else if (fav_child >= 0) { - time = replies[fav_child].poststat.ia_mtime; - tm_ptr = localtime (&time); - strftime (mtime_str, sizeof (mtime_str), "%Y-%m-%d %H:%M:%S", - tm_ptr); - time = replies[fav_child].poststat.ia_ctime; - tm_ptr = localtime (&time); - strftime (ctime_str, sizeof (ctime_str), "%Y-%m-%d %H:%M:%S", - tm_ptr); - - gf_msg (this->name, GF_LOG_WARNING, 0, - AFR_MSG_SBRAIN_FAV_CHILD_POLICY, "Source %s " - "selected as authentic to resolve conflicting " - "data in file (gfid:%s) by %s (%lu bytes @ %s mtime, " - "%s ctime).", - priv->children[fav_child]->name, - uuid_utoa (inode->gfid), - policy_str, - replies[fav_child].poststat.ia_size, - mtime_str, - ctime_str); - - sources[fav_child] = 1; - sinks[fav_child] = 0; - healed_sinks[fav_child] = 0; - } - return fav_child; +afr_mark_split_brain_source_sinks_by_policy( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type) +{ + afr_private_t *priv = NULL; + int fav_child = -1; + char mtime_str[256]; + char ctime_str[256]; + char *policy_str = NULL; + struct tm *tm_ptr; + time_t time; + + priv = this->private; + + fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str); + if (fav_child == -1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "No child selected by favorite-child policy."); + } else if (fav_child > priv->child_count - 1) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Invalid child (%d) " + "selected by policy %s.", + fav_child, policy_str); + } else if (fav_child >= 0) { + time = replies[fav_child].poststat.ia_mtime; + tm_ptr = localtime(&time); + strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr); + time = replies[fav_child].poststat.ia_ctime; + tm_ptr = localtime(&time); + strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr); + + gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY, + "Source %s selected as authentic to resolve conflicting data " + "in file (gfid:%s) by %s (%" PRIu64 + " bytes @ %s mtime, %s " + "ctime).", + priv->children[fav_child]->name, uuid_utoa(inode->gfid), + policy_str, replies[fav_child].poststat.ia_size, mtime_str, + ctime_str); + + sources[fav_child] = 1; + sinks[fav_child] = 0; + healed_sinks[fav_child] = 0; + } + return fav_child; } -int -afr_mark_source_sinks_if_file_empty (xlator_t *this, unsigned char *sources, - unsigned char *sinks, - unsigned char *healed_sinks, - unsigned char *locked_on, - struct afr_reply *replies, - afr_transaction_type type) -{ - int source = -1; - int i = 0; - afr_private_t *priv = this->private; - struct iatt stbuf = {0, }; - - if ((AFR_COUNT (locked_on, priv->child_count) < priv->child_count) || - (afr_success_count(replies, priv->child_count) < priv->child_count)) - return -1; +gf_boolean_t +afr_is_file_empty_on_all_children(afr_private_t *priv, + struct afr_reply *replies) +{ + int i = 0; - if (type == AFR_DATA_TRANSACTION) { - for (i = 0; i < priv->child_count; i++) { - if (replies[i].poststat.ia_size != 0) - return -1; - } + for (i = 0; i < priv->child_count; i++) { + if ((!replies[i].valid) || (replies[i].op_ret != 0) || + (replies[i].poststat.ia_size != 0)) + return _gf_false; + } - goto mark; - } + return _gf_true; +} - /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/ - stbuf = replies[0].poststat; - for (i = 1; i < priv->child_count; i++) { - if ((!IA_EQUAL (stbuf, replies[i].poststat, type)) || - (!IA_EQUAL (stbuf, replies[i].poststat, uid)) || - (!IA_EQUAL (stbuf, replies[i].poststat, gid)) || - (!IA_EQUAL (stbuf, replies[i].poststat, prot))) - return -1; - } - for (i = 1; i < priv->child_count; i++) { - if (!afr_xattrs_are_equal (replies[0].xdata, - replies[i].xdata)) - return -1; - } +int +afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources, + unsigned char *sinks, + unsigned char *healed_sinks, + unsigned char *locked_on, + struct afr_reply *replies, + afr_transaction_type type) +{ + int source = -1; + int i = 0; + afr_private_t *priv = this->private; + struct iatt stbuf = { + 0, + }; + + if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) || + (afr_success_count(replies, priv->child_count) < priv->child_count)) + return -1; + + if (type == AFR_DATA_TRANSACTION) { + if (!afr_is_file_empty_on_all_children(priv, replies)) + return -1; + goto mark; + } + + /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/ + stbuf = replies[0].poststat; + for (i = 1; i < priv->child_count; i++) { + if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) || + (!IA_EQUAL(stbuf, replies[i].poststat, uid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, gid)) || + (!IA_EQUAL(stbuf, replies[i].poststat, prot))) + return -1; + } + for (i = 1; i < priv->child_count; i++) { + if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata)) + return -1; + } mark: - /* data/metadata is same on all bricks. Pick one of them as source. Rest - * are sinks.*/ - for (i = 0 ; i < priv->child_count; i++) { - if (source == -1) { - source = i; - sources[i] = 1; - sinks[i] = 0; - healed_sinks[i] = 0; - continue; - } - sources[i] = 0; - sinks[i] = 1; - healed_sinks[i] = 1; + /* data/metadata is same on all bricks. Pick one of them as source. Rest + * are sinks.*/ + for (i = 0; i < priv->child_count; i++) { + if (source == -1) { + source = i; + sources[i] = 1; + sinks[i] = 0; + healed_sinks[i] = 0; + continue; } + sources[i] = 0; + sinks[i] = 1; + healed_sinks[i] = 1; + } - return source; + return source; } /* Return a source depending on the type of heal_op, and set sources[source], @@ -1318,141 +1389,156 @@ mark: * sinks[node] are 1. This should be the case if the file is in split-brain. */ int -afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this, - inode_t *inode, - unsigned char *sources, - unsigned char *sinks, - unsigned char *healed_sinks, - unsigned char *locked_on, - struct afr_reply *replies, - afr_transaction_type type) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - dict_t *xdata_req = NULL; - int heal_op = -1; - int ret = -1; - int source = -1; - - local = frame->local; - priv = this->private; - xdata_req = local->xdata_req; - - source = afr_mark_source_sinks_if_file_empty (this, sources, sinks, - healed_sinks, locked_on, - replies, type); - if (source >= 0) - return source; - - ret = dict_get_int32 (xdata_req, "heal-op", &heal_op); - if (ret) - goto autoheal; - - source = afr_mark_split_brain_source_sinks_by_heal_op (frame, this, - sources, sinks, - healed_sinks, - locked_on, replies, - type, heal_op); +afr_mark_split_brain_source_sinks( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + struct afr_reply *replies, afr_transaction_type type) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xdata_req = NULL; + int heal_op = -1; + int ret = -1; + int source = -1; + + local = frame->local; + priv = this->private; + xdata_req = local->xdata_req; + + source = afr_mark_source_sinks_if_file_empty( + this, sources, sinks, healed_sinks, locked_on, replies, type); + if (source >= 0) return source; + ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op); + if (ret) + goto autoheal; + + source = afr_mark_split_brain_source_sinks_by_heal_op( + frame, this, sources, sinks, healed_sinks, locked_on, replies, type, + heal_op); + return source; + autoheal: - /* Automatically heal if fav_child_policy is set. */ - if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) { - source = afr_mark_split_brain_source_sinks_by_policy (frame, - this, - inode, - sources, - sinks, - healed_sinks, - locked_on, - replies, - type); - if (source != -1) { - ret = dict_set_int32 (xdata_req, "fav-child-policy", 1); - if (ret) - return -1; - } + /* Automatically heal if fav_child_policy is set. */ + if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) { + source = afr_mark_split_brain_source_sinks_by_policy( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, type); + if (source != -1) { + ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1); + if (ret) + return -1; } + } - return source; + return source; } int -_afr_fav_child_reset_sink_xattrs (call_frame_t *frame, xlator_t *this, - inode_t *inode, int source, - unsigned char *healed_sinks, - unsigned char *undid_pending, - afr_transaction_type type, - unsigned char *locked_on, - struct afr_reply *replies) +_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this, + inode_t *inode, int source, + unsigned char *healed_sinks, + unsigned char *undid_pending, + afr_transaction_type type, + unsigned char *locked_on, + struct afr_reply *replies) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int *input_dirty = NULL; - int **input_matrix = NULL; - int *output_dirty = NULL; - int **output_matrix = NULL; - dict_t *xattr = NULL; - dict_t *xdata = NULL; - int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int *input_dirty = NULL; + int **input_matrix = NULL; + int *output_dirty = NULL; + int **output_matrix = NULL; + dict_t *xattr = NULL; + dict_t *xdata = NULL; + int i = 0; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - if (!dict_get (local->xdata_req, "fav-child-policy")) - return 0; + if (!dict_get_sizen(local->xdata_req, "fav-child-policy")) + return 0; - xdata = dict_new(); - if (!xdata) - return -1; + xdata = dict_new(); + if (!xdata) + return -1; - input_dirty = alloca0 (priv->child_count * sizeof (int)); - input_matrix = ALLOC_MATRIX (priv->child_count, int); - output_dirty = alloca0 (priv->child_count * sizeof (int)); - output_matrix = ALLOC_MATRIX (priv->child_count, int); + input_dirty = alloca0(priv->child_count * sizeof(int)); + input_matrix = ALLOC_MATRIX(priv->child_count, int); + output_dirty = alloca0(priv->child_count * sizeof(int)); + output_matrix = ALLOC_MATRIX(priv->child_count, int); - afr_selfheal_extract_xattr (this, replies, type, input_dirty, - input_matrix); + afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix); - for (i = 0; i < priv->child_count; i++) { - if (i == source || !healed_sinks[i]) - continue; - output_dirty[i] = -input_dirty[i]; - output_matrix[i][source] = -input_matrix[i][source]; - } + for (i = 0; i < priv->child_count; i++) { + if (i == source || !healed_sinks[i]) + continue; + output_dirty[i] = -input_dirty[i]; + output_matrix[i][source] = -input_matrix[i][source]; + } - for (i = 0; i < priv->child_count; i++) { - if (!healed_sinks[i] || !locked_on[i]) - continue; - xattr = afr_selfheal_output_xattr (this, _gf_false, type, - output_dirty, output_matrix, - i, NULL); + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] || !locked_on[i]) + continue; + xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty, + output_matrix, i, NULL); - afr_selfheal_post_op (frame, this, inode, i, xattr, xdata); + afr_selfheal_post_op(frame, this, inode, i, xattr, xdata); - undid_pending[i] = 1; - dict_unref (xattr); - } + undid_pending[i] = 1; + dict_unref(xattr); + } - if (xdata) - dict_unref (xdata); + if (xdata) + dict_unref(xdata); - return 0; + return 0; } gf_boolean_t -afr_does_witness_exist (xlator_t *this, uint64_t *witness) +afr_does_witness_exist(xlator_t *this, uint64_t *witness) { - int i = 0; - afr_private_t *priv = NULL; + int i = 0; + afr_private_t *priv = NULL; - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (witness[i]) - return _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (witness[i]) + return _gf_true; + } + return _gf_false; +} + +unsigned int +afr_get_quorum_count(afr_private_t *priv) +{ + if (priv->quorum_count == AFR_QUORUM_AUTO) { + return priv->child_count / 2 + 1; + } else { + return priv->quorum_count; + } +} + +void +afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused, + unsigned char *sources, + unsigned char *locked_on) +{ + int i = 0; + unsigned int quorum_count = 0; + + if (AFR_COUNT(sources, priv->child_count) != 0) + return; + + quorum_count = afr_get_quorum_count(priv); + for (i = 0; i < priv->child_count; i++) { + if ((accused[i] < quorum_count) && locked_on[i]) { + sources[i] = 1; } - return _gf_false; + } + return; } /* @@ -1475,691 +1561,711 @@ afr_does_witness_exist (xlator_t *this, uint64_t *witness) */ int -afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, - struct afr_reply *replies, - afr_transaction_type type, - unsigned char *locked_on, unsigned char *sources, - unsigned char *sinks, uint64_t *witness, - gf_boolean_t *pflag) -{ - afr_private_t *priv = NULL; - int i = 0; - int j = 0; - int *dirty = NULL; /* Denotes if dirty xattr is set */ - int **matrix = NULL;/* Changelog matrix */ - char *accused = NULL;/* Accused others without any self-accusal */ - char *pending = NULL;/* Have pending operations on others */ - char *self_accused = NULL; /* Accused itself */ - - priv = this->private; - - dirty = alloca0 (priv->child_count * sizeof (int)); - accused = alloca0 (priv->child_count); - pending = alloca0 (priv->child_count); - self_accused = alloca0 (priv->child_count); - matrix = ALLOC_MATRIX(priv->child_count, int); - memset (witness, 0, sizeof (*witness) * priv->child_count); - - /* First construct the pending matrix for further analysis */ - afr_selfheal_extract_xattr (this, replies, type, dirty, matrix); - - if (pflag) { - for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) - if (matrix[i][j]) - *pflag = _gf_true; - if (*pflag) - break; - } +afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, + struct afr_reply *replies, + afr_transaction_type type, unsigned char *locked_on, + unsigned char *sources, unsigned char *sinks, + uint64_t *witness, unsigned char *pflag) +{ + afr_private_t *priv = NULL; + int i = 0; + int j = 0; + int *dirty = NULL; /* Denotes if dirty xattr is set */ + int **matrix = NULL; /* Changelog matrix */ + char *accused = NULL; /* Accused others without any self-accusal */ + char *pending = NULL; /* Have pending operations on others */ + char *self_accused = NULL; /* Accused itself */ + + priv = this->private; + + dirty = alloca0(priv->child_count * sizeof(int)); + accused = alloca0(priv->child_count); + pending = alloca0(priv->child_count); + self_accused = alloca0(priv->child_count); + matrix = ALLOC_MATRIX(priv->child_count, int); + memset(witness, 0, sizeof(*witness) * priv->child_count); + + /* First construct the pending matrix for further analysis */ + afr_selfheal_extract_xattr(this, replies, type, dirty, matrix); + + if (pflag) { + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j]) + *pflag |= PFLAG_PENDING; + if (*pflag) + break; } - - if (afr_success_count (replies, - priv->child_count) < AFR_SH_MIN_PARTICIPANTS) { - /* Treat this just like locks not being acquired */ - return -ENOTCONN; + } + + if (afr_success_count(replies, priv->child_count) < priv->child_count) { + /* Treat this just like locks not being acquired */ + return -ENOTCONN; + } + + /* short list all self-accused */ + for (i = 0; i < priv->child_count; i++) { + if (matrix[i][i]) + self_accused[i] = 1; + } + + /* Next short list all accused to exclude them from being sources */ + /* Self-accused can't accuse others as they are FOOLs */ + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) { + if (!self_accused[i]) + accused[j] += 1; + if (i != j) + pending[i] += 1; + } } + } - /* short list all self-accused */ - for (i = 0; i < priv->child_count; i++) { - if (matrix[i][i]) - self_accused[i] = 1; + /* Short list all non-accused as sources */ + for (i = 0; i < priv->child_count; i++) { + if (!accused[i] && locked_on[i]) + sources[i] = 1; + else + sources[i] = 0; + } + + /* Everyone accused by non-self-accused sources are sinks */ + memset(sinks, 0, priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (self_accused[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) + sinks[j] = 1; } - - /* Next short list all accused to exclude them from being sources */ - /* Self-accused can't accuse others as they are FOOLs */ - for (i = 0; i < priv->child_count; i++) { - for (j = 0; j < priv->child_count; j++) { - if (matrix[i][j]) { - if (!self_accused[i]) - accused[j] = 1; - - if (i != j) - pending[i] = 1; - } - } - } - - /* Short list all non-accused as sources */ - memset (sources, 0, priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (!accused[i] && locked_on[i]) - sources[i] = 1; - } - - /* Everyone accused by non-self-accused sources are sinks */ - memset (sinks, 0, priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (self_accused[i]) - continue; - for (j = 0; j < priv->child_count; j++) { - if (matrix[i][j]) - sinks[j] = 1; - } + } + + /* For breaking ties provide with number of fops they witnessed */ + + /* + * count the pending fops witnessed from itself to others when it is + * self-accused + */ + for (i = 0; i < priv->child_count; i++) { + if (!self_accused[i]) + continue; + for (j = 0; j < priv->child_count; j++) { + if (i == j) + continue; + witness[i] += matrix[i][j]; } + } - /* For breaking ties provide with number of fops they witnessed */ + if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) + afr_selfheal_post_op_failure_accounting(priv, accused, sources, + locked_on); - /* - * count the pending fops witnessed from itself to others when it is - * self-accused - */ + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT(sources, priv->child_count) == 0) { for (i = 0; i < priv->child_count; i++) { - if (!self_accused[i]) - continue; - for (j = 0; j < priv->child_count; j++) { - if (i == j) - continue; - witness[i] += matrix[i][j]; - } + if (locked_on[i]) + sinks[i] = 1; } - - /* If no sources, all locked nodes are sinks - split brain */ - if (AFR_COUNT (sources, priv->child_count) == 0) { - for (i = 0; i < priv->child_count; i++) { - if (locked_on[i]) - sinks[i] = 1; - } + if (pflag) + *pflag |= PFLAG_SBRAIN; + } + + /* One more class of witness similar to dirty in v2 is where no pending + * exists but we have self-accusing markers. This can happen in afr-v1 + * if the brick crashes just after doing xattrop on self but + * before xattrop on the other xattrs on the brick in pre-op. */ + if (AFR_COUNT(pending, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { + if (self_accused[i]) + witness[i] += matrix[i][i]; } - - /* One more class of witness similar to dirty in v2 is where no pending - * exists but we have self-accusing markers. This can happen in afr-v1 - * if the brick crashes just after doing xattrop on self but - * before xattrop on the other xattrs on the brick in pre-op. */ - if (AFR_COUNT (pending, priv->child_count) == 0) { - for (i = 0; i < priv->child_count; i++) { - if (self_accused[i]) - witness[i] += matrix[i][i]; - } - } else { - /* In afr-v1 if a file is self-accused and has pending - * operations on others then it is similar to 'dirty' in afr-v2. - * Consider such cases as witness. - */ - for (i = 0; i < priv->child_count; i++) { - if (self_accused[i] && pending[i]) - witness[i] += matrix[i][i]; - } + } else { + /* In afr-v1 if a file is self-accused and has pending + * operations on others then it is similar to 'dirty' in afr-v2. + * Consider such cases as witness. + */ + for (i = 0; i < priv->child_count; i++) { + if (self_accused[i] && pending[i]) + witness[i] += matrix[i][i]; } + } + /* count the number of dirty fops witnessed */ + for (i = 0; i < priv->child_count; i++) + witness[i] += dirty[i]; - /* count the number of dirty fops witnessed */ - for (i = 0; i < priv->child_count; i++) - witness[i] += dirty[i]; - - return 0; + return 0; } void -afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type, - int source, unsigned char *sources, - unsigned char *healed_sinks) -{ - char *status = NULL; - char *sinks_str = NULL; - char *p = NULL; - char *sources_str = NULL; - char *q = NULL; - afr_private_t *priv = NULL; - gf_loglevel_t loglevel = GF_LOG_NONE; - int i = 0; - - priv = this->private; - sinks_str = alloca0 (priv->child_count * 8); - p = sinks_str; - sources_str = alloca0 (priv->child_count * 8); - q = sources_str; - for (i = 0; i < priv->child_count; i++) { - if (healed_sinks[i]) - p += sprintf (p, "%d ", i); - if (sources[i]) { - if (source == i) { - q += sprintf (q, "[%d] ", i); - } else { - q += sprintf (q, "%d ", i); - } - } +afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source, + unsigned char *sources, unsigned char *healed_sinks) +{ + char *status = NULL; + char *sinks_str = NULL; + char *p = NULL; + char *sources_str = NULL; + char *q = NULL; + afr_private_t *priv = NULL; + gf_loglevel_t loglevel = GF_LOG_NONE; + int i = 0; + + priv = this->private; + sinks_str = alloca0(priv->child_count * 8); + p = sinks_str; + sources_str = alloca0(priv->child_count * 8); + q = sources_str; + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i]) + p += sprintf(p, "%d ", i); + if (sources[i]) { + if (source == i) { + q += sprintf(q, "[%d] ", i); + } else { + q += sprintf(q, "%d ", i); + } } + } - if (ret < 0) { - status = "Failed"; - loglevel = GF_LOG_DEBUG; - } else { - status = "Completed"; - loglevel = GF_LOG_INFO; - } + if (ret < 0) { + status = "Failed"; + loglevel = GF_LOG_DEBUG; + } else { + status = "Completed"; + loglevel = GF_LOG_INFO; + } - gf_msg (this->name, loglevel, 0, - AFR_MSG_SELF_HEAL_INFO, "%s %s selfheal on %s. " - "sources=%s sinks=%s", status, type, uuid_utoa (gfid), - sources_str, sinks_str); + gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO, + "%s %s selfheal on %s. " + "sources=%s sinks=%s", + status, type, uuid_utoa(gfid), sources_str, sinks_str); } int -afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, inode_t *inode, - struct iatt *buf, dict_t *xdata, struct iatt *parbuf) -{ - afr_local_t *local = NULL; - int i = -1; - GF_UNUSED int ret = -1; - int8_t need_heal = 1; - - local = frame->local; - i = (long) cookie; - - local->replies[i].valid = 1; - local->replies[i].op_ret = op_ret; - local->replies[i].op_errno = op_errno; - if (buf) - local->replies[i].poststat = *buf; - if (parbuf) - local->replies[i].postparent = *parbuf; - if (xdata) { - local->replies[i].xdata = dict_ref (xdata); - ret = dict_get_int8 (xdata, "link-count", &need_heal); - local->replies[i].need_heal = need_heal; - } else { - local->replies[i].need_heal = need_heal; - } - - syncbarrier_wake (&local->barrier); +afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *parbuf) +{ + afr_local_t *local = NULL; + int i = -1; + GF_UNUSED int ret = -1; + int8_t need_heal = 1; - return 0; -} + local = frame->local; + i = (long)cookie; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (buf) + local->replies[i].poststat = *buf; + if (parbuf) + local->replies[i].postparent = *parbuf; + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + ret = dict_get_int8(xdata, "link-count", &need_heal); + } -inode_t * -afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent, - const char *name, struct afr_reply *replies, - unsigned char *lookup_on, dict_t *xattr) -{ - loc_t loc = {0, }; - dict_t *xattr_req = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - inode_t *inode = NULL; + local->replies[i].need_heal = need_heal; + syncbarrier_wake(&local->barrier); - local = frame->local; - priv = frame->this->private; + return 0; +} - xattr_req = dict_new (); - if (!xattr_req) - return NULL; +inode_t * +afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent, + const char *name, struct afr_reply *replies, + unsigned char *lookup_on, dict_t *xattr) +{ + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + inode_t *inode = NULL; + + local = frame->local; + priv = frame->this->private; + + xattr_req = dict_new(); + if (!xattr_req) + return NULL; - if (xattr) - dict_copy (xattr, xattr_req); + if (xattr) + dict_copy(xattr, xattr_req); - if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { - dict_unref (xattr_req); - return NULL; - } + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); + return NULL; + } - inode = inode_new (parent->table); - if (!inode) { - dict_unref (xattr_req); - return NULL; - } + inode = inode_new(parent->table); + if (!inode) { + dict_unref(xattr_req); + return NULL; + } - loc.parent = inode_ref (parent); - gf_uuid_copy (loc.pargfid, parent->gfid); - loc.name = name; - loc.inode = inode_ref (inode); + loc.parent = inode_ref(parent); + gf_uuid_copy(loc.pargfid, parent->gfid); + loc.name = name; + loc.inode = inode_ref(inode); - AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, - xattr_req); + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - afr_replies_copy (replies, local->replies, priv->child_count); + afr_replies_copy(replies, local->replies, priv->child_count); - loc_wipe (&loc); - dict_unref (xattr_req); + loc_wipe(&loc); + dict_unref(xattr_req); - return inode; + return inode; } -int -afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode, - uuid_t gfid, struct afr_reply *replies, - unsigned char *discover_on) +static int +afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict) { - loc_t loc = {0, }; - dict_t *xattr_req = NULL; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; + int ret = 0; + afr_private_t *priv = NULL; + char *key1 = NULL; + char *key2 = NULL; + + priv = this->private; + key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(this->name)); + key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 + + strlen(priv->sh_domain)); - local = frame->local; - priv = frame->this->private; + ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1); + if (ret) + return ret; - xattr_req = dict_new (); - if (!xattr_req) - return -ENOMEM; + sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name); + ret = dict_set_uint32(dict, key1, 1); + if (ret) + return ret; - if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) { - dict_unref (xattr_req); - return -ENOMEM; - } + sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain); + ret = dict_set_uint32(dict, key2, 1); + if (ret) + return ret; - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, gfid); + return 0; +} + +int +afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode, + uuid_t gfid, struct afr_reply *replies, + unsigned char *discover_on, dict_t *dict) +{ + loc_t loc = { + 0, + }; + dict_t *xattr_req = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + priv = frame->this->private; + + xattr_req = dict_new(); + if (!xattr_req) + return -ENOMEM; + if (dict) + dict_copy(dict, xattr_req); + + if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) { + dict_unref(xattr_req); + return -ENOMEM; + } + + if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) { + dict_unref(xattr_req); + return -1; + } - AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, - xattr_req); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, gfid); - afr_replies_copy (replies, local->replies, priv->child_count); + AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xattr_req); - loc_wipe (&loc); - dict_unref (xattr_req); + afr_replies_copy(replies, local->replies, priv->child_count); - return 0; + loc_wipe(&loc); + dict_unref(xattr_req); + + return 0; } int -afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode, - uuid_t gfid, struct afr_reply *replies) +afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid, + struct afr_reply *replies) { - afr_private_t *priv = NULL; + afr_local_t *local = NULL; + dict_t *dict = NULL; + + local = frame->local; - priv = frame->this->private; + if (local->xattr_req) + dict = local->xattr_req; - return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies, - priv->child_up); + return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies, + local->child_up, dict); } unsigned int -afr_success_count (struct afr_reply *replies, unsigned int count) +afr_success_count(struct afr_reply *replies, unsigned int count) { - int i = 0; - unsigned int success = 0; + int i = 0; + unsigned int success = 0; - for (i = 0; i < count; i++) - if (replies[i].valid && replies[i].op_ret == 0) - success++; - return success; + for (i = 0; i < count; i++) + if (replies[i].valid && replies[i].op_ret == 0) + success++; + return success; } int -afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, dict_t *xdata) +afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xdata) { - afr_local_t *local = NULL; - int i = 0; + afr_local_t *local = NULL; + int i = 0; - local = frame->local; - i = (long) cookie; + local = frame->local; + i = (long)cookie; - local->replies[i].valid = 1; - local->replies[i].op_ret = op_ret; - local->replies[i].op_errno = op_errno; + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; - syncbarrier_wake (&local->barrier); + syncbarrier_wake(&local->barrier); - return 0; + return 0; } - int -afr_locked_fill (call_frame_t *frame, xlator_t *this, - unsigned char *locked_on) +afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on) { - int i = 0; - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - int count = 0; + int i = 0; + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int count = 0; - local = frame->local; - priv = this->private; + local = frame->local; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (local->replies[i].valid && local->replies[i].op_ret == 0) { - locked_on[i] = 1; - count++; - } else { - locked_on[i] = 0; - } - } + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].valid && local->replies[i].op_ret == 0) { + locked_on[i] = 1; + count++; + } else { + locked_on[i] = 0; + } + } - return count; + return count; } - int -afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, - char *dom, off_t off, size_t size, - unsigned char *locked_on) +afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) { - loc_t loc = {0,}; - struct gf_flock flock = {0, }; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - flock.l_type = F_WRLCK; - flock.l_start = off; - flock.l_len = size; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, - &loc, F_SETLK, &flock, NULL); + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - loc_wipe (&loc); + loc_wipe(&loc); - return afr_locked_fill (frame, this, locked_on); + return afr_locked_fill(frame, this, locked_on); } - int -afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, - char *dom, off_t off, size_t size, - unsigned char *locked_on) -{ - loc_t loc = {0,}; - struct gf_flock flock = {0, }; - afr_local_t *local = NULL; - int i = 0; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); - - flock.l_type = F_WRLCK; - flock.l_start = off; - flock.l_len = size; - - AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, - &loc, F_SETLK, &flock, NULL); - - for (i = 0; i < priv->child_count; i++) { - if (local->replies[i].op_ret == -1 && - local->replies[i].op_errno == EAGAIN) { - afr_locked_fill (frame, this, locked_on); - afr_selfheal_uninodelk (frame, this, inode, dom, off, - size, locked_on); - - AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, - &loc, F_SETLKW, &flock, NULL); - break; - } - } +afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; + + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_uninodelk(frame, this, inode, dom, off, size, + locked_on); + + AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW, + &flock, NULL); + break; + } + } - loc_wipe (&loc); + loc_wipe(&loc); - return afr_locked_fill (frame, this, locked_on); + return afr_locked_fill(frame, this, locked_on); } static void -afr_get_lock_and_eagain_counts (afr_private_t *priv, struct afr_reply *replies, - int *lock_count, int *eagain_count) -{ - int i = 0; - - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) - continue; - if (replies[i].op_ret == 0) { - (*lock_count)++; - } else if (replies[i].op_ret == -1 && - replies[i].op_errno == EAGAIN) { - (*eagain_count)++; - } - } +afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies, + int *lock_count, int *eagain_count) +{ + int i = 0; + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == 0) { + (*lock_count)++; + } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) { + (*eagain_count)++; + } + } } /*Do blocking locks if number of locks acquired is majority and there were some * EAGAINs. Useful for odd-way replication*/ int -afr_selfheal_tie_breaker_inodelk (call_frame_t *frame, xlator_t *this, - inode_t *inode, char *dom, off_t off, - size_t size, unsigned char *locked_on) +afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, off_t off, + size_t size, unsigned char *locked_on) { - loc_t loc = {0,}; - struct gf_flock flock = {0, }; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int lock_count = 0; - int eagain_count = 0; + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lock_count = 0; + int eagain_count = 0; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - flock.l_type = F_WRLCK; - flock.l_start = off; - flock.l_len = size; + flock.l_type = F_WRLCK; + flock.l_start = off; + flock.l_len = size; - AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom, - &loc, F_SETLK, &flock, NULL); + AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock, + NULL); - afr_get_lock_and_eagain_counts (priv, local->replies, &lock_count, - &eagain_count); + afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count, + &eagain_count); - if (lock_count > priv->child_count/2 && eagain_count) { - afr_locked_fill (frame, this, locked_on); - afr_selfheal_uninodelk (frame, this, inode, dom, off, - size, locked_on); + if (lock_count > priv->child_count / 2 && eagain_count) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on); - AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom, - &loc, F_SETLKW, &flock, NULL); - } + AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW, + &flock, NULL); + } - loc_wipe (&loc); + loc_wipe(&loc); - return afr_locked_fill (frame, this, locked_on); + return afr_locked_fill(frame, this, locked_on); } int -afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode, - char *dom, off_t off, size_t size, - const unsigned char *locked_on) +afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, off_t off, size_t size, + const unsigned char *locked_on) { - loc_t loc = {0,}; - struct gf_flock flock = {0, }; - + loc_t loc = { + 0, + }; + struct gf_flock flock = { + 0, + }; - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - flock.l_type = F_UNLCK; - flock.l_start = off; - flock.l_len = size; + flock.l_type = F_UNLCK; + flock.l_start = off; + flock.l_len = size; - AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk, - dom, &loc, F_SETLK, &flock, NULL); + AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, + F_SETLK, &flock, NULL); - loc_wipe (&loc); + loc_wipe(&loc); - return 0; + return 0; } - int -afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, - char *dom, const char *name, unsigned char *locked_on) +afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) { - loc_t loc = {0,}; + loc_t loc = { + 0, + }; - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, - &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - loc_wipe (&loc); + loc_wipe(&loc); - return afr_locked_fill (frame, this, locked_on); + return afr_locked_fill(frame, this, locked_on); } - int -afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, - char *dom, const char *name, unsigned char *locked_on) -{ - loc_t loc = {0,}; - afr_local_t *local = NULL; - int i = 0; - afr_private_t *priv = NULL; - - priv = this->private; - local = frame->local; - - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); - - AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, - name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - - for (i = 0; i < priv->child_count; i++) { - if (local->replies[i].op_ret == -1 && - local->replies[i].op_errno == EAGAIN) { - afr_locked_fill (frame, this, locked_on); - afr_selfheal_unentrylk (frame, this, inode, dom, name, - locked_on, NULL); - - AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, - &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); - break; - } - } +afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on) +{ + loc_t loc = { + 0, + }; + afr_local_t *local = NULL; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + local = frame->local; + + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); + + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); + + for (i = 0; i < priv->child_count; i++) { + if (local->replies[i].op_ret == -1 && + local->replies[i].op_errno == EAGAIN) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, + NULL); + + AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + break; + } + } - loc_wipe (&loc); + loc_wipe(&loc); - return afr_locked_fill (frame, this, locked_on); + return afr_locked_fill(frame, this, locked_on); } int -afr_selfheal_tie_breaker_entrylk (call_frame_t *frame, xlator_t *this, - inode_t *inode, char *dom, const char *name, - unsigned char *locked_on) +afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this, + inode_t *inode, char *dom, const char *name, + unsigned char *locked_on) { - loc_t loc = {0,}; - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int lock_count = 0; - int eagain_count = 0; + loc_t loc = { + 0, + }; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int lock_count = 0; + int eagain_count = 0; - priv = this->private; - local = frame->local; + priv = this->private; + local = frame->local; - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, - name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); + AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL); - afr_get_lock_and_eagain_counts (priv, local->replies, &lock_count, - &eagain_count); + afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count, + &eagain_count); - if (lock_count > priv->child_count/2 && eagain_count) { - afr_locked_fill (frame, this, locked_on); - afr_selfheal_unentrylk (frame, this, inode, dom, name, - locked_on, NULL); + if (lock_count > priv->child_count / 2 && eagain_count) { + afr_locked_fill(frame, this, locked_on); + afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL); - AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom, - &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); - } + AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } - loc_wipe (&loc); + loc_wipe(&loc); - return afr_locked_fill (frame, this, locked_on); + return afr_locked_fill(frame, this, locked_on); } - int -afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, - char *dom, const char *name, unsigned char *locked_on, - dict_t *xdata) -{ - loc_t loc = {0,}; - - loc.inode = inode_ref (inode); - gf_uuid_copy (loc.gfid, inode->gfid); - - AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk, - dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); - - loc_wipe (&loc); - - return 0; -} - - -gf_boolean_t -afr_is_pending_set (xlator_t *this, dict_t *xdata, int type) +afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode, + char *dom, const char *name, unsigned char *locked_on, + dict_t *xdata) { - int idx = -1; - afr_private_t *priv = NULL; - void *pending_raw = NULL; - int *pending_int = NULL; - int i = 0; + loc_t loc = { + 0, + }; - priv = this->private; - idx = afr_index_for_transaction_type (type); + loc.inode = inode_ref(inode); + gf_uuid_copy(loc.gfid, inode->gfid); - if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { - if (pending_raw) { - pending_int = pending_raw; + AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, + name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata); - if (ntoh32 (pending_int[idx])) - return _gf_true; - } - } + loc_wipe(&loc); - for (i = 0; i < priv->child_count; i++) { - if (dict_get_ptr (xdata, priv->pending_key[i], - &pending_raw)) - continue; - if (!pending_raw) - continue; - pending_int = pending_raw; - - if (ntoh32 (pending_int[idx])) - return _gf_true; - } - - return _gf_false; + return 0; } - gf_boolean_t -afr_is_data_set (xlator_t *this, dict_t *xdata) +afr_is_data_set(xlator_t *this, dict_t *xdata) { - return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION); + return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION); } gf_boolean_t -afr_is_metadata_set (xlator_t *this, dict_t *xdata) +afr_is_metadata_set(xlator_t *this, dict_t *xdata) { - return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION); + return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION); } gf_boolean_t -afr_is_entry_set (xlator_t *this, dict_t *xdata) +afr_is_entry_set(xlator_t *this, dict_t *xdata) { - return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION); + return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION); } /* @@ -2172,306 +2278,310 @@ afr_is_entry_set (xlator_t *this, dict_t *xdata) */ int -afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this, - uuid_t gfid, inode_t **link_inode, - gf_boolean_t *data_selfheal, - gf_boolean_t *metadata_selfheal, - gf_boolean_t *entry_selfheal) -{ - afr_private_t *priv = NULL; - inode_t *inode = NULL; - int i = 0; - int valid_cnt = 0; - struct iatt first = {0, }; - int first_idx = 0; - struct afr_reply *replies = NULL; - int ret = -1; - - priv = this->private; - - inode = afr_inode_find (this, gfid); - if (!inode) - goto out; +afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid, + inode_t **link_inode, gf_boolean_t *data_selfheal, + gf_boolean_t *metadata_selfheal, + gf_boolean_t *entry_selfheal, + struct afr_reply *replies_dst) +{ + afr_private_t *priv = NULL; + inode_t *inode = NULL; + int i = 0; + int valid_cnt = 0; + struct iatt first = { + 0, + }; + int first_idx = 0; + struct afr_reply *replies = NULL; + int ret = -1; + + priv = this->private; + + inode = afr_inode_find(this, gfid); + if (!inode) + goto out; - replies = alloca0 (sizeof (*replies) * priv->child_count); + replies = alloca0(sizeof(*replies) * priv->child_count); - ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies); - if (ret) - goto out; + ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies); + if (ret) + goto out; - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) - continue; - if (replies[i].op_ret == -1) - continue; - - /* The data segment of the changelog can be non-zero to indicate - * the directory needs a full heal. So the check below ensures - * it's not a directory before setting the data_selfheal boolean. - */ - if (data_selfheal && !IA_ISDIR (replies[i].poststat.ia_type) && - afr_is_data_set (this, replies[i].xdata)) - *data_selfheal = _gf_true; - - if (metadata_selfheal && - afr_is_metadata_set (this, replies[i].xdata)) - *metadata_selfheal = _gf_true; - - if (entry_selfheal && afr_is_entry_set (this, replies[i].xdata)) - *entry_selfheal = _gf_true; - - valid_cnt++; - if (valid_cnt == 1) { - first = replies[i].poststat; - first_idx = i; - continue; - } - - if (!IA_EQUAL (first, replies[i].poststat, type)) { - gf_msg (this->name, GF_LOG_ERROR, 0, - AFR_MSG_SPLIT_BRAIN, - "TYPE mismatch %d vs %d on %s for gfid:%s", - (int) first.ia_type, - (int) replies[i].poststat.ia_type, - priv->children[i]->name, - uuid_utoa (replies[i].poststat.ia_gfid)); - gf_event (EVENT_AFR_SPLIT_BRAIN, "subvol=%s;" - "type=file;gfid=%s;" - "ia_type-%d=%s;ia_type-%d=%s", - this->name, - uuid_utoa (replies[i].poststat.ia_gfid), - first_idx, - gf_inode_type_to_str (first.ia_type), i, - gf_inode_type_to_str (replies[i].poststat.ia_type)); - ret = -EIO; - goto out; - } - - if (!IA_EQUAL (first, replies[i].poststat, uid)) { - gf_msg_debug (this->name, 0, - "UID mismatch " - "%d vs %d on %s for gfid:%s", - (int) first.ia_uid, - (int) replies[i].poststat.ia_uid, - priv->children[i]->name, - uuid_utoa (replies[i].poststat.ia_gfid)); - - if (metadata_selfheal) - *metadata_selfheal = _gf_true; - } - - if (!IA_EQUAL (first, replies[i].poststat, gid)) { - gf_msg_debug (this->name, 0, - "GID mismatch " - "%d vs %d on %s for gfid:%s", - (int) first.ia_uid, - (int) replies[i].poststat.ia_uid, - priv->children[i]->name, - uuid_utoa (replies[i].poststat.ia_gfid)); - - if (metadata_selfheal) - *metadata_selfheal = _gf_true; - } - - if (!IA_EQUAL (first, replies[i].poststat, prot)) { - gf_msg_debug (this->name, 0, - "MODE mismatch " - "%d vs %d on %s for gfid:%s", - (int) st_mode_from_ia (first.ia_prot, 0), - (int) st_mode_from_ia - (replies[i].poststat.ia_prot, 0), - priv->children[i]->name, - uuid_utoa (replies[i].poststat.ia_gfid)); - - if (metadata_selfheal) - *metadata_selfheal = _gf_true; - } - - if (IA_ISREG(first.ia_type) && - !IA_EQUAL (first, replies[i].poststat, size)) { - gf_msg_debug (this->name, 0, - "SIZE mismatch " - "%lld vs %lld on %s for gfid:%s", - (long long) first.ia_size, - (long long) replies[i].poststat.ia_size, - priv->children[i]->name, - uuid_utoa (replies[i].poststat.ia_gfid)); - - if (data_selfheal) - *data_selfheal = _gf_true; - } - } - - if (valid_cnt > 0 && link_inode) { - *link_inode = inode_link (inode, NULL, NULL, &first); - if (!*link_inode) { - ret = -EINVAL; - goto out; - } - } else if (valid_cnt < 2) { - ret = afr_check_stale_error (replies, priv); - goto out; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + if (replies[i].op_ret == -1) + continue; + + /* The data segment of the changelog can be non-zero to indicate + * the directory needs a full heal. So the check below ensures + * it's not a directory before setting the data_selfheal boolean. + */ + if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) && + afr_is_data_set(this, replies[i].xdata)) + *data_selfheal = _gf_true; + + if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata)) + *metadata_selfheal = _gf_true; + + if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata)) + *entry_selfheal = _gf_true; + + valid_cnt++; + if (valid_cnt == 1) { + first = replies[i].poststat; + first_idx = i; + continue; } - ret = 0; + if (!IA_EQUAL(first, replies[i].poststat, type)) { + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN, + "TYPE mismatch %d vs %d on %s for gfid:%s", + (int)first.ia_type, (int)replies[i].poststat.ia_type, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;" + "type=file;gfid=%s;" + "ia_type-%d=%s;ia_type-%d=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(replies[i].poststat.ia_gfid), first_idx, + gf_inode_type_to_str(first.ia_type), i, + gf_inode_type_to_str(replies[i].poststat.ia_type)); + ret = -EIO; + goto out; + } + + if (!IA_EQUAL(first, replies[i].poststat, uid)) { + gf_msg_debug(this->name, 0, + "UID mismatch " + "%d vs %d on %s for gfid:%s", + (int)first.ia_uid, (int)replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (!IA_EQUAL(first, replies[i].poststat, gid)) { + gf_msg_debug(this->name, 0, + "GID mismatch " + "%d vs %d on %s for gfid:%s", + (int)first.ia_uid, (int)replies[i].poststat.ia_uid, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (!IA_EQUAL(first, replies[i].poststat, prot)) { + gf_msg_debug(this->name, 0, + "MODE mismatch " + "%d vs %d on %s for gfid:%s", + (int)st_mode_from_ia(first.ia_prot, 0), + (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0), + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (metadata_selfheal) + *metadata_selfheal = _gf_true; + } + + if (IA_ISREG(first.ia_type) && + !IA_EQUAL(first, replies[i].poststat, size)) { + gf_msg_debug(this->name, 0, + "SIZE mismatch " + "%lld vs %lld on %s for gfid:%s", + (long long)first.ia_size, + (long long)replies[i].poststat.ia_size, + priv->children[i]->name, + uuid_utoa(replies[i].poststat.ia_gfid)); + + if (data_selfheal) + *data_selfheal = _gf_true; + } + } + + if (valid_cnt > 0 && link_inode) { + *link_inode = inode_link(inode, NULL, NULL, &first); + if (!*link_inode) { + ret = -EINVAL; + goto out; + } + } else if (valid_cnt < 2) { + ret = afr_check_stale_error(replies, priv); + goto out; + } + + ret = 0; out: - if (inode) - inode_unref (inode); - if (replies) - afr_replies_wipe (replies, priv->child_count); + if (replies && replies_dst) + afr_replies_copy(replies_dst, replies, priv->child_count); + if (inode) + inode_unref(inode); + if (replies) + afr_replies_wipe(replies, priv->child_count); - return ret; + return ret; } - inode_t * -afr_inode_find (xlator_t *this, uuid_t gfid) +afr_inode_find(xlator_t *this, uuid_t gfid) { - inode_table_t *table = NULL; - inode_t *inode = NULL; + inode_table_t *table = NULL; + inode_t *inode = NULL; - table = this->itable; - if (!table) - return NULL; + table = this->itable; + if (!table) + return NULL; - inode = inode_find (table, gfid); - if (inode) - return inode; + inode = inode_find(table, gfid); + if (inode) + return inode; - inode = inode_new (table); - if (!inode) - return NULL; + inode = inode_new(table); + if (!inode) + return NULL; - gf_uuid_copy (inode->gfid, gfid); + gf_uuid_copy(inode->gfid, gfid); - return inode; + return inode; } - call_frame_t * -afr_frame_create (xlator_t *this, int32_t *op_errno) +afr_frame_create(xlator_t *this, int32_t *op_errno) { - call_frame_t *frame = NULL; - afr_local_t *local = NULL; - pid_t pid = GF_CLIENT_PID_SELF_HEALD; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + pid_t pid = GF_CLIENT_PID_SELF_HEALD; - frame = create_frame (this, this->ctx->pool); - if (!frame) - return NULL; + frame = create_frame(this, this->ctx->pool); + if (!frame) { + if (op_errno) + *op_errno = ENOMEM; + return NULL; + } - local = AFR_FRAME_INIT (frame, (*op_errno)); - if (!local) { - STACK_DESTROY (frame->root); - return NULL; - } + local = AFR_FRAME_INIT(frame, (*op_errno)); + if (!local) { + STACK_DESTROY(frame->root); + return NULL; + } - syncopctx_setfspid (&pid); + syncopctx_setfspid(&pid); - frame->root->pid = pid; + frame->root->pid = pid; - afr_set_lk_owner (frame, this, frame->root); + afr_set_lk_owner(frame, this, frame->root); - return frame; + return frame; } int -afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode, - int source, struct afr_reply *replies, - unsigned char *sources, unsigned char *newentry) +afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode, + int source, struct afr_reply *replies, + unsigned char *sources, unsigned char *newentry) { - int ret = 0; - int i = 0; - afr_private_t *priv = NULL; - dict_t *xattr = NULL; - int **changelog = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + dict_t *xattr = NULL; + int **changelog = NULL; - priv = this->private; + priv = this->private; - gf_uuid_copy (inode->gfid, replies[source].poststat.ia_gfid); + gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid); - xattr = dict_new(); - if (!xattr) - return -ENOMEM; + xattr = dict_new(); + if (!xattr) + return -ENOMEM; - changelog = afr_mark_pending_changelog (priv, newentry, xattr, - replies[source].poststat.ia_type); + changelog = afr_mark_pending_changelog(priv, newentry, xattr, + replies[source].poststat.ia_type); - if (!changelog) { - ret = -ENOMEM; - goto out; - } + if (!changelog) { + ret = -ENOMEM; + goto out; + } - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - ret |= afr_selfheal_post_op (frame, this, inode, i, xattr, - NULL); - } + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL); + } out: - if (changelog) - afr_matrix_cleanup (changelog, priv->child_count); - if (xattr) - dict_unref (xattr); - return ret; + if (changelog) + afr_matrix_cleanup(changelog, priv->child_count); + if (xattr) + dict_unref(xattr); + return ret; } int -afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) -{ - int ret = -1; - int entry_ret = 1; - int metadata_ret = 1; - int data_ret = 1; - int or_ret = 0; - inode_t *inode = NULL; - gf_boolean_t data_selfheal = _gf_false; - gf_boolean_t metadata_selfheal = _gf_false; - gf_boolean_t entry_selfheal = _gf_false; - afr_private_t *priv = NULL; - gf_boolean_t dataheal_enabled = _gf_false; - - priv = this->private; - - ret = gf_string2boolean (priv->data_self_heal, &dataheal_enabled); - if (ret) - goto out; +afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid) +{ + int ret = -1; + int entry_ret = 1; + int metadata_ret = 1; + int data_ret = 1; + int or_ret = 0; + inode_t *inode = NULL; + fd_t *fd = NULL; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode, + &data_selfheal, &metadata_selfheal, + &entry_selfheal, NULL); + if (ret) + goto out; - ret = afr_selfheal_unlocked_inspect (frame, this, gfid, &inode, - &data_selfheal, - &metadata_selfheal, - &entry_selfheal); - if (ret) - goto out; + if (!(data_selfheal || metadata_selfheal || entry_selfheal)) { + ret = 2; + goto out; + } - if (!(data_selfheal || metadata_selfheal || entry_selfheal)) { - ret = 2; - goto out; + if (inode->ia_type == IA_IFREG) { + ret = afr_selfheal_data_open(this, inode, &fd); + if (!fd) { + ret = -EIO; + goto out; } + } - if (data_selfheal && dataheal_enabled) - data_ret = afr_selfheal_data (frame, this, inode); + if (data_selfheal && priv->data_self_heal) + data_ret = afr_selfheal_data(frame, this, fd); - if (metadata_selfheal && priv->metadata_self_heal) - metadata_ret = afr_selfheal_metadata (frame, this, inode); + if (metadata_selfheal && priv->metadata_self_heal) + metadata_ret = afr_selfheal_metadata(frame, this, inode); - if (entry_selfheal && priv->entry_self_heal) - entry_ret = afr_selfheal_entry (frame, this, inode); + if (entry_selfheal && priv->entry_self_heal) + entry_ret = afr_selfheal_entry(frame, this, inode); - or_ret = (data_ret | metadata_ret | entry_ret); + or_ret = (data_ret | metadata_ret | entry_ret); - if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO) - ret = -EIO; - else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1) - ret = 1; - else if (or_ret < 0) - ret = or_ret; - else - ret = 0; + if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO) + ret = -EIO; + else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1) + ret = 1; + else if (or_ret < 0) + ret = or_ret; + else + ret = 0; out: - if (inode) - inode_unref (inode); - return ret; + if (inode) + inode_unref(inode); + if (fd) + fd_unref(fd); + return ret; } /* * This is the entry point for healing a given GFID. The return values for this @@ -2483,161 +2593,342 @@ out: */ int -afr_selfheal (xlator_t *this, uuid_t gfid) +afr_selfheal(xlator_t *this, uuid_t gfid) { - int ret = -1; - call_frame_t *frame = NULL; - afr_local_t *local = NULL; + int ret = -1; + call_frame_t *frame = NULL; + afr_local_t *local = NULL; - frame = afr_frame_create (this, NULL); - if (!frame) - return ret; + frame = afr_frame_create(this, NULL); + if (!frame) + return ret; - local = frame->local; - local->xdata_req = dict_new(); + local = frame->local; + local->xdata_req = dict_new(); - ret = afr_selfheal_do (frame, this, gfid); + ret = afr_selfheal_do(frame, this, gfid); - if (frame) - AFR_STACK_DESTROY (frame); + if (frame) + AFR_STACK_DESTROY(frame); - return ret; + return ret; } -afr_local_t* -__afr_dequeue_heals (afr_private_t *priv) +afr_local_t * +__afr_dequeue_heals(afr_private_t *priv) { - afr_local_t *local = NULL; + afr_local_t *local = NULL; - if (list_empty (&priv->heal_waiting)) - goto none; - if ((priv->background_self_heal_count > 0) && - (priv->healers >= priv->background_self_heal_count)) - goto none; + if (list_empty(&priv->heal_waiting)) + goto none; + if ((priv->background_self_heal_count > 0) && + (priv->healers >= priv->background_self_heal_count)) + goto none; - local = list_entry (priv->heal_waiting.next, afr_local_t, healer); - priv->heal_waiters--; - GF_ASSERT (priv->heal_waiters >= 0); - list_del_init(&local->healer); - list_add(&local->healer, &priv->healing); - priv->healers++; - return local; + local = list_entry(priv->heal_waiting.next, afr_local_t, healer); + priv->heal_waiters--; + GF_ASSERT(priv->heal_waiters >= 0); + list_del_init(&local->healer); + list_add(&local->healer, &priv->healing); + priv->healers++; + return local; none: - gf_msg_debug (THIS->name, 0, "Nothing dequeued. " - "Num healers: %d, Num Waiters: %d", - priv->healers, priv->heal_waiters); - return NULL; + gf_msg_debug(THIS->name, 0, + "Nothing dequeued. " + "Num healers: %d, Num Waiters: %d", + priv->healers, priv->heal_waiters); + return NULL; } int -afr_refresh_selfheal_wrap (void *opaque) +afr_refresh_selfheal_wrap(void *opaque) { - call_frame_t *heal_frame = opaque; - afr_local_t *local = heal_frame->local; - int ret = 0; + call_frame_t *heal_frame = opaque; + afr_local_t *local = heal_frame->local; + int ret = 0; - ret = afr_selfheal (heal_frame->this, local->refreshinode->gfid); - return ret; + ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid); + return ret; } int -afr_refresh_heal_done (int ret, call_frame_t *frame, void *opaque) -{ - call_frame_t *heal_frame = opaque; - xlator_t *this = heal_frame->this; - afr_private_t *priv = this->private; - afr_local_t *local = heal_frame->local; - - LOCK (&priv->lock); - { - list_del_init(&local->healer); - priv->healers--; - GF_ASSERT (priv->healers >= 0); - local = __afr_dequeue_heals (priv); - } - UNLOCK (&priv->lock); +afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque) +{ + call_frame_t *heal_frame = opaque; + xlator_t *this = heal_frame->this; + afr_private_t *priv = this->private; + afr_local_t *local = heal_frame->local; - if (heal_frame) - AFR_STACK_DESTROY (heal_frame); + LOCK(&priv->lock); + { + list_del_init(&local->healer); + priv->healers--; + GF_ASSERT(priv->healers >= 0); + local = __afr_dequeue_heals(priv); + } + UNLOCK(&priv->lock); - if (local) - afr_heal_synctask (this, local); - return 0; + AFR_STACK_DESTROY(heal_frame); + if (local) + afr_heal_synctask(this, local); + return 0; } void -afr_heal_synctask (xlator_t *this, afr_local_t *local) +afr_heal_synctask(xlator_t *this, afr_local_t *local) { - int ret = 0; - call_frame_t *heal_frame = NULL; + int ret = 0; + call_frame_t *heal_frame = NULL; - heal_frame = local->heal_frame; - ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap, - afr_refresh_heal_done, heal_frame, heal_frame); - if (ret < 0) - /* Heal not launched. Will be queued when the next inode - * refresh happens and shd hasn't healed it yet. */ - afr_refresh_heal_done (ret, heal_frame, heal_frame); + heal_frame = local->heal_frame; + ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap, + afr_refresh_heal_done, heal_frame, heal_frame); + if (ret < 0) + /* Heal not launched. Will be queued when the next inode + * refresh happens and shd hasn't healed it yet. */ + afr_refresh_heal_done(ret, heal_frame, heal_frame); } gf_boolean_t -afr_throttled_selfheal (call_frame_t *frame, xlator_t *this) -{ - gf_boolean_t can_heal = _gf_true; - afr_private_t *priv = this->private; - afr_local_t *local = frame->local; - - LOCK (&priv->lock); - { - if ((priv->background_self_heal_count > 0) && - (priv->heal_wait_qlen + priv->background_self_heal_count) > - (priv->heal_waiters + priv->healers)) { - list_add_tail(&local->healer, &priv->heal_waiting); - priv->heal_waiters++; - local = __afr_dequeue_heals (priv); - } else { - can_heal = _gf_false; - } - } - UNLOCK (&priv->lock); - - if (can_heal) { - if (local) - afr_heal_synctask (this, local); - else - gf_msg_debug (this->name, 0, "Max number of heals are " - "pending, background self-heal rejected."); +afr_throttled_selfheal(call_frame_t *frame, xlator_t *this) +{ + gf_boolean_t can_heal = _gf_true; + afr_private_t *priv = this->private; + afr_local_t *local = frame->local; + + LOCK(&priv->lock); + { + if ((priv->background_self_heal_count > 0) && + (priv->heal_wait_qlen + priv->background_self_heal_count) > + (priv->heal_waiters + priv->healers)) { + list_add_tail(&local->healer, &priv->heal_waiting); + priv->heal_waiters++; + local = __afr_dequeue_heals(priv); + } else { + can_heal = _gf_false; } + } + UNLOCK(&priv->lock); - return can_heal; + if (can_heal) { + if (local) + afr_heal_synctask(this, local); + else + gf_msg_debug(this->name, 0, + "Max number of heals are " + "pending, background self-heal rejected."); + } + + return can_heal; } int -afr_choose_source_by_policy (afr_private_t *priv, unsigned char *sources, - afr_transaction_type type) +afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources, + afr_transaction_type type) { - int source = -1; - int i = 0; + int source = -1; + int i = 0; - /* Give preference to local child to save on bandwidth */ - for (i = 0; i < priv->child_count; i++) { - if (priv->local[i] && sources[i]) { - if ((type == AFR_DATA_TRANSACTION) && - AFR_IS_ARBITER_BRICK (priv, i)) - continue; + /* Give preference to local child to save on bandwidth */ + for (i = 0; i < priv->child_count; i++) { + if (priv->local[i] && sources[i]) { + if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i)) + continue; - source = i; - goto out; - } + source = i; + goto out; } + } - for (i = 0; i < priv->child_count; i++) { - if (sources[i]) { - source = i; - goto out; - } + for (i = 0; i < priv->child_count; i++) { + if (sources[i]) { + source = i; + goto out; } + } out: - return source; + return source; +} + +static int +afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, struct iatt *preparent, + struct iatt *postparent, dict_t *xdata) +{ + afr_local_t *local = frame->local; + int i = (long)cookie; + + local->replies[i].valid = 1; + local->replies[i].op_ret = op_ret; + local->replies[i].op_errno = op_errno; + if (op_ret == 0) { + local->op_ret = 0; + local->replies[i].poststat = *buf; + local->replies[i].preparent = *preparent; + local->replies[i].postparent = *postparent; + } + if (xdata) { + local->replies[i].xdata = dict_ref(xdata); + } + + syncbarrier_wake(&local->barrier); + return 0; +} + +int +afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode) +{ + call_frame_t *frame = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = this->private; + unsigned char *mkdir_on = alloca0(priv->child_count); + unsigned char *lookup_on = alloca0(priv->child_count); + loc_t loc = {0}; + int32_t op_errno = 0; + int32_t child_op_errno = 0; + struct iatt iatt = {0}; + dict_t *xdata = NULL; + uuid_t anon_inode_gfid = {0}; + int mkdir_count = 0; + int i = 0; + + /*Try to mkdir everywhere and return success if the dir exists on 'child' + */ + + if (!priv->use_anon_inode) { + op_errno = EINVAL; + goto out; + } + + frame = afr_frame_create(this, &op_errno); + if (op_errno) { + goto out; + } + local = frame->local; + if (!local->child_up[child]) { + /*Other bricks may need mkdir so don't error out yet*/ + child_op_errno = ENOTCONN; + } + gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid); + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; + + if (priv->anon_inode[i]) { + mkdir_on[i] = 0; + } else { + mkdir_on[i] = 1; + mkdir_count++; + } + } + + if (mkdir_count == 0) { + *linked_inode = inode_find(this->itable, anon_inode_gfid); + if (*linked_inode) { + op_errno = 0; + goto out; + } + } + + loc.parent = inode_ref(this->itable->root); + loc.name = priv->anon_inode_name; + loc.inode = inode_new(this->itable); + if (!loc.inode) { + op_errno = ENOMEM; + goto out; + } + + xdata = dict_new(); + if (!xdata) { + op_errno = ENOMEM; + goto out; + } + + op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true); + if (op_errno) { + goto out; + } + + if (mkdir_count == 0) { + memcpy(lookup_on, local->child_up, priv->child_count); + goto lookup; + } + + AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0, + xdata); + + for (i = 0; i < priv->child_count; i++) { + if (!mkdir_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else if (local->replies[i].op_ret < 0 && + local->replies[i].op_errno == EEXIST) { + lookup_on[i] = 1; + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } + + if (AFR_COUNT(lookup_on, priv->child_count) == 0) { + goto link; + } + +lookup: + AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc, + xdata); + for (i = 0; i < priv->child_count; i++) { + if (!lookup_on[i]) { + continue; + } + + if (local->replies[i].op_ret == 0) { + if (gf_uuid_compare(anon_inode_gfid, + local->replies[i].poststat.ia_gfid) == 0) { + priv->anon_inode[i] = 1; + iatt = local->replies[i].poststat; + } else { + if (i == child) + child_op_errno = EINVAL; + gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA, + "%s has gfid: %s", priv->anon_inode_name, + uuid_utoa(local->replies[i].poststat.ia_gfid)); + } + } else if (i == child) { + child_op_errno = local->replies[i].op_errno; + } + } +link: + if (!gf_uuid_is_null(iatt.ia_gfid)) { + *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt); + if (*linked_inode) { + op_errno = 0; + inode_lookup(*linked_inode); + } else { + op_errno = ENOMEM; + } + goto out; + } + +out: + if (xdata) + dict_unref(xdata); + loc_wipe(&loc); + /*child_op_errno takes precedence*/ + if (child_op_errno == 0) { + child_op_errno = op_errno; + } + + if (child_op_errno && *linked_inode) { + inode_unref(*linked_inode); + *linked_inode = NULL; + } + if (frame) + AFR_STACK_DESTROY(frame); + return -child_op_errno; } diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c index 8cf43f2807b..37bcc2b3f9e 100644 --- a/xlators/cluster/afr/src/afr-self-heal-data.c +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -8,592 +8,592 @@ cases as published by the Free Software Foundation. */ - #include "afr.h" #include "afr-self-heal.h" -#include "byte-order.h" +#include <glusterfs/byte-order.h> #include "protocol-common.h" #include "afr-messages.h" -#include "events.h" - -enum { - AFR_SELFHEAL_DATA_FULL = 0, - AFR_SELFHEAL_DATA_DIFF, -}; - +#include <glusterfs/events.h> #define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size)) static int -__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this, - int op_ret, int op_errno, uint32_t weak, uint8_t *strong, - dict_t *xdata) +__checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, + int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata) { - afr_local_t *local = NULL; - struct afr_reply *replies = NULL; - int i = (long) cookie; - - local = frame->local; - replies = local->replies; - - replies[i].valid = 1; - replies[i].op_ret = op_ret; - replies[i].op_errno = op_errno; - if (xdata) - replies[i].buf_has_zeroes = dict_get_str_boolean (xdata, - "buf-has-zeroes", _gf_false); - if (strong) - memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); - - syncbarrier_wake (&local->barrier); - return 0; + afr_local_t *local = NULL; + struct afr_reply *replies = NULL; + int i = (long)cookie; + + local = frame->local; + replies = local->replies; + + replies[i].valid = 1; + replies[i].op_ret = op_ret; + replies[i].op_errno = op_errno; + if (xdata) { + replies[i].buf_has_zeroes = dict_get_str_boolean( + xdata, "buf-has-zeroes", _gf_false); + replies[i].fips_mode_rchecksum = dict_get_str_boolean( + xdata, "fips-mode-rchecksum", _gf_false); + } + if (strong) { + if (replies[i].fips_mode_rchecksum) { + memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH); + } else { + memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH); + } + } + + syncbarrier_wake(&local->barrier); + return 0; } static gf_boolean_t -__afr_can_skip_data_block_heal (call_frame_t *frame, xlator_t *this, fd_t *fd, - int source, unsigned char *healed_sinks, - off_t offset, size_t size, - struct iatt *poststat) +__afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, struct iatt *poststat) { - afr_private_t *priv = NULL; - afr_local_t *local = NULL; - unsigned char *wind_subvols = NULL; - gf_boolean_t checksum_match = _gf_true; - dict_t *xdata = NULL; - int i = 0; - - priv = this->private; - local = frame->local; - - xdata = dict_new(); - if (!xdata) - goto out; - if (dict_set_int32 (xdata, "check-zero-filled", 1)) { - dict_unref (xdata); - goto out; - } + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + unsigned char *wind_subvols = NULL; + gf_boolean_t checksum_match = _gf_true; + struct afr_reply *replies = NULL; + dict_t *xdata = NULL; + int i = 0; + + priv = this->private; + local = frame->local; + replies = local->replies; + + xdata = dict_new(); + if (!xdata) + goto out; + if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) { + dict_unref(xdata); + goto out; + } + + wind_subvols = alloca0(priv->child_count); + for (i = 0; i < priv->child_count; i++) { + if (i == source || healed_sinks[i]) + wind_subvols[i] = 1; + } + + AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size, + xdata); + if (xdata) + dict_unref(xdata); + + if (!replies[source].valid || replies[source].op_ret != 0) + return _gf_false; - wind_subvols = alloca0 (priv->child_count); - for (i = 0; i < priv->child_count; i++) { - if (i == source || healed_sinks[i]) - wind_subvols[i] = 1; - } - - AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd, - offset, size, xdata); - if (xdata) - dict_unref (xdata); - - if (!local->replies[source].valid || local->replies[source].op_ret != 0) - return _gf_false; - - for (i = 0; i < priv->child_count; i++) { - if (i == source) - continue; - if (local->replies[i].valid) { - if (memcmp (local->replies[source].checksum, - local->replies[i].checksum, - MD5_DIGEST_LENGTH)) { - checksum_match = _gf_false; - break; - } - } - } - - if (checksum_match) { - if (HAS_HOLES (poststat)) - return _gf_true; - - /* For non-sparse files, we might be better off writing the - * zeroes to sinks to avoid mismatch of disk-usage in bricks. */ - if (local->replies[source].buf_has_zeroes) - return _gf_false; - else - return _gf_true; + for (i = 0; i < priv->child_count; i++) { + if (i == source) + continue; + if (replies[i].valid) { + if (memcmp(replies[source].checksum, replies[i].checksum, + replies[source].fips_mode_rchecksum + ? SHA256_DIGEST_LENGTH + : MD5_DIGEST_LENGTH)) { + checksum_match = _gf_false; + break; + } } + } + + if (checksum_match) { + if (HAS_HOLES(poststat)) + return _gf_true; + + /* For non-sparse files, we might be better off writing the + * zeroes to sinks to avoid mismatch of disk-usage in bricks. */ + if (local->replies[source].buf_has_zeroes) + return _gf_false; + else + return _gf_true; + } out: - return _gf_false; + return _gf_false; } - static gf_boolean_t -__afr_is_sink_zero_filled (xlator_t *this, fd_t *fd, size_t size, - off_t offset, int sink) +__afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset, + int sink) { - afr_private_t *priv = NULL; - struct iobref *iobref = NULL; - struct iovec *iovec = NULL; - int count = 0; - int ret = 0; - gf_boolean_t zero_filled = _gf_false; - - priv = this->private; - ret = syncop_readv (priv->children[sink], fd, size, offset, 0, &iovec, - &count, &iobref, NULL, NULL); - if (ret < 0) - goto out; - ret = iov_0filled (iovec, count); - if (!ret) - zero_filled = _gf_true; + afr_private_t *priv = NULL; + struct iobref *iobref = NULL; + struct iovec *iovec = NULL; + int count = 0; + int ret = 0; + gf_boolean_t zero_filled = _gf_false; + + priv = this->private; + ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec, + &count, &iobref, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = iov_0filled(iovec, count); + if (!ret) + zero_filled = _gf_true; out: - if (iovec) - GF_FREE (iovec); - if (iobref) - iobref_unref (iobref); - return zero_filled; + if (iovec) + GF_FREE(iovec); + if (iobref) + iobref_unref(iobref); + return zero_filled; } static int -__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd, - int source, unsigned char *healed_sinks, - off_t offset, size_t size, - struct afr_reply *replies, int type) +__afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, + off_t offset, size_t size, + struct afr_reply *replies, int type) { - struct iovec *iovec = NULL; - int count = 0; - struct iobref *iobref = NULL; - int ret = 0; - int i = 0; - afr_private_t *priv = NULL; - - priv = this->private; - - ret = syncop_readv (priv->children[source], fd, size, offset, 0, - &iovec, &count, &iobref, NULL, NULL); - if (ret <= 0) - return ret; - - for (i = 0; i < priv->child_count; i++) { - if (!healed_sinks[i]) - continue; - - /* - * TODO: Use fiemap() and discard() to heal holes - * in the future. - * - * For now, - * - * - if the source had any holes at all, - * AND - * - if we are writing past the original file size - * of the sink - * AND - * - is NOT the last block of the source file. if - * the block contains EOF, it has to be written - * in order to set the file size even if the - * last block is 0-filled. - * AND - * - if the read buffer is filled with only 0's - * - * then, skip writing to this source. We don't depend - * on the write to happen to update the size as we - * have performed an ftruncate() upfront anyways. - */ -#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b))) - if (HAS_HOLES ((&replies[source].poststat)) && - offset >= replies[i].poststat.ia_size && - !is_last_block (offset, size, - replies[source].poststat.ia_size) && - (iov_0filled (iovec, count) == 0)) - continue; - - /* Avoid filling up sparse regions of the sink with 0-filled - * writes.*/ - if (type == AFR_SELFHEAL_DATA_FULL && - HAS_HOLES ((&replies[source].poststat)) && - ((offset + size) <= replies[i].poststat.ia_size) && - (iov_0filled (iovec, count) == 0) && - __afr_is_sink_zero_filled (this, fd, size, offset, i)) { - continue; - } - - ret = syncop_writev (priv->children[i], fd, iovec, count, - offset, iobref, 0, NULL, NULL); - if (ret != iov_length (iovec, count)) { - /* write() failed on this sink. unset the corresponding - member in sinks[] (which is healed_sinks[] in the - caller) so that this server does NOT get considered - as successfully healed. - */ - healed_sinks[i] = 0; - } - } - if (iovec) - GF_FREE (iovec); - if (iobref) - iobref_unref (iobref); - - return ret; + struct iovec *iovec = NULL; + int count = 0; + struct iobref *iobref = NULL; + int ret = 0; + int i = 0; + afr_private_t *priv = NULL; + + priv = this->private; + + ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec, + &count, &iobref, NULL, NULL, NULL); + if (ret <= 0) + return ret; + + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i]) + continue; + + /* + * TODO: Use fiemap() and discard() to heal holes + * in the future. + * + * For now, + * + * - if the source had any holes at all, + * AND + * - if we are writing past the original file size + * of the sink + * AND + * - is NOT the last block of the source file. if + * the block contains EOF, it has to be written + * in order to set the file size even if the + * last block is 0-filled. + * AND + * - if the read buffer is filled with only 0's + * + * then, skip writing to this source. We don't depend + * on the write to happen to update the size as we + * have performed an ftruncate() upfront anyways. + */ +#define is_last_block(o, b, s) ((s >= o) && (s <= (o + b))) + if (HAS_HOLES((&replies[source].poststat)) && + offset >= replies[i].poststat.ia_size && + !is_last_block(offset, size, replies[source].poststat.ia_size) && + (iov_0filled(iovec, count) == 0)) + continue; + + /* Avoid filling up sparse regions of the sink with 0-filled + * writes.*/ + if (type == AFR_SELFHEAL_DATA_FULL && + HAS_HOLES((&replies[source].poststat)) && + ((offset + size) <= replies[i].poststat.ia_size) && + (iov_0filled(iovec, count) == 0) && + __afr_is_sink_zero_filled(this, fd, size, offset, i)) { + continue; + } + + ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref, + 0, NULL, NULL, NULL, NULL); + if (ret != iov_length(iovec, count)) { + /* write() failed on this sink. unset the corresponding + member in sinks[] (which is healed_sinks[] in the + caller) so that this server does NOT get considered + as successfully healed. + */ + healed_sinks[i] = 0; + } + } + if (iovec) + GF_FREE(iovec); + if (iobref) + iobref_unref(iobref); + + return ret; } -static int -afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd, - int source, unsigned char *healed_sinks, off_t offset, - size_t size, int type, struct afr_reply *replies) +static gf_boolean_t +afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source, + unsigned char *healed_sinks) { - int ret = -1; - int sink_count = 0; - afr_private_t *priv = NULL; - unsigned char *data_lock = NULL; - - priv = this->private; - sink_count = AFR_COUNT (healed_sinks, priv->child_count); - data_lock = alloca0 (priv->child_count); - - ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, - offset, size, data_lock); - { - if (ret < sink_count) { - ret = -ENOTCONN; - goto unlock; - } - - if (type == AFR_SELFHEAL_DATA_DIFF && - __afr_can_skip_data_block_heal (frame, this, fd, source, - healed_sinks, offset, size, - &replies[source].poststat)) { - ret = 0; - goto unlock; - } - - ret = __afr_selfheal_data_read_write (frame, this, fd, source, - healed_sinks, offset, size, - replies, type); - } -unlock: - afr_selfheal_uninodelk (frame, this, fd->inode, this->name, - offset, size, data_lock); - return ret; -} + afr_private_t *priv = this->private; + int i = 0; + if (!locked_on[source]) + return _gf_false; + for (i = 0; i < priv->child_count; i++) { + if (healed_sinks[i] && locked_on[i]) + return _gf_true; + } -static int -afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, - unsigned char *healed_sinks) -{ - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - int i = 0; - - local = frame->local; - priv = this->private; - - if (!priv->ensure_durability) - return 0; - - AFR_ONLIST (healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, - NULL); - - for (i = 0; i < priv->child_count; i++) - if (healed_sinks[i] && local->replies[i].op_ret != 0) - /* fsync() failed. Do NOT consider this server - as successfully healed. Mark it so. - */ - healed_sinks[i] = 0; - return 0; + return _gf_false; } static int -afr_data_self_heal_type_get (afr_private_t *priv, unsigned char *healed_sinks, - int source, struct afr_reply *replies) +afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd, + int source, unsigned char *healed_sinks, off_t offset, + size_t size, int type, struct afr_reply *replies) { - int type = AFR_SELFHEAL_DATA_FULL; - int i = 0; - - if (priv->data_self_heal_algorithm == NULL) { - type = AFR_SELFHEAL_DATA_FULL; - for (i = 0; i < priv->child_count; i++) { - if (!healed_sinks[i] && i != source) - continue; - if (replies[i].poststat.ia_size) { - type = AFR_SELFHEAL_DATA_DIFF; - break; - } - } - } else if (strcmp (priv->data_self_heal_algorithm, "full") == 0) { - type = AFR_SELFHEAL_DATA_FULL; - } else if (strcmp (priv->data_self_heal_algorithm, "diff") == 0) { - type = AFR_SELFHEAL_DATA_DIFF; + int ret = -1; + afr_private_t *priv = NULL; + unsigned char *data_lock = NULL; + + priv = this->private; + data_lock = alloca0(priv->child_count); + + ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + { + if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) { + ret = -ENOTCONN; + goto unlock; } - return type; -} -static int -afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd, - int source, unsigned char *healed_sinks, - struct afr_reply *replies) -{ - afr_private_t *priv = NULL; - off_t off = 0; - size_t block = 0; - int type = AFR_SELFHEAL_DATA_FULL; - int ret = -1; - call_frame_t *iter_frame = NULL; - unsigned char arbiter_sink_status = 0; - - priv = this->private; - if (priv->arbiter_count) { - arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX]; - healed_sinks[ARBITER_BRICK_INDEX] = 0; + if (type == AFR_SELFHEAL_DATA_DIFF && + __afr_can_skip_data_block_heal(frame, this, fd, source, + healed_sinks, offset, size, + &replies[source].poststat)) { + ret = 0; + goto unlock; } - block = 128 * 1024 * priv->data_self_heal_window_size; - - type = afr_data_self_heal_type_get (priv, healed_sinks, source, - replies); + ret = __afr_selfheal_data_read_write( + frame, this, fd, source, healed_sinks, offset, size, replies, type); + } +unlock: + afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size, + data_lock); + return ret; +} - iter_frame = afr_copy_frame (frame); - if (!iter_frame) { - ret = -ENOMEM; - goto out; - } +static int +afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; - for (off = 0; off < replies[source].poststat.ia_size; off += block) { - if (AFR_COUNT (healed_sinks, priv->child_count) == 0) { - ret = -ENOTCONN; - goto out; - } + local = frame->local; + priv = this->private; - ret = afr_selfheal_data_block (iter_frame, this, fd, source, - healed_sinks, off, block, type, - replies); - if (ret < 0) - goto out; + if (!priv->ensure_durability) + return 0; - AFR_STACK_RESET (iter_frame); - if (iter_frame->local == NULL) { - ret = -ENOTCONN; - goto out; - } - } + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL); - ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks); + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret != 0) + /* fsync() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + return 0; +} -out: - if (arbiter_sink_status) - healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status; +static int +afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks, + int source, struct afr_reply *replies) +{ + int type = AFR_SELFHEAL_DATA_FULL; + int i = 0; - if (iter_frame) - AFR_STACK_DESTROY (iter_frame); - return ret; + if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) { + type = AFR_SELFHEAL_DATA_FULL; + for (i = 0; i < priv->child_count; i++) { + if (!healed_sinks[i] && i != source) + continue; + if (replies[i].poststat.ia_size) { + type = AFR_SELFHEAL_DATA_DIFF; + break; + } + } + } else { + type = priv->data_self_heal_algorithm; + } + return type; } - static int -__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this, - fd_t *fd, unsigned char *healed_sinks, - uint64_t size) +afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source, + unsigned char *healed_sinks, struct afr_reply *replies) { - afr_local_t *local = NULL; - afr_private_t *priv = NULL; - unsigned char arbiter_sink_status = 0; - int i = 0; + afr_private_t *priv = NULL; + off_t off = 0; + size_t block = 0; + int type = AFR_SELFHEAL_DATA_FULL; + int ret = -1; + call_frame_t *iter_frame = NULL; + unsigned char arbiter_sink_status = 0; + + gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO, + "performing data selfheal on %s", uuid_utoa(fd->inode->gfid)); + + priv = this->private; + if (priv->arbiter_count) { + arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX]; + healed_sinks[ARBITER_BRICK_INDEX] = 0; + } + + block = 128 * 1024 * priv->data_self_heal_window_size; + + type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies); + + iter_frame = afr_copy_frame(frame); + if (!iter_frame) { + ret = -ENOMEM; + goto out; + } + + for (off = 0; off < replies[source].poststat.ia_size; off += block) { + if (AFR_COUNT(healed_sinks, priv->child_count) == 0) { + ret = -ENOTCONN; + goto out; + } - local = frame->local; - priv = this->private; + ret = afr_selfheal_data_block(iter_frame, this, fd, source, + healed_sinks, off, block, type, replies); + if (ret < 0) + goto out; - if (priv->arbiter_count) { - arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX]; - healed_sinks[ARBITER_BRICK_INDEX] = 0; + AFR_STACK_RESET(iter_frame); + if (iter_frame->local == NULL) { + ret = -ENOTCONN; + goto out; } + } - AFR_ONLIST (healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, - size, NULL); + ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks); - for (i = 0; i < priv->child_count; i++) - if (healed_sinks[i] && local->replies[i].op_ret == -1) - /* truncate() failed. Do NOT consider this server - as successfully healed. Mark it so. - */ - healed_sinks[i] = 0; +out: + if (arbiter_sink_status) + healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status; - if (arbiter_sink_status) - healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status; - return 0; + if (iter_frame) + AFR_STACK_DESTROY(iter_frame); + return ret; +} + +static int +__afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *healed_sinks, uint64_t size) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + priv = this->private; + + /* This will send truncate on the arbiter brick as well if it is marked as + * sink. If changelog is enabled on the volume it captures truncate as a + * data transactions on the arbiter brick. This will help geo-rep to + * properly sync the data from master to slave if arbiter is the ACTIVE + * brick during syncing and which had got some entries healed for data as + * part of self heal. + */ + AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size, + NULL); + + for (i = 0; i < priv->child_count; i++) + if (healed_sinks[i] && local->replies[i].op_ret == -1) + /* truncate() failed. Do NOT consider this server + as successfully healed. Mark it so. + */ + healed_sinks[i] = 0; + + return 0; } gf_boolean_t -afr_has_source_witnesses (xlator_t *this, unsigned char *sources, - uint64_t *witness) +afr_has_source_witnesses(xlator_t *this, unsigned char *sources, + uint64_t *witness) { - int i = 0; - afr_private_t *priv = NULL; + int i = 0; + afr_private_t *priv = NULL; - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (sources[i] && witness[i]) - return _gf_true; - } - return _gf_false; + for (i = 0; i < priv->child_count; i++) { + if (sources[i] && witness[i]) + return _gf_true; + } + return _gf_false; } static gf_boolean_t -afr_does_size_mismatch (xlator_t *this, unsigned char *sources, - struct afr_reply *replies) +afr_does_size_mismatch(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - int i = 0; - afr_private_t *priv = NULL; - struct iatt *min = NULL; - struct iatt *max = NULL; + int i = 0; + afr_private_t *priv = NULL; + struct iatt *min = NULL; + struct iatt *max = NULL; - priv = this->private; + priv = this->private; - for (i = 0; i < priv->child_count; i++) { - if (!replies[i].valid) - continue; + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; - if (replies[i].op_ret < 0) - continue; + if (replies[i].op_ret < 0) + continue; - if (!sources[i]) - continue; + if (!sources[i]) + continue; - if (AFR_IS_ARBITER_BRICK (priv, i) && - (replies[i].poststat.ia_size == 0)) - continue; + if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0)) + continue; - if (!min) - min = &replies[i].poststat; + if (!min) + min = &replies[i].poststat; - if (!max) - max = &replies[i].poststat; + if (!max) + max = &replies[i].poststat; - if (min->ia_size > replies[i].poststat.ia_size) - min = &replies[i].poststat; + if (min->ia_size > replies[i].poststat.ia_size) + min = &replies[i].poststat; - if (max->ia_size < replies[i].poststat.ia_size) - max = &replies[i].poststat; - } + if (max->ia_size < replies[i].poststat.ia_size) + max = &replies[i].poststat; + } - if (min && max) { - if (min->ia_size != max->ia_size) - return _gf_true; - } + if (min && max) { + if (min->ia_size != max->ia_size) + return _gf_true; + } - return _gf_false; + return _gf_false; } static void -afr_mark_biggest_witness_as_source (xlator_t *this, unsigned char *sources, - uint64_t *witness) +afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources, + uint64_t *witness) { - int i = 0; - afr_private_t *priv = NULL; - uint64_t biggest_witness = 0; - - priv = this->private; - /* Find source with biggest witness count */ - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (biggest_witness < witness[i]) - biggest_witness = witness[i]; - } - - /* Mark files with less witness count as not source */ - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - if (witness[i] < biggest_witness) - sources[i] = 0; - } - - return; + int i = 0; + afr_private_t *priv = NULL; + uint64_t biggest_witness = 0; + + priv = this->private; + /* Find source with biggest witness count */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (biggest_witness < witness[i]) + biggest_witness = witness[i]; + } + + /* Mark files with less witness count as not source */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + if (witness[i] < biggest_witness) + sources[i] = 0; + } + + return; } /* This is a tie breaker function. Only one source be assigned here */ static void -afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources, - struct afr_reply *replies) +afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources, + struct afr_reply *replies) { - int i = 0; - afr_private_t *priv = NULL; - int source = -1; - uint32_t max_ctime = 0; - - priv = this->private; - /* Find source with latest ctime */ - for (i = 0; i < priv->child_count; i++) { - if (!sources[i]) - continue; - - if (max_ctime <= replies[i].poststat.ia_ctime) { - source = i; - max_ctime = replies[i].poststat.ia_ctime; - } + int i = 0; + afr_private_t *priv = NULL; + int source = -1; + uint32_t max_ctime = 0; + + priv = this->private; + /* Find source with latest ctime */ + for (i = 0; i < priv->child_count; i++) { + if (!sources[i]) + continue; + + if (max_ctime <= replies[i].poststat.ia_ctime) { + source = i; + max_ctime = replies[i].poststat.ia_ctime; } + } - /* Only mark one of the files as source to break ties */ - memset (sources, 0, sizeof (*sources) * priv->child_count); - sources[source] = 1; + /* Only mark one of the files as source to break ties */ + memset(sources, 0, sizeof(*sources) * priv->child_count); + sources[source] = 1; } static int -__afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this, - inode_t *inode, - unsigned char *sources, - unsigned char *sinks, - unsigned char *healed_sinks, - unsigned char *locked_on, - unsigned char *undid_pending, - struct afr_reply *replies, - uint64_t *witness) +__afr_selfheal_data_finalize_source( + call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on, + unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness) { - afr_private_t *priv = NULL; - int source = -1; - int sources_count = 0; - priv = this->private; - - sources_count = AFR_COUNT (sources, priv->child_count); - - if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0) - || !sources_count) { - /* split brain */ - source = afr_mark_split_brain_source_sinks (frame, this, inode, - sources, sinks, - healed_sinks, - locked_on, replies, - AFR_DATA_TRANSACTION); - if (source < 0) { - gf_event (EVENT_AFR_SPLIT_BRAIN, "subvol=%s;type=data;" - "file=%s", this->name, uuid_utoa(inode->gfid)); - return -EIO; - } - - _afr_fav_child_reset_sink_xattrs (frame, this, inode, source, - healed_sinks, undid_pending, - AFR_DATA_TRANSACTION, - locked_on, replies); - goto out; - } - - /* No split brain at this point. If we were called from - * afr_heal_splitbrain_file(), abort.*/ - if (afr_dict_contains_heal_op(frame)) - return -EIO; - - /* If there are no witnesses/size-mismatches on sources we are done*/ - if (!afr_does_size_mismatch (this, sources, replies) && - !afr_has_source_witnesses (this, sources, witness)) - goto out; - - afr_mark_largest_file_as_source (this, sources, replies); - afr_mark_biggest_witness_as_source (this, sources, witness); - afr_mark_newest_file_as_source (this, sources, replies); - if (priv->arbiter_count) - /* Choose non-arbiter brick as source for empty files. */ - afr_mark_source_sinks_if_file_empty (this, sources, sinks, - healed_sinks, locked_on, - replies, - AFR_DATA_TRANSACTION); + afr_private_t *priv = NULL; + int source = -1; + int sources_count = 0; + priv = this->private; + + sources_count = AFR_COUNT(sources, priv->child_count); + + if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) || + !sources_count) { + /* split brain */ + source = afr_mark_split_brain_source_sinks( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + replies, AFR_DATA_TRANSACTION); + if (source < 0) { + gf_event(EVENT_AFR_SPLIT_BRAIN, + "client-pid=%d;" + "subvol=%s;type=data;" + "file=%s", + this->ctx->cmd_args.client_pid, this->name, + uuid_utoa(inode->gfid)); + return -EIO; + } + + _afr_fav_child_reset_sink_xattrs( + frame, this, inode, source, healed_sinks, undid_pending, + AFR_DATA_TRANSACTION, locked_on, replies); + goto out; + } + + /* No split brain at this point. If we were called from + * afr_heal_splitbrain_file(), abort.*/ + if (afr_dict_contains_heal_op(frame)) + return -EIO; + + /* If there are no witnesses/size-mismatches on sources we are done*/ + if (!afr_does_size_mismatch(this, sources, replies) && + !afr_has_source_witnesses(this, sources, witness)) + goto out; + + afr_mark_largest_file_as_source(this, sources, replies); + afr_mark_biggest_witness_as_source(this, sources, witness); + afr_mark_newest_file_as_source(this, sources, replies); + if (priv->arbiter_count) + /* Choose non-arbiter brick as source for empty files. */ + afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks, + locked_on, replies, + AFR_DATA_TRANSACTION); out: - afr_mark_active_sinks (this, sources, locked_on, healed_sinks); - source = afr_choose_source_by_policy (priv, sources, - AFR_DATA_TRANSACTION); + afr_mark_active_sinks(this, sources, locked_on, healed_sinks); + source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION); - return source; + return source; } /* @@ -606,300 +606,286 @@ out: * for self-healing, or -1 if no healing is necessary/split brain. */ int -__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, - inode_t *inode, unsigned char *locked_on, - unsigned char *sources, unsigned char *sinks, - unsigned char *healed_sinks, - unsigned char *undid_pending, - struct afr_reply *replies, gf_boolean_t *pflag) +__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode, + unsigned char *locked_on, unsigned char *sources, + unsigned char *sinks, unsigned char *healed_sinks, + unsigned char *undid_pending, + struct afr_reply *replies, unsigned char *pflag) { - int ret = -1; - int source = -1; - afr_private_t *priv = NULL; - uint64_t *witness = NULL; - - priv = this->private; - - ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid, - replies); - - if (ret) - return ret; - - witness = alloca0(priv->child_count * sizeof (*witness)); - ret = afr_selfheal_find_direction (frame, this, replies, - AFR_DATA_TRANSACTION, - locked_on, sources, sinks, witness, - pflag); - if (ret) - return ret; - - /* Initialize the healed_sinks[] array optimistically to - the intersection of to-be-healed (i.e sinks[]) and - the list of servers which are up (i.e locked_on[]). - As we encounter failures in the healing process, we - will unmark the respective servers in the healed_sinks[] - array. - */ - AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count); - - source = __afr_selfheal_data_finalize_source (frame, this, inode, - sources, sinks, - healed_sinks, - locked_on, undid_pending, - replies, witness); - if (source < 0) - return -EIO; - - return source; + int ret = -1; + int source = -1; + afr_private_t *priv = NULL; + uint64_t *witness = NULL; + + priv = this->private; + + ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies); + + if (ret) + return ret; + + witness = alloca0(priv->child_count * sizeof(*witness)); + ret = afr_selfheal_find_direction(frame, this, replies, + AFR_DATA_TRANSACTION, locked_on, sources, + sinks, witness, pflag); + if (ret) + return ret; + + /* Initialize the healed_sinks[] array optimistically to + the intersection of to-be-healed (i.e sinks[]) and + the list of servers which are up (i.e locked_on[]). + As we encounter failures in the healing process, we + will unmark the respective servers in the healed_sinks[] + array. + */ + AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count); + + source = __afr_selfheal_data_finalize_source( + frame, this, inode, sources, sinks, healed_sinks, locked_on, + undid_pending, replies, witness); + if (source < 0) + return -EIO; + + return source; } - static int -__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, - unsigned char *locked_on) +__afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd, + unsigned char *locked_on) { - afr_private_t *priv = NULL; - int ret = -1; - unsigned char *sources = NULL; - unsigned char *sinks = NULL; - unsigned char *data_lock = NULL; - unsigned char *healed_sinks = NULL; - unsigned char *undid_pending = NULL; - struct afr_reply *locked_replies = NULL; - int source = -1; - gf_boolean_t did_sh = _gf_true; - gf_boolean_t is_arbiter_the_only_sink = _gf_false; - - priv = this->private; - - sources = alloca0 (priv->child_count); - sinks = alloca0 (priv->child_count); - healed_sinks = alloca0 (priv->child_count); - data_lock = alloca0 (priv->child_count); - undid_pending = alloca0 (priv->child_count); - - locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); - - ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0, - data_lock); - { - if (ret < AFR_SH_MIN_PARTICIPANTS) { - gf_msg_debug (this->name, 0, "%s: Skipping " - "self-heal as only %d number " - "of subvolumes " - "could be locked", - uuid_utoa (fd->inode->gfid), - ret); - ret = -ENOTCONN; - goto unlock; - } - - ret = __afr_selfheal_data_prepare (frame, this, fd->inode, - data_lock, sources, sinks, - healed_sinks, undid_pending, - locked_replies, NULL); - if (ret < 0) |
