summaryrefslogtreecommitdiffstats
path: root/extras
diff options
context:
space:
mode:
authorNiels de Vos <ndevos@redhat.com>2013-04-21 11:10:06 +0200
committerAnand Avati <avati@redhat.com>2013-04-29 17:07:26 -0700
commitee9984882e3b28e76fb3af5491a8d2bf22e432a3 (patch)
tree8754437fc40aecc8ddd24b01e34408514454b915 /extras
parentddad856d371b1cbc80e1a195e9d81edb1be31371 (diff)
extras: include Fedora changes in init.d/glusterd
The changes in the .spec file from Fedora have largely been merged into the glusterfs.spec.in. It seems that some dependencies have been missed, most importantly some additions to the init-script that are called while (un)installing or updating RPMs. These changes come from the downstream Fedora package that carries its own glusterd.init script. In future, Fedora/EPEL should be able to drop that file and use the Gluster project version. Change-Id: Iac25854b0c559b93fa1dd452a04663bd95ea3378 BUG: 954149 URL: http://lists.nongnu.org/archive/html/gluster-devel/2013-04/msg00077.html CC: Fedora GlusterFS Packagers <glusterfs-owner@fedoraproject.org> Signed-off-by: Niels de Vos <ndevos@redhat.com> Reviewed-on: http://review.gluster.org/4864 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Kaleb KEITHLEY <kkeithle@redhat.com> Reviewed-by: Anand Avati <avati@redhat.com>
Diffstat (limited to 'extras')
-rwxr-xr-xextras/init.d/glusterd-Redhat.in96
1 files changed, 73 insertions, 23 deletions
diff --git a/extras/init.d/glusterd-Redhat.in b/extras/init.d/glusterd-Redhat.in
index 858f82245..e1e5e859c 100755
--- a/extras/init.d/glusterd-Redhat.in
+++ b/extras/init.d/glusterd-Redhat.in
@@ -1,20 +1,39 @@
#!/bin/bash
#
-# chkconfig: 35 20 80
-# description: Gluster File System service for volume management
+# glusterd Startup script for the glusterfs server
#
+# chkconfig: - 20 80
+# description: Clustered file-system server
-# Get function from functions library
+### BEGIN INIT INFO
+# Provides: glusterd
+# Required-Start: $local_fs $network
+# Required-Stop: $local_fs $network
+# Should-Start:
+# Should-Stop:
+# Default-Start:
+# Default-Stop: 0 1 2 3 4 5 6
+# Short-Description: glusterfs server
+# Description: Clustered file-system server
+### END INIT INFO
+#
+
+# Source function library.
. /etc/rc.d/init.d/functions
BASE=glusterd
-PIDFILE=/var/run/$BASE.pid
+
+# Fedora File System Layout dictates /run
+[ -e /run ] && RUNDIR="/run"
+PIDFILE="${RUNDIR:-/var/run}/${BASE}.pid"
+
PID=`test -f $PIDFILE && cat $PIDFILE`
# Overwriteable from sysconfig
LOG_LEVEL=''
LOG_FILE=''
GLUSTERD_OPTIONS=''
+GLUSTERD_NOFILE='65536'
[ -f /etc/sysconfig/${BASE} ] && . /etc/sysconfig/${BASE}
@@ -31,57 +50,88 @@ RETVAL=0
# Start the service $BASE
start()
{
- pidofproc -p $PIDFILE $GLUSTERD_BIN &> /dev/null
- status=$?
- if [ $status -eq 0 ]; then
+ if pidofproc -p $PIDFILE $GLUSTERD_BIN &> /dev/null; then
echo "glusterd service is already running with pid $PID"
- exit 0
+ return 0
else
+ ulimit -n $GLUSTERD_NOFILE
echo -n $"Starting $BASE:"
daemon $GLUSTERD
RETVAL=$?
echo
- [ $RETVAL -ne 0 ] && exit $RETVAL
+ return $RETVAL
fi
-
}
# Stop the service $BASE
stop()
{
echo -n $"Stopping $BASE:"
- pidofproc -p $PIDFILE $GLUSTERD_BIN &> /dev/null
- status=$?
- if [ $status -eq 0 ]; then
+ if pidofproc -p $PIDFILE $GLUSTERD_BIN &> /dev/null; then
killproc -p $PIDFILE $BASE
[ -w $PIDFILE ] && rm -f $PIDFILE
else
killproc $BASE
fi
+}
+
+restart()
+{
+ stop
+ start
+}
+
+reload()
+{
+ restart
+}
+force_reload()
+{
+ restart
+}
+
+rh_status()
+{
+ status $BASE
+}
+
+rh_status_q()
+{
+ rh_status &>/dev/null
}
### service arguments ###
case $1 in
start)
- start
+ rh_status_q && exit 0
+ $1
;;
stop)
- stop
- RETVAL=$?
+ rh_status_q || exit 0
+ $1
+ ;;
+ restart)
+ $1
+ ;;
+ reload)
+ rh_status_q || exit 7
+ $1
+ ;;
+ force-reload)
+ force_reload
;;
status)
- status $BASE
- RETVAL=$?
+ rh_status
;;
- restart)
- $0 stop
- $0 start
+ condrestart|try-restart)
+ rh_status_q || exit 0
+ restart
;;
*)
- echo $"Usage: $0 {start|stop|status|restart}."
+ echo $"Usage: $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}"
exit 1
esac
-exit $RETVAL
+exit $?
e63992b318dd11bbb81920c0e09059c3dad'>xlators/cluster/afr/src/afr-self-heal-common.c3052
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-common.h136
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-data.c1844
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-entry.c2717
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-metadata.c829
-rw-r--r--xlators/cluster/afr/src/afr-self-heal-name.c457
-rw-r--r--xlators/cluster/afr/src/afr-self-heal.h162
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.c2105
-rw-r--r--xlators/cluster/afr/src/afr-self-heald.h86
-rw-r--r--xlators/cluster/afr/src/afr-transaction.c1829
-rw-r--r--xlators/cluster/afr/src/afr-transaction.h30
-rw-r--r--xlators/cluster/afr/src/afr.c170
-rw-r--r--xlators/cluster/afr/src/afr.h822
-rw-r--r--xlators/cluster/afr/src/pump.c659
-rw-r--r--xlators/cluster/afr/src/pump.h3
-rw-r--r--xlators/cluster/dht/src/Makefile.am1
-rw-r--r--xlators/cluster/dht/src/dht-common.c340
-rw-r--r--xlators/cluster/dht/src/dht-common.h43
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c75
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c2
-rw-r--r--xlators/cluster/dht/src/dht-helper.c279
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c83
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c455
-rw-r--r--xlators/cluster/dht/src/dht-layout.c52
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c72
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c402
-rw-r--r--xlators/cluster/dht/src/dht-rename.c203
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c71
-rw-r--r--xlators/cluster/dht/src/dht-shared.c82
-rw-r--r--xlators/cluster/dht/src/dht.c3
-rw-r--r--xlators/cluster/dht/src/nufa.c190
-rw-r--r--xlators/cluster/dht/src/switch.c8
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_mock.c63
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_unittest.c124
-rw-r--r--xlators/cluster/nsr-client/Makefile.am3
-rw-r--r--xlators/cluster/nsr-client/src/Makefile.am33
-rw-r--r--xlators/cluster/nsr-client/src/fop-template.c113
-rwxr-xr-xxlators/cluster/nsr-client/src/gen-fops.py57
-rw-r--r--xlators/cluster/nsr-client/src/nsrc.c243
-rw-r--r--xlators/cluster/nsr-recon/Makefile.am3
-rw-r--r--xlators/cluster/nsr-recon/src/Makefile.am23
-rw-r--r--xlators/cluster/nsr-recon/src/recon_driver.c3130
-rw-r--r--xlators/cluster/nsr-recon/src/recon_driver.h325
-rw-r--r--xlators/cluster/nsr-recon/src/recon_xlator.c1010
-rw-r--r--xlators/cluster/nsr-recon/src/recon_xlator.h92
-rw-r--r--xlators/cluster/nsr-server/Makefile.am3
-rw-r--r--xlators/cluster/nsr-server/src/Makefile.am43
-rw-r--r--xlators/cluster/nsr-server/src/all-templates.c345
-rwxr-xr-xxlators/cluster/nsr-server/src/codegen.py174
-rw-r--r--xlators/cluster/nsr-server/src/etcd-api.c831
-rw-r--r--xlators/cluster/nsr-server/src/etcd-api.h214
-rw-r--r--xlators/cluster/nsr-server/src/etcd-sim.c280
-rwxr-xr-xxlators/cluster/nsr-server/src/gen-fops.py120
-rw-r--r--xlators/cluster/nsr-server/src/leader.c138
-rw-r--r--xlators/cluster/nsr-server/src/nsr-internal.h101
-rw-r--r--xlators/cluster/nsr-server/src/nsr.c812
-rw-r--r--xlators/cluster/nsr-server/src/recon_notify.c389
-rw-r--r--xlators/cluster/nsr-server/src/yajl.c175
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_common.h75
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_gen.h157
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_parse.h226
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_tree.h177
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_version.h23
-rw-r--r--xlators/cluster/nsr-server/src/yajl_alloc.c49
-rw-r--r--xlators/cluster/nsr-server/src/yajl_alloc.h34
-rw-r--r--xlators/cluster/nsr-server/src/yajl_buf.c103
-rw-r--r--xlators/cluster/nsr-server/src/yajl_buf.h57
-rw-r--r--xlators/cluster/nsr-server/src/yajl_bytestack.h69
-rw-r--r--xlators/cluster/nsr-server/src/yajl_encode.c220
-rw-r--r--xlators/cluster/nsr-server/src/yajl_encode.h34
-rw-r--r--xlators/cluster/nsr-server/src/yajl_gen.c350
-rw-r--r--xlators/cluster/nsr-server/src/yajl_lex.c763
-rw-r--r--xlators/cluster/nsr-server/src/yajl_lex.h117
-rw-r--r--xlators/cluster/nsr-server/src/yajl_parser.c492
-rw-r--r--xlators/cluster/nsr-server/src/yajl_parser.h78
-rw-r--r--xlators/cluster/nsr-server/src/yajl_tree.c501
-rw-r--r--xlators/cluster/nsr-server/src/yajl_version.c7
-rw-r--r--xlators/cluster/stripe/src/Makefile.am1
-rw-r--r--xlators/cluster/stripe/src/stripe-helpers.c12
-rw-r--r--xlators/cluster/stripe/src/stripe.c599
-rw-r--r--xlators/debug/io-stats/src/Makefile.am4
-rw-r--r--xlators/debug/io-stats/src/io-stats.c269
-rw-r--r--xlators/debug/trace/src/trace.c40
-rw-r--r--xlators/debug/trace/src/trace.h37
-rw-r--r--xlators/encryption/Makefile.am2
-rw-r--r--xlators/encryption/crypt/Makefile.am3
-rw-r--r--xlators/encryption/crypt/src/Makefile.am24
-rw-r--r--xlators/encryption/crypt/src/atom.c962
-rw-r--r--xlators/encryption/crypt/src/crypt-common.h141
-rw-r--r--xlators/encryption/crypt/src/crypt-mem-types.h44
-rw-r--r--xlators/encryption/crypt/src/crypt.c4522
-rw-r--r--xlators/encryption/crypt/src/crypt.h908
-rw-r--r--xlators/encryption/crypt/src/data.c769
-rw-r--r--xlators/encryption/crypt/src/keys.c302
-rw-r--r--xlators/encryption/crypt/src/metadata.c605
-rw-r--r--xlators/encryption/crypt/src/metadata.h74
-rw-r--r--xlators/encryption/rot-13/src/rot-13.c2
-rw-r--r--xlators/features/Makefile.am4
-rw-r--r--xlators/features/barrier/Makefile.am (renamed from xlators/storage/bd_map/Makefile.am)0
-rw-r--r--xlators/features/barrier/src/Makefile.am16
-rw-r--r--xlators/features/barrier/src/barrier-mem-types.h20
-rw-r--r--xlators/features/barrier/src/barrier.c658
-rw-r--r--xlators/features/barrier/src/barrier.h91
-rw-r--r--xlators/features/changelog/Makefile.am3
-rw-r--r--xlators/features/changelog/lib/Makefile.am3
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes.c87
-rw-r--r--xlators/features/changelog/lib/examples/python/changes.py32
-rw-r--r--xlators/features/changelog/lib/examples/python/libgfchangelog.py64
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am38
-rw-r--r--xlators/features/changelog/lib/src/changelog.h31
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.c180
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.h102
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-process.c571
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog.c571
-rw-r--r--xlators/features/changelog/lib/src/gf-history-changelog.c274
-rw-r--r--xlators/features/changelog/src/Makefile.am21
-rw-r--r--xlators/features/changelog/src/changelog-default-fops.c561
-rw-r--r--xlators/features/changelog/src/changelog-encoders.c182
-rw-r--r--xlators/features/changelog/src/changelog-encoders.h48
-rw-r--r--xlators/features/changelog/src/changelog-fops.h157
-rw-r--r--xlators/features/changelog/src/changelog-helpers.c719
-rw-r--r--xlators/features/changelog/src/changelog-helpers.h578
-rw-r--r--xlators/features/changelog/src/changelog-mem-types.h30
-rw-r--r--xlators/features/changelog/src/changelog-misc.h107
-rw-r--r--xlators/features/changelog/src/changelog-notifier.c314
-rw-r--r--xlators/features/changelog/src/changelog-notifier.h19
-rw-r--r--xlators/features/changelog/src/changelog-rt.c83
-rw-r--r--xlators/features/changelog/src/changelog-rt.h40
-rw-r--r--xlators/features/changelog/src/changelog.c1389
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy-default.c45
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy-replication.c1374
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy.h41
-rw-r--r--xlators/features/compress/Makefile.am3
-rw-r--r--xlators/features/compress/src/Makefile.am17
-rw-r--r--xlators/features/compress/src/cdc-helper.c547
-rw-r--r--xlators/features/compress/src/cdc-mem-types.h23
-rw-r--r--xlators/features/compress/src/cdc.c361
-rw-r--r--xlators/features/compress/src/cdc.h107
-rw-r--r--xlators/features/gfid-access/Makefile.am (renamed from xlators/bindings/python/Makefile.am)0
-rw-r--r--xlators/features/gfid-access/src/Makefile.am15
-rw-r--r--xlators/features/gfid-access/src/gfid-access-mem-types.h23
-rw-r--r--xlators/features/gfid-access/src/gfid-access.c1299
-rw-r--r--xlators/features/gfid-access/src/gfid-access.h134
-rw-r--r--xlators/features/glupy/Makefile.am2
-rw-r--r--xlators/features/glupy/examples/Makefile.am5
-rw-r--r--xlators/features/glupy/examples/debug-trace.py (renamed from xlators/features/glupy/src/debug-trace.py)11
-rw-r--r--xlators/features/glupy/examples/helloworld.py (renamed from xlators/features/glupy/src/helloworld.py)2
-rw-r--r--xlators/features/glupy/examples/negative.py (renamed from xlators/features/glupy/src/negative.py)3
-rw-r--r--xlators/features/glupy/src/Makefile.am15
-rw-r--r--xlators/features/glupy/src/glupy.c24
-rw-r--r--xlators/features/glupy/src/glupy.py (renamed from xlators/features/glupy/src/gluster.py)0
-rw-r--r--xlators/features/glupy/src/setup.py.in24
-rw-r--r--xlators/features/index/src/index.c121
-rw-r--r--xlators/features/locks/src/Makefile.am1
-rw-r--r--xlators/features/locks/src/clear.c14
-rw-r--r--xlators/features/locks/src/common.c34
-rw-r--r--xlators/features/locks/src/common.h28
-rw-r--r--xlators/features/locks/src/entrylk.c551
-rw-r--r--xlators/features/locks/src/inodelk.c374
-rw-r--r--xlators/features/locks/src/locks.h59
-rw-r--r--xlators/features/locks/src/posix.c198
-rw-r--r--xlators/features/mac-compat/src/Makefile.am5
-rw-r--r--xlators/features/mac-compat/src/mac-compat.c246
-rw-r--r--xlators/features/mac-compat/src/mac-compat.h41
-rw-r--r--xlators/features/marker/Makefile.am2
-rw-r--r--xlators/features/marker/src/marker-quota-helper.c31
-rw-r--r--xlators/features/marker/src/marker-quota-helper.h2
-rw-r--r--xlators/features/marker/src/marker-quota.c291
-rw-r--r--xlators/features/marker/src/marker-quota.h23
-rw-r--r--xlators/features/marker/src/marker.c674
-rw-r--r--xlators/features/marker/src/marker.h7
-rw-r--r--xlators/features/marker/utils/Makefile.am3
-rw-r--r--xlators/features/marker/utils/src/Makefile.am26
-rw-r--r--xlators/features/marker/utils/src/gsyncd.c367
-rw-r--r--xlators/features/marker/utils/src/procdiggy.c121
-rw-r--r--xlators/features/marker/utils/syncdaemon/Makefile.am6
-rw-r--r--xlators/features/marker/utils/syncdaemon/README.md81
-rw-r--r--xlators/features/marker/utils/syncdaemon/__codecheck.py46
-rw-r--r--xlators/features/marker/utils/syncdaemon/__init__.py0
-rw-r--r--xlators/features/marker/utils/syncdaemon/configinterface.py224
-rw-r--r--xlators/features/marker/utils/syncdaemon/gconf.py20
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py419
-rw-r--r--xlators/features/marker/utils/syncdaemon/libcxattr.py72
-rw-r--r--xlators/features/marker/utils/syncdaemon/master.py961
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py129
-rw-r--r--xlators/features/marker/utils/syncdaemon/repce.py225
-rw-r--r--xlators/features/marker/utils/syncdaemon/resource.py972
-rw-r--r--xlators/features/marker/utils/syncdaemon/syncdutils.py288
-rw-r--r--xlators/features/protect/src/Makefile.am6
-rw-r--r--xlators/features/protect/src/prot_client.c2
-rw-r--r--xlators/features/qemu-block/Makefile.am1
-rw-r--r--xlators/features/qemu-block/src/Makefile.am155
-rw-r--r--xlators/features/qemu-block/src/bdrv-xlator.c389
-rw-r--r--xlators/features/qemu-block/src/bh-syncop.c48
-rw-r--r--xlators/features/qemu-block/src/clock-timer.c60
-rw-r--r--xlators/features/qemu-block/src/coroutine-synctask.c116
-rw-r--r--xlators/features/qemu-block/src/monitor-logging.c50
-rw-r--r--xlators/features/qemu-block/src/qb-coroutines.c667
-rw-r--r--xlators/features/qemu-block/src/qb-coroutines.h30
-rw-r--r--xlators/features/qemu-block/src/qemu-block-memory-types.h25
-rw-r--r--xlators/features/qemu-block/src/qemu-block.c1140
-rw-r--r--xlators/features/qemu-block/src/qemu-block.h109
-rw-r--r--xlators/features/quiesce/src/quiesce.c8
-rw-r--r--xlators/features/quota/src/Makefile.am15
-rw-r--r--xlators/features/quota/src/quota-enforcer-client.c403
-rw-r--r--xlators/features/quota/src/quota-mem-types.h3
-rw-r--r--xlators/features/quota/src/quota.c3064
-rw-r--r--xlators/features/quota/src/quota.h145
-rw-r--r--xlators/features/quota/src/quotad-aggregator.c423
-rw-r--r--xlators/features/quota/src/quotad-aggregator.h37
-rw-r--r--xlators/features/quota/src/quotad-helpers.c113
-rw-r--r--xlators/features/quota/src/quotad-helpers.h24
-rw-r--r--xlators/features/quota/src/quotad.c210
-rw-r--r--xlators/lib/src/libxlator.c224
-rw-r--r--xlators/lib/src/libxlator.h89
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am9
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-brick-ops.c604
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-etcd.c87
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-etcd.h (renamed from xlators/features/marker/utils/src/procdiggy.h)19
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.c3715
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c1290
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c257
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.c44
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.c656
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.h51
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-log-ops.c9
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mem-types.h5
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c936
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.c1899
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.h45
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.c1
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c1594
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.h49
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.c29
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.c1426
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c145
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-replace-brick.c64
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rpc-ops.c456
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c55
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.h19
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot.c5787
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.c2744
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.h126
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.c829
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.h63
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c5478
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h264
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c1229
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h37
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c506
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c1225
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c422
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h525
-rw-r--r--xlators/mount/fuse/src/Makefile.am4
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.c606
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.h34
-rw-r--r--xlators/mount/fuse/src/fuse-helpers.c34
-rw-r--r--xlators/mount/fuse/src/fuse-resolve.c36
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in695
-rwxr-xr-xxlators/mount/fuse/utils/mount_glusterfs.in600
-rw-r--r--xlators/nfs/server/src/Makefile.am2
-rw-r--r--xlators/nfs/server/src/acl3.c458
-rw-r--r--xlators/nfs/server/src/acl3.h26
-rw-r--r--xlators/nfs/server/src/mount3.c1036
-rw-r--r--xlators/nfs/server/src/mount3.h39
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c17
-rw-r--r--xlators/nfs/server/src/nfs-common.c19
-rw-r--r--xlators/nfs/server/src/nfs-common.h17
-rw-r--r--xlators/nfs/server/src/nfs-fops.c84
-rw-r--r--xlators/nfs/server/src/nfs-fops.h17
-rw-r--r--xlators/nfs/server/src/nfs-generics.c17
-rw-r--r--xlators/nfs/server/src/nfs-generics.h17
-rw-r--r--xlators/nfs/server/src/nfs-inodes.c21
-rw-r--r--xlators/nfs/server/src/nfs-inodes.h17
-rw-r--r--xlators/nfs/server/src/nfs-mem-types.h19
-rw-r--r--xlators/nfs/server/src/nfs.c858
-rw-r--r--xlators/nfs/server/src/nfs.h24
-rw-r--r--xlators/nfs/server/src/nfs3-fh.c29
-rw-r--r--xlators/nfs/server/src/nfs3-fh.h24
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.c170
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.h23
-rw-r--r--xlators/nfs/server/src/nfs3.c397
-rw-r--r--xlators/nfs/server/src/nfs3.h79
-rw-r--r--xlators/nfs/server/src/nlm4.c208
-rw-r--r--xlators/nfs/server/src/nlm4.h52
-rw-r--r--xlators/nfs/server/src/nlmcbk_svc.c28
-rw-r--r--xlators/performance/Makefile.am2
-rw-r--r--xlators/performance/io-cache/src/io-cache.c103
-rw-r--r--xlators/performance/io-cache/src/page.c17
-rw-r--r--xlators/performance/io-threads/src/io-threads.c1862
-rw-r--r--xlators/performance/md-cache/src/md-cache.c342
-rw-r--r--xlators/performance/open-behind/src/open-behind.c98
-rw-r--r--xlators/performance/quick-read/src/quick-read.c57
-rw-r--r--xlators/performance/read-ahead/src/page.c7
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c107
-rw-r--r--xlators/performance/readdir-ahead/Makefile.am3
-rw-r--r--xlators/performance/readdir-ahead/src/Makefile.am15
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h24
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.c560
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.h46
-rw-r--r--xlators/performance/write-behind/src/write-behind.c236
-rw-r--r--xlators/playground/Makefile.am2
-rw-r--r--xlators/playground/template/Makefile.am2
-rw-r--r--xlators/playground/template/src/Makefile.am16
-rw-r--r--xlators/playground/template/src/template.c49
-rw-r--r--xlators/playground/template/src/template.h24
-rw-r--r--xlators/protocol/auth/addr/src/addr.c18
-rw-r--r--xlators/protocol/auth/login/src/login.c17
-rw-r--r--xlators/protocol/client/src/client-handshake.c50
-rw-r--r--xlators/protocol/client/src/client-lk.c2
-rw-r--r--xlators/protocol/client/src/client-rpc-fops.c424
-rw-r--r--xlators/protocol/client/src/client.c178
-rw-r--r--xlators/protocol/client/src/client.h11
-rw-r--r--xlators/protocol/server/src/Makefile.am6
-rw-r--r--xlators/protocol/server/src/server-handshake.c115
-rw-r--r--xlators/protocol/server/src/server-helpers.c1274
-rw-r--r--xlators/protocol/server/src/server-helpers.h57
-rw-r--r--xlators/protocol/server/src/server-resolve.c37
-rw-r--r--xlators/protocol/server/src/server-rpc-fops.c2080
-rw-r--r--xlators/protocol/server/src/server.c636
-rw-r--r--xlators/protocol/server/src/server.h107
-rw-r--r--xlators/storage/Makefile.am3
-rw-r--r--xlators/storage/bd/Makefile.am3
-rw-r--r--xlators/storage/bd/src/Makefile.am (renamed from xlators/storage/bd_map/src/Makefile.am)11
-rw-r--r--xlators/storage/bd/src/bd-aio.c528
-rw-r--r--xlators/storage/bd/src/bd-aio.h41
-rw-r--r--xlators/storage/bd/src/bd-helper.c1021
-rw-r--r--xlators/storage/bd/src/bd-mem-types.h27
-rw-r--r--xlators/storage/bd/src/bd.c2450
-rw-r--r--xlators/storage/bd/src/bd.h173
-rw-r--r--xlators/storage/bd_map/src/bd_map.c2580
-rw-r--r--xlators/storage/bd_map/src/bd_map.h76
-rw-r--r--xlators/storage/bd_map/src/bd_map_help.c501
-rw-r--r--xlators/storage/bd_map/src/bd_map_help.h69
-rw-r--r--xlators/storage/posix/src/posix-aio.c10
-rw-r--r--xlators/storage/posix/src/posix-handle.c205
-rw-r--r--xlators/storage/posix/src/posix-handle.h95
-rw-r--r--xlators/storage/posix/src/posix-helpers.c580
-rw-r--r--xlators/storage/posix/src/posix.c1674
-rw-r--r--xlators/storage/posix/src/posix.h50
-rw-r--r--xlators/system/posix-acl/src/Makefile.am2
-rw-r--r--xlators/system/posix-acl/src/posix-acl-mem-types.h24
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.c10
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.h16
-rw-r--r--xlators/system/posix-acl/src/posix-acl.c86
-rw-r--r--xlators/system/posix-acl/src/posix-acl.h49
368 files changed, 96161 insertions, 37838 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am
index b1643d26c..f60fa85ce 100644
--- a/xlators/Makefile.am
+++ b/xlators/Makefile.am
@@ -1,3 +1,4 @@
-SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system
+SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system \
+ playground
CLEANFILES =
diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am
deleted file mode 100644
index f77665802..000000000
--- a/xlators/bindings/Makefile.am
+++ /dev/null
@@ -1 +0,0 @@
-SUBDIRS = $(BINDINGS_SUBDIRS)
diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am
deleted file mode 100644
index 90370d861..000000000
--- a/xlators/bindings/python/src/Makefile.am
+++ /dev/null
@@ -1,19 +0,0 @@
-
-xlator_PROGRAMS = python.so
-
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings
-
-python_PYTHON = gluster.py glustertypes.py glusterstack.py
-
-pythondir = $(xlatordir)/python
-
-python_so_SOURCES = python.c
-
-AM_CFLAGS = -fPIC $(GF_CPPFLAGS) -Wall \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \
- $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\"
-
-AM_LDFLAGS = $(PYTHON_LDFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py
deleted file mode 100644
index 337c983ec..000000000
--- a/xlators/bindings/python/src/gluster.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
-# This file is part of GlusterFS.
-#
-# This file is licensed to you under your choice of the GNU Lesser
-# General Public License, version 3 or any later version (LGPLv3 or
-# later), or the GNU General Public License, version 2 (GPLv2), in all
-# cases as published by the Free Software Foundation.
-
-from ctypes import *
-from glustertypes import *
-from glusterstack import *
-import sys
-import inspect
-
-libglusterfs = CDLL("libglusterfs.so")
-_gf_log = libglusterfs._gf_log
-_gf_log.restype = c_int32
-_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p]
-
-gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel")
-
-GF_LOG_NONE = 0
-GF_LOG_CRITICAL = 1
-GF_LOG_ERROR = 2
-GF_LOG_WARNING = 3
-GF_LOG_DEBUG = 4
-
-def gf_log(module, level, fmt, *params):
- if level <= gf_log_loglevel:
- frame = sys._getframe(1)
- _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name,
- frame.f_lineno, level, fmt, *params)
-
-class ComplexTranslator(object):
- def __init__(self, xlator):
- self.xlator = xlator_t.from_address(xlator)
-
- def __getattr__(self, item):
- return getattr(self.xlator, item)
diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py
deleted file mode 100644
index 0c071ae98..000000000
--- a/xlators/bindings/python/src/glusterstack.py
+++ /dev/null
@@ -1,48 +0,0 @@
-
-# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
-# This file is part of GlusterFS.
-#
-# This file is licensed to you under your choice of the GNU Lesser
-# General Public License, version 3 or any later version (LGPLv3 or
-# later), or the GNU General Public License, version 2 (GPLv2), in all
-# cases as published by the Free Software Foundation.
-
-from ctypes import *
-from glustertypes import *
-
-libc = CDLL("libc.so.6")
-calloc = libc.calloc
-calloc.argtypes = [c_int, c_int]
-calloc.restype = c_void_p
-
-# TODO: Can these be done in C somehow?
-def stack_wind(frame, rfn, obj, fn, *params):
- """Frame is a frame object"""
- _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t))
- _new[0].root = frame.root
- _new[0].next = frame.root[0].frames.next
- _new[0].prev = pointer(frame.root[0].frames)
- if frame.root[0].frames.next:
- frame.root[0].frames.next[0].prev = _new
- frame.root[0].frames.next = _new
- _new[0].this = obj
- # TODO: Type checking like tmp_cbk?
- _new[0].ret = rfn
- _new[0].parent = pointer(frame)
- _new[0].cookie = cast(_new, c_void_p)
- # TODO: Initialize lock
- #_new.lock.init()
- frame.ref_count += 1
- fn(_new, obj, *params)
-
-def stack_unwind(frame, *params):
- """Frame is a frame object"""
- fn = frame[0].ret
- parent = frame[0].parent[0]
- parent.ref_count -= 1
-
- op_ret = params[0]
- op_err = params[1]
- params = params[2:]
- fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this,
- op_ret, op_err, *params)
diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py
deleted file mode 100644
index 98437d22e..000000000
--- a/xlators/bindings/python/src/glustertypes.py
+++ /dev/null
@@ -1,160 +0,0 @@
-
-# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
-# This file is part of GlusterFS.
-#
-# This file is licensed to you under your choice of the GNU Lesser
-# General Public License, version 3 or any later version (LGPLv3 or
-# later), or the GNU General Public License, version 2 (GPLv2), in all
-# cases as published by the Free Software Foundation.
-
-from ctypes import *
-import collections
-
-#
-# Forward declaration of some gluster types
-#
-class call_frame_t(Structure):
- pass
-
-class call_ctx_t(Structure):
- pass
-
-class call_pool_t(Structure):
- pass
-
-class xlator_t(Structure):
- def _getFirstChild(self):
- return self.children[0].xlator
- firstChild = property(_getFirstChild)
-
-class xlator_list_t(Structure):
- pass
-
-class xlator_fops(Structure):
- pass
-
-class xlator_mops(Structure):
- pass
-
-class glusterfs_ctx_t(Structure):
- pass
-
-class list_head(Structure):
- pass
-
-class dict_t(Structure):
- pass
-
-class inode_table_t(Structure):
- pass
-
-class fd_t(Structure):
- pass
-
-class iovec(Structure):
- _fields_ = [
- ("iov_base", c_void_p),
- ("iov_len", c_size_t),
- ]
-
- def __init__(self, s):
- self.iov_base = cast(c_char_p(s), c_void_p)
- self.iov_len = len(s)
-
- def getBytes(self):
- return string_at(self.iov_base, self.iov_len)
-
-# This is a pthread_spinlock_t
-# TODO: what happens to volatile-ness?
-gf_lock_t = c_int
-
-uid_t = c_uint32
-gid_t = c_uint32
-pid_t = c_int32
-
-off_t = c_int64
-
-#
-# Function pointer types
-#
-ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t),
- POINTER(xlator_t), c_int32, c_int32)
-
-fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t))
-init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t))
-event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p)
-
-list_head._fields_ = [
- ("next", POINTER(list_head)),
- ("prev", POINTER(list_head)),
- ]
-
-call_frame_t._fields_ = [
- ("root", POINTER(call_ctx_t)),
- ("parent", POINTER(call_frame_t)),
- ("next", POINTER(call_frame_t)),
- ("prev", POINTER(call_frame_t)),
- ("local", c_void_p),
- ("this", POINTER(xlator_t)),
- ("ret", ret_fn_t),
- ("ref_count", c_int32),
- ("lock", gf_lock_t),
- ("cookie", c_void_p),
- ("op", c_int32),
- ("type", c_int8),
- ]
-
-call_ctx_t._fields_ = [
- ("all_frames", list_head),
- ("trans", c_void_p),
- ("pool", call_pool_t),
- ("unique", c_uint64),
- ("state", c_void_p),
- ("uid", uid_t),
- ("gid", gid_t),
- ("pid", pid_t),
- ("frames", call_frame_t),
- ("req_refs", POINTER(dict_t)),
- ("rsp_refs", POINTER(dict_t)),
- ]
-
-xlator_t._fields_ = [
- ("name", c_char_p),
- ("type", c_char_p),
- ("next", POINTER(xlator_t)),
- ("prev", POINTER(xlator_t)),
- ("parent", POINTER(xlator_t)),
- ("children", POINTER(xlator_list_t)),
- ("fops", POINTER(xlator_fops)),
- ("mops", POINTER(xlator_mops)),
- ("fini", fini_fn_t),
- ("init", init_fn_t),
- ("notify", event_notify_fn_t),
- ("options", POINTER(dict_t)),
- ("ctx", POINTER(glusterfs_ctx_t)),
- ("itable", POINTER(inode_table_t)),
- ("ready", c_char),
- ("private", c_void_p),
- ]
-
-xlator_list_t._fields_ = [
- ("xlator", POINTER(xlator_t)),
- ("next", POINTER(xlator_list_t)),
- ]
-
-fop_functions = collections.defaultdict(lambda: c_void_p)
-fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod',
- 'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access',
- 'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink',
- 'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush',
- 'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir',
- 'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir',
- # TODO: Call backs?
- ]
-
-fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t),
- POINTER(fd_t), POINTER(iovec), c_int32,
- off_t)
-
-fop_functions['writev'] = fop_writev_t
-xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names]
diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c
deleted file mode 100644
index 9b96790de..000000000
--- a/xlators/bindings/python/src/python.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-#include <Python.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-
-typedef struct
-{
- char *scriptname;
- PyObject *pXlator;
- PyObject *pScriptModule;
- PyObject *pGlusterModule;
- PyThreadState *pInterp;
-
- PyObject *pFrameType, *pVectorType, *pFdType;
-} python_private_t;
-
-int32_t
-python_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t offset)
-{
- python_private_t *priv = (python_private_t *)this->private;
- gf_log("python", GF_LOG_DEBUG, "In writev");
- if (PyObject_HasAttrString(priv->pXlator, "writev"))
- {
-
- PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev",
- "O O O i l",
- PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame),
- PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd),
- PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector),
- count,
- offset);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- }
- Py_XDECREF(retval);
- }
- else
- {
- return default_writev(frame, this, fd, vector, count, offset);
- }
- return 0;
-}
-
-struct xlator_fops fops = {
- .writev = python_writev
-};
-
-static PyObject *
-AnonModule_FromFile (const char* fname)
-{
- // Get the builtins
- PyThreadState* pThread = PyThreadState_Get();
- PyObject *pBuiltins = pThread->interp->builtins;
-
- if (PyErr_Occurred())
- {
- PyErr_Print();
- return NULL;
- }
-
- // Create a new dictionary for running code in
- PyObject *pModuleDict = PyDict_New();
- PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins);
- Py_INCREF(pBuiltins);
-
- // Run the file in the new context
- FILE* fp = fopen(fname, "r");
- PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict);
- fclose(fp);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- return NULL;
- }
-
- // Create an object to hold the new context
- PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- return NULL;
- }
- PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- Py_XDECREF(pModule);
- return NULL;
- }
-
- // Set the new context's dictionary to the one we used to run the code
- // inside
- PyObject_SetAttrString(pModule, "__dict__", pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- Py_DECREF(pModule);
- return NULL;
- }
-
- return pModule;
-}
-
-int32_t
-init (xlator_t *this)
-{
- // This is ok to call more than once per process
- Py_InitializeEx(0);
-
- if (!this->children) {
- gf_log ("python", GF_LOG_ERROR,
- "FATAL: python should have exactly one child");
- return -1;
- }
-
- python_private_t *priv = CALLOC (sizeof (python_private_t), 1);
- ERR_ABORT (priv);
-
- data_t *scriptname = dict_get (this->options, "scriptname");
- if (scriptname) {
- priv->scriptname = data_to_str(scriptname);
- } else {
- gf_log("python", GF_LOG_ERROR,
- "FATAL: python requires the scriptname parameter");
- return -1;
- }
-
- priv->pInterp = Py_NewInterpreter();
-
- // Adjust python's path
- PyObject *syspath = PySys_GetObject("path");
- PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH);
- PyList_Append(syspath, path);
- Py_DECREF(path);
-
- gf_log("python", GF_LOG_DEBUG,
- "Loading gluster module");
-
- priv->pGlusterModule = PyImport_ImportModule("gluster");
- if (PyErr_Occurred())
- {
- PyErr_Print();
- return -1;
- }
-
- priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t");
- priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t");
- priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec");
-
- gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname);
-
- priv->pScriptModule = AnonModule_FromFile(priv->scriptname);
- if (!priv->pScriptModule || PyErr_Occurred())
- {
- gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname);
- PyErr_Print();
- return -1;
- }
-
- if (!PyObject_HasAttrString(priv->pScriptModule, "xlator"))
- {
- gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname);
- return -1;
- }
- gf_log("python", GF_LOG_DEBUG, "Instantiating translator");
- priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&",
- PyLong_FromVoidPtr, this);
- if (PyErr_Occurred() || !priv->pXlator)
- {
- PyErr_Print();
- return -1;
- }
-
- this->private = priv;
-
- gf_log ("python", GF_LOG_DEBUG, "python xlator loaded");
- return 0;
-}
-
-void
-fini (xlator_t *this)
-{
- python_private_t *priv = (python_private_t*)(this->private);
- Py_DECREF(priv->pXlator);
- Py_DECREF(priv->pScriptModule);
- Py_DECREF(priv->pGlusterModule);
- Py_DECREF(priv->pFrameType);
- Py_DECREF(priv->pFdType);
- Py_DECREF(priv->pVectorType);
- Py_EndInterpreter(priv->pInterp);
- return;
-}
diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py
deleted file mode 100644
index 59a991dca..000000000
--- a/xlators/bindings/python/src/testxlator.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
- Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-"""
-
-"""
-This is a test translator written in python.
-
-Important things to note:
- This file must be import-able from glusterfsd. This probably means
- setting PYTHONPATH to where this file is located.
-
- This file must have a top-level xlator class object that will be
- used to instantiate individual translators.
-"""
-from gluster import *
-
-class MyXlator(ComplexTranslator):
- name = "MyXlator"
- def writev_cbk(self, frame, cookie, op_ret, op_errno, buf):
- stack_unwind(frame, op_ret, op_errno, buf)
- return 0
-
- def writev(self, frame, fd, vector, count, offset):
- gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
- # TODO: Use cookie to pass this to writev_cbk
- old_count = vector.iov_len
-
- data = vector.getBytes().encode("zlib")
-
- vector = iovec(data)
- gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
-
- @ret_fn_t
- def rfn(frame, prev, this, op_ret, op_errno, *params):
- if len(params) == 0:
- params = [0]
- return self.writev_cbk(frame, prev, old_count, op_errno, *params)
-
- stack_wind(frame, rfn, self.firstChild,
- self.firstChild[0].fops[0].writev, fd, vector, count, offset)
- return 0
-
-xlator = MyXlator
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 0990822a7..6e883e565 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht
+SUBDIRS = stripe afr dht nsr-server nsr-recon nsr-client
CLEANFILES =
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index 35d18a6c0..ea5a90abb 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -2,24 +2,26 @@ xlator_LTLIBRARIES = afr.la pump.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \
- afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \
- afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \
- afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \
+ afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \
+ afr-read-txn.c \
$(top_builddir)/xlators/lib/src/libxlator.c
+AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \
+ afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \
+ afr-self-heal-name.c
+
afr_la_LDFLAGS = -module -avoid-version
-afr_la_SOURCES = $(afr_common_source) afr.c
+afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c
afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
pump_la_LDFLAGS = -module -avoid-version
-pump_la_SOURCES = $(afr_common_source) pump.c
+pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c
pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \
- afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \
- afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \
- afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \
- $(top_builddir)/glusterfsd/src/glusterfsd.h
+ afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \
+ afr-common.c afr-self-heald.h pump.h \
+ $(top_builddir)/xlators/lib/src/libxlator.h
AM_CPPFLAGS = $(GF_CPPFLAGS) \
-I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
@@ -31,7 +33,6 @@ CLEANFILES =
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/replicate.so
- rm -f $(DESTDIR)$(xlatordir)/pump.so
install-data-hook:
ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index a9acb4094..164a651ba 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -45,798 +45,815 @@
#include "afr-dir-write.h"
#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
#include "afr-self-heald.h"
-#include "pump.h"
-#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000100000000ULL
-#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL
-int
-afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this,
- gf_boolean_t fail_conflict);
-void
-afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++)
- dst[i] = src[i];
-}
-
-void
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path)
+call_frame_t *
+afr_copy_frame (call_frame_t *base)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int op_errno = 0;
- priv = this->private;
+ frame = copy_frame (base);
+ if (!frame)
+ return NULL;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local) {
+ AFR_STACK_DESTROY (frame);
+ return NULL;
+ }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req, priv->pending_key[i],
- 3 * sizeof(int32_t));
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- path, priv->pending_key[i]);
- /* 3 = data+metadata+entry */
- }
- ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless "
- "lookup", path);
- }
+ return frame;
}
+/*
+ * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS:
+ *
+ * |<---------- 64bit ------------>|
+ * 63 32 31 16 15 0
+ * | EVENT_GEN | DATA | METADATA |
+ *
+ *
+ * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which
+ * metadata can be attempted to be read.
+ *
+ * bit-0 => priv->subvolumes[0]
+ * bit-1 => priv->subvolumes[1]
+ * ... etc. till bit-15
+ *
+ * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data
+ * can be attempted to be read.
+ *
+ * bit-16 => priv->subvolumes[0]
+ * bit-17 => priv->subvolumes[1]
+ * ... etc. till bit-31
+ *
+ * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation)
+ * when DATA and METADATA was last updated.
+ *
+ * If EVENT_GEN is < priv->event_generation,
+ * or is 0, it means afr_inode_refresh() needs
+ * to be called to recalculate the bitmaps.
+ */
+
int
-afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
- dict_t *xattr_req, loc_t *loc, void **gfid_req)
+__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
{
- int ret = -ENOMEM;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ int i = 0;
- GF_ASSERT (gfid_req);
+ priv = this->private;
- *gfid_req = NULL;
- local->xattr_req = dict_new ();
- if (!local->xattr_req)
- goto out;
- if (xattr_req)
- dict_copy (xattr_req, local->xattr_req);
+ ret = __inode_ctx_get (inode, this, &val);
+ if (ret < 0)
+ return ret;
- afr_xattr_req_prepare (this, local->xattr_req, loc->path);
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_INODELK_COUNT);
- }
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_ENTRYLK_COUNT);
- }
+ metadatamap = (val & 0x000000000000ffff);
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
- ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_PARENT_ENTRYLK);
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (metadata)
+ metadata[i] = (metadatamap >> i) & 1;
+ if (data)
+ data[i] = (datamap >> i) & 1;
+ }
- ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: failed to get the gfid from dict", loc->path);
- *gfid_req = NULL;
- } else {
- if (loc->parent != NULL)
- dict_del (local->xattr_req, "gfid-req");
- }
- ret = 0;
-out:
- return ret;
+ if (event_p)
+ *event_p = event;
+ return ret;
}
-void
-afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc)
-{
- inode_t *inode = NULL;
-
- inode = loc->inode;
- if (inode && !uuid_is_null (inode->gfid))
- uuid_copy (dst, inode->gfid);
- else if (!uuid_is_null (loc->gfid))
- uuid_copy (dst, loc->gfid);
- else if (new && !uuid_is_null (new))
- uuid_copy (dst, new);
-}
int
-afr_errno_count (int32_t *children, int *child_errno,
- unsigned int child_count, int32_t op_errno)
-{
- int i = 0;
- int errno_count = 0;
- int child = 0;
+__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int event)
+{
+ afr_private_t *priv = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int i = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data[i])
+ datamap |= (1 << i);
+ if (metadata[i])
+ metadatamap |= (1 << i);
+ }
- for (i = 0; i < child_count; i++) {
- if (children) {
- child = children[i];
- if (child == -1)
- break;
- } else {
- child = i;
- }
- if (child_errno[child] == op_errno)
- errno_count++;
- }
- return errno_count;
-}
+ val = ((uint64_t) metadatamap) |
+ (((uint64_t) datamap) << 16) |
+ (((uint64_t) event) << 32);
-int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid)
-{
- int ret = 0;
- uuid_t *pgfid = NULL;
+ return __inode_ctx_set (inode, this, &val);
+}
- GF_ASSERT (gfid);
- pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
- if (!pgfid) {
- ret = -1;
- goto out;
- }
+int
+__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)
+{
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
- uuid_copy (*pgfid, gfid);
+ ret = __inode_ctx_get (inode, this, &val);
+ (void) ret;
- ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t));
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed");
+ metadatamap = (val & 0x000000000000ffff) >> 0;
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = 0;
-out:
- if (ret && pgfid)
- GF_FREE (pgfid);
+ val = ((uint64_t) metadatamap) |
+ (((uint64_t) datamap) << 16) |
+ (((uint64_t) event) << 32);
- return ret;
+ return __inode_ctx_set (inode, this, &val);
}
-void
-afr_inode_ctx_destroy (afr_inode_ctx_t *ctx)
-{
- if (!ctx)
- return;
- GF_FREE (ctx->fresh_children);
- GF_FREE (ctx);
-}
-afr_inode_ctx_t*
-__afr_inode_ctx_get (inode_t *inode, xlator_t *this)
+int
+__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
{
- int ret = 0;
- uint64_t ctx_addr = 0;
- afr_inode_ctx_t *ctx = NULL;
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
- priv = this->private;
- ret = __inode_ctx_get (inode, this, &ctx_addr);
- if (ret < 0)
- ctx_addr = 0;
- if (ctx_addr != 0) {
- ctx = (afr_inode_ctx_t*) (long) ctx_addr;
- goto out;
- }
- ctx = GF_CALLOC (1, sizeof (*ctx),
- gf_afr_mt_inode_ctx_t);
- if (!ctx)
- goto fail;
- ctx->fresh_children = GF_CALLOC (priv->child_count,
- sizeof (*ctx->fresh_children),
- gf_afr_mt_int32_t);
- if (!ctx->fresh_children)
- goto fail;
- ret = __inode_ctx_put (inode, this, (uint64_t)ctx);
- if (ret) {
- gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to "
- "set the inode ctx (%s)",
- uuid_utoa (inode->gfid));
- goto fail;
- }
+ priv = this->private;
-out:
- return ctx;
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_get_small (inode, this, data,
+ metadata, event_p);
+ else
+ /* TBD: allocate structure with array and read from it */
+ ret = -1;
-fail:
- afr_inode_ctx_destroy (ctx);
- return NULL;
+ return ret;
}
-afr_inode_ctx_t*
-afr_inode_ctx_get (inode_t *inode, xlator_t *this)
+
+int
+__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
{
- afr_inode_ctx_t *ctx = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
- LOCK (&inode->lock);
- {
- ctx = __afr_inode_ctx_get (inode, this);
- }
- UNLOCK (&inode->lock);
- return ctx;
+ priv = this->private;
+
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_set_small (inode, this, data,
+ metadata, event);
+ else
+ ret = -1;
+
+ return ret;
}
-void
-afr_inode_get_ctx_params (xlator_t *this, inode_t *inode,
- afr_inode_params_t *params)
+
+int
+__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
{
- GF_ASSERT (inode);
- GF_ASSERT (params);
+ afr_private_t *priv = NULL;
+ int ret = -1;
- afr_inode_ctx_t *ctx = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
+ priv = this->private;
- priv = this->private;
- LOCK (&inode->lock);
- {
- ctx = __afr_inode_ctx_get (inode, this);
- if (!ctx)
- goto unlock;
- switch (params->op) {
- case AFR_INODE_GET_READ_CTX:
- fresh_children = params->u.read_ctx.children;
- read_child = (int32_t)(ctx->masks &
- AFR_ICTX_READ_CHILD_MASK);
- params->u.read_ctx.read_child = read_child;
- if (!fresh_children)
- goto unlock;
- for (i = 0; i < priv->child_count; i++)
- fresh_children[i] = ctx->fresh_children[i];
- break;
- case AFR_INODE_GET_OPENDIR_DONE:
- params->u.value = _gf_false;
- if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK)
- params->u.value = _gf_true;
- break;
- default:
- GF_ASSERT (0);
- break;
- }
- }
-unlock:
- UNLOCK (&inode->lock);
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_reset_small (inode, this);
+ else
+ ret = -1;
+
+ return ret;
}
-gf_boolean_t
-afr_is_split_brain (xlator_t *this, inode_t *inode)
+
+int
+afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
{
- afr_inode_ctx_t *ctx = NULL;
- gf_boolean_t spb = _gf_false;
+ int ret = -1;
- ctx = afr_inode_ctx_get (inode, this);
- if (!ctx)
- goto out;
- if ((ctx->mdata_spb == SPB) || (ctx->data_spb == SPB))
- spb = _gf_true;
-out:
- return spb;
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_get (inode, this, data,
+ metadata, event_p);
+ }
+ UNLOCK(&inode->lock);
+
+ return ret;
}
-gf_boolean_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode)
+
+int
+afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
{
- afr_inode_params_t params = {0};
+ int ret = -1;
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_set (inode, this, data, metadata,
+ event);
+ }
+ UNLOCK(&inode->lock);
- params.op = AFR_INODE_GET_OPENDIR_DONE;
- afr_inode_get_ctx_params (this, inode, &params);
- return params.u.value;
+ return ret;
}
-int32_t
-afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children)
+
+int
+afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
{
- afr_inode_params_t params = {0};
+ int ret = -1;
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_reset (inode, this);
+ }
+ UNLOCK(&inode->lock);
- params.op = AFR_INODE_GET_READ_CTX;
- params.u.read_ctx.children = fresh_children;
- afr_inode_get_ctx_params (this, inode, &params);
- return params.u.read_ctx.read_child;
+ return ret;
}
-void
-afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child)
-{
- uint64_t remaining_mask = 0;
- uint64_t mask = 0;
- remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks);
- mask = (AFR_ICTX_READ_CHILD_MASK & read_child);
- ctx->masks = remaining_mask | mask;
-}
+int
+afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused,
+ afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int idx = afr_index_for_transaction_type (type);
+ void *pending_raw = NULL;
+ int pending[3];
+ int ret = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_get_ptr (xdata, priv->pending_key[i],
+ &pending_raw);
+ if (ret) /* no pending flags */
+ continue;
+ memcpy (pending, pending_raw, sizeof(pending));
+
+ if (ntoh32 (pending[idx]))
+ accused[i] = 1;
+ }
-void
-afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child,
- int32_t *fresh_children, int32_t child_count)
-{
- int i = 0;
-
- afr_inode_ctx_set_read_child (ctx, read_child);
- for (i = 0; i < child_count; i++) {
- if (fresh_children)
- ctx->fresh_children[i] = fresh_children[i];
- else
- ctx->fresh_children[i] = -1;
- }
+ return 0;
}
-void
-afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children,
- int32_t child_count)
+
+int
+afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
+ unsigned char *data_accused)
{
- int i = 0;
- int32_t read_child = -1;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t maxsize = 0;
- GF_ASSERT (stale_children);
- for (i = 0; i < child_count; i++) {
- if (stale_children[i] == -1)
- break;
- afr_children_rm_child (ctx->fresh_children,
- stale_children[i], child_count);
- }
- read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK);
- if (!afr_is_child_present (ctx->fresh_children, child_count,
- read_child))
- afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]);
-}
+ priv = this->private;
-void
-afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx)
-{
- uint64_t remaining_mask = 0;
- uint64_t mask = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size > maxsize)
+ maxsize = replies[i].poststat.ia_size;
+ }
- remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks);
- mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK);
- ctx->masks = remaining_mask | mask;
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size < maxsize)
+ data_accused[i] = 1;
+ }
+
+ return 0;
}
-void
-afr_inode_set_ctx_params (xlator_t *this, inode_t *inode,
- afr_inode_params_t *params)
-{
- GF_ASSERT (inode);
- GF_ASSERT (params);
- afr_inode_ctx_t *ctx = NULL;
- afr_private_t *priv = NULL;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
- int32_t *stale_children = NULL;
+int
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int event_generation = 0;
+ int i = 0;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int ret = 0;
- priv = this->private;
- LOCK (&inode->lock);
- {
- ctx = __afr_inode_ctx_get (inode, this);
- if (!ctx)
- goto unlock;
- switch (params->op) {
- case AFR_INODE_SET_READ_CTX:
- read_child = params->u.read_ctx.read_child;
- fresh_children = params->u.read_ctx.children;
- afr_inode_ctx_set_read_ctx (ctx, read_child,
- fresh_children,
- priv->child_count);
- break;
- case AFR_INODE_RM_STALE_CHILDREN:
- stale_children = params->u.read_ctx.children;
- afr_inode_ctx_rm_stale_children (ctx,
- stale_children,
- priv->child_count);
- break;
- case AFR_INODE_SET_OPENDIR_DONE:
- afr_inode_ctx_set_opendir_done (ctx);
- break;
- default:
- GF_ASSERT (0);
- break;
- }
- }
-unlock:
- UNLOCK (&inode->lock);
-}
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+ event_generation = local->event_generation;
+
+ data_accused = alloca0 (priv->child_count);
+ data_readable = alloca0 (priv->child_count);
+ metadata_accused = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ data_readable[i] = 1;
+ metadata_readable[i] = 1;
+ }
-void
-afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb,
- afr_spb_state_t data_spb)
-{
- afr_inode_ctx_t *ctx = NULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ if (replies[i].op_ret == -1) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ afr_accused_fill (this, replies[i].xdata, data_accused,
+ (inode->ia_type == IA_IFDIR) ?
+ AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
+
+ afr_accused_fill (this, replies[i].xdata,
+ metadata_accused, AFR_METADATA_TRANSACTION);
+
+ }
- ctx = afr_inode_ctx_get (inode, this);
- if (mdata_spb != DONT_KNOW)
- ctx->mdata_spb = mdata_spb;
- if (data_spb != DONT_KNOW)
- ctx->data_spb = data_spb;
+ if (inode->ia_type != IA_IFDIR)
+ afr_accuse_smallfiles (this, replies, data_accused);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i]) {
+ data_readable[i] = 0;
+ ret = 1;
+ }
+ if (metadata_accused[i]) {
+ metadata_readable[i] = 0;
+ ret = 1;
+ }
+ }
+
+ afr_inode_read_subvol_set (inode, this, data_readable,
+ metadata_readable, event_generation);
+ return ret;
}
-void
-afr_set_opendir_done (xlator_t *this, inode_t *inode)
-{
- afr_inode_params_t params = {0};
- params.op = AFR_INODE_SET_OPENDIR_DONE;
- afr_inode_set_ctx_params (this, inode, &params);
+
+int
+afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque)
+{
+ if (heal)
+ STACK_DESTROY (heal->root);
+ return 0;
}
-void
-afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child,
- int32_t *fresh_children)
+int
+afr_inode_refresh_err (call_frame_t *frame, xlator_t *this)
{
- afr_inode_params_t params = {0};
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int err = 0;
- priv = this->private;
- GF_ASSERT (read_child >= 0);
- GF_ASSERT (fresh_children);
- GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count,
- read_child));
-
- params.op = AFR_INODE_SET_READ_CTX;
- params.u.read_ctx.read_child = read_child;
- params.u.read_ctx.children = fresh_children;
- afr_inode_set_ctx_params (this, inode, &params);
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && !local->replies[i].op_ret) {
+ err = 0;
+ goto ret;
+ }
+ }
+
+ err = afr_final_errno (local, priv);
+ret:
+ return -err;
}
-void
-afr_inode_rm_stale_children (xlator_t *this, inode_t *inode,
- int32_t *stale_children)
+
+int
+afr_refresh_selfheal_wrap (void *opaque)
{
- afr_inode_params_t params = {0};
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ int err = 0;
+
+ local = frame->local;
+ this = frame->this;
- GF_ASSERT (stale_children);
+ afr_selfheal (frame->this, local->refreshinode->gfid);
- params.op = AFR_INODE_RM_STALE_CHILDREN;
- params.u.read_ctx.children = stale_children;
- afr_inode_set_ctx_params (this, inode, &params);
+ afr_selfheal_unlocked_discover (frame, local->refreshinode,
+ local->refreshinode->gfid,
+ local->replies);
+
+ afr_replies_interpret (frame, this, local->refreshinode);
+
+ err = afr_inode_refresh_err (frame, this);
+
+ afr_replies_wipe (local, this->private);
+
+ local->refreshfn (frame, this, err);
+
+ return 0;
}
+
gf_boolean_t
-afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child)
+afr_selfheal_enabled (xlator_t *this)
{
- gf_boolean_t source_xattrs = _gf_false;
+ afr_private_t *priv = NULL;
+ gf_boolean_t data = _gf_false;
- GF_ASSERT (child < child_count);
+ priv = this->private;
- if ((child >= 0) && (child < child_count) &&
- sources[child]) {
- source_xattrs = _gf_true;
- }
- return source_xattrs;
+ gf_string2boolean (priv->data_self_heal, &data);
+
+ return data || priv->metadata_self_heal || priv->entry_self_heal;
}
-gf_boolean_t
-afr_is_child_present (int32_t *success_children, int32_t child_count,
- int32_t child)
+
+
+int
+afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
{
- gf_boolean_t success_child = _gf_false;
- int i = 0;
+ call_frame_t *heal = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ int err = 0;
- GF_ASSERT (child < child_count);
+ local = frame->local;
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- if (child == success_children[i]) {
- success_child = _gf_true;
- break;
- }
- }
- return success_child;
+ ret = afr_replies_interpret (frame, this, local->refreshinode);
+
+ err = afr_inode_refresh_err (frame, this);
+
+ afr_replies_wipe (local, this->private);
+
+ if (ret && afr_selfheal_enabled (this)) {
+ heal = copy_frame (frame);
+ if (heal)
+ heal->root->pid = -1;
+ ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto refresh_done;
+ } else {
+ refresh_done:
+ local->refreshfn (frame, this, err);
+ }
+
+ return 0;
}
-gf_boolean_t
-afr_is_read_child (int32_t *success_children, int32_t *sources,
- int32_t child_count, int32_t child)
+
+int
+afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *par)
{
- gf_boolean_t success_child = _gf_false;
- gf_boolean_t source = _gf_false;
+ afr_local_t *local = NULL;
+ int call_child = (long) cookie;
+ int call_count = 0;
- if (child < 0) {
- return _gf_false;
- }
+ local = frame->local;
- GF_ASSERT (success_children);
- GF_ASSERT (child_count > 0);
+ local->replies[call_child].valid = 1;
+ local->replies[call_child].op_ret = op_ret;
+ local->replies[call_child].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[call_child].poststat = *buf;
+ local->replies[call_child].postparent = *par;
+ local->replies[call_child].xdata = dict_ref (xdata);
+ }
- success_child = afr_is_child_present (success_children, child_count,
- child);
- if (!success_child)
- goto out;
- if (NULL == sources) {
- source = _gf_true;
- goto out;
- }
- source = afr_is_source_child (sources, child_count, child);
-out:
- return (success_child && source);
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_inode_refresh_done (frame, this);
+
+ return 0;
}
-int32_t
-afr_hash_child (int32_t *success_children, int32_t child_count,
- unsigned int hmode, uuid_t gfid)
+
+int
+afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i,
+ inode_t *inode, dict_t *xdata)
{
- uuid_t gfid_copy = {0,};
- pid_t pid;
+ loc_t loc = {0, };
+ afr_private_t *priv = NULL;
- if (!hmode) {
- return -1;
- }
+ priv = this->private;
- if (gfid) {
- uuid_copy(gfid_copy,gfid);
- }
- if (hmode > 1) {
- /*
- * Why getpid? Because it's one of the cheapest calls
- * available - faster than gethostname etc. - and returns a
- * constant-length value that's sure to be shorter than a UUID.
- * It's still very unlikely to be the same across clients, so
- * it still provides good mixing. We're not trying for
- * perfection here. All we need is a low probability that
- * multiple clients won't converge on the same subvolume.
- */
- pid = getpid();
- memcpy (gfid_copy, &pid, sizeof(pid));
- }
+ loc.inode = inode;
+ uuid_copy (loc.gfid, inode->gfid);
- return SuperFastHash((char *)gfid_copy,
- sizeof(gfid_copy)) % child_count;
+ STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->lookup, &loc, xdata);
+ return 0;
}
-/* If sources is NULL the xattrs are assumed to be of source for all
- * success_children.
- */
+
int
-afr_select_read_child_from_policy (int32_t *success_children,
- int32_t child_count, int32_t prev_read_child,
- int32_t config_read_child, int32_t *sources,
- unsigned int hmode, uuid_t gfid)
+afr_inode_refresh_do (call_frame_t *frame, xlator_t *this)
{
- int32_t read_child = -1;
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t *xdata = NULL;
- GF_ASSERT (success_children);
+ priv = this->private;
+ local = frame->local;
- read_child = config_read_child;
- if (afr_is_read_child (success_children, sources, child_count,
- read_child))
- goto out;
+ afr_replies_wipe (local, priv);
- read_child = prev_read_child;
- if (afr_is_read_child (success_children, sources, child_count,
- read_child))
- goto out;
+ xdata = dict_new ();
+ if (!xdata) {
+ afr_inode_refresh_done (frame, this);
+ return 0;
+ }
- read_child = afr_hash_child (success_children, child_count,
- hmode, gfid);
- if (afr_is_read_child (success_children, sources, child_count,
- read_child)) {
- goto out;
- }
+ if (afr_xattr_req_prepare (this, xdata) != 0) {
+ dict_unref (xdata);
+ afr_inode_refresh_done (frame, this);
+ return 0;
+ }
- for (i = 0; i < child_count; i++) {
- read_child = success_children[i];
- if (read_child < 0)
- break;
- if (afr_is_read_child (success_children, sources, child_count,
- read_child))
- goto out;
- }
- read_child = -1;
+ local->call_count = AFR_COUNT (local->child_up, priv->child_count);
-out:
- return read_child;
+ call_count = local->call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ afr_inode_refresh_subvol (frame, this, i, local->refreshinode,
+ xdata);
+
+ if (!--call_count)
+ break;
+ }
+
+ dict_unref (xdata);
+
+ return 0;
}
-/* This function should be used when all the success_children are sources
- */
-void
-afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
- int32_t *fresh_children, int32_t prev_read_child,
- int32_t config_read_child, uuid_t gfid)
+
+int
+afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_inode_refresh_cbk_t refreshfn)
{
- int read_child = -1;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- priv = this->private;
- read_child = afr_select_read_child_from_policy (fresh_children,
- priv->child_count,
- prev_read_child,
- config_read_child,
- NULL,
- priv->hash_mode, gfid);
- if (read_child >= 0)
- afr_inode_set_read_ctx (this, inode, read_child,
- fresh_children);
+ local = frame->local;
+
+ local->refreshfn = refreshfn;
+
+ if (local->refreshinode) {
+ inode_unref (local->refreshinode);
+ local->refreshinode = NULL;
+ }
+
+ local->refreshinode = inode_ref (inode);
+
+ afr_inode_refresh_do (frame, this);
+
+ return 0;
}
-/* afr_next_call_child ()
- * This is a common function used by all the read-type fops
- * This function should not be called with the inode's read_children array.
- * The fop's handler should make a copy of the inode's read_children,
- * preferred read_child into the local vars, because while this function is
- * in execution there is a chance for inode's read_ctx to change.
- */
-int32_t
-afr_next_call_child (int32_t *fresh_children, unsigned char *child_up,
- size_t child_count, int32_t *last_index,
- int32_t read_child)
+
+int
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
{
- int next_index = 0;
- int32_t next_call_child = -1;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
- GF_ASSERT (last_index);
+ priv = this->private;
- next_index = *last_index;
-retry:
- next_index++;
- if ((next_index >= child_count) ||
- (fresh_children[next_index] == -1))
- goto out;
- if ((fresh_children[next_index] == read_child) ||
- (!child_up[fresh_children[next_index]]))
- goto retry;
- *last_index = next_index;
- next_call_child = fresh_children[next_index];
-out:
- return next_call_child;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_uint64 (xattr_req, priv->pending_key[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to set dict value for %s",
+ priv->pending_key[i]);
+ /* 3 = data+metadata+entry */
+ }
+ ret = dict_set_uint64 (xattr_req, AFR_DIRTY,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty "
+ "query flag");
+ }
+
+ return ret;
}
- /* This function should not be called with the inode's read_children array.
- * The fop's handler should make a copy of the inode's read_children,
- * preferred read_child into the local vars, because while this function is
- * in execution there is a chance for inode's read_ctx to change.
- */
-int32_t
-afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child,
- int32_t *fresh_children,
- int32_t *call_child, int32_t *last_index)
+int
+afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
+ dict_t *xattr_req, loc_t *loc)
{
- int ret = 0;
- afr_private_t *priv = NULL;
- int i = 0;
-
- GF_ASSERT (child_up);
- GF_ASSERT (call_child);
- GF_ASSERT (last_index);
- GF_ASSERT (fresh_children);
+ int ret = -ENOMEM;
- if (read_child < 0) {
- ret = -EIO;
+ local->xattr_req = dict_new ();
+ if (!local->xattr_req)
goto out;
- }
- priv = this->private;
- *call_child = -1;
- *last_index = -1;
+ if (xattr_req)
+ dict_copy (xattr_req, local->xattr_req);
- if (child_up[read_child]) {
- *call_child = read_child;
- } else {
- for (i = 0; i < priv->child_count; i++) {
- if (fresh_children[i] == -1)
- break;
- if (child_up[fresh_children[i]]) {
- *call_child = fresh_children[i];
- ret = 0;
- break;
- }
- }
+ ret = afr_xattr_req_prepare (this, local->xattr_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to prepare xattr_req", loc->path);
+ }
- if (*call_child == -1) {
- ret = -ENOTCONN;
- goto out;
- }
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_INODELK_COUNT);
+ }
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_ENTRYLK_COUNT);
+ }
- *last_index = i;
+ ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_PARENT_ENTRYLK);
}
+
+ ret = 0;
out:
- gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, "
- "last_index: %d", ret, *call_child, *last_index);
return ret;
}
-void
-afr_reset_xattr (dict_t **xattr, unsigned int child_count)
+
+int
+afr_hash_child (inode_t *inode, int32_t child_count, int hashmode)
{
- unsigned int i = 0;
+ uuid_t gfid_copy = {0,};
+ pid_t pid;
- if (!xattr)
- goto out;
- for (i = 0; i < child_count; i++) {
- if (xattr[i]) {
- dict_unref (xattr[i]);
- xattr[i] = NULL;
- }
+ if (!hashmode) {
+ return -1;
}
-out:
- return;
-}
-void
-afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count)
-{
- afr_reset_xattr (xattr, child_count);
- GF_FREE (xattr);
+ if (inode) {
+ uuid_copy (gfid_copy, inode->gfid);
+ }
+
+ if (hashmode > 1) {
+ /*
+ * Why getpid? Because it's one of the cheapest calls
+ * available - faster than gethostname etc. - and returns a
+ * constant-length value that's sure to be shorter than a UUID.
+ * It's still very unlikely to be the same across clients, so
+ * it still provides good mixing. We're not trying for
+ * perfection here. All we need is a low probability that
+ * multiple clients won't converge on the same subvolume.
+ */
+ pid = getpid();
+ memcpy (gfid_copy, &pid, sizeof(pid));
+ }
+
+ return SuperFastHash((char *)gfid_copy,
+ sizeof(gfid_copy)) % child_count;
}
-void
-afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
-{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- sh = &local->self_heal;
- priv = this->private;
+int
+afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
+ unsigned char *readable)
+{
+ afr_private_t *priv = NULL;
+ int read_subvol = -1;
+ int i = 0;
- GF_FREE (sh->buf);
+ priv = this->private;
- GF_FREE (sh->parentbufs);
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
+ return priv->read_child;
- if (sh->inode)
- inode_unref (sh->inode);
+ /* second preference - use hashed mode */
+ read_subvol = afr_hash_child (inode, priv->child_count,
+ priv->hash_mode);
+ if (read_subvol >= 0 && readable[read_subvol])
+ return read_subvol;
- afr_xattr_array_destroy (sh->xattr, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i])
+ return i;
+ }
- GF_FREE (sh->child_errno);
+ /* no readable subvolumes, either split brain or all subvols down */
- afr_matrix_cleanup (sh->pending_matrix, priv->child_count);
- afr_matrix_cleanup (sh->delta_matrix, priv->child_count);
+ return -1;
+}
- GF_FREE (sh->sources);
- GF_FREE (sh->success);
+int
+afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p,
+ int type)
+{
+ int ret = -1;
- GF_FREE (sh->locked_nodes);
+ if (type == AFR_METADATA_TRANSACTION)
+ ret = afr_inode_read_subvol_get (inode, this, 0, readable,
+ event_p);
+ else
+ ret = afr_inode_read_subvol_get (inode, this, readable, 0,
+ event_p);
+ return ret;
+}
- if (sh->healing_fd) {
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
- }
- GF_FREE ((char *)sh->linkname);
+int
+afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
+ int *event_p, afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ unsigned char *readable = NULL;
+ unsigned char *intersection = NULL;
+ int subvol = -1;
+ int event = 0;
- GF_FREE (sh->success_children);
+ priv = this->private;
- GF_FREE (sh->fresh_children);
+ readable = alloca0 (priv->child_count);
+ data_readable = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+ intersection = alloca0 (priv->child_count);
- GF_FREE (sh->fresh_parent_dirs);
+ afr_inode_read_subvol_type_get (inode, this, readable, &event, type);
- loc_wipe (&sh->parent_loc);
- loc_wipe (&sh->lookup_loc);
+ afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable,
+ &event);
- GF_FREE (sh->checksum);
+ AFR_INTERSECT (intersection, data_readable, metadata_readable,
+ priv->child_count);
- GF_FREE (sh->write_needed);
- if (sh->healing_fd)
- fd_unref (sh->healing_fd);
+ if (AFR_COUNT (intersection, priv->child_count) > 0)
+ subvol = afr_read_subvol_select_by_policy (inode, this,
+ intersection);
+ else
+ subvol = afr_read_subvol_select_by_policy (inode, this,
+ readable);
+ if (subvol_p)
+ *subvol_p = subvol;
+ if (event_p)
+ *event_p = event;
+ return subvol;
}
void
afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
{
- afr_private_t * priv = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
priv = this->private;
afr_matrix_cleanup (local->pending, priv->child_count);
- afr_matrix_cleanup (local->transaction.txn_changelog,
- priv->child_count);
GF_FREE (local->internal_lock.locked_nodes);
- GF_FREE (local->internal_lock.inode_locked_nodes);
+ for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
+ GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
+ }
GF_FREE (local->internal_lock.lower_locked_nodes);
@@ -844,6 +861,8 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
GF_FREE (local->transaction.pre_op);
GF_FREE (local->transaction.eager_lock);
+ GF_FREE (local->transaction.fop_subvols);
+ GF_FREE (local->transaction.failed_subvols);
GF_FREE (local->transaction.basename);
GF_FREE (local->transaction.new_basename);
@@ -851,11 +870,38 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
loc_wipe (&local->transaction.parent_loc);
loc_wipe (&local->transaction.new_parent_loc);
- GF_FREE (local->transaction.postop_piggybacked);
}
void
+afr_replies_wipe (afr_local_t *local, afr_private_t *priv)
+{
+ int i;
+
+ if (!local->replies)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].xdata) {
+ dict_unref (local->replies[i].xdata);
+ local->replies[i].xdata = NULL;
+ }
+ }
+
+ memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);
+}
+
+void
+afr_remove_eager_lock_stub (afr_local_t *local)
+{
+ LOCK (&local->fd->lock);
+ {
+ list_del_init (&local->transaction.eager_locked);
+ }
+ UNLOCK (&local->fd->lock);
+}
+
+void
afr_local_cleanup (afr_local_t *local, xlator_t *this)
{
afr_private_t * priv = NULL;
@@ -863,7 +909,11 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
if (!local)
return;
- afr_local_sh_cleanup (local, this);
+ syncbarrier_destroy (&local->barrier);
+
+ if (local->transaction.eager_lock_on &&
+ !list_empty (&local->transaction.eager_locked))
+ afr_remove_eager_lock_stub (local);
afr_local_transaction_cleanup (local, this);
@@ -881,40 +931,26 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
if (local->dict)
dict_unref (local->dict);
+ afr_replies_wipe (local, priv);
GF_FREE(local->replies);
GF_FREE (local->child_up);
- GF_FREE (local->child_errno);
+ GF_FREE (local->read_attempted);
- GF_FREE (local->fresh_children);
+ GF_FREE (local->readable);
- { /* lookup */
- if (local->cont.lookup.xattrs) {
- afr_reset_xattr (local->cont.lookup.xattrs,
- priv->child_count);
- GF_FREE (local->cont.lookup.xattrs);
- local->cont.lookup.xattrs = NULL;
- }
-
- if (local->cont.lookup.xattr) {
- dict_unref (local->cont.lookup.xattr);
- }
+ if (local->inode)
+ inode_unref (local->inode);
- if (local->cont.lookup.inode) {
- inode_unref (local->cont.lookup.inode);
- }
-
- GF_FREE (local->cont.lookup.postparents);
+ if (local->parent)
+ inode_unref (local->parent);
- GF_FREE (local->cont.lookup.bufs);
+ if (local->parent2)
+ inode_unref (local->parent2);
- GF_FREE (local->cont.lookup.success_children);
-
- GF_FREE (local->cont.lookup.sources);
- afr_matrix_cleanup (local->cont.lookup.pending_matrix,
- priv->child_count);
- }
+ if (local->refreshinode)
+ inode_unref (local->refreshinode);
{ /* getxattr */
GF_FREE (local->cont.getxattr.name);
@@ -948,6 +984,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
{ /* writev */
GF_FREE (local->cont.writev.vector);
+ if (local->cont.writev.iobref)
+ iobref_unref (local->cont.writev.iobref);
}
{ /* setxattr */
@@ -1009,1040 +1047,222 @@ afr_frame_return (call_frame_t *frame)
return call_count;
}
-int
-afr_set_elem_count_get (unsigned char *elems, int child_count)
-{
- int i = 0;
- int ret = 0;
-
- for (i = 0; i < child_count; i++)
- if (elems[i])
- ret++;
- return ret;
-}
-
-/**
- * up_children_count - return the number of children that are up
- */
-
-unsigned int
-afr_up_children_count (unsigned char *child_up, unsigned int child_count)
-{
- return afr_set_elem_count_get (child_up, child_count);
-}
-
-unsigned int
-afr_locked_children_count (unsigned char *children, unsigned int child_count)
-{
- return afr_set_elem_count_get (children, child_count);
-}
-
-unsigned int
-afr_pre_op_done_children_count (unsigned char *pre_op,
- unsigned int child_count)
-{
- return afr_set_elem_count_get (pre_op, child_count);
-}
gf_boolean_t
-afr_is_fresh_lookup (loc_t *loc, xlator_t *this)
-{
- uint64_t ctx = 0;
- int32_t ret = 0;
-
- GF_ASSERT (loc);
- GF_ASSERT (this);
- GF_ASSERT (loc->inode);
-
- ret = inode_ctx_get (loc->inode, this, &ctx);
- if (0 == ret)
- return _gf_false;
- return _gf_true;
-}
-
-void
-afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
-{
- GF_ASSERT (loc);
- GF_ASSERT (buf);
-
- uuid_copy (loc->gfid, buf->ia_gfid);
- if (postparent)
- uuid_copy (loc->pargfid, postparent->ia_gfid);
-}
-
-int
-afr_lookup_build_response_params (afr_local_t *local, xlator_t *this)
-{
- struct iatt *buf = NULL;
- struct iatt *postparent = NULL;
- dict_t **xattr = NULL;
- int32_t *success_children = NULL;
- int32_t *sources = NULL;
- afr_private_t *priv = NULL;
- int32_t read_child = -1;
- int ret = 0;
- int i = 0;
-
- GF_ASSERT (local);
-
- buf = &local->cont.lookup.buf;
- postparent = &local->cont.lookup.postparent;
- xattr = &local->cont.lookup.xattr;
- priv = this->private;
+afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this)
+{
+ int i = 0;
+ int tmp = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].xdata)
+ continue;
+ if (dict_get_int32 (local->replies[i].xdata,
+ GLUSTERFS_PARENT_ENTRYLK,
+ &tmp) == 0)
+ if (tmp)
+ return _gf_true;
+ }
- read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode,
- local->fresh_children);
- if (read_child < 0) {
- ret = -1;
- goto out;
- }
- success_children = local->cont.lookup.success_children;
- sources = local->cont.lookup.sources;
- memset (sources, 0, sizeof (*sources) * priv->child_count);
- afr_children_intersection_get (local->fresh_children, success_children,
- sources, priv->child_count);
- if (!sources[read_child]) {
- read_child = -1;
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i]) {
- read_child = i;
- break;
- }
- }
- }
- if (read_child < 0) {
- ret = -1;
- goto out;
- }
- gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d",
- read_child);
- if (!*xattr)
- *xattr = dict_ref (local->cont.lookup.xattrs[read_child]);
- *buf = local->cont.lookup.bufs[read_child];
- *postparent = local->cont.lookup.postparents[read_child];
-
- if (IA_INVAL == local->cont.lookup.inode->ia_type) {
- /* fix for RT #602 */
- local->cont.lookup.inode->ia_type = buf->ia_type;
- }
-out:
- return ret;
+ return _gf_false;
}
-static void
-afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this,
- int child_index, dict_t *xattr)
-{
- uint32_t inodelk_count = 0;
- uint32_t entrylk_count = 0;
- int ret = -1;
- uint32_t parent_entrylk = 0;
-
- GF_ASSERT (local);
- GF_ASSERT (this);
- GF_ASSERT (xattr);
- GF_ASSERT (child_index >= 0);
-
- ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT,
- &inodelk_count);
- if (ret == 0)
- local->inodelk_count += inodelk_count;
-
- ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT,
- &entrylk_count);
- if (ret == 0)
- local->entrylk_count += entrylk_count;
- ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK,
- &parent_entrylk);
- if (!ret)
- local->cont.lookup.parent_entrylk += parent_entrylk;
-}
/*
- * It's important to maintain a commutative property on do_*_self_heal and
- * found*; once set, they must not be cleared by a subsequent iteration or
- * call, so that they represent a logical OR of all iterations and calls
- * regardless of child/key order. That allows the caller to call us multiple
- * times without having to use a separate variable as a "reduce" accumulator.
- */
-static void
-afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this,
- dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- int i = 0;
- int ret = -1;
- void *pending_raw = NULL;
- int32_t *pending = NULL;
-
- GF_ASSERT (local);
- GF_ASSERT (this);
- GF_ASSERT (xattr);
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
- if (ret != 0) {
- continue;
- }
- pending = pending_raw;
-
- if (pending[AFR_METADATA_TRANSACTION]) {
- gf_log(this->name, GF_LOG_DEBUG,
- "metadata self-heal is pending for %s.",
- local->loc.path);
- local->self_heal.do_metadata_self_heal = _gf_true;
- }
-
- if (pending[AFR_ENTRY_TRANSACTION]) {
- gf_log(this->name, GF_LOG_DEBUG,
- "entry self-heal is pending for %s.",
- local->loc.path);
- local->self_heal.do_entry_self_heal = _gf_true;
- }
-
- if (pending[AFR_DATA_TRANSACTION]) {
- gf_log(this->name, GF_LOG_DEBUG,
- "data self-heal is pending for %s.",
- local->loc.path);
- local->self_heal.do_data_self_heal = _gf_true;
- }
- }
-}
-
-void
-afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this)
-{
- int32_t *sources = NULL;
- afr_private_t *priv = NULL;
- int32_t subvol_status = 0;
- int32_t *success_children = NULL;
- dict_t **xattrs = NULL;
- struct iatt *bufs = NULL;
- int32_t **pending_matrix = NULL;
-
- priv = this->private;
-
- sources = GF_CALLOC (priv->child_count, sizeof (*sources),
- gf_afr_mt_int32_t);
- if (NULL == sources)
- goto out;
- success_children = local->cont.lookup.success_children;
- xattrs = local->cont.lookup.xattrs;
- bufs = local->cont.lookup.bufs;
- pending_matrix = local->cont.lookup.pending_matrix;
- afr_build_sources (this, xattrs, bufs, pending_matrix,
- sources, success_children, AFR_METADATA_TRANSACTION,
- &subvol_status, _gf_false);
- if (subvol_status & SPLIT_BRAIN)
- local->cont.lookup.possible_spb = _gf_true;
-out:
- GF_FREE (sources);
-}
-
-static void
-afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this,
- struct iatt *buf, struct iatt *lookup_buf)
-{
- if (PERMISSION_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- gf_log (this->name, GF_LOG_DEBUG,
- "permissions differ for %s ", local->loc.path);
- local->self_heal.do_metadata_self_heal = _gf_true;
- }
-
- if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- local->self_heal.do_metadata_self_heal = _gf_true;
- gf_log (this->name, GF_LOG_DEBUG,
- "ownership differs for %s ", local->loc.path);
- }
-
- if (SIZE_DIFFERS (buf, lookup_buf)
- && IA_ISREG (buf->ia_type)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "size differs for %s ", local->loc.path);
- local->self_heal.do_data_self_heal = _gf_true;
- }
-
- if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) {
- /* mismatching gfid */
- gf_log (this->name, GF_LOG_WARNING,
- "%s: gfid different on subvolume", local->loc.path);
- }
-}
+ * Quota size xattrs are not maintained by afr. There is a
+ * possibility that they differ even when both the directory changelog xattrs
+ * suggest everything is fine. So if there is at least one 'source' check among
+ * the sources which has the maximum quota size. Otherwise check among all the
+ * available ones for maximum quota size. This way if there is a source and
+ * stale copies it always votes for the 'source'.
+ * */
static void
-afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this)
+afr_handle_quota_size (call_frame_t *frame, xlator_t *this)
{
- gf_boolean_t split_brain = _gf_false;
- afr_self_heal_t *sh = NULL;
+ unsigned char *readable = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0;
+ uint64_t size = 0;
+ uint64_t max_size = 0;
+ int readable_cnt = 0;
- sh = &local->self_heal;
-
- split_brain = afr_is_split_brain (this, local->cont.lookup.inode);
- split_brain = split_brain || local->cont.lookup.possible_spb;
- if ((local->success_count > 0) && split_brain &&
- IA_ISREG (local->cont.lookup.inode->ia_type)) {
- sh->force_confirm_spb = _gf_true;
- gf_log (this->name, GF_LOG_DEBUG,
- "split brain detected during lookup of %s.",
- local->loc.path);
- }
-}
-
-static void
-afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this)
-{
- GF_ASSERT (local);
- GF_ASSERT (this);
-
- if ((local->success_count > 0) && (local->enoent_count > 0)) {
- local->self_heal.do_metadata_self_heal = _gf_true;
- local->self_heal.do_data_self_heal = _gf_true;
- local->self_heal.do_entry_self_heal = _gf_true;
- local->self_heal.do_gfid_self_heal = _gf_true;
- local->self_heal.do_missing_entry_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "entries are missing in lookup of %s.",
- local->loc.path);
- }
-
- return;
-}
-
-gf_boolean_t
-afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv)
-{
- GF_ASSERT (sh);
- GF_ASSERT (priv);
-
- if (sh->force_confirm_spb)
- return _gf_true;
- return (sh->do_gfid_self_heal
- || sh->do_missing_entry_self_heal
- || (afr_data_self_heal_enabled (priv->data_self_heal) &&
- sh->do_data_self_heal)
- || (priv->metadata_self_heal && sh->do_metadata_self_heal)
- || (priv->entry_self_heal && sh->do_entry_self_heal));
-}
-
-afr_transaction_type
-afr_transaction_type_get (ia_type_t ia_type)
-{
- afr_transaction_type type = AFR_METADATA_TRANSACTION;
-
- GF_ASSERT (ia_type != IA_INVAL);
-
- if (IA_ISDIR (ia_type)) {
- type = AFR_ENTRY_TRANSACTION;
- } else if (IA_ISREG (ia_type)) {
- type = AFR_DATA_TRANSACTION;
- }
- return type;
-}
-
-int
-afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,
- int32_t *read_child)
-{
- ia_type_t ia_type = IA_INVAL;
- int32_t source = -1;
- int ret = -1;
- dict_t **xattrs = NULL;
- int32_t *success_children = NULL;
- afr_transaction_type type = AFR_METADATA_TRANSACTION;
- uuid_t *gfid = NULL;
-
- GF_ASSERT (local);
- GF_ASSERT (this);
- GF_ASSERT (local->success_count > 0);
-
- success_children = local->cont.lookup.success_children;
- /*We can take the success_children[0] only because we already
- *handle the conflicting children other wise, we could select the
- *read_child based on wrong file type
- */
- ia_type = local->cont.lookup.bufs[success_children[0]].ia_type;
- type = afr_transaction_type_get (ia_type);
- xattrs = local->cont.lookup.xattrs;
- gfid = &local->cont.lookup.buf.ia_gfid;
- source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs,
- type, *gfid);
- if (source < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "failed to select source "
- "for %s", local->loc.path);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s",
- source, local->loc.path);
- *read_child = source;
- ret = 0;
-out:
- return ret;
-}
-
-static inline gf_boolean_t
-afr_is_transaction_running (afr_local_t *local)
-{
- GF_ASSERT (local->fop == GF_FOP_LOOKUP);
- return ((local->inodelk_count > 0) || (local->entrylk_count > 0));
-}
-
-void
-afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,
- gf_boolean_t background, ia_type_t ia_type, char *reason,
- void (*gfid_sh_success_cbk) (call_frame_t *sh_frame,
- xlator_t *this),
- int (*unwind) (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- int32_t sh_failed))
-{
- afr_local_t *local = NULL;
- char sh_type_str[256] = {0,};
- char *bg = "";
-
- GF_ASSERT (frame);
- GF_ASSERT (this);
- GF_ASSERT (inode);
- GF_ASSERT (ia_type != IA_INVAL);
-
- local = frame->local;
- local->self_heal.background = background;
- local->self_heal.type = ia_type;
- local->self_heal.unwind = unwind;
- local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk;
-
- afr_self_heal_type_str_get (&local->self_heal,
- sh_type_str,
- sizeof (sh_type_str));
-
- if (background)
- bg = "background";
- gf_log (this->name, GF_LOG_DEBUG,
- "%s %s self-heal triggered. path: %s, reason: %s", bg,
- sh_type_str, local->loc.path, reason);
-
- afr_self_heal (frame, this, inode);
-}
-
-unsigned int
-afr_gfid_missing_count (const char *xlator_name, int32_t *success_children,
- struct iatt *bufs, unsigned int child_count,
- const char *path)
-{
- unsigned int gfid_miss_count = 0;
- int i = 0;
- struct iatt *child1 = NULL;
-
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- child1 = &bufs[success_children[i]];
- if (uuid_is_null (child1->ia_gfid)) {
- gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null"
- " on subvolume %d", path, success_children[i]);
- gfid_miss_count++;
- }
- }
-
- return gfid_miss_count;
-}
-
-static int
-afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this)
-{
- int32_t *success_children = NULL;
- afr_private_t *priv = NULL;
- struct iatt *bufs = NULL;
- int miss_count = 0;
-
- priv = this->private;
- bufs = local->cont.lookup.bufs;
- success_children = local->cont.lookup.success_children;
-
- miss_count = afr_gfid_missing_count (this->name, success_children,
- bufs, priv->child_count,
- local->loc.path);
- return miss_count;
-}
-
-gf_boolean_t
-afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
- unsigned int child_count, const char *path,
- const char *xlator_name)
-{
- gf_boolean_t conflicting = _gf_false;
- int i = 0;
- struct iatt *child1 = NULL;
- struct iatt *child2 = NULL;
- uuid_t *gfid = NULL;
-
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- child1 = &bufs[success_children[i]];
- if ((!gfid) && (!uuid_is_null (child1->ia_gfid)))
- gfid = &child1->ia_gfid;
-
- if (i == 0)
- continue;
-
- child2 = &bufs[success_children[i-1]];
- if (FILETYPE_DIFFERS (child1, child2)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype "
- "differs on subvolumes (%d, %d)", path,
- success_children[i-1], success_children[i]);
- conflicting = _gf_true;
- goto out;
- }
- if (!gfid || uuid_is_null (child1->ia_gfid))
- continue;
- if (uuid_compare (*gfid, child1->ia_gfid)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs"
- " on subvolume %d", path, success_children[i]);
- conflicting = _gf_true;
- goto out;
- }
- }
-out:
- return conflicting;
-}
-
-/* afr_update_gfid_from_iatts: This function should be called only if the
- * iatts are not conflicting.
- */
-void
-afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs,
- int32_t *success_children, unsigned int child_count)
-{
- uuid_t *gfid = NULL;
- int i = 0;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child == -1)
- break;
- if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) {
- gfid = &bufs[child].ia_gfid;
- } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) {
- if (uuid_compare (*gfid, bufs[child].ia_gfid)) {
- GF_ASSERT (0);
- goto out;
- }
- }
- }
- if (gfid && (!uuid_is_null (*gfid)))
- uuid_copy (uuid, *gfid);
-out:
- return;
-}
-
-static gf_boolean_t
-afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- gf_boolean_t conflict = _gf_false;
-
- priv = this->private;
- conflict = afr_conflicting_iattrs (local->cont.lookup.bufs,
- local->cont.lookup.success_children,
- priv->child_count, local->loc.path,
- this->name);
- return conflict;
-}
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+
+ readable = alloca0 (priv->child_count);
+
+ afr_inode_read_subvol_get (local->inode, this, readable, 0, 0);
+
+ readable_cnt = AFR_COUNT (readable, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size))
+ continue;
+ if (size > max_size)
+ max_size = size;
+ }
-gf_boolean_t
-afr_open_only_data_self_heal (char *data_self_heal)
-{
- return !strcmp (data_self_heal, "open");
+ if (!max_size)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size))
+ continue;
+ }
}
-gf_boolean_t
-afr_data_self_heal_enabled (char *data_self_heal)
-{
- gf_boolean_t enabled = _gf_false;
-
- if (gf_string2boolean (data_self_heal, &enabled) == -1) {
- enabled = !strcmp (data_self_heal, "open");
- GF_ASSERT (enabled);
- }
-
- return enabled;
-}
static void
-afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this)
+afr_lookup_done (call_frame_t *frame, xlator_t *this)
{
- int i = 0;
- struct iatt *bufs = NULL;
- dict_t **xattr = NULL;
- afr_private_t *priv = NULL;
- int32_t child1 = -1;
- int32_t child2 = -1;
- afr_self_heal_t *sh = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
+ unsigned char *readable = NULL;
+ int event = 0;
+ struct afr_reply *replies = NULL;
+ uuid_t read_gfid = {0, };
+ gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t can_interpret = _gf_true;
priv = this->private;
- sh = &local->self_heal;
-
- afr_detect_self_heal_by_lookup_status (local, this);
-
- if (afr_lookup_gfid_missing_count (local, this))
- local->self_heal.do_gfid_self_heal = _gf_true;
-
- if (_gf_true == afr_lookup_conflicting_entries (local, this))
- local->self_heal.do_missing_entry_self_heal = _gf_true;
- else
- afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req,
- local->cont.lookup.bufs,
- local->cont.lookup.success_children,
- priv->child_count);
-
- bufs = local->cont.lookup.bufs;
- for (i = 1; i < local->success_count; i++) {
- child1 = local->cont.lookup.success_children[i-1];
- child2 = local->cont.lookup.success_children[i];
- afr_detect_self_heal_by_iatt (local, this,
- &bufs[child1], &bufs[child2]);
- }
-
- xattr = local->cont.lookup.xattrs;
- for (i = 0; i < local->success_count; i++) {
- child1 = local->cont.lookup.success_children[i];
- afr_lookup_set_self_heal_params_by_xattr (local, this,
- xattr[child1]);
- }
- if (afr_open_only_data_self_heal (priv->data_self_heal))
- sh->do_data_self_heal = _gf_false;
- if (sh->do_metadata_self_heal)
- afr_lookup_check_set_metadata_split_brain (local, this);
- afr_detect_self_heal_by_split_brain_status (local, this);
-}
-
-int
-afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- int32_t sh_failed)
-{
- afr_local_t *local = NULL;
- int ret = -1;
- dict_t *xattr = NULL;
- int32_t spb = 0;
-
local = frame->local;
+ replies = local->replies;
- if (op_ret == -1) {
- local->op_ret = -1;
- local->op_errno = afr_most_important_error(local->op_errno,
- op_errno, _gf_true);
-
- goto out;
- } else {
- local->op_ret = 0;
- }
+ locked_entry = afr_is_entry_possibly_under_txn (local, this);
- afr_lookup_done_success_action (frame, this, _gf_true);
- xattr = local->cont.lookup.xattr;
- if (xattr) {
- ret = dict_set_int32 (xattr, "sh-failed", sh_failed);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
- "sh-failed to %d", local->loc.path, sh_failed);
-
- if (local->self_heal.actual_sh_started == _gf_true &&
- sh_failed == 0) {
- ret = dict_set_int32 (xattr, "actual-sh-done", 1);
- if (ret)
- gf_log(this->name, GF_LOG_ERROR, "%s: Failed to"
- " set actual-sh-done to %d",
- local->loc.path,
- local->self_heal.actual_sh_started);
- }
+ readable = alloca0 (priv->child_count);
- if (local->loc.inode)
- spb = afr_is_split_brain (this, local->loc.inode);
- ret = dict_set_int32 (xattr, "split-brain", spb);
- }
-out:
- AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->cont.lookup.inode, &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
+ afr_inode_read_subvol_get (local->loc.parent, this, readable,
+ NULL, &event);
- return 0;
-}
-
-//TODO: At the moment only lookup needs this, so not doing any checks, in the
-// future we will have to do fop specific operations
-void
-afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_local_t *sh_local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- struct iatt *lookup_bufs = NULL;
- struct iatt *lookup_parentbufs = NULL;
-
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- local = sh->orig_frame->local;
- lookup_bufs = local->cont.lookup.bufs;
- lookup_parentbufs = local->cont.lookup.postparents;
- priv = this->private;
-
- memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf));
- memcpy (lookup_parentbufs, sh->parentbufs,
- priv->child_count * sizeof (*sh->parentbufs));
-
- afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count);
- if (local->cont.lookup.xattr) {
- dict_unref (local->cont.lookup.xattr);
- local->cont.lookup.xattr = NULL;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]);
- }
-
- afr_reset_children (local->cont.lookup.success_children,
- priv->child_count);
- afr_children_copy (local->cont.lookup.success_children,
- sh->fresh_children, priv->child_count);
-}
-
-static void
-afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this,
- gf_boolean_t *sh_launched)
-{
- unsigned int up_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- char *reason = NULL;
-
- GF_ASSERT (sh_launched);
- *sh_launched = _gf_false;
- priv = this->private;
- local = frame->local;
-
- up_count = afr_up_children_count (local->child_up, priv->child_count);
- if (up_count == 1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Only 1 child up - do not attempt to detect self heal");
- goto out;
- }
-
- afr_lookup_set_self_heal_params (local, this);
- if (afr_can_self_heal_proceed (&local->self_heal, priv)) {
- if (afr_is_transaction_running (local))
- goto out;
-
- reason = "lookup detected pending operations";
- afr_launch_self_heal (frame, this, local->cont.lookup.inode,
- _gf_true, local->cont.lookup.buf.ia_type,
- reason, afr_post_gfid_sh_success,
- afr_self_heal_lookup_unwind);
- *sh_launched = _gf_true;
- }
-out:
- return;
-}
-
-void
-afr_get_fresh_children (int32_t *success_children, int32_t *sources,
- int32_t *fresh_children, unsigned int child_count)
-{
- unsigned int i = 0;
- unsigned int j = 0;
-
- GF_ASSERT (success_children);
- GF_ASSERT (sources);
- GF_ASSERT (fresh_children);
-
- afr_reset_children (fresh_children, child_count);
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- if (afr_is_read_child (success_children, sources, child_count,
- success_children[i])) {
- fresh_children[j] = success_children[i];
- j++;
- }
- }
-}
-
-static int
-afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child)
-{
- afr_private_t *priv = NULL;
-
- GF_ASSERT (read_child >= 0);
-
- priv = this->private;
- afr_get_fresh_children (local->cont.lookup.success_children,
- local->cont.lookup.sources,
- local->fresh_children, priv->child_count);
- afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child,
- local->fresh_children);
-
- return 0;
-}
-
-int
-afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this,
- gf_boolean_t fail_conflict)
-{
- int32_t read_child = -1;
- int32_t ret = -1;
- afr_local_t *local = NULL;
- gf_boolean_t fresh_lookup = _gf_false;
-
- local = frame->local;
- fresh_lookup = local->cont.lookup.fresh_lookup;
-
- if (local->loc.parent == NULL)
- fail_conflict = _gf_true;
-
- if (afr_lookup_conflicting_entries (local, this)) {
- if (fail_conflict == _gf_false)
- ret = 0;
- goto out;
- }
-
- ret = afr_lookup_select_read_child (local, this, &read_child);
- if (!afr_is_transaction_running (local) || fresh_lookup) {
- if (read_child < 0)
- goto out;
-
- ret = afr_lookup_set_read_ctx (local, this, read_child);
- if (ret)
- goto out;
- }
-
- ret = afr_lookup_build_response_params (local, this);
- if (ret)
- goto out;
- afr_update_loc_gfids (&local->loc,
- &local->cont.lookup.buf,
- &local->cont.lookup.postparent);
-
- ret = 0;
-out:
- if (ret) {
- local->op_ret = -1;
- local->op_errno = EIO;
- }
- return ret;
-}
-
-int
-afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- int32_t *success_children = NULL;
- struct iatt *bufs = NULL;
- int i = 0;
- int child = 0;
- int lsubvol = -1;
-
- priv = this->private;
- success_children = local->cont.lookup.success_children;
- bufs = local->cont.lookup.bufs;
- for (i = 0; i < priv->child_count; i++) {
- child = success_children[i];
- if (child == -1)
- break;
- if (uuid_is_null (bufs[child].ia_gfid))
- continue;
- if (lsubvol < 0) {
- lsubvol = child;
- } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) {
- lsubvol = child;
- } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) &&
- (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) {
- lsubvol = child;
- }
- }
- return lsubvol;
-}
-
-void
-afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this,
- int subvol)
-{
- afr_private_t *priv = NULL;
- int32_t *success_children = NULL;
- struct iatt *bufs = NULL;
- int i = 0;
- int child = 0;
-
- priv = this->private;
- success_children = local->cont.lookup.success_children;
- bufs = local->cont.lookup.bufs;
- memcpy (local->fresh_children, success_children,
- sizeof (*success_children) * priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- child = local->fresh_children[i];
- if (child == -1)
- break;
- if (child == subvol)
- continue;
- if (uuid_is_null (bufs[child].ia_gfid) &&
- (bufs[child].ia_type == bufs[subvol].ia_type))
- continue;
- afr_children_rm_child (success_children, child,
- priv->child_count);
- local->success_count--;
- }
- afr_reset_children (local->fresh_children, priv->child_count);
-}
+ /* First, check if we have an ESTALE from somewhere,
+ If so, propagate that so that a revalidate can be
+ issued
+ */
+ op_errno = afr_final_errno (frame->local, this->private);
+ local->op_errno = op_errno;
+ if (op_errno == ESTALE) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ goto unwind;
+ }
-void
-afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this)
-{
- int lsubvol = 0;
+ read_subvol = -1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (locked_entry && replies[i].op_ret == -1 &&
+ replies[i].op_errno == ENOENT) {
+ /* Second, check entry is still
+ "underway" in creation */
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ read_subvol = i;
+ goto unwind;
+ }
- if (!afr_lookup_conflicting_entries (local, this))
- goto out;
+ if (replies[i].op_ret == -1)
+ continue;
- lsubvol = afr_lookup_get_latest_subvol (local, this);
- if (lsubvol < 0)
- goto out;
- afr_lookup_mark_other_entries_stale (local, this, lsubvol);
-out:
- return;
-}
+ if (read_subvol == -1 || !readable[read_subvol]) {
+ read_subvol = i;
+ uuid_copy (read_gfid, replies[i].poststat.ia_gfid);
+ local->op_ret = 0;
+ }
+ }
-gf_boolean_t
-afr_is_entry_possibly_under_creation (afr_local_t *local, xlator_t *this)
-{
- /*
- * We need to perform this test in lookup done and treat on going
- * create/DELETE as ENOENT.
- * Reason:
- Multiple clients A, B and C are attempting 'mkdir -p /mnt/a/b/c'
-
- 1 Client A is in the middle of mkdir(/a). It has acquired lock.
- It has performed mkdir(/a) on one subvol, and second one is still
- in progress
- 2 Client B performs a lookup, sees directory /a on one,
- ENOENT on the other, succeeds lookup.
- 3 Client B performs lookup on /a/b on both subvols, both return ENOENT
- (one subvol because /a/b does not exist, another because /a
- itself does not exist)
- 4 Client B proceeds to mkdir /a/b. It obtains entrylk on inode=/a with
- basename=b on one subvol, but fails on other subvol as /a is yet to
- be created by Client A.
- 5 Client A finishes mkdir of /a on other subvol
- 6 Client C also attempts to create /a/b, lookup returns ENOENT on
- both subvols.
- 7 Client C tries to obtain entrylk on on inode=/a with basename=b,
- obtains on one subvol (where B had failed), and waits for B to unlock
- on other subvol.
- 8 Client B finishes mkdir() on one subvol with GFID-1 and completes
- transaction and unlocks
- 9 Client C gets the lock on the second subvol, At this stage second
- subvol already has /a/b created from Client B, but Client C does not
- check that in the middle of mkdir transaction
- 10 Client C attempts mkdir /a/b on both subvols. It succeeds on
- ONLY ONE (where Client B could not get lock because of
- missing parent /a dir) with GFID-2, and gets EEXIST from ONE subvol.
- This way we have /a/b in GFID mismatch. One subvol got GFID-1 because
- Client B performed transaction on only one subvol (because entrylk()
- could not be obtained on second subvol because of missing parent dir --
- caused by premature/speculative succeeding of lookup() on /a when locks
- are detected). Other subvol gets GFID-2 from Client C because while
- it was waiting for entrylk() on both subvols, Client B was in the
- middle of creating mkdir() on only one subvol, and Client C does not
- "expect" this when it is between lock() and pre-op()/op() phase of the
- transaction.
- */
- if (local->cont.lookup.parent_entrylk && local->enoent_count)
- return _gf_true;
+ if (read_subvol == -1)
+ goto unwind;
+ /* We now have a read_subvol, which is readable[] (if there
+ were any). Next we look for GFID mismatches. We don't
+ consider a GFID mismatch as an error if read_subvol is
+ readable[] but the mismatching GFID subvol is not.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1) {
+ if (priv->child_up[i])
+ can_interpret = _gf_false;
+ continue;
+ }
- return _gf_false;
-}
+ if (!uuid_compare (replies[i].poststat.ia_gfid,
+ read_gfid))
+ continue;
+ can_interpret = _gf_false;
-static void
-afr_lookup_done (call_frame_t *frame, xlator_t *this)
-{
- int unwind = 1;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- gf_boolean_t sh_launched = _gf_false;
- gf_boolean_t fail_conflict = _gf_false;
- int gfid_miss_count = 0;
- int enotconn_count = 0;
- int up_children_count = 0;
+ if (locked_entry)
+ continue;
- priv = this->private;
- local = frame->local;
+ /* Now GFIDs mismatch. It's OK as long as this subvol
+ is not readable[] but read_subvol is */
+ if (readable[read_subvol] && !readable[i])
+ continue;
- if (afr_is_entry_possibly_under_creation (local, this)) {
+ /* LOG ERROR */
local->op_ret = -1;
- local->op_errno = ENOENT;
+ local->op_errno = EIO;
goto unwind;
}
- if (local->op_ret < 0)
- goto unwind;
-
- if (local->cont.lookup.parent_entrylk && local->success_count > 1)
- afr_succeed_lookup_on_latest_iatt (local, this);
-
- gfid_miss_count = afr_lookup_gfid_missing_count (local, this);
- up_children_count = afr_up_children_count (local->child_up,
- priv->child_count);
- enotconn_count = priv->child_count - up_children_count;
- if ((gfid_miss_count == local->success_count) &&
- (enotconn_count > 0)) {
- local->op_ret = -1;
- local->op_errno = EIO;
- gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, "
- "LOOKUP on a file without gfid is not allowed when "
- "some of the children are down", local->loc.path);
- goto unwind;
- }
-
- if ((gfid_miss_count == local->success_count) &&
- uuid_is_null (local->cont.lookup.gfid_req)) {
- local->op_ret = -1;
- local->op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present",
- local->loc.path);
- goto unwind;
- }
+ /* Forth, for the finalized GFID, pick the best subvolume
+ to return stats from.
+ */
+ if (can_interpret) {
+ /* It is safe to call afr_replies_interpret() because we have
+ a response from all the UP subvolumes and all of them resolved
+ to the same GFID
+ */
+ if (afr_replies_interpret (frame, this, local->inode)) {
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ 0, 0);
+ afr_inode_read_subvol_reset (local->inode, this);
+ goto cant_interpret;
+ } else {
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ 0, 0);
+ }
+ } else {
+ cant_interpret:
+ if (read_subvol == -1)
+ dict_del (replies[0].xdata, GF_CONTENT_KEY);
+ else
+ dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
+ }
- if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req))
- fail_conflict = _gf_true;
- ret = afr_lookup_done_success_action (frame, this, fail_conflict);
- if (ret)
- goto unwind;
- uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req);
+ afr_handle_quota_size (frame, this);
- afr_lookup_perform_self_heal (frame, this, &sh_launched);
- if (sh_launched) {
- unwind = 0;
- goto unwind;
- }
+unwind:
+ if (read_subvol == -1)
+ read_subvol = 0;
- unwind:
- if (unwind) {
- AFR_STACK_UNWIND (lookup, frame, local->op_ret,
- local->op_errno, local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
- }
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
}
/*
@@ -2050,104 +1270,102 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this)
* others in that they must be given higher priority while
* returning to the user.
*
- * The hierarchy is ESTALE > EIO > ENOENT > others
+ * The hierarchy is ESTALE > ENOENT > others
*/
-int32_t
-afr_most_important_error(int32_t old_errno, int32_t new_errno,
- gf_boolean_t eio)
+
+int
+afr_higher_errno (int32_t old_errno, int32_t new_errno)
{
+ if (old_errno == ENODATA || new_errno == ENODATA)
+ return ENODATA;
if (old_errno == ESTALE || new_errno == ESTALE)
return ESTALE;
- if (eio && (old_errno == EIO || new_errno == EIO))
- return EIO;
if (old_errno == ENOENT || new_errno == ENOENT)
return ENOENT;
return new_errno;
}
-int32_t
-afr_resultant_errno_get (int32_t *children,
- int *child_errno, unsigned int child_count)
-{
- int i = 0;
- int32_t op_errno = 0;
- int child = 0;
- for (i = 0; i < child_count; i++) {
- if (children) {
- child = children[i];
- if (child == -1)
- break;
- } else {
- child = i;
- }
- op_errno = afr_most_important_error(op_errno,
- child_errno[child],
- _gf_false);
- }
- return op_errno;
+int
+afr_final_errno (afr_local_t *local, afr_private_t *priv)
+{
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == 0)
+ continue;
+ tmp_errno = local->replies[i].op_errno;
+ op_errno = afr_higher_errno (op_errno, tmp_errno);
+ }
+
+ return op_errno;
}
-static void
-afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno)
+static int
+get_pathinfo_host (char *pathinfo, char *hostname, size_t size)
{
- GF_ASSERT (local);
- if (op_errno == ENOENT)
- local->enoent_count++;
+ char *start = NULL;
+ char *end = NULL;
+ int ret = -1;
+ int i = 0;
- local->op_errno = afr_most_important_error(local->op_errno, op_errno,
- _gf_false);
+ if (!pathinfo)
+ goto out;
- if (local->op_errno == ESTALE) {
- local->op_ret = -1;
- }
+ start = strchr (pathinfo, ':');
+ if (!start)
+ goto out;
+ end = strrchr (pathinfo, ':');
+ if (start == end)
+ goto out;
+
+ memset (hostname, 0, size);
+ i = 0;
+ while (++start != end)
+ hostname[i++] = *start;
+ ret = 0;
+out:
+ return ret;
}
-static void
-afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this,
- inode_t *inode)
+int
+afr_local_pathinfo (char *pathinfo, gf_boolean_t *local)
{
- afr_private_t *priv = NULL;
- GF_ASSERT (inode);
+ int ret = 0;
+ char pathinfohost[1024] = {0};
+ char localhost[1024] = {0};
+ xlator_t *this = THIS;
- if (!__is_root_gfid (inode->gfid))
- goto out;
- if (!afr_is_fresh_lookup (&local->loc, this))
+ *local = _gf_false;
+ ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s",
+ pathinfo);
goto out;
- priv = this->private;
- if ((priv->first_lookup)) {
- gf_log (this->name, GF_LOG_INFO, "added root inode");
- priv->root_inode = inode_ref (inode);
- priv->first_lookup = 0;
}
-out:
- return;
-}
-static void
-afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr,
- struct iatt *buf, struct iatt *postparent)
-{
- GF_ASSERT (child_index >= 0);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparents[child_index] = *postparent;
- local->cont.lookup.bufs[child_index] = *buf;
-}
+ ret = gethostname (localhost, sizeof (localhost));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, "
+ "reason: %s", strerror (errno));
+ goto out;
+ }
-static void
-afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this,
- inode_t *inode, struct iatt *buf)
-{
- local->cont.lookup.inode = inode_ref (inode);
- local->cont.lookup.buf = *buf;
- afr_set_root_inode_on_first_lookup (local, this, inode);
+ if (!strcmp (localhost, pathinfohost))
+ *local = _gf_true;
+out:
+ return ret;
}
static int32_t
-afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata)
+afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
int ret = 0;
char *pathinfo = NULL;
@@ -2159,6 +1377,9 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
+ priv = this->private;
+ child_index = (int32_t)(long)cookie;
+
ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
if (ret != 0) {
goto out;
@@ -2169,7 +1390,6 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- priv = this->private;
/*
* Note that one local subvolume will override another here. The only
* way to avoid that would be to retain extra information about whether
@@ -2177,13 +1397,11 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* the slowest local subvolume is far preferable to a remote one.
*/
if (is_local) {
- child_index = (int32_t)(long)cookie;
gf_log (this->name, GF_LOG_INFO,
"selecting local read_child %s",
priv->children[child_index]->name);
priv->read_child = child_index;
}
-
out:
STACK_DESTROY(frame->root);
return 0;
@@ -2202,220 +1420,357 @@ afr_attempt_local_discovery (xlator_t *this, int32_t child_index)
}
tmploc.gfid[sizeof(tmploc.gfid)-1] = 1;
- STACK_WIND_COOKIE (newframe, afr_discovery_cbk,
+ STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk,
(void *)(long)child_index,
priv->children[child_index],
priv->children[child_index]->fops->getxattr,
&tmploc, GF_XATTR_PATHINFO_KEY, NULL);
}
-static void
-afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+
+int
+afr_lookup_selfheal_wrap (void *opaque)
{
- afr_private_t *priv = this->private;
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
- if (local->success_count == 0) {
- if (local->op_errno != ESTALE) {
- local->op_ret = op_ret;
- local->op_errno = 0;
- }
- afr_lookup_handle_first_success (local, this, inode, buf);
- }
- afr_lookup_update_lk_counts (local, this,
- child_index, xattr);
+ local = frame->local;
+ this = frame->this;
- afr_lookup_cache_args (local, child_index, xattr,
- buf, postparent);
+ afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name);
- if (local->do_discovery && (priv->read_child == (-1))) {
- afr_attempt_local_discovery(this,child_index);
- }
+ afr_replies_wipe (local, this->private);
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up);
+ if (inode)
+ inode_unref (inode);
+ afr_lookup_done (frame, this);
- local->cont.lookup.success_children[local->success_count] = child_index;
- local->success_count++;
+ return 0;
}
+
int
-afr_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *heal = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t need_heal = _gf_false;
+ struct afr_reply *replies = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first == -1) {
+ first = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first].op_ret) {
+ need_heal = _gf_true;
+ break;
+ }
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first].poststat.ia_gfid)) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+
+ if (need_heal) {
+ heal = copy_frame (frame);
+ if (heal)
+ heal->root->pid = -1;
+ ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto lookup_done;
+ } else {
+ lookup_done:
+ afr_lookup_done (frame, this);
+ }
+
+ return ret;
+}
+
+
+int
+afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ dict_t *xdata, struct iatt *postparent)
{
afr_local_t * local = NULL;
int call_count = -1;
int child_index = -1;
- child_index = (long) cookie;
+ child_index = (long) cookie;
- LOCK (&frame->lock);
- {
- local = frame->local;
-
- if (op_ret == -1) {
- afr_lookup_handle_error (local, op_ret, op_errno);
- goto unlock;
- }
- afr_lookup_handle_success (local, this, child_index, op_ret,
- op_errno, inode, buf, xattr,
- postparent);
+ local = frame->local;
- }
-unlock:
- UNLOCK (&frame->lock);
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ }
call_count = afr_frame_return (frame);
if (call_count == 0) {
- afr_lookup_done (frame, this);
+ afr_lookup_entry_heal (frame, this);
}
- return 0;
+ return 0;
}
-int
-afr_lookup_cont_init (afr_local_t *local, unsigned int child_count)
+
+
+static void
+afr_discover_done (call_frame_t *frame, xlator_t *this)
{
- int ret = -ENOMEM;
- struct iatt *iatts = NULL;
- int32_t *success_children = NULL;
- int32_t *sources = NULL;
- int32_t **pending_matrix = NULL;
-
- GF_ASSERT (local);
- local->cont.lookup.xattrs = GF_CALLOC (child_count,
- sizeof (*local->cont.lookup.xattr),
- gf_afr_mt_dict_t);
- if (NULL == local->cont.lookup.xattrs)
- goto out;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
- iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt);
- if (NULL == iatts)
- goto out;
- local->cont.lookup.postparents = iatts;
+ priv = this->private;
+ local = frame->local;
- iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt);
- if (NULL == iatts)
- goto out;
- local->cont.lookup.bufs = iatts;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == 0)
+ local->op_ret = 0;
+ }
- success_children = afr_children_create (child_count);
- if (NULL == success_children)
- goto out;
- local->cont.lookup.success_children = success_children;
+ op_errno = afr_final_errno (frame->local, this->private);
- local->fresh_children = afr_children_create (child_count);
- if (NULL == local->fresh_children)
- goto out;
+ if (local->op_ret < 0) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ goto unwind;
+ }
- sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t);
- if (NULL == sources)
- goto out;
- local->cont.lookup.sources = sources;
+ afr_replies_interpret (frame, this, local->inode);
- pending_matrix = afr_matrix_create (child_count, child_count);
- if (NULL == pending_matrix)
- goto out;
- local->cont.lookup.pending_matrix = pending_matrix;
+ read_subvol = afr_data_subvol_get (local->inode, this, 0, 0);
+ if (read_subvol == -1) {
+ gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s",
+ local->loc.path);
- ret = 0;
-out:
- return ret;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid ||
+ local->replies[i].op_ret == -1)
+ continue;
+ read_subvol = i;
+ break;
+ }
+ }
+
+unwind:
+ if (read_subvol == -1)
+ read_subvol = 0;
+
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
}
+
int
-afr_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ dict_t *xdata, struct iatt *postparent)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- void *gfid_req = NULL;
- int ret = -1;
- int i = 0;
- int call_count = 0;
- uint64_t ctx = 0;
- int32_t op_errno = 0;
+ afr_local_t * local = NULL;
+ int call_count = -1;
+ int child_index = -1;
- priv = this->private;
+ child_index = (long) cookie;
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
+ local = frame->local;
- local->op_ret = -1;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ }
- frame->local = local;
- local->fop = GF_FOP_LOOKUP;
+ if (local->do_discovery && (op_ret == 0))
+ afr_attempt_local_discovery (this, child_index);
- loc_copy (&local->loc, loc);
- ret = loc_path (&local->loc, NULL);
- if (ret < 0) {
- op_errno = EINVAL;
- goto out;
+ call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ afr_discover_done (frame, this);
}
- ret = inode_ctx_get (local->loc.inode, this, &ctx);
- if (ret == 0) {
- /* lookup is a revalidate */
+ return 0;
+}
- local->read_child_index = afr_inode_get_read_ctx (this,
- local->loc.inode,
- NULL);
- } else {
- LOCK (&priv->read_child_lock);
- {
- if (priv->hash_mode) {
- local->read_child_index = -1;
- }
- else {
- local->read_child_index =
- (++priv->read_child_rr) %
- (priv->child_count);
- }
- }
- UNLOCK (&priv->read_child_lock);
- local->cont.lookup.fresh_lookup = _gf_true;
- }
- local->child_up = memdup (priv->child_up,
- sizeof (*local->child_up) * priv->child_count);
- if (NULL == local->child_up) {
- op_errno = ENOMEM;
+int
+afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err) {
+ local->op_errno = -err;
+ ret = -1;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT (local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ ret = -1;
goto out;
}
- ret = afr_lookup_cont_init (local, priv->child_count);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_discover_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
+ }
}
- local->call_count = afr_up_children_count (local->child_up,
- priv->child_count);
- call_count = local->call_count;
- if (local->call_count == 0) {
- ret = -1;
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
+
+
+int
+afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int event = 0;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (!local->call_count) {
op_errno = ENOTCONN;
goto out;
}
- /* By default assume ENOTCONN. On success it will be set to 0. */
- local->op_errno = ENOTCONN;
+ if (__is_root_gfid (loc->inode->gfid)) {
+ if (!this->itable)
+ this->itable = loc->inode->table;
+ if (!priv->root_inode)
+ priv->root_inode = inode_ref (loc->inode);
+
+ if (priv->choose_local && !priv->did_discovery) {
+ /* Logic to detect which subvolumes of AFR are
+ local, in order to prefer them for reads
+ */
+ local->do_discovery = _gf_true;
+ priv->did_discovery = _gf_true;
+ }
+ }
+
+ local->op = GF_FOP_LOOKUP;
+
+ loc_copy (&local->loc, loc);
- ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc,
- &gfid_req);
+ local->inode = inode_ref (loc->inode);
+
+ if (xattr_req)
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_ref (xattr_req);
+
+ if (uuid_is_null (loc->inode->gfid)) {
+ afr_discover_do (frame, this, 0);
+ return 0;
+ }
+
+ afr_read_subvol_get (loc->inode, this, NULL, &event,
+ AFR_DATA_TRANSACTION);
+
+ if (event != local->event_generation)
+ afr_inode_refresh (frame, this, loc->inode, afr_discover_do);
+ else
+ afr_discover_do (frame, this, 0);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int
+afr_lookup_do (call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err < 0) {
+ local->op_errno = -err;
+ ret = -1;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT (local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
+ &local->loc);
if (ret) {
local->op_errno = -ret;
+ ret = -1;
goto out;
}
- afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req,
- &local->loc);
- local->fop = GF_FOP_LOOKUP;
- if (priv->choose_local && !priv->did_discovery) {
- if (gfid_req && __is_root_gfid(gfid_req)) {
- local->do_discovery = _gf_true;
- priv->did_discovery = _gf_true;
- }
- }
+
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_lookup_cbk,
@@ -2427,12 +1782,98 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
break;
}
}
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
- ret = 0;
+/*
+ * afr_lookup()
+ *
+ * The goal here is to figure out what the element getting looked up is.
+ * i.e what is the GFID, inode type and a conservative estimate of the
+ * inode attributes are.
+ *
+ * As we lookup, operations may be underway on the entry name and the
+ * inode. In lookup() we are primarily concerned only with the entry
+ * operations. If the entry is getting unlinked or renamed, we detect
+ * what operation is underway by querying for on-going transactions and
+ * pending self-healing on the entry through xdata.
+ *
+ * If the entry is a file/dir, it may need self-heal and/or in a
+ * split-brain condition. Lookup is not the place to worry about these
+ * conditions. Outcast marking will naturally handle them in the read
+ * paths.
+ *
+ * Here is a brief goal of what we are trying to achieve:
+ *
+ * - LOOKUP on all subvolumes concurrently, querying on-going transaction
+ * and pending self-heal info from the servers.
+ *
+ * - If all servers reply the same inode type and GFID, the overall call
+ * MUST be a success.
+ *
+ * - If inode types or GFIDs mismatch, and there IS either an on-going
+ * transaction or pending self-heal, inspect what the nature of the
+ * transaction or pending heal is, and select the appropriate subvolume's
+ * reply as the winner.
+ *
+ * - If inode types or GFIDs mismatch, and there are no on-going transactions
+ * or pending self-heal on the entry name on any of the servers, fail the
+ * lookup with EIO. Something has gone wrong beyond reasonable action.
+ */
+
+int
+afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int event = 0;
+
+ if (!loc->parent) {
+ afr_discover (frame, this, loc, xattr_req);
+ return 0;
+ }
+
+ if (__is_root_gfid (loc->parent->gfid)) {
+ if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) {
+ op_errno = EPERM;
+ goto out;
+ }
+ }
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->op = GF_FOP_LOOKUP;
+
+ loc_copy (&local->loc, loc);
+
+ local->inode = inode_ref (loc->inode);
+
+ if (xattr_req)
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_ref (xattr_req);
+
+ afr_read_subvol_get (loc->parent, this, NULL, &event,
+ AFR_DATA_TRANSACTION);
+
+ if (event != local->event_generation)
+ afr_inode_refresh (frame, this, loc->parent, afr_lookup_do);
+ else
+ afr_lookup_do (frame, this, 0);
+
+ return 0;
out:
- if (ret)
- AFR_STACK_UNWIND (lookup, frame, -1, op_errno,
- NULL, NULL, NULL, NULL);
+ AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
return 0;
}
@@ -2440,6 +1881,46 @@ out:
/* {{{ open */
+afr_fd_ctx_t *
+__afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+{
+ uint64_t ctx = 0;
+ int ret = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ ret = __fd_ctx_get (fd, this, &ctx);
+
+ if (ret < 0) {
+ ret = __afr_fd_ctx_set (this, fd);
+ if (ret < 0)
+ goto out;
+
+ ret = __fd_ctx_get (fd, this, &ctx);
+ if (ret < 0)
+ goto out;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+out:
+ return fd_ctx;
+}
+
+
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get (fd, this);
+ }
+ UNLOCK(&fd->lock);
+
+ return fd_ctx;
+}
+
+
int
__afr_fd_ctx_set (xlator_t *this, fd_t *fd)
{
@@ -2447,6 +1928,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
int ret = -1;
uint64_t ctx = 0;
afr_fd_ctx_t * fd_ctx = NULL;
+ int i = 0;
VALIDATE_OR_GOTO (this->private, out);
VALIDATE_OR_GOTO (fd, out);
@@ -2465,21 +1947,15 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto out;
}
- fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->pre_op_done) {
- ret = -ENOMEM;
- goto out;
- }
-
- fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->pre_op_piggyback) {
- ret = -ENOMEM;
- goto out;
- }
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!fd_ctx->pre_op_done[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
priv->child_count,
@@ -2489,6 +1965,13 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto out;
}
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_is_anonymous (fd))
+ fd_ctx->opened_on[i] = AFR_FD_OPENED;
+ else
+ fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
+ }
+
fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),
priv->child_count,
gf_afr_mt_char);
@@ -2505,20 +1988,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto out;
}
- fd_ctx->up_count = priv->up_count;
- fd_ctx->down_count = priv->down_count;
-
- fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->locked_on) {
- ret = -ENOMEM;
- goto out;
- }
-
pthread_mutex_init (&fd_ctx->delay_lock, NULL);
- INIT_LIST_HEAD (&fd_ctx->entries);
- fd_ctx->call_child = -1;
INIT_LIST_HEAD (&fd_ctx->eager_locked);
@@ -2548,83 +2018,90 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)
/* {{{ flush */
int
-afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
+ afr_local_t *local = NULL;
+ int call_count = -1;
local = frame->local;
LOCK (&frame->lock);
{
if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
- }
-
- local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
}
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0)
- AFR_STACK_UNWIND(flush, frame, local->op_ret,
- local->op_errno, NULL);
+ AFR_STACK_UNWIND (flush, frame, local->op_ret,
+ local->op_errno, local->xdata_rsp);
return 0;
}
-int
-afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+static int
+afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
+ int i = 0;
afr_local_t *local = NULL;
- int ret = -1;
- int op_errno = 0;
- int call_count = -1;
- int i = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ int call_count = -1;
priv = this->private;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init(local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->fd = fd_ref(fd);
+ local = frame->local;
call_count = local->call_count;
- afr_delayed_changelog_wake_up (this, fd);
-
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_flush_cbk,
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->flush,
- local->fd, NULL);
-
+ local->fd, xdata);
if (!--call_count)
break;
+
}
}
- ret = 0;
+ return 0;
+}
-out:
- if (ret < 0)
- AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+int
+afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int op_errno = ENOMEM;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->fd = fd_ref(fd);
+
+ stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata);
+ if (!stub)
+ goto out;
+
+ afr_delayed_changelog_wake_resume (this, fd, stub);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL);
return 0;
}
@@ -2637,6 +2114,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
uint64_t ctx = 0;
afr_fd_ctx_t *fd_ctx = NULL;
int ret = 0;
+ int i = 0;
ret = fd_ctx_get (fd, this, &ctx);
if (ret < 0)
@@ -2645,13 +2123,17 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
if (fd_ctx) {
- GF_FREE (fd_ctx->pre_op_done);
+ //no need to take any locks
+ if (!list_empty (&fd_ctx->eager_locked))
+ gf_log (this->name, GF_LOG_WARNING, "%s: Stale "
+ "Eager-lock stubs found",
+ uuid_utoa (fd->inode->gfid));
- GF_FREE (fd_ctx->opened_on);
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++)
+ GF_FREE (fd_ctx->pre_op_done[i]);
- GF_FREE (fd_ctx->locked_on);
+ GF_FREE (fd_ctx->opened_on);
- GF_FREE (fd_ctx->pre_op_piggyback);
GF_FREE (fd_ctx->lock_piggyback);
GF_FREE (fd_ctx->lock_acquired);
@@ -2669,24 +2151,8 @@ out:
int
afr_release (xlator_t *this, fd_t *fd)
{
- afr_locked_fd_t *locked_fd = NULL;
- afr_locked_fd_t *tmp = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
afr_cleanup_fd_ctx (this, fd);
- list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds,
- list) {
-
- if (locked_fd->fd == fd) {
- list_del_init (&locked_fd->list);
- GF_FREE (locked_fd);
- }
-
- }
-
return 0;
}
@@ -2694,6 +2160,16 @@ afr_release (xlator_t *this, fd_t *fd)
/* {{{ fsync */
int
+afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
@@ -2701,36 +2177,38 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_local_t *local = NULL;
int call_count = -1;
int child_index = (long) cookie;
- int read_child = 0;
+ int read_subvol = 0;
call_stub_t *stub = NULL;
local = frame->local;
- read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
+ read_subvol = afr_data_subvol_get (local->inode, this, 0, 0);
LOCK (&frame->lock);
{
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
if (op_ret == 0) {
- local->op_ret = 0;
+ if (local->op_ret == -1) {
+ local->op_ret = 0;
- if (local->success_count == 0) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
- }
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
- if (child_index == read_child) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
+ if (xdata)
+ local->xdata_rsp = dict_ref (xdata);
}
- local->success_count++;
- }
-
- local->op_errno = op_errno;
+ if (child_index == read_subvol) {
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+ } else {
+ local->op_errno = op_errno;
+ }
}
UNLOCK (&frame->lock);
@@ -2746,10 +2224,11 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
post-op. This guarantee is expected by FUSE graph switching
for example.
*/
- stub = fop_fsync_cbk_stub (frame, default_fsync_cbk,
+ stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk,
local->op_ret, local->op_errno,
- &local->cont.fsync.prebuf,
- &local->cont.fsync.postbuf, xdata);
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
if (!stub) {
AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);
return 0;
@@ -2769,37 +2248,35 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync, dict_t *xdata)
+afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
+ int32_t op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ priv = this->private;
- priv = this->private;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- local->fd = fd_ref (fd);
+ local->fd = fd_ref (fd);
if (afr_fd_has_witnessed_unstable_write (this, fd)) {
/* don't care. we only wanted to CLEAR the bit */
}
+ local->inode = inode_ref (fd->inode);
+
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_fsync_cbk,
@@ -2812,10 +2289,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
+ AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
return 0;
}
@@ -2823,10 +2300,9 @@ out:
/* {{{ fsync */
-int32_t
-afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xdata)
+int
+afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
int call_count = -1;
@@ -2835,10 +2311,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
LOCK (&frame->lock);
{
- if (op_ret == 0)
+ if (op_ret == 0) {
local->op_ret = 0;
-
- local->op_errno = op_errno;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
}
UNLOCK (&frame->lock);
@@ -2846,37 +2325,33 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
if (call_count == 0)
AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret,
- local->op_errno, xdata);
+ local->op_errno, local->xdata_rsp);
return 0;
}
-int32_t
-afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync, dict_t *xdata)
+int
+afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
+ int32_t op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ priv = this->private;
- priv = this->private;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -2889,10 +2364,10 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL);
+
return 0;
}
@@ -2915,6 +2390,10 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie,
if (op_ret == 0) {
if (!local->cont.xattrop.xattr)
local->cont.xattrop.xattr = dict_ref (xattr);
+
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+
local->op_ret = 0;
}
@@ -2926,7 +2405,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie,
if (call_count == 0)
AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno,
- local->cont.xattrop.xattr, xdata);
+ local->cont.xattrop.xattr, local->xdata_rsp);
return 0;
}
@@ -2938,25 +2417,21 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -2969,10 +2444,10 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
@@ -2997,6 +2472,8 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
if (!local->cont.fxattrop.xattr)
local->cont.fxattrop.xattr = dict_ref (xattr);
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
local->op_ret = 0;
}
@@ -3008,7 +2485,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
if (call_count == 0)
AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno,
- local->cont.fxattrop.xattr, xdata);
+ local->cont.fxattrop.xattr, local->xdata_rsp);
return 0;
}
@@ -3020,25 +2497,21 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
int32_t op_errno = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3051,10 +2524,10 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
@@ -3062,8 +2535,8 @@ out:
int32_t
-afr_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
@@ -3097,25 +2570,21 @@ afr_inodelk (call_frame_t *frame, xlator_t *this,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOMEM;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3129,18 +2598,17 @@ afr_inodelk (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL);
+
return 0;
}
int32_t
-afr_finodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xdata)
+afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
@@ -3168,31 +2636,26 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie,
int32_t
-afr_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock,
- dict_t *xdata)
+afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3206,10 +2669,10 @@ afr_finodelk (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
+
return 0;
}
@@ -3242,33 +2705,28 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
-int32_t
-afr_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc,
- const char *basename, entrylk_cmd cmd, entrylk_type type,
- dict_t *xdata)
+int
+afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
int32_t op_errno = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3282,18 +2740,18 @@ afr_entrylk (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL);
+
return 0;
}
-int32_t
-afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+int
+afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
@@ -3320,33 +2778,28 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
}
-int32_t
-afr_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd,
- const char *basename, entrylk_cmd cmd,
- entrylk_type type, dict_t *xdata)
+int
+afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3360,82 +2813,85 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL);
+
return 0;
}
-int32_t
-afr_statfs_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct statvfs *statvfs, dict_t *xdata)
+
+int
+afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
{
afr_local_t *local = NULL;
int call_count = 0;
+ struct statvfs *buf = NULL;
LOCK (&frame->lock);
{
local = frame->local;
- if (op_ret == 0) {
- local->op_ret = op_ret;
-
- if (local->cont.statfs.buf_set) {
- if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
- local->cont.statfs.buf = *statvfs;
- } else {
- local->cont.statfs.buf = *statvfs;
- local->cont.statfs.buf_set = 1;
- }
- }
-
- if (op_ret == -1)
+ if (op_ret != 0) {
local->op_errno = op_errno;
+ goto unlock;
+ }
+ local->op_ret = op_ret;
+
+ buf = &local->cont.statfs.buf;
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < buf->f_bavail) {
+ *buf = *statvfs;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+ } else {
+ *buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ if (xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
}
+unlock:
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0)
AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
- &local->cont.statfs.buf, xdata);
+ &local->cont.statfs.buf, local->xdata_rsp);
return 0;
}
-int32_t
-afr_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xdata)
+int
+afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- int child_count = 0;
afr_local_t * local = NULL;
+ afr_private_t *priv = NULL;
int i = 0;
- int ret = -1;
int call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ int32_t op_errno = ENOMEM;
- priv = this->private;
- child_count = priv->child_count;
+ priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- for (i = 0; i < child_count; i++) {
+ for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND (frame, afr_statfs_cbk,
priv->children[i],
@@ -3446,10 +2902,10 @@ afr_statfs (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
@@ -3558,21 +3014,6 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN,
&local->cont.lk.ret_flock, NULL);
} else {
- /* locking has succeeded on all nodes that are up */
-
- /* temporarily
- ret = afr_mark_locked_nodes (this, local->fd,
- local->cont.lk.locked_nodes);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not save locked nodes info in fdctx");
-
- ret = afr_save_locked_fd (this, local->fd);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not save locked fd");
-
- */
AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
&local->cont.lk.ret_flock, NULL);
}
@@ -3588,20 +3029,12 @@ afr_lk (call_frame_t *frame, xlator_t *this,
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
int i = 0;
- int32_t op_errno = 0;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
goto out;
local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count,
@@ -3623,28 +3056,16 @@ afr_lk (call_frame_t *frame, xlator_t *this,
priv->children[i]->fops->lk,
fd, cmd, flock, xdata);
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
int
afr_forget (xlator_t *this, inode_t *inode)
{
- uint64_t ctx_addr = 0;
- afr_inode_ctx_t *ctx = NULL;
-
- inode_ctx_get (inode, this, &ctx_addr);
-
- if (!ctx_addr)
- goto out;
-
- ctx = (afr_inode_ctx_t *)(long)ctx_addr;
- GF_FREE (ctx->fresh_children);
- GF_FREE (ctx);
-out:
return 0;
}
@@ -3664,7 +3085,6 @@ afr_priv_dump (xlator_t *this)
snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
gf_proc_dump_add_section(key_prefix);
gf_proc_dump_write("child_count", "%u", priv->child_count);
- gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr);
for (i = 0; i < priv->child_count; i++) {
sprintf (key, "child_up[%d]", i);
gf_proc_dump_write(key, "%d", priv->child_up[i]);
@@ -3721,7 +3141,7 @@ afr_notify (xlator_t *this, int32_t event,
int idx = -1;
int ret = -1;
int call_psh = 0;
- int up_child = AFR_ALL_CHILDREN;
+ int up_child = -1;
dict_t *input = NULL;
dict_t *output = NULL;
@@ -3773,6 +3193,7 @@ afr_notify (xlator_t *this, int32_t event,
*/
if (priv->child_up[idx] != 1) {
priv->up_count++;
+ priv->event_generation++;
}
priv->child_up[idx] = 1;
@@ -3812,6 +3233,7 @@ afr_notify (xlator_t *this, int32_t event,
*/
if (priv->child_up[idx] == 1) {
priv->down_count++;
+ priv->event_generation++;
}
priv->child_up[idx] = 0;
@@ -3844,6 +3266,10 @@ afr_notify (xlator_t *this, int32_t event,
case GF_EVENT_TRANSLATOR_OP:
input = data;
output = data2;
+ if (!had_heard_from_all) {
+ ret = -1;
+ goto out;
+ }
ret = afr_xl_op (this, input, output);
goto out;
break;
@@ -3874,8 +3300,7 @@ afr_notify (xlator_t *this, int32_t event,
LOCK (&priv->lock);
{
- up_children = afr_up_children_count (priv->child_up,
- priv->child_count);
+ up_children = AFR_COUNT (priv->child_up, priv->child_count);
for (i = 0; i < priv->child_count; i++) {
if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
event = GF_EVENT_CHILD_UP;
@@ -3895,39 +3320,23 @@ afr_notify (xlator_t *this, int32_t event,
ret = 0;
if (propagate)
ret = default_notify (this, event, data);
- if (call_psh && priv->shd.iamshd)
- afr_proactive_self_heal ((void*) (long) up_child);
+ if (call_psh && priv->shd.iamshd) {
+ afr_selfheal_childup (this, up_child);
+ }
out:
return ret;
}
-int
-afr_first_up_child (unsigned char *child_up, size_t child_count)
-{
- int ret = -1;
- int i = 0;
-
- GF_ASSERT (child_up);
-
- for (i = 0; i < child_count; i++) {
- if (child_up[i]) {
- ret = i;
- break;
- }
- }
-
- return ret;
-}
int
afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
{
- int ret = -1;
-
local->op_ret = -1;
local->op_errno = EUCLEAN;
+ syncbarrier_init (&local->barrier);
+
local->child_up = GF_CALLOC (priv->child_count,
sizeof (*local->child_up),
gf_afr_mt_char);
@@ -3939,36 +3348,42 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
memcpy (local->child_up, priv->child_up,
sizeof (*local->child_up) * priv->child_count);
- local->call_count = afr_up_children_count (local->child_up,
- priv->child_count);
+ local->call_count = AFR_COUNT (local->child_up, priv->child_count);
if (local->call_count == 0) {
gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up");
if (op_errno)
*op_errno = ENOTCONN;
goto out;
}
+ local->event_generation = priv->event_generation;
- local->child_errno = GF_CALLOC (priv->child_count,
- sizeof (*local->child_errno),
- gf_afr_mt_int32_t);
- if (!local->child_errno) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
- }
+ local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char),
+ gf_afr_mt_char);
+ if (!local->read_attempted) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
- local->transaction.postop_piggybacked = GF_CALLOC (priv->child_count,
- sizeof (int),
- gf_afr_mt_int32_t);
- if (!local->transaction.postop_piggybacked) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
- }
+ local->readable = GF_CALLOC (priv->child_count, sizeof (char),
+ gf_afr_mt_char);
+ if (!local->readable) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
- ret = 0;
+ local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+ gf_afr_mt_reply_t);
+ if (!local->replies) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ return 0;
out:
- return ret;
+ return -1;
}
int
@@ -3977,11 +3392,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
{
int ret = -ENOMEM;
- lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->inode_locked_nodes)
- goto out;
-
lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
child_count, gf_afr_mt_char);
if (NULL == lk->locked_nodes)
@@ -4040,6 +3450,21 @@ out:
}
int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
+{
+ int ret = -ENOMEM;
+
+ lk->domain = dom;
+ lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->locked_nodes)
+ goto out;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
afr_transaction_local_init (afr_local_t *local, xlator_t *this)
{
int child_up_count = 0;
@@ -4052,14 +3477,20 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (ret < 0)
goto out;
+ if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
+ (local->transaction.type == AFR_METADATA_TRANSACTION)) {
+ ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
+ this->name, priv->child_count);
+ if (ret < 0)
+ goto out;
+ }
+
ret = -ENOMEM;
- child_up_count = afr_up_children_count (local->child_up,
- priv->child_count);
+ child_up_count = AFR_COUNT (local->child_up, priv->child_count);
if (priv->optimistic_change_log && child_up_count == priv->child_count)
local->optimistic_change_log = 1;
- local->first_up_child = afr_first_up_child (local->child_up,
- priv->child_count);
+ local->pre_op_compat = priv->pre_op_compat;
local->transaction.eager_lock =
GF_CALLOC (sizeof (*local->transaction.eager_lock),
@@ -4069,26 +3500,29 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (!local->transaction.eager_lock)
goto out;
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children)
- goto out;
-
local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
priv->child_count,
gf_afr_mt_char);
if (!local->transaction.pre_op)
goto out;
+ local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.fop_subvols)
+ goto out;
+
+ local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.failed_subvols)
+ goto out;
+
local->pending = afr_matrix_create (priv->child_count,
AFR_NUM_CHANGE_LOGS);
if (!local->pending)
goto out;
- local->transaction.txn_changelog = afr_matrix_create (priv->child_count,
- AFR_NUM_CHANGE_LOGS);
- if (!local->transaction.txn_changelog)
- goto out;
-
INIT_LIST_HEAD (&local->transaction.eager_locked);
ret = 0;
@@ -4096,86 +3530,6 @@ out:
return ret;
}
-void
-afr_reset_children (int32_t *fresh_children, int32_t child_count)
-{
- unsigned int i = 0;
- for (i = 0; i < child_count; i++)
- fresh_children[i] = -1;
-}
-
-int32_t*
-afr_children_create (int32_t child_count)
-{
- int32_t *children = NULL;
- int i = 0;
-
- GF_ASSERT (child_count > 0);
-
- children = GF_CALLOC (child_count, sizeof (*children),
- gf_afr_mt_int32_t);
- if (NULL == children)
- goto out;
- for (i = 0; i < child_count; i++)
- children[i] = -1;
-out:
- return children;
-}
-
-void
-afr_children_add_child (int32_t *children, int32_t child,
- int32_t child_count)
-{
- gf_boolean_t child_found = _gf_false;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (children[i] == -1)
- break;
- if (children[i] == child) {
- child_found = _gf_true;
- break;
- }
- }
-
- if (!child_found) {
- GF_ASSERT (i < child_count);
- children[i] = child;
- }
-}
-
-void
-afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count)
-{
- int i = 0;
-
- GF_ASSERT ((child >= 0) && (child < child_count));
- for (i = 0; i < child_count; i++) {
- if (children[i] == -1)
- break;
- if (children[i] == child) {
- if (i != (child_count - 1))
- memmove (children + i, children + i + 1,
- sizeof (*children)*(child_count - i - 1));
- children[child_count - 1] = -1;
- break;
- }
- }
-}
-
-int
-afr_get_children_count (int32_t *children, unsigned int child_count)
-{
- int count = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (children[i] == -1)
- break;
- count++;
- }
- return count;
-}
void
afr_set_low_priority (call_frame_t *frame)
@@ -4183,38 +3537,6 @@ afr_set_low_priority (call_frame_t *frame)
frame->root->pid = LOW_PRIO_PROC_PID;
}
-int
-afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,
- int flags)
-{
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- GF_ASSERT (fd && fd->inode);
- ret = afr_fd_ctx_set (this, fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set fd ctx for fd=%p", fd);
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not get fd ctx for fd=%p", fd);
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- fd_ctx->opened_on[child] = AFR_FD_OPENED;
- if (!IA_ISDIR (fd->inode->ia_type)) {
- fd_ctx->flags = flags;
- }
- ret = 0;
-out:
- return ret;
-}
gf_boolean_t
afr_have_quorum (char *logname, afr_private_t *priv)
@@ -4261,23 +3583,6 @@ afr_priv_destroy (afr_private_t *priv)
if (!priv)
goto out;
inode_unref (priv->root_inode);
- GF_FREE (priv->shd.pos);
- GF_FREE (priv->shd.pending);
- GF_FREE (priv->shd.inprogress);
-// for (i = 0; i < priv->child_count; i++)
-// if (priv->shd.timer && priv->shd.timer[i])
-// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]);
- GF_FREE (priv->shd.timer);
-
- if (priv->shd.healed)
- eh_destroy (priv->shd.healed);
-
- if (priv->shd.heal_failed)
- eh_destroy (priv->shd.heal_failed);
-
- if (priv->shd.split_brain)
- eh_destroy (priv->shd.split_brain);
-
GF_FREE (priv->last_event);
if (priv->pending_key) {
for (i = 0; i < priv->child_count; i++)
@@ -4287,8 +3592,7 @@ afr_priv_destroy (afr_private_t *priv)
GF_FREE (priv->children);
GF_FREE (priv->child_up);
LOCK_DESTROY (&priv->lock);
- LOCK_DESTROY (&priv->read_child_lock);
- pthread_mutex_destroy (&priv->mutex);
+
GF_FREE (priv);
out:
return;
@@ -4305,54 +3609,21 @@ xlator_subvolume_count (xlator_t *this)
return i;
}
-inline gf_boolean_t
-afr_is_errno_set (int *child_errno, int child)
-{
- return child_errno[child];
-}
-
-inline gf_boolean_t
-afr_is_errno_unset (int *child_errno, int child)
-{
- return !afr_is_errno_set (child_errno, child);
-}
void
-afr_prepare_new_entry_pending_matrix (int32_t **pending,
- gf_boolean_t (*is_pending) (int *, int),
- int *ctx, struct iatt *buf,
- unsigned int child_count)
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
{
- int midx = 0;
- int idx = 0;
- int i = 0;
-
- midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
- if (IA_ISDIR (buf->ia_type))
- idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
- else if (IA_ISREG (buf->ia_type))
- idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- else
- idx = -1;
- for (i = 0; i < child_count; i++) {
- if (is_pending (ctx, i)) {
- pending[i][midx] = hton32 (1);
- if (idx == -1)
- continue;
- pending[i][idx] = hton32 (1);
- }
- }
-}
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
-gf_boolean_t
-afr_is_fd_fixable (fd_t *fd)
-{
- if (!fd || !fd->inode)
- return _gf_false;
- else if (fd_is_anonymous (fd))
- return _gf_false;
- else if (uuid_is_null (fd->inode->gfid))
- return _gf_false;
-
- return _gf_true;
+ local = frame->local;
+
+ if (!local->fd)
+ return;
+
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+ if (!fd_ctx)
+ return;
+
+ fd_ctx->open_fd_count = local->open_fd_count;
}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 0e1718814..fa1da3958 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -37,177 +37,7 @@
#include "checksum.h"
#include "afr.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-int
-afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret,
- int32_t op_errno, int32_t sh_failed)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- afr_set_opendir_done (this, local->fd->inode);
-
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd, NULL);
-
- return 0;
-}
-
-
-gf_boolean_t
-__checksums_differ (uint32_t *checksum, int child_count,
- unsigned char *child_up)
-{
- int ret = _gf_false;
- int i = 0;
- uint32_t cksum = 0;
- gf_boolean_t activate_check = _gf_false;
-
- for (i = 0; i < child_count; i++) {
- if (!child_up[i])
- continue;
- if (_gf_false == activate_check) {
- cksum = checksum[i];
- activate_check = _gf_true;
- continue;
- }
-
- if (cksum != checksum[i]) {
- ret = _gf_true;
- break;
- }
-
- cksum = checksum[i];
- }
-
- return ret;
-}
-
-
-int32_t
-afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- gf_dirent_t * entry = NULL;
- gf_dirent_t * tmp = NULL;
- char *reason = NULL;
- int child_index = 0;
- uint32_t entry_cksum = 0;
- int call_count = 0;
- off_t last_offset = 0;
- inode_t *inode = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- inode = local->fd->inode;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to do opendir on %s",
- local->loc.path, priv->children[child_index]->name);
- local->op_ret = -1;
- local->op_ret = op_errno;
- goto out;
- }
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: no entries found in %s",
- local->loc.path, priv->children[child_index]->name);
- goto out;
- }
-
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name,
- strlen (entry->d_name));
- local->cont.opendir.checksum[child_index] ^= entry_cksum;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- }
-
- /* read more entries */
-
- STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->readdir,
- local->fd, 131072, last_offset, NULL);
-
- return 0;
-
-out:
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (__checksums_differ (local->cont.opendir.checksum,
- priv->child_count,
- local->child_up)) {
-
- sh->do_entry_self_heal = _gf_true;
- sh->forced_merge = _gf_true;
-
- reason = "checksums of directory differ";
- afr_launch_self_heal (frame, this, inode, _gf_false,
- inode->ia_type, reason, NULL,
- afr_examine_dir_sh_unwind);
- } else {
- afr_set_opendir_done (this, inode);
-
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd, NULL);
- }
- }
-
- return 0;
-}
-
-
-int
-afr_examine_dir (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
-
- local->cont.opendir.checksum = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.opendir.checksum),
- gf_afr_mt_int32_t);
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->readdir,
- local->fd, 131072, 0, NULL);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
+#include "afr-transaction.h"
int32_t
@@ -215,112 +45,66 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int32_t up_children_count = 0;
- int ret = -1;
int call_count = -1;
int32_t child_index = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
- priv = this->private;
local = frame->local;
+ fd_ctx = local->fd_ctx;
child_index = (long) cookie;
- up_children_count = afr_up_children_count (local->child_up,
- priv->child_count);
-
LOCK (&frame->lock);
{
- if (op_ret >= 0) {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
local->op_ret = op_ret;
- ret = afr_child_fd_ctx_set (this, fd, child_index, 0);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
}
-
- local->op_errno = op_errno;
}
-unlock:
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
- if (call_count == 0) {
- if (local->op_ret != 0)
- goto out;
-
- if (!afr_is_opendir_done (this, local->fd->inode) &&
- up_children_count > 1) {
-
- /*
- * This is the first opendir on this inode. We need
- * to check if the directory's entries are the same
- * on all subvolumes. This is needed in addition
- * to regular entry self-heal because the readdir
- * call is sent only to the first subvolume, and
- * thus files that exist only there will never be healed
- * otherwise (assuming changelog shows no anomalies).
- */
-
- gf_log (this->name, GF_LOG_TRACE,
- "reading contents of directory %s looking for mismatch",
- local->loc.path);
-
- afr_examine_dir (frame, this);
-
- } else {
- /* do the unwind */
- goto out;
- }
- }
-
- return 0;
-
-out:
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd, NULL);
-
+ if (call_count == 0)
+ AFR_STACK_UNWIND (opendir, frame, local->op_ret,
+ local->op_errno, local->fd, NULL);
return 0;
}
-int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd)
+int
+afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
{
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
- int child_count = 0;
int i = 0;
- int ret = -1;
int call_count = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
+ afr_fd_ctx_t *fd_ctx = NULL;
priv = this->private;
- child_count = priv->child_count;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ goto out;
loc_copy (&local->loc, loc);
local->fd = fd_ref (fd);
+ local->fd_ctx = fd_ctx;
call_count = local->call_count;
- for (i = 0; i < child_count; i++) {
+ for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_opendir_cbk,
(void*) (long) i,
@@ -333,152 +117,280 @@ afr_opendir (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL);
-
+ AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL);
return 0;
}
-/**
- * Common algorithm for directory read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: readdir
- */
+#define BACKEND_D_OFF_BITS 63
+#define PRESENT_D_OFF_BITS 63
+#define ONE 1ULL
+#define MASK (~0ULL)
+#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS))
+#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS))
-struct entry_name {
- char *name;
- struct list_head list;
-};
+#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1))
+#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1)))
-static void
-afr_forget_entries (fd_t *fd)
+static uint64_t
+afr_bits_for (uint64_t num)
{
- struct entry_name *entry = NULL;
- struct entry_name *tmp = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- return;
+ uint64_t bits = 0, ctrl = 1;
+
+ while (ctrl < num) {
+ ctrl *= 2;
+ bits ++;
+ }
+
+ return bits;
+}
+
+int
+afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p)
+{
+ afr_private_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t y = 0;
+ uint64_t hi_mask = 0;
+ uint64_t off_mask = 0;
+ int max_bits = 0;
+
+ if (x == ((uint64_t) -1)) {
+ y = (uint64_t) -1;
+ goto out;
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ max = conf->child_count;
+ cnt = subvol;
+
+ if (max == 1) {
+ y = x;
+ goto out;
+ }
+
+ max_bits = afr_bits_for (max);
+
+ hi_mask = ~(PRESENT_MASK >> (max_bits + 1));
- list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) {
- GF_FREE (entry->name);
- list_del (&entry->list);
- GF_FREE (entry);
+ if (x & hi_mask) {
+ /* HUGE d_off */
+ off_mask = MASK << max_bits;
+ y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt;
+ } else {
+ /* small d_off */
+ y = ((x * max) + cnt);
}
+
+out:
+ if (y_p)
+ *y_p = y;
+
+ return 0;
}
-int32_t
-afr_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries, dict_t *xdata)
+int
+afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p,
+ uint64_t *x_p)
{
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL);
+ afr_private_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t x = 0;
+ int subvol = 0;
+ int max_bits = 0;
+ uint64_t off_mask = 0;
+ uint64_t host_mask = 0;
+
+ if (!this->private)
+ return -1;
+
+ conf = this->private;
+ max = conf->child_count;
+
+ if (max == 1) {
+ x = y;
+ cnt = 0;
+ goto out;
+ }
+
+ if (y & TOP_BIT) {
+ /* HUGE d_off */
+ max_bits = afr_bits_for (max);
+ off_mask = (MASK << max_bits);
+ host_mask = ~(off_mask);
+
+ x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS;
+
+ cnt = y & host_mask;
+ } else {
+ /* small d_off */
+ cnt = y % max;
+ x = y / max;
+ }
+
+out:
+ subvol = cnt;
+
+ if (subvol_p)
+ *subvol_p = subvol;
+
+ if (x_p)
+ *x_p = x;
return 0;
}
-int32_t
-afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
- dict_t *xdata)
+static void
+afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol,
+ gf_dirent_t *entries, fd_t *fd)
{
- AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL);
+ afr_private_t *priv = NULL;
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int gen = 0;
+
+ priv = THIS->private;
+
+ data_readable = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+
+ list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) {
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ continue;
+ }
- return 0;
+ list_del_init (&entry->list);
+ afr_itransform (THIS, subvol, entry->d_off, &entry->d_off);
+ list_add_tail (&entry->list, &entries->list);
+
+ if (entry->inode) {
+ gen = 0;
+ afr_inode_read_subvol_get (entry->inode, THIS,
+ data_readable,
+ metadata_readable, &gen);
+
+ if (gen != priv->event_generation ||
+ !data_readable[subvol] ||
+ !metadata_readable[subvol]) {
+
+ inode_unref (entry->inode);
+ entry->inode = NULL;
+ }
+ }
+ }
}
+
int32_t
-afr_do_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict)
+afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = -1;
- int32_t op_errno = 0;
- uint64_t read_child = 0;
+ afr_local_t *local = NULL;
+ gf_dirent_t entries;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ INIT_LIST_HEAD (&entries.list);
- priv = this->private;
- children = priv->children;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
local = frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ if (op_ret < 0 && !local->cont.readdir.offset) {
+ /* failover only if this was first readdir, detected
+ by offset == 0 */
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- read_child = afr_inode_get_read_ctx (this, fd->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.readdir.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ if (op_ret >= 0)
+ afr_readdir_transform_entries (subvol_entries, (long) cookie,
+ &entries, local->fd);
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx) {
- op_errno = EBADF;
- goto out;
- }
+ AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata);
- if ((offset == 0) || (fd_ctx->call_child == -1)) {
- fd_ctx->call_child = call_child;
- } else if ((priv->readdir_failover == _gf_false) &&
- (call_child != fd_ctx->call_child)) {
- op_errno = EBADF;
- goto out;
- }
+ return 0;
+}
+
+
+int
+afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
- local->fd = fd_ref (fd);
- local->cont.readdir.size = size;
- local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL;
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readdir, frame, local->op_ret,
+ local->op_errno, 0, 0);
+ return 0;
+ }
- if (whichop == GF_FOP_READDIR)
+ if (local->op == GF_FOP_READDIR)
STACK_WIND_COOKIE (frame, afr_readdir_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdir, fd,
- size, offset, dict);
+ (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdir,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset,
+ local->xdata_req);
else
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdirp, fd,
- size, offset, dict);
+ STACK_WIND_COOKIE (frame, afr_readdir_cbk,
+ (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdirp,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset,
+ local->xdata_req);
+ return 0;
+}
+
+
+int
+afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, int whichop, dict_t *dict)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int subvol = -1;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = whichop;
+ local->fd = fd_ref (fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+ local->xdata_req = (dict)? dict_ref (dict) : NULL;
+
+ if (offset == 0) {
+ /* First readdir has option of failing over and selecting
+ an appropriate read subvolume */
+ afr_read_txn (frame, this, fd->inode, afr_readdir_wind,
+ AFR_DATA_TRANSACTION);
+ } else {
+ /* But continued readdirs MUST stick to the same subvolume
+ without an option to failover */
+ afr_deitransform (this, offset, &subvol,
+ (uint64_t *)&local->cont.readdir.offset);
+ afr_readdir_wind (frame, this, subvol);
+ }
return 0;
out:
@@ -491,7 +403,8 @@ int32_t
afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *xdata)
{
- afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
+ afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
+
return 0;
}
@@ -501,6 +414,7 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *dict)
{
afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict);
+
return 0;
}
@@ -508,7 +422,6 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
int32_t
afr_releasedir (xlator_t *this, fd_t *fd)
{
- afr_forget_entries (fd);
afr_cleanup_fd_ctx (this, fd);
return 0;
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 5a22696ce..465dde54f 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -34,10 +34,14 @@
#include "common-utils.h"
#include "compat-errno.h"
#include "compat.h"
+#include "byte-order.h"
#include "afr.h"
#include "afr-transaction.h"
+void
+afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this);
+
int
afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno)
{
@@ -56,79 +60,214 @@ afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno)
*op_errno = ENOMEM;
goto out;
}
- parent->path = gf_strdup( dirname (child_path) );
- if (!parent->path) {
+
+ parent->path = gf_strdup (dirname (child_path));
+ if (!parent->path) {
if (op_errno)
*op_errno = ENOMEM;
goto out;
}
- parent->inode = inode_ref (child->parent);
- uuid_copy (parent->gfid, child->pargfid);
+
+ parent->inode = inode_ref (child->parent);
+ uuid_copy (parent->gfid, child->pargfid);
ret = 0;
out:
- GF_FREE(child_path);
+ GF_FREE (child_path);
return ret;
}
-void
-__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, struct iatt *prenewparent,
- struct iatt *postnewparent)
+
+static void
+__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int inode_read_subvol = -1;
+ int parent_read_subvol = -1;
+ int parent2_read_subvol = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (local->inode) {
+ afr_replies_interpret (frame, this, local->inode);
+ inode_read_subvol = afr_data_subvol_get (local->inode, this,
+ NULL, NULL);
+ }
+ if (local->parent)
+ parent_read_subvol = afr_data_subvol_get (local->parent, this,
+ NULL, NULL);
+ if (local->parent2)
+ parent2_read_subvol = afr_data_subvol_get (local->parent2, this,
+ NULL, NULL);
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno (local, priv);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ if (local->inode)
+ afr_inode_read_subvol_reset (local->inode,
+ this);
+ if (local->parent)
+ afr_inode_read_subvol_reset (local->parent,
+ this);
+ if (local->parent2)
+ afr_inode_read_subvol_reset (local->parent2,
+ this);
+ continue;
+ }
+
+ if (local->op_ret == -1) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.dir_fop.buf =
+ local->replies[i].poststat;
+ local->cont.dir_fop.preparent =
+ local->replies[i].preparent;
+ local->cont.dir_fop.postparent =
+ local->replies[i].postparent;
+ local->cont.dir_fop.prenewparent =
+ local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent =
+ local->replies[i].postparent2;
+ if (local->replies[i].xdata)
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
+ continue;
+ }
+
+ if (i == inode_read_subvol) {
+ local->cont.dir_fop.buf =
+ local->replies[i].poststat;
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
+ }
+ }
+
+ if (i == parent_read_subvol) {
+ local->cont.dir_fop.preparent =
+ local->replies[i].preparent;
+ local->cont.dir_fop.postparent =
+ local->replies[i].postparent;
+ }
+
+ if (i == parent2_read_subvol) {
+ local->cont.dir_fop.prenewparent =
+ local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent =
+ local->replies[i].postparent2;
+ }
+ }
+}
+
+
+static void
+__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *poststat,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
+ if (op_ret >= 0) {
+ if (poststat)
+ local->replies[child_index].poststat = *poststat;
+ if (preparent)
+ local->replies[child_index].preparent = *preparent;
+ if (postparent)
+ local->replies[child_index].postparent = *postparent;
+ if (preparent2)
+ local->replies[child_index].preparent2 = *preparent2;
+ if (postparent2)
+ local->replies[child_index].postparent2 = *postparent2;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ if (op_errno != ENOTEMPTY)
+ afr_transaction_fop_failed (frame, this, child_index);
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ }
+
+ return;
+}
+
+
+static int
+__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ __afr_dir_write_fill (frame, this, child_index, op_ret,
+ op_errno, buf, preparent, postparent,
+ preparent2, postparent2, xdata);
+ }
+ UNLOCK (&frame->lock);
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ __afr_dir_write_finalize (frame, this);
+
+ if (afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret > -1) {
- local->op_ret = op_ret;
-
- if ((local->success_count == 0) ||
- (child_index == local->read_child_index)) {
- local->cont.dir_fop.preparent = *preparent;
- local->cont.dir_fop.postparent = *postparent;
- if (buf)
- local->cont.dir_fop.buf = *buf;
- if (prenewparent)
- local->cont.dir_fop.prenewparent = *prenewparent;
- if (postnewparent)
- local->cont.dir_fop.postnewparent = *postnewparent;
- }
-
- local->cont.dir_fop.inode = inode;
-
- local->fresh_children[local->success_count] = child_index;
- local->success_count++;
- local->child_errno[child_index] = 0;
- } else {
- local->child_errno[child_index] = op_errno;
+ afr_mark_entry_pending_changelog (frame, this);
+
+ local->transaction.resume (frame, this);
}
- local->op_errno = op_errno;
+ return 0;
}
+
int
afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
+ xlator_t *this, int op_ret, int op_errno,
dict_t *xattr, dict_t *xdata)
{
- int call_count = 0;
+ int call_count = 0;
call_count = afr_frame_return (frame);
- if (call_count == 0) {
+
+ if (call_count == 0)
AFR_STACK_DESTROY (frame);
- }
+
return 0;
}
+
void
afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this)
{
@@ -136,146 +275,109 @@ afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
afr_local_t *new_local = NULL;
afr_private_t *priv = NULL;
- dict_t **xattr = NULL;
+ dict_t *xattr = NULL;
int32_t **changelog = NULL;
int i = 0;
- GF_UNUSED int op_errno = 0;
+ int idx = 0;
+ int op_errno = ENOMEM;
+ unsigned char *pending = NULL;
+ int call_count = 0;
local = frame->local;
priv = this->private;
new_frame = copy_frame (frame);
- if (!new_frame) {
+ if (!new_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (new_frame->local, out);
- new_local = new_frame->local;
+ new_local = AFR_FRAME_INIT (new_frame, op_errno);
+ if (!new_local)
+ goto out;
+
changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
if (!changelog)
goto out;
- xattr = GF_CALLOC (priv->child_count, sizeof (*xattr),
- gf_afr_mt_dict_t);
- if (!xattr)
- goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_errno[i])
- continue;
- xattr[i] = dict_new ();
- if (!xattr[i])
- goto out;
- }
+ xattr = dict_new ();
+ if (!xattr)
+ goto out;
+
+ idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
+
+ pending = alloca0 (priv->child_count);
- afr_prepare_new_entry_pending_matrix (changelog,
- afr_is_errno_set,
- local->child_errno,
- &local->cont.dir_fop.buf,
- priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ !local->transaction.failed_subvols[i]) {
+ call_count ++;
+ continue;
+ }
+
+ changelog[i][idx] = hton32(1);
+ pending[i] = 1;
+ }
new_local->pending = changelog;
uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid);
- new_local->loc.inode = inode_ref (local->cont.dir_fop.inode);
- new_local->call_count = local->success_count;
+ new_local->loc.inode = inode_ref (local->inode);
+
+
+ afr_set_pending_dict (priv, xattr, changelog);
+
+ new_local->call_count = call_count;
for (i = 0; i < priv->child_count; i++) {
- if (local->child_errno[i])
+ if (pending[i])
continue;
- afr_set_pending_dict (priv, xattr[i], changelog, i, LOCAL_LAST);
STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk,
(void *) (long) i, priv->children[i],
priv->children[i]->fops->xattrop,
&new_local->loc, GF_XATTROP_ADD_ARRAY,
- xattr[i], NULL);
+ xattr, NULL);
+ if (!--call_count)
+ break;
}
+
new_frame = NULL;
out:
if (new_frame)
AFR_STACK_DESTROY (new_frame);
- afr_xattr_array_destroy (xattr, priv->child_count);
+ if (xattr)
+ dict_unref (xattr);
return;
}
-gf_boolean_t
-afr_is_new_entry_changelog_needed (glusterfs_fop_t fop)
-{
- glusterfs_fop_t fops[] = {GF_FOP_CREATE, GF_FOP_MKNOD, GF_FOP_NULL};
- int i = 0;
-
- for (i = 0; fops[i] != GF_FOP_NULL; i++) {
- if (fop == fops[i])
- return _gf_true;
- }
- return _gf_false;
-}
void
-afr_dir_fop_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this)
+afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_count = 0;
+ int failed_count = 0;
local = frame->local;
priv = this->private;
if (local->op_ret < 0)
- goto out;
+ return;
- if (local->success_count == priv->child_count)
- goto out;
+ if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD)
+ return;
- if (!afr_is_new_entry_changelog_needed (local->op))
- goto out;
+ pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+ failed_count = AFR_COUNT (local->transaction.failed_subvols,
+ priv->child_count);
- afr_mark_new_entry_changelog (frame, this);
+ if (pre_op_count == priv->child_count && !failed_count)
+ return;
-out:
- return;
-}
-
-void
-afr_dir_fop_handle_all_fop_failures (call_frame_t *frame)
-{
- xlator_t *this = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- this = frame->this;
- local = frame->local;
- priv = this->private;
-
- if (local->op_ret >= 0)
- goto out;
+ afr_mark_new_entry_changelog (frame, this);
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
-out:
return;
}
-void
-afr_dir_fop_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- if (local->cont.dir_fop.inode == NULL)
- goto done;
- afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode,
- local->fresh_children,
- local->read_child_index,
- priv->read_child,
- local->cont.dir_fop.buf.ia_gfid);
-done:
- local->transaction.unwind (frame, this);
- afr_dir_fop_mark_entry_pending_changelog (frame, this);
- afr_dir_fop_handle_all_fop_failures (frame);
- local->transaction.resume (frame, this);
-}
/* {{{ create */
@@ -287,26 +389,16 @@ afr_create_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (create, main_frame,
- local->op_ret, local->op_errno,
- local->cont.create.fd,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- local->xdata_rsp);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+
+ AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd, local->inode,
+ &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -318,175 +410,79 @@ afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *preparent, struct iatt *postparent,
dict_t *xdata)
{
- afr_local_t *local = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret > -1) {
- ret = afr_fd_ctx_set (this, fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set ctx on fd=%p", fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not get fd ctx for fd=%p", fd);
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
- fd_ctx->flags = local->cont.create.flags;
-
- if (local->success_count == 0) {
- if (xdata)
- local->xdata_rsp = dict_ref(xdata);
- }
- }
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
-
-unlock:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_create_wind (call_frame_t *frame, xlator_t *this)
+afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->create,
- &local->loc,
- local->cont.create.flags,
- local->cont.create.mode,
- local->umask,
- local->cont.create.fd,
- local->xdata_req);
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->create,
+ &local->loc, local->cont.create.flags,
+ local->cont.create.mode, local->umask,
+ local->cont.create.fd, local->xdata_req);
return 0;
}
int
-afr_create_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
-int
-afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- mode_t umask, fd_t *fd, dict_t *params)
+afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(create,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!local->fd_ctx)
+ goto out;
+
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
local->op = GF_FOP_CREATE;
local->cont.create.flags = flags;
local->cont.create.mode = mode;
local->cont.create.fd = fd_ref (fd);
local->umask = umask;
- if (params)
- local->xdata_req = dict_ref (params);
- local->transaction.fop = afr_create_wind;
- local->transaction.done = afr_create_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_create_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_create_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -513,15 +509,13 @@ afr_create (call_frame_t *frame, xlator_t *this,
goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (create, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
return 0;
}
@@ -537,25 +531,14 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (mknod, main_frame,
- local->op_ret, local->op_errno,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -566,131 +549,72 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int call_count = -1;
- int child_index = -1;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-int32_t
-afr_mknod_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mknod,
- &local->loc, local->cont.mknod.mode,
- local->cont.mknod.dev,
- local->umask,
- local->xdata_req);
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mknod,
+ &local->loc, local->cont.mknod.mode,
+ local->cont.mknod.dev, local->umask,
+ local->xdata_req);
return 0;
}
-
-int
-afr_mknod_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
int
afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t dev, mode_t umask, dict_t *params)
+ dev_t dev, mode_t umask, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(mknod,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
local->op = GF_FOP_MKNOD;
local->cont.mknod.mode = mode;
local->cont.mknod.dev = dev;
local->umask = umask;
- if (params)
- local->xdata_req = dict_ref (params);
- local->transaction.fop = afr_mknod_wind;
- local->transaction.done = afr_mknod_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_mknod_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_mknod_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -713,19 +637,17 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (mknod, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
return 0;
}
@@ -742,25 +664,14 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (mkdir, main_frame,
- local->op_ret, local->op_errno,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -771,130 +682,71 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int call_count = -1;
- int child_index = -1;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mkdir,
- &local->loc, local->cont.mkdir.mode,
- local->umask,
- local->xdata_req);
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mkdir, &local->loc,
+ local->cont.mkdir.mode, local->umask,
+ local->xdata_req);
return 0;
}
int
-afr_mkdir_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int
-afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, mode_t umask, dict_t *params)
+afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(mkdir,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
local->cont.mkdir.mode = mode;
local->umask = umask;
- if (params)
- local->xdata_req = dict_ref (params);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
local->op = GF_FOP_MKDIR;
- local->transaction.fop = afr_mkdir_wind;
- local->transaction.done = afr_mkdir_done;
+ local->transaction.wind = afr_mkdir_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_mkdir_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -917,20 +769,17 @@ afr_mkdir (call_frame_t *frame, xlator_t *this,
int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (mkdir, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
return 0;
}
@@ -947,25 +796,14 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (link, main_frame,
- local->op_ret, local->op_errno,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -976,127 +814,70 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int call_count = -1;
- int child_index = -1;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_link_wind (call_frame_t *frame, xlator_t *this)
+afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_link_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->link,
- &local->loc,
- &local->newloc, local->xdata_req);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_link_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->link,
+ &local->loc, &local->newloc, local->xdata_req);
return 0;
}
int
-afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(link,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, oldloc);
loc_copy (&local->newloc, newloc);
+
+ local->inode = inode_ref (oldloc->inode);
+ local->parent = inode_ref (newloc->parent);
+
if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ if (!local->xdata_req)
+ goto out;
local->op = GF_FOP_LINK;
- local->transaction.fop = afr_link_wind;
- local->transaction.done = afr_link_done;
+
+ local->transaction.wind = afr_link_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_link_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc,
@@ -1119,18 +900,17 @@ afr_link (call_frame_t *frame, xlator_t *this,
int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (link, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
return 0;
}
@@ -1147,25 +927,14 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (symlink, main_frame,
- local->op_ret, local->op_errno,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -1176,132 +945,71 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int call_count = -1;
- int child_index = -1;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->symlink,
- local->cont.symlink.linkpath,
- &local->loc,
- local->umask,
- local->xdata_req);
-
- if (!--call_count)
- break;
-
- }
- }
-
- return 0;
-}
-
-
-int
-afr_symlink_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->symlink,
+ local->cont.symlink.linkpath, &local->loc,
+ local->umask, local->xdata_req);
return 0;
}
int
-afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, mode_t umask, dict_t *params)
+afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(symlink,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
local->cont.symlink.linkpath = gf_strdup (linkpath);
local->umask = umask;
- if (params)
- local->xdata_req = dict_ref (params);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
local->op = GF_FOP_SYMLINK;
- local->transaction.fop = afr_symlink_wind;
- local->transaction.done = afr_symlink_done;
+ local->transaction.wind = afr_symlink_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_symlink_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -1324,19 +1032,17 @@ afr_symlink (call_frame_t *frame, xlator_t *this,
int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (symlink, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL);
return 0;
}
@@ -1352,26 +1058,16 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (rename, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- &local->cont.dir_fop.prenewparent,
- &local->cont.dir_fop.postnewparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent,
+ &local->cont.dir_fop.prenewparent,
+ &local->cont.dir_fop.postnewparent, local->xdata_rsp);
return 0;
}
@@ -1383,131 +1079,72 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *prenewparent, struct iatt *postnewparent,
dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY)
- afr_transaction_fop_failed (frame, this, child_index);
- local->op_errno = op_errno;
- local->child_errno[child_index] = op_errno;
-
- if (op_ret > -1)
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, NULL, buf,
- preoldparent, postoldparent,
- prenewparent, postnewparent);
-
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
}
-int32_t
-afr_rename_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rename,
- &local->loc,
- &local->newloc, NULL);
- if (!--call_count)
- break;
- }
- }
+ local = frame->local;
+ priv = this->private;
+ STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rename,
+ &local->loc, &local->newloc, local->xdata_req);
return 0;
}
int
-afr_rename_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
-int
-afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
+ int op_errno = ENOMEM;
int nlockee = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
priv = this->private;
QUORUM_CHECK(rename,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
+ if (!transaction_frame)
op_errno = ENOMEM;
- goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, oldloc);
loc_copy (&local->newloc, newloc);
- local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL);
+ local->inode = inode_ref (oldloc->inode);
+ local->parent = inode_ref (oldloc->parent);
+ local->parent2 = inode_ref (newloc->parent);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
local->op = GF_FOP_RENAME;
- local->transaction.fop = afr_rename_wind;
- local->transaction.done = afr_rename_done;
+ local->transaction.wind = afr_rename_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_rename_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc,
@@ -1557,20 +1194,17 @@ afr_rename (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (rename, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
return 0;
}
@@ -1586,23 +1220,13 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (unlink, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -1612,123 +1236,69 @@ afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (child_index == local->read_child_index) {
- local->read_child_returned = _gf_true;
- }
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, NULL, NULL,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-int32_t
-afr_unlink_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->unlink,
- &local->loc, local->xflag,
- local->xdata_req);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int32_t
-afr_unlink_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->unlink,
+ &local->loc, local->xflag, local->xdata_req);
return 0;
}
-int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int xflag, dict_t *xdata)
+int
+afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(unlink,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
local->xflag = xflag;
+
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
+
if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
local->op = GF_FOP_UNLINK;
- local->transaction.fop = afr_unlink_wind;
- local->transaction.done = afr_unlink_done;
+ local->transaction.wind = afr_unlink_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_unlink_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -1751,19 +1321,16 @@ afr_unlink (call_frame_t *frame, xlator_t *this,
int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (unlink, frame, -1, op_errno,
- NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -1781,23 +1348,13 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (rmdir, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -1807,130 +1364,71 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
- int read_child = 0;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
- if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY))
- afr_transaction_fop_failed (frame, this, child_index);
- local->op_errno = op_errno;
- local->child_errno[child_index] = op_errno;
- if (op_ret > -1)
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, NULL, NULL,
- preparent, postparent, NULL,
- NULL);
-
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rmdir,
- &local->loc, local->cont.rmdir.flags,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_rmdir_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rmdir,
+ &local->loc, local->cont.rmdir.flags, local->xdata_req);
return 0;
}
int
-afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags, dict_t *xdata)
+afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
+ int op_errno = ENOMEM;
int nlockee = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
priv = this->private;
QUORUM_CHECK(rmdir,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
- local->cont.rmdir.flags = flags;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
+
+ local->cont.rmdir.flags = flags;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
local->op = GF_FOP_RMDIR;
- local->transaction.fop = afr_rmdir_wind;
- local->transaction.done = afr_rmdir_done;
+ local->transaction.wind = afr_rmdir_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_rmdir_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -1965,18 +1463,16 @@ afr_rmdir (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 051b7b164..4cb219246 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -35,237 +35,153 @@
#include "compat-errno.h"
#include "compat.h"
-/**
- * Common algorithm for inode read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: access, stat, fstat, readlink, getxattr
- */
+#include "afr-transaction.h"
+
/* {{{ access */
-int32_t
-afr_access_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+int
+afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
+ afr_local_t *local = NULL;
local = frame->local;
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- last_index = &local->cont.access.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
-
- unwind = 0;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->access,
- &local->loc, local->cont.access.mask,
- NULL);
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
-out:
- if (unwind) {
- AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);
- }
+ AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);
return 0;
}
-int32_t
-afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
- dict_t *xdata)
+int
+afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- int32_t op_errno = 0;
- int32_t read_child = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (access, frame, local->op_ret,
+ local->op_errno, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->access,
+ &local->loc, local->cont.access.mask,
+ local->xdata_req);
+ return 0;
+}
+int
+afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int mask, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- read_child = afr_inode_get_read_ctx (this, loc->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.access.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- loc_copy (&local->loc, loc);
- local->cont.access.mask = mask;
+ local->op = GF_FOP_ACCESS;
+ loc_copy (&local->loc, loc);
+ local->cont.access.mask = mask;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->access,
- loc, mask, xdata);
+ afr_read_txn (frame, this, loc->inode, afr_access_wind,
+ AFR_METADATA_TRANSACTION);
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL);
+
return 0;
}
-
/* }}} */
/* {{{ stat */
-int32_t
+int
afr_stat_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
struct iatt *buf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
-
- read_child = (long) cookie;
+ afr_local_t *local = NULL;
local = frame->local;
- if (op_ret == -1) {
- last_index = &local->cont.stat.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- unwind = 0;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- STACK_WIND_COOKIE (frame, afr_stat_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->stat,
- &local->loc, NULL);
- }
-
-out:
- if (unwind) {
- AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
- }
+ AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
-int32_t
-afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+int
+afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- int32_t op_errno = 0;
- int32_t read_child = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno,
+ 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->stat,
+ &local->loc, local->xdata_req);
+ return 0;
+}
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+int
+afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- read_child = afr_inode_get_read_ctx (this, loc->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.stat.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
- loc_copy (&local->loc, loc);
+ local->op = GF_FOP_STAT;
+ loc_copy (&local->loc, loc);
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->stat,
- loc, xdata);
+ afr_read_txn (frame, this, loc->inode, afr_stat_wind,
+ AFR_DATA_TRANSACTION);
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);
return 0;
}
@@ -275,52 +191,49 @@ out:
/* {{{ fstat */
-int32_t
+int
afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf,
dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
+ afr_local_t *local = NULL;
local = frame->local;
- read_child = (long) cookie;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- if (op_ret == -1) {
- last_index = &local->cont.fstat.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- unwind = 0;
+ AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
- STACK_WIND_COOKIE (frame, afr_fstat_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->fstat,
- local->fd, NULL);
- }
+ return 0;
+}
-out:
- if (unwind) {
- AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
- }
- return 0;
+int
+afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno,
+ 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fstat,
+ local->fd, local->xdata_req);
+ return 0;
}
@@ -328,71 +241,26 @@ int32_t
afr_fstat (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- int32_t op_errno = 0;
- int32_t read_child = 0;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- children = priv->children;
+ local->op = GF_FOP_FSTAT;
+ local->fd = fd_ref (fd);
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- VALIDATE_OR_GOTO (fd->inode, out);
+ afr_fix_open (fd, this);
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ afr_read_txn (frame, this, fd->inode, afr_fstat_wind,
+ AFR_DATA_TRANSACTION);
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
-
- read_child = afr_inode_get_read_ctx (this, fd->inode,
- local->fresh_children);
-
-
-
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.fstat.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- local->fd = fd_ref (fd);
-
- afr_open_fd_fix (fd, this);
-
- STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->fstat,
- fd, xdata);
-
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);
return 0;
}
@@ -401,115 +269,77 @@ out:
/* {{{ readlink */
-int32_t
+int
afr_readlink_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
const char *buf, struct iatt *sbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
+ afr_local_t *local = NULL;
- read_child = (long) cookie;
+ local = frame->local;
- if (op_ret == -1) {
- last_index = &local->cont.readlink.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->readlink,
- &local->loc,
- local->cont.readlink.size, NULL);
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
-out:
- if (unwind) {
- AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf,
- xdata);
- }
+ AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno,
+ buf, sbuf, xdata);
+ return 0;
+}
- return 0;
+int
+afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readlink, frame, local->op_ret,
+ local->op_errno, 0, 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readlink,
+ &local->loc, local->cont.readlink.size,
+ local->xdata_req);
+ return 0;
}
-int32_t
+int
afr_readlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, size_t size, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
+ afr_local_t * local = NULL;
int32_t op_errno = 0;
- int32_t read_child = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
- read_child = afr_inode_get_read_ctx (this, loc->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.readlink.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+ local->op = GF_FOP_READLINK;
loc_copy (&local->loc, loc);
+ local->cont.readlink.size = size;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- local->cont.readlink.size = size;
-
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readlink,
- loc, size, xdata);
+ afr_read_txn (frame, this, loc->inode, afr_readlink_wind,
+ AFR_DATA_TRANSACTION);
- ret = 0;
-out:
- if (ret < 0)
- AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
+out:
+ AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0);
+
+ return 0;
}
@@ -547,7 +377,7 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value,
void
-__filter_xattrs (dict_t *dict)
+afr_filter_xattrs (dict_t *dict)
{
struct list_head keys = {0,};
struct _xattr_key *key = NULL;
@@ -568,59 +398,56 @@ __filter_xattrs (dict_t *dict)
}
-
-int32_t
+int
afr_getxattr_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
+ afr_local_t *local = NULL;
local = frame->local;
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- last_index = &local->cont.getxattr.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name,
- NULL);
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
-out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
+ if (dict)
+ afr_filter_xattrs (dict);
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
- }
+ AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
return 0;
}
+
+int
+afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, NULL, NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->getxattr,
+ &local->loc, local->cont.getxattr.name,
+ local->xdata_req);
+ return 0;
+}
+
+
int32_t
afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno,
dict_t *dict, dict_t *xdata)
@@ -656,7 +483,7 @@ afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
{
callcnt = --local->call_count;
if (op_ret == -1)
- local->child_errno[cky] = op_errno;
+ local->replies[cky].op_errno = op_errno;
if (!local->dict)
local->dict = dict_new ();
@@ -707,12 +534,10 @@ unlock:
unwind:
// Updating child_errno with more recent 'events'
- local->child_errno[cky] = op_errno;
- op_errno = afr_resultant_errno_get (NULL, local->child_errno,
- priv->child_count);
+ op_errno = afr_final_errno (local, priv);
+
AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,
xdata);
-
if (xattr)
dict_unref (xattr);
}
@@ -746,7 +571,7 @@ afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
{
callcnt = --local->call_count;
if (op_ret == -1)
- local->child_errno[cky] = op_errno;
+ local->replies[cky].op_errno = op_errno;
if (!local->dict)
local->dict = dict_new ();
@@ -797,9 +622,8 @@ unlock:
unwind:
// Updating child_errno with more recent 'events'
- local->child_errno[cky] = op_errno;
- op_errno = afr_resultant_errno_get (NULL, local->child_errno,
- priv->child_count);
+ op_errno = afr_final_errno (local, priv);
+
AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata);
if (xattr)
@@ -1124,6 +948,14 @@ afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
{
callcnt = --local->call_count;
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+
if (!dict || (op_ret < 0))
goto out;
@@ -1201,8 +1033,8 @@ out:
" key in dict");
unwind:
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, nxattr,
- xdata);
+ AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret,
+ local->op_errno, nxattr, local->xdata_rsp);
if (nxattr)
dict_unref (nxattr);
@@ -1239,6 +1071,14 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
{
callcnt = --local->call_count;
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+
if (!dict || (op_ret < 0))
goto out;
@@ -1313,8 +1153,8 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
" key in dict");
unwind:
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, nxattr,
- xdata);
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, nxattr, local->xdata_rsp);
if (nxattr)
dict_unref (nxattr);
@@ -1323,6 +1163,62 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
return ret;
}
+static int
+afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data)
+{
+ int ret = 0;
+
+ if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+ ret = gf_get_max_stime (THIS, data, key, value);
+
+ return ret;
+}
+
+int32_t
+afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_log ("", GF_LOG_ERROR, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (!dict || (op_ret < 0)) {
+ local->op_errno = op_errno;
+ goto cleanup;
+ }
+
+ if (!local->dict)
+ local->dict = dict_copy_with_ref (dict, NULL);
+ else
+ dict_foreach (dict, afr_aggregate_stime_xattr,
+ local->dict);
+ local->op_ret = 0;
+ }
+
+cleanup:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, local->dict, xdata);
+ }
+
+out:
+ return 0;
+}
+
+
static gf_boolean_t
afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
gf_boolean_t is_fgetxattr)
@@ -1330,12 +1226,13 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
gf_boolean_t is_spl = _gf_true;
GF_ASSERT (cbk);
- if (!cbk) {
+ if (!cbk || !name) {
is_spl = _gf_false;
goto out;
}
- if (!strcmp (name, GF_XATTR_PATHINFO_KEY)) {
+ if (!strcmp (name, GF_XATTR_PATHINFO_KEY) ||
+ !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) {
if (is_fgetxattr) {
*cbk = afr_fgetxattr_pathinfo_cbk;
} else {
@@ -1355,6 +1252,8 @@ afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
} else {
*cbk = afr_getxattr_lockinfo_cbk;
}
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+ *cbk = afr_common_getxattr_stime_cbk;
} else {
is_spl = _gf_false;
}
@@ -1364,26 +1263,33 @@ out:
}
static void
-afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame,
- const char *name, loc_t *loc,
- fop_getxattr_cbk_t cbk)
+afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame,
+ const char *name, loc_t *loc,
+ fop_getxattr_cbk_t cbk)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- xlator_t **children = NULL;
int i = 0;
+ int call_count = 0;
priv = this->private;
- children = priv->children;
local = frame->local;
- local->call_count = priv->child_count;
+ //local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ //If up-children count is 0, afr_local_init would have failed already
+ //and the call would have unwound so not handling it here.
for (i = 0; i < priv->child_count; i++) {
- STACK_WIND_COOKIE (frame, cbk,
- (void *) (long) i,
- children[i], children[i]->fops->getxattr,
- loc, name, NULL);
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->getxattr,
+ loc, name, NULL);
+ if (!--call_count)
+ break;
+ }
}
return;
}
@@ -1394,39 +1300,41 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
{
afr_private_t *priv = NULL;
xlator_t **children = NULL;
- int call_child = 0;
afr_local_t *local = NULL;
xlator_list_t *trav = NULL;
xlator_t **sub_volumes = NULL;
int i = 0;
int32_t op_errno = 0;
- int32_t read_child = -1;
int ret = -1;
fop_getxattr_cbk_t cbk = NULL;
+ int afr_xtime_gauge[MCNT_MAX] = {0,};
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
children = priv->children;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
+ loc_copy (&local->loc, loc);
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local->op = GF_FOP_GETXATTR;
+
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- loc_copy (&local->loc, loc);
if (!name)
goto no_name;
local->cont.getxattr.name = gf_strdup (name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
if (!strncmp (name, AFR_XATTR_PREFIX,
strlen (AFR_XATTR_PREFIX))) {
gf_log (this->name, GF_LOG_INFO,
@@ -1452,6 +1360,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
sub_volumes,
priv->child_count,
MARKER_UUID_TYPE,
+ marker_uuid_default_gauge,
priv->vol_uuid)) {
gf_log (this->name, GF_LOG_INFO,
@@ -1469,8 +1378,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
* collect information from all childs
*/
if (afr_is_special_xattr (name, &cbk, 0)) {
- afr_getxattr_frm_all_children (this, frame, name,
- loc, cbk);
+ afr_getxattr_all_subvols (this, frame, name, loc, cbk);
return 0;
}
@@ -1498,12 +1406,20 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
}
+ /* don't err out on getting ENOTCONN (brick down)
+ * from a subset of the bricks
+ */
+ memcpy (afr_xtime_gauge, marker_xtime_default_gauge,
+ sizeof (afr_xtime_gauge));
+ afr_xtime_gauge[MCNT_NOTFOUND] = 0;
+ afr_xtime_gauge[MCNT_ENOTCONN] = 0;
if (cluster_getmarkerattr (frame, this, loc,
name, local,
afr_getxattr_unwind,
sub_volumes,
priv->child_count,
MARKER_XTIME_TYPE,
+ afr_xtime_gauge,
priv->vol_uuid)) {
gf_log (this->name, GF_LOG_INFO,
"%s: failed to get marker attr (%s)",
@@ -1517,28 +1433,9 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
}
no_name:
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
- read_child = afr_inode_get_read_ctx (this, loc->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.getxattr.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->getxattr,
- loc, name, xdata);
+ afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind,
+ AFR_METADATA_TRANSACTION);
ret = 0;
out:
@@ -1555,127 +1452,108 @@ afr_fgetxattr_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
+ afr_local_t *local = NULL;
- priv = this->private;
- children = priv->children;
+ local = frame->local;
- local = frame->local;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- read_child = (long) cookie;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- if (op_ret == -1) {
- last_index = &local->cont.getxattr.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ if (dict)
+ afr_filter_xattrs (dict);
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->fgetxattr,
- local->fd,
- local->cont.getxattr.name,
- NULL);
- }
+ AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata);
-out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
-
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict,
- xdata);
- }
-
- return 0;
+ return 0;
}
-int32_t
-afr_fgetxattr_unwind (call_frame_t *frame,
- int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
-
+int
+afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret,
+ local->op_errno, NULL, NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fgetxattr,
+ local->fd, local->cont.getxattr.name,
+ local->xdata_req);
+ return 0;
}
+
static void
-afr_fgetxattr_frm_all_children (xlator_t *this, call_frame_t *frame,
- const char *name, fd_t *fd,
- fop_fgetxattr_cbk_t cbk)
+afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame,
+ fop_fgetxattr_cbk_t cbk)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- xlator_t **children = NULL;
int i = 0;
+ int call_count = 0;
priv = this->private;
- children = priv->children;
local = frame->local;
- local->call_count = priv->child_count;
+ //local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ //If up-children count is 0, afr_local_init would have failed already
+ //and the call would have unwound so not handling it here.
for (i = 0; i < priv->child_count; i++) {
- STACK_WIND_COOKIE (frame, cbk,
- (void *) (long) i,
- children[i], children[i]->fops->fgetxattr,
- fd, name, NULL);
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fgetxattr,
+ local->fd, local->cont.getxattr.name,
+ NULL);
+ if (!--call_count)
+ break;
+ }
}
return;
}
-int32_t
+
+int
afr_fgetxattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
afr_local_t *local = NULL;
- int32_t op_ret = -1;
int32_t op_errno = 0;
- int32_t read_child = -1;
fop_fgetxattr_cbk_t cbk = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
- frame->local = local;
-
- op_ret = afr_local_init (local, priv, &op_errno);
- if (op_ret < 0) {
- op_errno = -op_ret;
- goto out;
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+ local->op = GF_FOP_FGETXATTR;
local->fd = fd_ref (fd);
- if (name)
+ if (name) {
local->cont.getxattr.name = gf_strdup (name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ }
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
/* pathinfo gets handled only in getxattr(), but we need to handle
* lockinfo.
@@ -1683,42 +1561,19 @@ afr_fgetxattr (call_frame_t *frame, xlator_t *this,
* collect information from all children.
*/
if (afr_is_special_xattr (name, &cbk, 1)) {
- afr_fgetxattr_frm_all_children (this, frame, name,
- fd, cbk);
+ afr_fgetxattr_all_subvols (this, frame, cbk);
return 0;
}
+ afr_fix_open (fd, this);
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
-
- read_child = afr_inode_get_read_ctx (this, fd->inode,
- local->fresh_children);
- op_ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.getxattr.last_index);
- if (op_ret < 0) {
- op_errno = -op_ret;
- op_ret = -1;
- goto out;
- }
+ afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind,
+ AFR_METADATA_TRANSACTION);
- STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->fgetxattr,
- fd, name, xdata);
-
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL,
- NULL);
- }
+ AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
@@ -1727,147 +1582,84 @@ out:
/* {{{ readv */
-/**
- * read algorithm:
- *
- * if the user has specified a read subvolume, use it
- * otherwise -
- * use the inode number to hash it to one of the subvolumes, and
- * read from there (to balance read load)
- *
- * if any of the above read's fail, try the children in sequence
- * beginning at the beginning
- */
-
-int32_t
+int
afr_readv_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
struct iovec *vector, int32_t count, struct iatt *buf,
struct iobref *iobref, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t *fresh_children = NULL;
- int32_t read_child = -1;
+ afr_local_t *local = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- local = frame->local;
-
- read_child = (long) cookie;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- if (op_ret == -1) {
- last_index = &local->cont.readv.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
-
- unwind = 0;
-
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->readv,
- local->fd, local->cont.readv.size,
- local->cont.readv.offset,
- local->cont.readv.flags,
- NULL);
- }
+ AFR_STACK_UNWIND (readv, frame, op_ret, op_errno,
+ vector, count, buf, iobref, xdata);
+ return 0;
+}
-out:
- if (unwind) {
- AFR_STACK_UNWIND (readv, frame, op_ret, op_errno,
- vector, count, buf, iobref, xdata);
- }
- return 0;
+int
+afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno,
+ 0, 0, 0, 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readv,
+ local->fd, local->cont.readv.size,
+ local->cont.readv.offset, local->cont.readv.flags,
+ local->xdata_req);
+ return 0;
}
-int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+int
+afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
{
- afr_private_t * priv = NULL;
afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
int32_t op_errno = 0;
- int32_t read_child = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (fd, out);
- priv = this->private;
- children = priv->children;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
-
- read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.readv.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- local->fd = fd_ref (fd);
-
- local->cont.readv.size = size;
- local->cont.readv.offset = offset;
- local->cont.readv.flags = flags;
+ local->op = GF_FOP_READ;
+ local->fd = fd_ref (fd);
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+ local->cont.readv.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- afr_open_fd_fix (fd, this);
+ afr_fix_open (fd, this);
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readv,
- fd, size, offset, flags, xdata);
+ afr_read_txn (frame, this, fd->inode, afr_readv_wind,
+ AFR_DATA_TRANSACTION);
- ret = 0;
-out:
- if (ret < 0) {
- AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL,
- NULL, NULL);
- }
return 0;
+out:
+ AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+
+ return 0;
}
/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index ce4fbf226..00e0d2676 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -37,7 +37,128 @@
#include "afr.h"
#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
+
+
+static void
+__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int read_subvol = 0;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (local->inode) {
+ if (local->transaction.type == AFR_METADATA_TRANSACTION)
+ read_subvol = afr_metadata_subvol_get (local->inode, this,
+ NULL, NULL);
+ else
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ NULL, NULL);
+ }
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno (local, priv);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ afr_inode_read_subvol_reset (local->inode, this);
+ continue;
+ }
+
+ /* Order of checks in the compound conditional
+ below is important.
+
+ - Highest precedence: largest op_ret
+ - Next precendence: if all op_rets are equal, read subvol
+ - Least precedence: any succeeded subvol
+ */
+ if ((local->op_ret < local->replies[i].op_ret) ||
+ ((local->op_ret == local->replies[i].op_ret) &&
+ (i == read_subvol))) {
+
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.inode_wfop.prebuf =
+ local->replies[i].prestat;
+ local->cont.inode_wfop.postbuf =
+ local->replies[i].poststat;
+
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
+ }
+ }
+ }
+}
+
+
+static void
+__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
+ if (op_ret >= 0) {
+ if (prebuf)
+ local->replies[child_index].prestat = *prebuf;
+ if (postbuf)
+ local->replies[child_index].poststat = *postbuf;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ } else {
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
+
+ return;
+}
+
+
+static int
+__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ __afr_inode_write_fill (frame, this, child_index, op_ret,
+ op_errno, prebuf, postbuf, xdata);
+ }
+ UNLOCK (&frame->lock);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ __afr_inode_write_finalize (frame, this);
+
+ if (afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
+ }
+
+ return 0;
+}
/* {{{ writev */
@@ -52,8 +173,10 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame)
dst_local->op_ret = src_local->op_ret;
dst_local->op_errno = src_local->op_errno;
- dst_local->cont.writev.prebuf = src_local->cont.writev.prebuf;
- dst_local->cont.writev.postbuf = src_local->cont.writev.postbuf;
+ dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+ dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
+ if (src_local->xdata_rsp)
+ dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp);
}
void
@@ -64,28 +187,11 @@ afr_writev_unwind (call_frame_t *frame, xlator_t *this)
AFR_STACK_UNWIND (writev, frame,
local->op_ret, local->op_errno,
- &local->cont.writev.prebuf,
- &local->cont.writev.postbuf,
- NULL);
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
}
-call_frame_t*
-afr_transaction_detach_fop_frame (call_frame_t *frame)
-{
- afr_local_t * local = NULL;
- call_frame_t *fop_frame = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- fop_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- return fop_frame;
-}
int
afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this)
@@ -136,50 +242,60 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *fop_frame = NULL;
int child_index = (long) cookie;
int call_count = -1;
- int read_child = 0;
+ int ret = 0;
+ uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
local = frame->local;
- read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
-
LOCK (&frame->lock);
{
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- local->replies[child_index].valid = 1;
- local->replies[child_index].op_ret = op_ret;
- local->replies[child_index].op_errno = op_errno;
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- /* stage the best case return value for unwind */
- if ((local->success_count == 0) || (op_ret > local->op_ret)) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- }
-
- if (op_ret != -1) {
- if ((local->success_count == 0) ||
- (child_index == read_child)) {
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
- local->success_count++;
+ __afr_inode_write_fill (frame, this, child_index, op_ret,
+ op_errno, prebuf, postbuf, xdata);
+ if (op_ret == -1 || !xdata)
+ goto unlock;
+
+ write_is_append = 0;
+ ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
+
+ ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT,
+ &open_fd_count);
+ if (ret == -1)
+ goto unlock;
+ if ((open_fd_count > local->open_fd_count)) {
+ local->open_fd_count = open_fd_count;
+ local->update_open_fd_count = _gf_true;
}
}
+unlock:
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0) {
-
- if (!local->stable_write)
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
afr_fd_report_unstable_write (this, local->fd);
+ __afr_inode_write_finalize (frame, this);
+
afr_writev_handle_short_writes (frame, this);
+
+ if (local->update_open_fd_count)
+ afr_handle_open_fd_count (frame, this);
+
+ if (!afr_txn_nothing_failed (frame, this)) {
+ //Don't unwind until post-op is complete
+ local->transaction.resume (frame, this);
+ } else {
/*
* Generally inode-write fops do transaction.unwind then
* transaction.resume, but writev needs to make sure that
@@ -191,81 +307,32 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* completed.
*/
- fop_frame = afr_transaction_detach_fop_frame (frame);
- afr_writev_copy_outvars (frame, fop_frame);
- local->transaction.resume (frame, this);
- afr_writev_unwind (fop_frame, this);
- }
- return 0;
-}
-
-int
-afr_writev_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = -1;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
- local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
- gf_afr_mt_reply_t);
- if (!local->replies) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- local->transaction.unwind(frame, this);
- local->transaction.resume(frame, this);
- return 0;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- local->fd,
- local->cont.writev.vector,
- local->cont.writev.count,
- local->cont.writev.offset,
- local->cont.writev.flags,
- local->cont.writev.iobref,
- NULL);
-
- if (!--call_count)
- break;
+ fop_frame = afr_transaction_detach_fop_frame (frame);
+ afr_writev_copy_outvars (frame, fop_frame);
+ local->transaction.resume (frame, this);
+ afr_writev_unwind (fop_frame, this);
}
}
-
return 0;
}
int
-afr_writev_done (call_frame_t *frame, xlator_t *this)
+afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
+ priv = this->private;
- iobref_unref (local->cont.writev.iobref);
- local->cont.writev.iobref = NULL;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->writev,
+ local->fd, local->cont.writev.vector,
+ local->cont.writev.count, local->cont.writev.offset,
+ local->cont.writev.flags, local->cont.writev.iobref,
+ local->xdata_req);
return 0;
}
@@ -275,29 +342,29 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
{
call_frame_t *transaction_frame = NULL;
afr_local_t *local = NULL;
- int op_ret = -1;
- int op_errno = 0;
-
- local = frame->local;
+ int ret = -1;
+ int op_errno = ENOMEM;
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
+ local = frame->local;
transaction_frame->local = local;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ frame->local = NULL;
- local->op = GF_FOP_WRITE;
+ if (!AFR_FRAME_INIT (frame, op_errno))
+ goto out;
- local->success_count = 0;
+ local->op = GF_FOP_WRITE;
- local->transaction.fop = afr_writev_wind;
- local->transaction.done = afr_writev_done;
+ local->transaction.wind = afr_writev_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_transaction_writev_unwind;
local->transaction.main_frame = frame;
+
if (local->fd->flags & O_APPEND) {
/*
* Backend vfs ignores the 'offset' for append mode fd so
@@ -314,179 +381,86 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
local->cont.writev.count);
}
- op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (op_ret < 0) {
- op_errno = -op_ret;
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
goto out;
}
- op_ret = 0;
+ return 0;
out:
- if (op_ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
-static void
-afr_trigger_open_fd_self_heal (fd_t *fd, xlator_t *this)
-{
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- char *reason = NULL;
- int32_t op_errno = 0;
- int ret = 0;
-
- if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) {
- gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid args: "
- "fd: %p, inode: %p", fd,
- fd ? fd->inode : NULL);
- goto out;
- }
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame)
- goto out;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
- ret = afr_local_init (local, this->private, &op_errno);
- if (ret < 0)
- goto out;
-
- local->loc.inode = inode_ref (fd->inode);
- ret = loc_path (&local->loc, NULL);
- if (ret < 0)
- goto out;
-
- sh = &local->self_heal;
- sh->do_metadata_self_heal = _gf_true;
- if (fd->inode->ia_type == IA_IFREG)
- sh->do_data_self_heal = _gf_true;
- else if (fd->inode->ia_type == IA_IFDIR)
- sh->do_entry_self_heal = _gf_true;
-
- reason = "subvolume came online";
- afr_launch_self_heal (frame, this, fd->inode, _gf_true,
- fd->inode->ia_type, reason, NULL, NULL);
- return;
-out:
- AFR_STACK_DESTROY (frame);
-}
-
-void
-afr_open_fd_fix (fd_t *fd, xlator_t *this)
-{
- int ret = 0;
- int i = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- gf_boolean_t need_self_heal = _gf_false;
- int *need_open = NULL;
- size_t need_open_count = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- if (!afr_is_fd_fixable (fd))
- goto out;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- goto out;
-
- LOCK (&fd->lock);
- {
- if (fd_ctx->up_count < priv->up_count) {
- need_self_heal = _gf_true;
- fd_ctx->up_count = priv->up_count;
- fd_ctx->down_count = priv->down_count;
- }
-
- need_open = alloca (priv->child_count * sizeof (*need_open));
- for (i = 0; i < priv->child_count; i++) {
- need_open[i] = 0;
- if (fd_ctx->opened_on[i] != AFR_FD_NOT_OPENED)
- continue;
-
- if (!priv->child_up[i])
- continue;
-
- fd_ctx->opened_on[i] = AFR_FD_OPENING;
-
- need_open[i] = 1;
- need_open_count++;
- }
- }
- UNLOCK (&fd->lock);
- if (ret)
- goto out;
-
- if (need_self_heal)
- afr_trigger_open_fd_self_heal (fd, this);
-
- if (!need_open_count)
- goto out;
-
- afr_fix_open (this, fd, need_open_count, need_open);
-out:
- return;
-}
int
afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = ENOMEM;
priv = this->private;
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
-
QUORUM_CHECK(writev,out);
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.writev.vector = iov_dup (vector, count);
+ local->cont.writev.vector = iov_dup (vector, count);
+ if (!local->cont.writev.vector)
+ goto out;
local->cont.writev.count = count;
local->cont.writev.offset = offset;
local->cont.writev.flags = flags;
local->cont.writev.iobref = iobref_ref (iobref);
- local->fd = fd_ref (fd);
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
/* detect here, but set it in writev_wind_cbk *after* the unstable
write is performed
*/
local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC));
- afr_open_fd_fix (fd, this);
+ afr_fix_open (fd, this);
afr_do_writev (frame, this);
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
+ AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -504,22 +478,13 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (truncate, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.truncate.prebuf,
- &local->cont.truncate.postbuf,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
@@ -529,114 +494,32 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ afr_local_t *local = NULL;
- if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG)
- afr_transaction_fop_failed (frame, this, child_index);
+ local = frame->local;
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- if (child_index == read_child) {
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
}
-int32_t
-afr_truncate_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->truncate,
- &local->loc,
- local->cont.truncate.offset,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_truncate_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->truncate,
+ &local->loc, local->cont.truncate.offset,
+ local->xdata_req);
return 0;
}
@@ -648,56 +531,60 @@ afr_truncate (call_frame_t *frame, xlator_t *this,
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(truncate,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
local->cont.truncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_truncate_wind;
- local->transaction.done = afr_truncate_done;
+ local->transaction.wind = afr_truncate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_truncate_unwind;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ local->op = GF_FOP_TRUNCATE;
local->transaction.main_frame = frame;
local->transaction.start = offset;
local->transaction.len = 0;
+ /* Set it true speculatively, will get reset in afr_truncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
+
ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -715,21 +602,13 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.ftruncate.prebuf,
- &local->cont.ftruncate.postbuf,
- NULL);
- }
+ AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
@@ -739,140 +618,75 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
- int need_unwind = 0;
- int read_child = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
+ afr_local_t *local = NULL;
- local->success_count++;
+ local = frame->local;
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
}
int
-afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- local->fd,
- local->cont.ftruncate.offset,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
+ local = frame->local;
+ priv = this->private;
+ STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->ftruncate,
+ local->fd, local->cont.ftruncate.offset,
+ local->xdata_req);
return 0;
}
int
-afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
+ afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ priv = this->private;
- return 0;
-}
+ QUORUM_CHECK(ftruncate,out);
+ transaction_frame = copy_frame (frame);
+ if (!frame)
+ goto out;
-int
-afr_do_ftruncate (call_frame_t *frame, xlator_t *this)
-{
- call_frame_t * transaction_frame = NULL;
- afr_local_t * local = NULL;
- int op_ret = -1;
- int op_errno = 0;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local = frame->local;
+ local->cont.ftruncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ if (!local->xdata_req)
+ goto out;
- transaction_frame->local = local;
- frame->local = NULL;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
local->op = GF_FOP_FTRUNCATE;
- local->transaction.fop = afr_ftruncate_wind;
- local->transaction.done = afr_ftruncate_done;
+ local->transaction.wind = afr_ftruncate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_ftruncate_unwind;
local->transaction.main_frame = frame;
@@ -880,69 +694,21 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this)
local->transaction.start = local->cont.ftruncate.offset;
local->transaction.len = 0;
- op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (op_ret < 0) {
- op_errno = -op_ret;
- goto out;
- }
-
- op_ret = 0;
-out:
- if (op_ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL,
- NULL, NULL);
- }
-
- return 0;
-}
-
-
-int
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_fix_open (fd, this);
- priv = this->private;
+ /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
- QUORUM_CHECK(ftruncate,out);
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->cont.ftruncate.offset = offset;
-
- local->fd = fd_ref (fd);
-
- afr_open_fd_fix (fd, this);
- afr_do_ftruncate (frame, this);
-
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -954,188 +720,92 @@ out:
int
afr_setattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (setattr, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.setattr.preop_buf,
- &local->cont.setattr.postop_buf,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
return 0;
}
int
afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
+ int op_ret, int op_errno,
struct iatt *preop, struct iatt *postop, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
-
- if (child_index == read_child) {
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ preop, postop, xdata);
}
-int32_t
-afr_setattr_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc,
- &local->cont.setattr.in_buf,
- local->cont.setattr.valid,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setattr,
+ &local->loc, &local->cont.setattr.in_buf,
+ local->cont.setattr.valid, local->xdata_req);
return 0;
}
int
-afr_setattr_done (call_frame_t *frame, xlator_t *this)
+afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
{
+ afr_private_t *priv = NULL;
afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
-int
-afr_setattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(setattr,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
local->cont.setattr.in_buf = *buf;
local->cont.setattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->transaction.fop = afr_setattr_wind;
- local->transaction.done = afr_setattr_done;
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_setattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_setattr_unwind;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ local->op = GF_FOP_SETATTR;
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1143,18 +813,16 @@ afr_setattr (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -1168,22 +836,13 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.fsetattr.preop_buf,
- &local->cont.fsetattr.postop_buf,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
@@ -1193,164 +852,72 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *preop, struct iatt *postop, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
-
- if (child_index == read_child) {
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ preop, postop, xdata);
}
-int32_t
-afr_fsetattr_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsetattr,
- local->fd,
- &local->cont.fsetattr.in_buf,
- local->cont.fsetattr.valid,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetattr,
+ local->fd, &local->cont.fsetattr.in_buf,
+ local->cont.fsetattr.valid, local->xdata_req);
return 0;
}
int
-afr_fsetattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int
afr_fsetattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
-
QUORUM_CHECK(fsetattr,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
local->cont.fsetattr.in_buf = *buf;
local->cont.fsetattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_fsetattr_wind;
- local->transaction.done = afr_fsetattr_done;
+ local->transaction.wind = afr_fsetattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_fsetattr_unwind;
local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- afr_open_fd_fix (fd, this);
+ local->op = GF_FOP_FSETATTR;
+
+ afr_fix_open (fd, this);
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1358,18 +925,16 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -1385,19 +950,12 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (setxattr, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
+ AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
return 0;
}
@@ -1406,100 +964,32 @@ int
afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
}
int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
+afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setxattr,
+ &local->loc, local->cont.setxattr.dict,
+ local->cont.setxattr.flags, local->xdata_req);
return 0;
}
int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int
-afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata)
+afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -1507,59 +997,60 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,
int ret = -1;
int op_errno = EINVAL;
- VALIDATE_OR_GOTO (this, out);
-
GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,
op_errno, out);
GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
op_errno, out);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this->private, out);
-
priv = this->private;
QUORUM_CHECK(setxattr,out);
+
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
local->cont.setxattr.dict = dict_ref (dict);
local->cont.setxattr.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
+ local->transaction.wind = afr_setxattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_setxattr_unwind;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
local->transaction.len = 0;
+ local->op = GF_FOP_SETXATTR;
+
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
return 0;
}
@@ -1575,19 +1066,12 @@ afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (fsetxattr, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
+ AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
return 0;
}
@@ -1596,98 +1080,30 @@ int
afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
}
int
-afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this)
+afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsetxattr,
- local->fd,
- local->cont.fsetxattr.dict,
- local->cont.fsetxattr.flags,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetxattr,
+ local->fd, local->cont.fsetxattr.dict,
+ local->cont.fsetxattr.flags, local->xdata_req);
return 0;
}
int
-afr_fsetxattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int
afr_fsetxattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata)
{
@@ -1695,11 +1111,7 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
afr_local_t *local = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,
op_errno, out);
@@ -1709,36 +1121,36 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
priv = this->private;
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
-
QUORUM_CHECK(fsetxattr,out);
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
+ if (!transaction_frame)
goto out;
- }
-
- transaction_frame->local = local;
- local->op_ret = -1;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
local->cont.fsetxattr.dict = dict_ref (dict);
local->cont.fsetxattr.flags = flags;
- local->transaction.fop = afr_fsetxattr_wind;
- local->transaction.done = afr_fsetxattr_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_fsetxattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_fsetxattr_unwind;
local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ local->op = GF_FOP_FSETXATTR;
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1746,18 +1158,16 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
return 0;
}
@@ -1775,19 +1185,12 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (removexattr, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
+ AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
return 0;
}
@@ -1796,108 +1199,151 @@ int
afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
+}
- if (need_unwind)
- local->transaction.unwind (frame, this);
- call_count = afr_frame_return (frame);
+int
+afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ local = frame->local;
+ priv = this->private;
+ STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->removexattr,
+ &local->loc, local->cont.removexattr.name,
+ local->xdata_req);
return 0;
}
-int32_t
-afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
+ name, op_errno, out);
+
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
+ name, op_errno, out);
- local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
+ QUORUM_CHECK(removexattr,out);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- local->call_count = call_count;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->removexattr,
- &local->loc,
- local->cont.removexattr.name,
- NULL);
-
- if (!--call_count)
- break;
- }
+ local->cont.removexattr.name = gf_strdup (name);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_removexattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_removexattr_unwind;
+
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ local->op = GF_FOP_REMOVEXATTR;
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
return 0;
}
-
+/* ffremovexattr */
int
-afr_removexattr_done (call_frame_t *frame, xlator_t *this)
+afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = frame->local;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
- AFR_STACK_DESTROY (frame);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
return 0;
}
int
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
+afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
+}
+
+
+int
+afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fremovexattr,
+ local->fd, local->cont.removexattr.name,
+ local->xdata_req);
+ return 0;
+}
- VALIDATE_OR_GOTO (this, out);
+
+int
+afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
name, op_errno, out);
@@ -1905,34 +1351,36 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
name, op_errno, out);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
+ priv = this->private;
- QUORUM_CHECK(removexattr,out);
+ QUORUM_CHECK(fremovexattr, out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
local->cont.removexattr.name = gf_strdup (name);
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->transaction.fop = afr_removexattr_wind;
- local->transaction.done = afr_removexattr_done;
- local->transaction.unwind = afr_removexattr_unwind;
+ if (!local->xdata_req)
+ goto out;
- loc_copy (&local->loc, loc);
+ local->transaction.wind = afr_fremovexattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_fremovexattr_unwind;
+
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ local->op = GF_FOP_FREMOVEXATTR;
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1940,214 +1388,364 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL);
return 0;
}
-/* ffremovexattr */
+
int
-afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fallocate_unwind (call_frame_t *frame, xlator_t *this)
{
afr_local_t * local = NULL;
call_frame_t *main_frame = NULL;
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (fremovexattr, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
+ AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
int
-afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
+
+
+int
+afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
priv = this->private;
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+ STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fallocate,
+ local->fd, local->cont.fallocate.mode,
+ local->cont.fallocate.offset,
+ local->cont.fallocate.len, local->xdata_req);
+ return 0;
+}
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
- if (need_unwind)
- local->transaction.unwind (frame, this);
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ priv = this->private;
- call_count = afr_frame_return (frame);
+ QUORUM_CHECK(fallocate,out);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.fallocate.mode = mode;
+ local->cont.fallocate.offset = offset;
+ local->cont.fallocate.len = len;
+
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_FALLOCATE;
+
+ local->transaction.wind = afr_fallocate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_fallocate_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.fallocate.offset;
+ local->transaction.len = 0;
+
+ afr_fix_open (fd, this);
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ discard */
+
+int
+afr_discard_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+
+ AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
-int32_t
-afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
+
+
+int
+afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
+ STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->discard,
+ local->fd, local->cont.discard.offset,
+ local->cont.discard.len, local->xdata_req);
+ return 0;
+}
+
+
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ priv = this->private;
+
+ QUORUM_CHECK(discard, out);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- local->call_count = call_count;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fremovexattr,
- local->fd,
- local->cont.removexattr.name,
- NULL);
-
- if (!--call_count)
- break;
- }
+ local->cont.discard.offset = offset;
+ local->cont.discard.len = len;
+
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_DISCARD;
+
+ local->transaction.wind = afr_discard_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_discard_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = 0;
+
+ afr_fix_open (fd, this);
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
+/* {{{ zerofill */
+
int
-afr_fremovexattr_done (call_frame_t *frame, xlator_t *this)
+afr_zerofill_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = frame->local;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
- AFR_STACK_DESTROY (frame);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
int
-afr_fremovexattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata)
+afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
- VALIDATE_OR_GOTO (this, out);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
- name, op_errno, out);
+int
+afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
- name, op_errno, out);
+ local = frame->local;
+ priv = this->private;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this->private, out);
+ STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->zerofill,
+ local->fd, local->cont.zerofill.offset,
+ local->cont.zerofill.len, local->xdata_req);
+ return 0;
+}
+
+int
+afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
- if (afr_is_split_brain (this, fd->inode)) {
- op_errno = EIO;
- goto out;
- }
- QUORUM_CHECK(fremovexattr, out);
+ QUORUM_CHECK(discard, out);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->cont.zerofill.offset = offset;
+ local->cont.zerofill.len = len;
- transaction_frame->local = local;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- local->op_ret = -1;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->cont.removexattr.name = gf_strdup (name);
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_fremovexattr_wind;
- local->transaction.done = afr_fremovexattr_done;
- local->transaction.unwind = afr_fremovexattr_unwind;
+ local->op = GF_FOP_ZEROFILL;
- local->fd = fd_ref (fd);
+ local->transaction.wind = afr_zerofill_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_zerofill_unwind;
local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
- op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (op_ret < 0) {
- op_errno = -op_ret;
- goto out;
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = len;
+
+ afr_fix_open (fd, this);
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
- op_ret = 0;
+ return 0;
out:
- if (op_ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index ed11079fd..7b1fc5528 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -68,4 +68,15 @@ int32_t
afr_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata);
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
+
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
+
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata);
#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index e091a7939..a2a758f35 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -57,11 +57,15 @@
int
afr_entry_lockee_cmp (const void *l1, const void *l2)
{
- const afr_entry_lockee_t *r1 = l1;
- const afr_entry_lockee_t *r2 = l2;
- int ret = 0;
-
- ret = uuid_compare (r1->loc.gfid, r2->loc.gfid);
+ const afr_entry_lockee_t *r1 = l1;
+ const afr_entry_lockee_t *r2 = l2;
+ int ret = 0;
+ uuid_t gfid1 = {0};
+ uuid_t gfid2 = {0};
+
+ loc_gfid ((loc_t*)&r1->loc, gfid1);
+ loc_gfid ((loc_t*)&r2->loc, gfid2);
+ ret = uuid_compare (gfid1, gfid2);
/*Entrylks with NULL basename are the 'smallest'*/
if (ret == 0) {
if (!r1->basename)
@@ -75,7 +79,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2)
return -1;
else
return 1;
-
}
int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
@@ -556,37 +559,25 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
afr_private_t *priv = NULL;
- int i = 0;
+ afr_inodelk_t *inodelk = NULL;
priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
- int_lock->inodelk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- int_lock->inode_locked_nodes[i] = 0;
- }
-
- return 0;
-}
-
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
-{
- int ret = 0;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- ret = uuid_compare (l1->inode->gfid, l2->inode->gfid);
+ inodelk->lock_count = 0;
+ int_lock->lk_attempted_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
- if (ret == 0)
- ret = strcmp (b1, b2);
+ memset (inodelk->locked_nodes, 0,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ memset (int_lock->locked_nodes, 0,
+ sizeof (*int_lock->locked_nodes) * priv->child_count);
- if (ret <= 0)
- return l1;
- else
- return l2;
+ return 0;
}
int
@@ -649,7 +640,9 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
int32_t child_index = (long)cookie;
+ afr_private_t *priv = NULL;
local = frame->local;
int_lock = &local->internal_lock;
@@ -658,14 +651,18 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
AFR_UNLOCK_OP, NULL, op_ret,
op_errno, child_index);
+ priv = this->private;
+
if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
- gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on %d "
- "unlock by %s", local->loc.path, child_index,
+ gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s "
+ "with lock owner %s", local->loc.path,
+ priv->children[child_index]->name,
lkowner_utoa (&frame->root->lk_owner));
}
- int_lock->inode_locked_nodes[child_index] &= LOCKED_NO;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ inodelk->locked_nodes[child_index] &= LOCKED_NO;
if (local->transaction.eager_lock)
local->transaction.eager_lock[child_index] = 0;
@@ -679,6 +676,7 @@ static int
afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
struct gf_flock flock = {0,};
@@ -694,12 +692,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
flock.l_type = F_UNLCK;
full_flock.l_type = F_UNLCK;
- call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes,
+ call_count = afr_locked_nodes_count (inodelk->locked_nodes,
priv->child_count);
int_lock->lk_call_count = call_count;
@@ -715,8 +715,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
fd_ctx = afr_fd_ctx_get (local->fd, this);
for (i = 0; i < priv->child_count; i++) {
- if ((int_lock->inode_locked_nodes[i] & LOCKED_YES)
- != LOCKED_YES)
+ if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
continue;
if (local->fd) {
@@ -757,7 +756,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long)i,
priv->children[i],
priv->children[i]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLK, flock_use, NULL);
if (!--call_count)
@@ -772,7 +771,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long)i,
priv->children[i],
priv->children[i]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLK, &flock, NULL);
if (!--call_count)
@@ -858,7 +857,7 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[index],
priv->children[index]->fops->entrylk,
- this->name,
+ int_lock->domain,
&int_lock->lockee[lockee_no].loc,
int_lock->lockee[lockee_no].basename,
ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
@@ -961,6 +960,7 @@ static int
afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -971,10 +971,10 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- memcpy (int_lock->inode_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->inodelk_lock_count = int_lock->lock_count;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ inodelk->lock_count = int_lock->lock_count;
break;
case AFR_ENTRY_RENAME_TRANSACTION:
@@ -1025,6 +1025,7 @@ int
afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
struct gf_flock flock = {0,};
@@ -1039,10 +1040,15 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
priv = this->private;
child_index = cookie % priv->child_count;
lockee_no = cookie / priv->child_count;
+ is_entrylk = afr_is_entrylk (int_lock, local->transaction.type);
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
+
+ if (!is_entrylk) {
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
+ }
if (local->fd) {
ret = fd_ctx_get (local->fd, this, &ctx);
@@ -1064,8 +1070,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
}
if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
- is_entrylk = afr_is_entrylk (int_lock, local->transaction.type);
-
if ((is_entrylk && int_lock->entrylk_lock_count == 0) ||
(!is_entrylk && int_lock->lock_count == 0)) {
gf_log (this->name, GF_LOG_INFO,
@@ -1114,7 +1118,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
(void *) (long) child_index,
priv->children[child_index],
priv->children[child_index]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLKW, &flock, NULL);
} else {
@@ -1127,7 +1131,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
(void *) (long) child_index,
priv->children[child_index],
priv->children[child_index]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLKW, &flock, NULL);
}
@@ -1148,7 +1152,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
(void *) (long) cookie,
priv->children[child_index],
priv->children[child_index]->fops->fentrylk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
} else {
@@ -1161,7 +1165,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
(void *) (long) cookie,
priv->children[child_index],
priv->children[child_index]->fops->entrylk,
- this->name,
+ int_lock->domain,
&int_lock->lockee[lockee_no].loc,
int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
@@ -1193,8 +1197,7 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_RENAME_TRANSACTION:
case AFR_ENTRY_TRANSACTION:
- up_count = afr_up_children_count (local->child_up,
- priv->child_count);
+ up_count = AFR_COUNT (local->child_up, priv->child_count);
int_lock->lk_call_count = int_lock->lk_expected_count
= (int_lock->lockee_count *
up_count);
@@ -1316,6 +1319,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
local->op_errno = EINVAL;
int_lock->lock_op_errno = EINVAL;
+ afr_unlock (frame, this);
return -1;
}
@@ -1390,6 +1394,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
int call_count = 0;
int child_index = (long) cookie;
@@ -1398,6 +1403,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION,
AFR_LOCK_OP, NULL, op_ret,
@@ -1423,9 +1429,8 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->transaction.eager_lock)
local->transaction.eager_lock[child_index] = 0;
} else {
- int_lock->inode_locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->inodelk_lock_count++;
+ inodelk->locked_nodes[child_index] |= LOCKED_YES;
+ inodelk->lock_count++;
if (local->transaction.eager_lock &&
local->transaction.eager_lock[child_index] &&
@@ -1448,8 +1453,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"Last inode locking reply received");
/* all locks successful. Proceed to call FOP */
- if (int_lock->inodelk_lock_count ==
- int_lock->lk_expected_count) {
+ if (inodelk->lock_count == int_lock->lk_expected_count) {
gf_log (this->name, GF_LOG_TRACE,
"All servers locked. Calling the cbk");
int_lock->lock_op_ret = 0;
@@ -1473,6 +1477,7 @@ int
afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
afr_fd_ctx_t *fd_ctx = NULL;
@@ -1488,11 +1493,13 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
- full_flock.l_type = int_lock->lk_flock.l_type;
+ full_flock.l_type = inodelk->flock.l_type;
initialize_inodelk_variables (frame, this);
@@ -1508,6 +1515,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
local->op_errno = EINVAL;
int_lock->lock_op_errno = EINVAL;
+ afr_unlock (frame, this);
ret = -1;
goto out;
}
@@ -1566,7 +1574,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLK, flock_use, NULL);
if (!--call_count)
@@ -1588,7 +1596,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLK, &flock, NULL);
if (!--call_count)
@@ -1623,518 +1631,37 @@ afr_unlock (call_frame_t *frame, xlator_t *this)
}
int
-afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
- unsigned char *locked_nodes)
-{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- priv = this->private;
-
- ret = afr_fd_ctx_set (this, fd);
- if (ret)
- goto out;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "failed to get the fd ctx");
- goto out;
- }
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- GF_ASSERT (fdctx->locked_on);
-
- memcpy (fdctx->locked_on, locked_nodes,
- priv->child_count);
-
-out:
- return ret;
-}
-
-static int
-__is_fd_saved (xlator_t *this, fd_t *fd)
-{
- afr_locked_fd_t *locked_fd = NULL;
- afr_private_t *priv = NULL;
- int found = 0;
-
- priv = this->private;
-
- list_for_each_entry (locked_fd, &priv->saved_fds, list) {
- if (locked_fd->fd == fd) {
- found = 1;
- break;
- }
- }
-
- return found;
-}
-
-static int
-__afr_save_locked_fd (xlator_t *this, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- afr_locked_fd_t *locked_fd = NULL;
- int ret = 0;
-
- priv = this->private;
-
- locked_fd = GF_CALLOC (1, sizeof (*locked_fd),
- gf_afr_mt_locked_fd);
- if (!locked_fd) {
- ret = -1;
- goto out;
- }
-
- locked_fd->fd = fd;
- INIT_LIST_HEAD (&locked_fd->list);
-
- list_add_tail (&locked_fd->list, &priv->saved_fds);
-
-out:
- return ret;
-}
-
-int
-afr_save_locked_fd (xlator_t *this, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- int ret = 0;
-
- priv = this->private;
-
- pthread_mutex_lock (&priv->mutex);
- {
- if (__is_fd_saved (this, fd)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fd=%p already saved", fd);
- goto unlock;
- }
-
- ret = __afr_save_locked_fd (this, fd);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "fd=%p could not be saved", fd);
- goto unlock;
- }
- }
-unlock:
- pthread_mutex_unlock (&priv->mutex);
-
- return ret;
-}
-
-static int
-afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_locked_fd_t *locked_fd = NULL;
-
- local = frame->local;
-
- locked_fd = local->locked_fd;
-
- STACK_DESTROY (frame->root);
- afr_local_cleanup (local, this);
-
- afr_save_locked_fd (this, locked_fd->fd);
-
- return 0;
-
-}
-
-static int
-afr_get_source_lock_recovery (xlator_t *this, fd_t *fd)
-{
- afr_fd_ctx_t *fdctx = NULL;
- afr_private_t *priv = NULL;
- uint64_t tmp = 0;
- int i = 0;
- int source_child = -1;
- int ret = 0;
-
- priv = this->private;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- for (i = 0; i < priv->child_count; i++) {
- if (fdctx->locked_on[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Found lock recovery source=%d", i);
- source_child = i;
- break;
- }
- }
-
-out:
- return source_child;
-
-}
-
-int32_t
-afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
- dict_t *xdata);
-int32_t
-afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
- dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t source_child = 0;
- struct gf_flock flock = {0,};
-
- local = frame->local;
- priv = this->private;
-
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "lock recovery failed");
- goto cleanup;
- }
-
- source_child = local->source_child;
-
- memcpy (&flock, lock, sizeof (*lock));
-
- STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk,
- (void *) (long) source_child,
- priv->children[source_child],
- priv->children[source_child]->fops->lk,
- local->fd, F_GETLK_FD, &flock, NULL);
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
- return 0;
-}
-
-int
-afr_recover_lock (call_frame_t *frame, xlator_t *this,
- struct gf_flock *flock)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t lock_recovery_child = 0;
-
- priv = this->private;
- local = frame->local;
-
- lock_recovery_child = local->lock_recovery_child;
-
- frame->root->lk_owner = flock->l_owner;
-
- STACK_WIND_COOKIE (frame, afr_recover_lock_cbk,
- (void *) (long) lock_recovery_child,
- priv->children[lock_recovery_child],
- priv->children[lock_recovery_child]->fops->lk,
- local->fd, F_SETLK, flock, NULL);
-
- return 0;
-}
-
-static int
-is_afr_lock_eol (struct gf_flock *lock)
-{
- int ret = 0;
-
- if ((lock->l_type == GF_LK_EOL))
- ret = 1;
-
- return ret;
-}
-
-int32_t
-afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
- dict_t *xdata)
-{
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Failed to get locks on fd");
- goto cleanup;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Got a lock on fd");
-
- if (is_afr_lock_eol (lock)) {
- gf_log (this->name, GF_LOG_INFO,
- "Reached EOL on locks on fd");
- goto cleanup;
- }
-
- afr_recover_lock (frame, this, lock);
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
-
- return 0;
-}
-
-static int
-afr_lock_recovery (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- int ret = 0;
- int32_t source_child = 0;
- struct gf_flock flock = {0,};
-
- priv = this->private;
- local = frame->local;
-
- fd = local->fd;
-
- source_child = afr_get_source_lock_recovery (this, fd);
- if (source_child < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not recover locks due to lock "
- "split brain");
- ret = -1;
- goto out;
- }
-
- local->source_child = source_child;
-
- /* the flock can be zero filled as we're querying incrementally
- the locks held on the fd.
- */
- STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk,
- (void *) (long) source_child,
- priv->children[source_child],
- priv->children[source_child]->fops->lk,
- local->fd, F_GETLK_FD, &flock, NULL);
-
-out:
- return ret;
-}
-
-
-static int
-afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index)
-{
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- fdctx->opened_on[child_index] = AFR_FD_OPENED;
-
-out:
- return ret;
-}
-
-int32_t
-afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd,
- dict_t *xdata)
-{
- int32_t child_index = (long )cookie;
- int ret = 0;
-
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Reopen during lock-recovery failed");
- goto cleanup;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Open succeeded => proceed to recover locks");
-
- ret = afr_lock_recovery (frame, this);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Lock recovery failed");
- goto cleanup;
- }
-
- ret = afr_mark_fd_opened (this, fd, child_index);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Marking fd open failed");
- goto cleanup;
- }
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
- return 0;
-}
-
-static int
-afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- uint64_t tmp = 0;
- afr_fd_ctx_t *fdctx = NULL;
- loc_t loc = {0,};
- int32_t child_index = 0;
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
-
- GF_ASSERT (local && local->fd);
-
- ret = fd_ctx_get (local->fd, this, &tmp);
- if (ret)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to get the context of fd",
- uuid_utoa (local->fd->inode->gfid));
- fdctx = (afr_fd_ctx_t *) (long) tmp;
- /* TODO: instead we should return from the function */
- GF_ASSERT (fdctx);
-
- child_index = local->lock_recovery_child;
-
- inode_path (local->fd->inode, NULL, (char **)&loc.path);
- loc.name = strrchr (loc.path, '/');
- loc.inode = inode_ref (local->fd->inode);
- loc.parent = inode_parent (local->fd->inode, 0, NULL);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk,
- (void *)(long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->open,
- &loc, fdctx->flags, local->fd, NULL);
-
- return 0;
-}
-
-static int
-is_fd_opened (fd_t *fd, int32_t child_index)
-{
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, THIS, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- if (fdctx->opened_on[child_index] == AFR_FD_OPENED)
- ret = 1;
-
-out:
- return ret;
-}
-
-int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index)
-{
- call_frame_t *frame = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_locked_fd_t *locked_fd = NULL;
- afr_locked_fd_t *tmp = NULL;
- int ret = -1;
- struct list_head locks_list = {0,};
- int32_t op_errno = 0;
-
-
- priv = this->private;
-
- if (list_empty (&priv->saved_fds))
- goto out;
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- ret = -1;
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0) {
- ret = -1;
- goto out;
- }
-
- frame->local = local;
-
- INIT_LIST_HEAD (&locks_list);
-
- pthread_mutex_lock (&priv->mutex);
- {
- list_splice_init (&priv->saved_fds, &locks_list);
- }
- pthread_mutex_unlock (&priv->mutex);
-
- list_for_each_entry_safe (locked_fd, tmp,
- &locks_list, list) {
-
- list_del_init (&locked_fd->list);
-
- local->fd = fd_ref (locked_fd->fd);
- local->lock_recovery_child = child_index;
- local->locked_fd = locked_fd;
-
- if (!is_fd_opened (locked_fd->fd, child_index)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting open before lock "
- "recovery");
- afr_lock_recovery_preopen (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting lock recovery "
- "without a preopen");
- afr_lock_recovery (frame, this);
- }
- }
-
-out:
- if ((ret < 0) && frame)
- AFR_STACK_DESTROY (frame);
- return ret;
-}
-
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count)
{
- afr_local_t *dst_local = NULL;
- afr_local_t *src_local = NULL;
- afr_internal_lock_t *dst_lock = NULL;
- afr_internal_lock_t *src_lock = NULL;
+ afr_local_t *dst_local = NULL;
+ afr_local_t *src_local = NULL;
+ afr_internal_lock_t *dst_lock = NULL;
+ afr_internal_lock_t *src_lock = NULL;
+ afr_inodelk_t *dst_inodelk = NULL;
+ afr_inodelk_t *src_inodelk = NULL;
+ int ret = -1;
- dst_local = dst->local;
- dst_lock = &dst_local->internal_lock;
src_local = src->local;
src_lock = &src_local->internal_lock;
- if (src_lock->inode_locked_nodes) {
- memcpy (dst_lock->inode_locked_nodes,
- src_lock->inode_locked_nodes,
- sizeof (*dst_lock->inode_locked_nodes) * child_count);
- memset (src_lock->inode_locked_nodes, 0,
- sizeof (*src_lock->inode_locked_nodes) * child_count);
+ src_inodelk = afr_get_inodelk (src_lock, dom);
+ dst_local = dst->local;
+ dst_lock = &dst_local->internal_lock;
+ dst_inodelk = afr_get_inodelk (dst_lock, dom);
+ if (!dst_inodelk || !src_inodelk)
+ goto out;
+ if (src_inodelk->locked_nodes) {
+ memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes,
+ sizeof (*dst_inodelk->locked_nodes) * child_count);
+ memset (src_inodelk->locked_nodes, 0,
+ sizeof (*src_inodelk->locked_nodes) * child_count);
}
dst_lock->transaction_lk_type = src_lock->transaction_lk_type;
dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type;
- dst_lock->inodelk_lock_count = src_lock->inodelk_lock_count;
- src_lock->inodelk_lock_count = 0;
+ dst_inodelk->lock_count = src_inodelk->lock_count;
+ src_inodelk->lock_count = 0;
+ ret = 0;
+out:
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index e01ab366f..05df90cc0 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -42,6 +42,7 @@ enum gf_afr_mem_types_ {
gf_afr_mt_time_t,
gf_afr_mt_pos_data_t,
gf_afr_mt_reply_t,
+ gf_afr_mt_subvol_healer_t,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 643a5d692..f86aa7fd8 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -43,85 +43,29 @@
#include "afr-dir-read.h"
#include "afr-dir-write.h"
#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-int
-afr_stale_child_up (afr_local_t *local, xlator_t *this)
-{
- int i = 0;
- afr_private_t *priv = NULL;
- int up = -1;
-
- priv = this->private;
-
- if (!local->fresh_children)
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children)
- goto out;
-
- afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children);
- if (priv->child_count == afr_get_children_count (local->fresh_children,
- priv->child_count))
- goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
- if (afr_is_child_present (local->fresh_children,
- priv->child_count, i))
- continue;
- up = i;
- break;
- }
-out:
- return up;
-}
-
-void
-afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_fd_fixable (fd_t *fd)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- inode_t *inode = NULL;
- int st_child = -1;
- char reason[64] = {0};
-
- local = frame->local;
- sh = &local->self_heal;
- inode = local->fd->inode;
-
- if (!IA_ISREG (inode->ia_type))
- goto out;
-
- st_child = afr_stale_child_up (local, this);
- if (st_child < 0)
- goto out;
-
- sh->do_data_self_heal = _gf_true;
- sh->do_metadata_self_heal = _gf_true;
- sh->do_gfid_self_heal = _gf_true;
- sh->do_missing_entry_self_heal = _gf_true;
-
- snprintf (reason, sizeof (reason), "stale subvolume %d detected",
- st_child);
- afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type,
- reason, NULL, NULL);
-out:
- return;
+ if (!fd || !fd->inode)
+ return _gf_false;
+ else if (fd_is_anonymous (fd))
+ return _gf_false;
+ else if (uuid_is_null (fd->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
}
+
int
afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
afr_local_t * local = frame->local;
- afr_private_t *priv = NULL;
- priv = this->private;
- if (afr_open_only_data_self_heal (priv->data_self_heal))
- afr_perform_data_self_heal (frame, this);
AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
local->fd, xdata);
return 0;
@@ -134,49 +78,38 @@ afr_open_cbk (call_frame_t *frame, void *cookie,
fd_t *fd, dict_t *xdata)
{
afr_local_t * local = NULL;
- int ret = 0;
int call_count = -1;
int child_index = (long) cookie;
- afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
- priv = this->private;
local = frame->local;
+ fd_ctx = local->fd_ctx;
LOCK (&frame->lock);
{
if (op_ret == -1) {
local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
local->op_ret = op_ret;
- local->success_count++;
-
- ret = afr_child_fd_ctx_set (this, fd, child_index,
- local->cont.open.flags);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
}
}
-unlock:
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if ((local->cont.open.flags & O_TRUNC)
- && (local->op_ret >= 0)) {
+ if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) {
STACK_WIND (frame, afr_open_ftruncate_cbk,
this, this->fops->ftruncate,
fd, 0, NULL);
} else {
- if (afr_open_only_data_self_heal (priv->data_self_heal))
- afr_perform_data_self_heal (frame, this);
AFR_STACK_UNWIND (open, frame, local->op_ret,
- local->op_errno, local->fd, xdata);
+ local->op_errno, local->fd,
+ local->xdata_rsp);
}
}
@@ -190,16 +123,11 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
int i = 0;
- int ret = -1;
int32_t call_count = 0;
int32_t op_errno = 0;
- int32_t wind_flags = flags & (~O_TRUNC);
- //We can't let truncation to happen outside transaction.
+ afr_fd_ctx_t *fd_ctx = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ //We can't let truncation to happen outside transaction.
priv = this->private;
@@ -207,44 +135,38 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
QUORUM_CHECK(open,out);
}
- if (afr_is_split_brain (this, loc->inode)) {
- /* self-heal failed */
- gf_log (this->name, GF_LOG_WARNING,
- "failed to open as split brain seen, returning EIO");
- op_errno = EIO;
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- call_count = local->call_count;
- loc_copy (&local->loc, loc);
+ local->fd = fd_ref (fd);
+ local->fd_ctx = fd_ctx;
+ fd_ctx->flags = flags;
- local->cont.open.flags = flags;
+ call_count = local->call_count;
- local->fd = fd_ref (fd);
+ local->cont.open.flags = flags;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
priv->children[i],
priv->children[i]->fops->open,
- loc, wind_flags, fd, xdata);
-
+ loc, (flags & ~O_TRUNC), fd, xdata);
if (!--call_count)
break;
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata);
+ AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL);
return 0;
}
@@ -273,12 +195,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv->children[child_index]->name);
}
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- if (!fd_ctx) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to get fd context, %p", local->fd);
- goto out;
- }
+ fd_ctx = local->fd_ctx;
LOCK (&local->fd->lock);
{
@@ -289,7 +206,7 @@ afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
}
UNLOCK (&local->fd->lock);
-out:
+
call_count = afr_frame_return (frame);
if (call_count == 0)
AFR_STACK_DESTROY (frame);
@@ -297,8 +214,42 @@ out:
return 0;
}
+
+static int
+afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int count = 0;
+
+ priv = this->private;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return 0;
+
+ LOCK (&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED &&
+ priv->child_up[i]) {
+ fd_ctx->opened_on[i] = AFR_FD_OPENING;
+ need_open[i] = 1;
+ count++;
+ } else {
+ need_open[i] = 0;
+ }
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ return count;
+}
+
+
void
-afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open)
+afr_fix_open (fd_t *fd, xlator_t *this)
{
afr_private_t *priv = NULL;
int i = 0;
@@ -307,29 +258,31 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open)
int ret = -1;
int32_t op_errno = 0;
afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *need_open = NULL;
+ int call_count = 0;
priv = this->private;
- if (!afr_is_fd_fixable (fd) || !need_open || !need_open_count)
+ if (!afr_is_fd_fixable (fd))
goto out;
fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx) {
- ret = -1;
+ if (!fd_ctx)
goto out;
- }
+
+ need_open = alloca0 (priv->child_count);
+
+ call_count = afr_fd_ctx_need_open (fd, this, need_open);
+ if (!call_count)
+ goto out;
frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- ret = -1;
+ if (!frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->loc.inode = inode_ref (fd->inode);
ret = loc_path (&local->loc, NULL);
@@ -337,10 +290,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open)
goto out;
local->fd = fd_ref (fd);
- local->call_count = need_open_count;
+ local->fd_ctx = fd_ctx;
+
+ local->call_count = call_count;
- gf_log (this->name, GF_LOG_DEBUG, "need open count: %zd",
- need_open_count);
+ gf_log (this->name, GF_LOG_DEBUG, "need open count: %d",
+ call_count);
for (i = 0; i < priv->child_count; i++) {
if (!need_open[i])
@@ -371,12 +326,12 @@ afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open)
local->fd, NULL);
}
+ if (!--call_count)
+ break;
}
- op_errno = 0;
- ret = 0;
+
+ return;
out:
- if (op_errno)
- ret = -1; //For handling ALLOC_OR_GOTO
- if (ret && frame)
+ if (frame)
AFR_STACK_DESTROY (frame);
}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
new file mode 100644
index 000000000..186f68c33
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -0,0 +1,239 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+int
+afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->readable[i]) {
+ /* don't even bother trying here.
+ just mark as attempted and move on. */
+ local->read_attempted[i] = 1;
+ continue;
+ }
+
+ if (!local->read_attempted[i]) {
+ subvol = i;
+ break;
+ }
+ }
+
+ /* If no more subvols were available for reading, we leave
+ @subvol as -1, which is an indication we have run out of
+ readable subvols. */
+ if (subvol != -1)
+ local->read_attempted[subvol] = 1;
+ local->readfn (frame, this, subvol);
+
+ return 0;
+}
+
+
+int
+afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
+{
+ afr_local_t *local = NULL;
+ int read_subvol = 0;
+ int event_generation = 0;
+ inode_t *inode = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ inode = local->inode;
+
+ if (err) {
+ local->op_errno = -err;
+ local->op_ret = -1;
+ read_subvol = -1;
+ goto readfn;
+ }
+
+ ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
+ &event_generation,
+ local->transaction.type);
+
+ if (ret == -1 || !event_generation) {
+ /* Even after refresh, we don't have a good
+ read subvolume. Time to bail */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ read_subvol = -1;
+ goto readfn;
+ }
+
+ read_subvol = afr_read_subvol_select_by_policy (inode, this,
+ local->readable);
+
+ if (read_subvol == -1) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto readfn;
+ }
+
+ if (local->read_attempted[read_subvol]) {
+ afr_read_txn_next_subvol (frame, this);
+ return 0;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+readfn:
+ local->readfn (frame, this, read_subvol);
+
+ return 0;
+}
+
+
+int
+afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (!local->refreshed) {
+ local->refreshed = _gf_true;
+ afr_inode_refresh (frame, this, local->inode,
+ afr_read_txn_refresh_done);
+ } else {
+ afr_read_txn_next_subvol (frame, this);
+ }
+
+ return 0;
+}
+
+
+/* afr_read_txn_wipe:
+
+ clean internal variables in @local in order to make
+ it possible to call afr_read_txn() multiple times from
+ the same frame
+*/
+
+void
+afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->readfn = NULL;
+
+ if (local->inode)
+ inode_unref (local->inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ local->read_attempted[i] = 0;
+ local->readable[i] = 0;
+ }
+}
+
+
+/*
+ afr_read_txn:
+
+ This is the read transaction function. The way it works:
+
+ - Determine read-subvolume from inode ctx.
+
+ - If read-subvolume's generation was stale, refresh ctx once by
+ calling afr_inode_refresh()
+
+ Else make an attempt to read on read-subvolume.
+
+ - If attempted read on read-subvolume fails, refresh ctx once
+ by calling afr_inode_refresh()
+
+ - After ctx refresh, query read-subvolume freshly and attempt
+ read once.
+
+ - If read fails, try every other readable[] subvolume before
+ finally giving up. readable[] elements are set by afr_inode_refresh()
+ based on dirty and pending flags.
+
+ - If file is in split brain in the backend, generation will be
+ kept 0 by afr_inode_refresh() and readable[] will be set 0 for
+ all elements. Therefore reads always fail.
+*/
+
+int
+afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int read_subvol = -1;
+ int event_generation = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_read_txn_wipe (frame, this);
+
+ local->readfn = readfn;
+ local->inode = inode_ref (inode);
+
+ local->transaction.type = type;
+ ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
+ &event_generation, type);
+ if (ret == -1)
+ /* very first transaction on this inode */
+ goto refresh;
+
+ if (local->event_generation != event_generation)
+ /* servers have disconnected / reconnected, and possibly
+ rebooted, very likely changing the state of freshness
+ of copies */
+ goto refresh;
+
+ read_subvol = afr_read_subvol_select_by_policy (inode, this,
+ local->readable);
+
+ if (read_subvol < 0 || read_subvol > priv->child_count) {
+ gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d "
+ "found with event generation %d", read_subvol,
+ event_generation);
+ goto refresh;
+ }
+
+ if (!local->child_up[read_subvol]) {
+ /* should never happen, just in case */
+ gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the "
+ "read subvolume in this generation, but is not up",
+ read_subvol);
+ goto refresh;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+
+ local->readfn (frame, this, read_subvol);
+
+ return 0;
+
+refresh:
+ afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done);
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
deleted file mode 100644
index 1721fd270..000000000
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ /dev/null
@@ -1,827 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-
-#include <openssl/md5.h>
-#include "glusterfs.h"
-#include "afr.h"
-#include "xlator.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
-/*
- This file contains the various self-heal algorithms
-*/
-
-static int
-sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
- gf_boolean_t is_first_call, call_frame_t *old_loop_frame);
-static int
-sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame,
- int32_t op_ret, int32_t op_errno);
-static int
-sh_destroy_frame (call_frame_t *frame, xlator_t *this)
-{
- if (!frame)
- goto out;
-
- AFR_STACK_DESTROY (frame);
-out:
- return 0;
-}
-
-static void
-sh_private_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_private_t *sh_priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = sh->private;
- GF_FREE (sh_priv);
-}
-
-static int
-sh_number_of_writes_needed (unsigned char *write_needed, int child_count)
-{
- int writes = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (write_needed[i])
- writes++;
- }
-
- return writes;
-}
-
-
-static int
-sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this,
- call_frame_t *last_loop_frame)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_private_t *sh_priv = NULL;
- int32_t total_blocks = 0;
- int32_t diff_blocks = 0;
-
- local = sh_frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
- if (sh_priv) {
- total_blocks = sh_priv->total_blocks;
- diff_blocks = sh_priv->diff_blocks;
- }
-
- sh_private_cleanup (sh_frame, this);
- if (sh->op_failed) {
- GF_ASSERT (!last_loop_frame);
- //loop_finish should have happened and the old_loop should be NULL
- gf_log (this->name, GF_LOG_DEBUG,
- "self-heal aborting on %s",
- local->loc.path);
-
- local->self_heal.algo_abort_cbk (sh_frame, this);
- } else {
- GF_ASSERT (last_loop_frame);
- if (diff_blocks == total_blocks) {
- gf_log (this->name, GF_LOG_DEBUG, "full self-heal "
- "completed on %s",local->loc.path);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "diff self-heal on %s: completed. "
- "(%d blocks of %d were different (%.2f%%))",
- local->loc.path, diff_blocks, total_blocks,
- ((diff_blocks * 1.0)/total_blocks) * 100);
- }
-
- sh->old_loop_frame = last_loop_frame;
- local->self_heal.algo_completion_cbk (sh_frame, this);
- }
-
- return 0;
-}
-
-int
-sh_loop_finish (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
-
- if (!loop_frame)
- goto out;
-
- loop_local = loop_frame->local;
- if (loop_local) {
- loop_sh = &loop_local->self_heal;
- }
-
- if (loop_sh && loop_sh->data_lock_held) {
- afr_sh_data_unlock (loop_frame, this,
- sh_destroy_frame);
- } else {
- sh_destroy_frame (loop_frame, this);
- }
-out:
- return 0;
-}
-
-static int
-sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
-
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- sh_loop_finish (loop_sh->old_loop_frame, this);
- loop_sh->old_loop_frame = NULL;
-
- gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64
- " %"PRIu64, loop_sh->offset, loop_sh->block_size);
- loop_sh->data_lock_held = _gf_true;
- loop_sh->sh_data_algo_start (loop_frame, this);
- return 0;
-}
-
-static int
-sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this)
-{
- call_frame_t *sh_frame = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
-
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
- sh_frame = loop_sh->sh_frame;
-
- gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64
- " %"PRIu64, loop_sh->offset, loop_sh->block_size);
- sh_loop_finish (loop_sh->old_loop_frame, this);
- loop_sh->old_loop_frame = NULL;
- sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN);
- return 0;
-}
-
-static int
-sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this,
- call_frame_t *old_loop_frame, call_frame_t **loop_frame)
-{
- call_frame_t *new_loop_frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *new_loop_local = NULL;
- afr_self_heal_t *new_loop_sh = NULL;
- afr_private_t *priv = NULL;
-
- GF_ASSERT (sh_frame);
- GF_ASSERT (loop_frame);
-
- *loop_frame = NULL;
- local = sh_frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- new_loop_frame = copy_frame (sh_frame);
- if (!new_loop_frame)
- goto out;
- //We want the frame to have same lk_owner as sh_frame
- //so that locks translator allows conflicting locks
- new_loop_local = afr_local_copy (local, this);
- if (!new_loop_local)
- goto out;
- new_loop_frame->local = new_loop_local;
-
- new_loop_sh = &new_loop_local->self_heal;
- new_loop_sh->sources = memdup (sh->sources,
- priv->child_count * sizeof (*sh->sources));
- if (!new_loop_sh->sources)
- goto out;
- new_loop_sh->write_needed = GF_CALLOC (priv->child_count,
- sizeof (*new_loop_sh->write_needed),
- gf_afr_mt_char);
- if (!new_loop_sh->write_needed)
- goto out;
- new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH,
- gf_afr_mt_uint8_t);
- if (!new_loop_sh->checksum)
- goto out;
- new_loop_sh->inode = inode_ref (sh->inode);
- new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start;
- new_loop_sh->source = sh->source;
- new_loop_sh->active_sinks = sh->active_sinks;
- new_loop_sh->healing_fd = fd_ref (sh->healing_fd);
- new_loop_sh->file_has_holes = sh->file_has_holes;
- new_loop_sh->old_loop_frame = old_loop_frame;
- new_loop_sh->sh_frame = sh_frame;
- *loop_frame = new_loop_frame;
- return 0;
-out:
- sh_destroy_frame (new_loop_frame, this);
- return -ENOMEM;
-}
-
-static int
-sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset,
- call_frame_t *old_loop_frame)
-{
- call_frame_t *new_loop_frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *new_loop_local = NULL;
- afr_self_heal_t *new_loop_sh = NULL;
- int ret = 0;
-
- GF_ASSERT (sh_frame);
-
- local = sh_frame->local;
- sh = &local->self_heal;
-
- ret = sh_loop_frame_create (sh_frame, this, old_loop_frame,
- &new_loop_frame);
- if (ret)
- goto out;
- new_loop_local = new_loop_frame->local;
- new_loop_sh = &new_loop_local->self_heal;
- new_loop_sh->offset = offset;
- new_loop_sh->block_size = sh->block_size;
- afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size,
- _gf_true, sh_loop_lock_success, sh_loop_lock_failure);
- return 0;
-out:
- sh->op_failed = 1;
- if (old_loop_frame)
- sh_loop_finish (old_loop_frame, this);
- sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM);
- return 0;
-}
-
-static int
-sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
- gf_boolean_t is_first_call, call_frame_t *old_loop_frame)
-{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_private_t *sh_priv = NULL;
- gf_boolean_t is_driver_done = _gf_false;
- blksize_t block_size = 0;
- int loop = 0;
- off_t offset = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = sh_frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
-
- LOCK (&sh_priv->lock);
- {
- if (!is_first_call)
- sh_priv->loops_running--;
- offset = sh_priv->offset;
- block_size = sh->block_size;
- while ((!sh->eof_reached) && (0 == sh->op_failed) &&
- (sh_priv->loops_running < priv->data_self_heal_window_size)
- && (sh_priv->offset < sh->file_size)) {
-
- loop++;
- sh_priv->offset += block_size;
- sh_priv->loops_running++;
-
- if (!is_first_call)
- break;
- }
- if (0 == sh_priv->loops_running) {
- is_driver_done = _gf_true;
- }
- }
- UNLOCK (&sh_priv->lock);
-
- if (0 == loop) {
- //loop finish does unlock, but the erasing of the pending
- //xattrs needs to happen before that so do not finish the loop
- if (is_driver_done && !sh->op_failed)
- goto driver_done;
- if (old_loop_frame) {
- sh_loop_finish (old_loop_frame, this);
- old_loop_frame = NULL;
- }
- }
-
- //If we have more loops to form we should finish previous loop after
- //the next loop lock
- while (loop--) {
- if (sh->op_failed) {
- // op failed in other loop, stop spawning more loops
- if (old_loop_frame) {
- sh_loop_finish (old_loop_frame, this);
- old_loop_frame = NULL;
- }
- sh_loop_driver (sh_frame, this, _gf_false, NULL);
- } else {
- gf_log (this->name, GF_LOG_TRACE, "spawning a loop "
- "for offset %"PRId64, offset);
-
- sh_loop_start (sh_frame, this, offset, old_loop_frame);
- old_loop_frame = NULL;
- offset += block_size;
- }
- }
-
-driver_done:
- if (is_driver_done) {
- sh_loop_driver_done (sh_frame, this, old_loop_frame);
- }
- return 0;
-}
-
-static int
-sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * loop_local = NULL;
- afr_self_heal_t * loop_sh = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
-
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- if (loop_frame) {
- loop_local = loop_frame->local;
- if (loop_local)
- loop_sh = &loop_local->self_heal;
- if (loop_sh)
- gf_log (this->name, GF_LOG_TRACE, "loop for offset "
- "%"PRId64" returned", loop_sh->offset);
- }
-
- if (op_ret == -1) {
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- if (loop_frame) {
- sh_loop_finish (loop_frame, this);
- loop_frame = NULL;
- }
- }
-
- sh_loop_driver (sh_frame, this, _gf_false, loop_frame);
-
- return 0;
-}
-
-static int
-sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *postbuf, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * loop_local = NULL;
- afr_self_heal_t * loop_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = 0;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- sh_frame = loop_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- child_index = (long) cookie;
-
- gf_log (this->name, GF_LOG_TRACE,
- "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
- op_ret, sh_local->loc.path, child_index, loop_sh->offset);
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "write to %s failed on subvolume %s (%s)",
- sh_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->op_failed = 1;
- afr_sh_set_error (loop_sh, op_errno);
- } else if (op_ret < loop_local->cont.writev.vector->iov_len) {
- gf_log(this->name, GF_LOG_ERROR,
- "incomplete write to %s on subvolume %s "
- "(expected %lu, returned %d)", sh_local->loc.path,
- priv->children[child_index]->name,
- loop_local->cont.writev.vector->iov_len, op_ret);
- sh->op_failed = 1;
- }
-
- call_count = afr_frame_return (loop_frame);
-
- if (call_count == 0) {
- iobref_unref(loop_local->cont.writev.iobref);
-
- sh_loop_return (sh_frame, this, loop_frame,
- loop_sh->op_ret, loop_sh->op_errno);
- }
-
- return 0;
-}
-
-static void
-sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame,
- afr_private_t *priv)
-{
- afr_local_t *sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
- int i = 0;
-
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- if (!strcmp (sh->algo->name, "diff"))
- return;
-
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- /* full self-heal guarantees there exists atleast 1 file with size 0
- * That means for other files we can preserve holes that come after
- * its size before 'trim'
- */
- for (i = 0; i < priv->child_count; i++) {
- if (loop_sh->write_needed[i] &&
- ((loop_sh->offset + 1) > sh->buf[i].ia_size))
- loop_sh->write_needed[i] = 0;
- }
-}
-
-static int
-sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * loop_local = NULL;
- afr_self_heal_t * loop_sh = NULL;
- call_frame_t *sh_frame = NULL;
- int i = 0;
- int call_count = 0;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t * sh = NULL;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- sh_frame = loop_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- gf_log (this->name, GF_LOG_TRACE,
- "read %d bytes of data from %s, offset %"PRId64"",
- op_ret, loop_local->loc.path, loop_sh->offset);
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- sh->op_failed = 1;
- gf_log (this->name, GF_LOG_ERROR, "read failed on %d "
- "for %s reason :%s", sh->source,
- sh_local->loc.path, strerror (errno));
- } else {
- sh->eof_reached = _gf_true;
- gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s",
- sh_local->loc.path);
- }
- sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno);
- goto out;
- }
-
- if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0)
- sh_prune_writes_needed (sh_frame, loop_frame, priv);
-
- call_count = sh_number_of_writes_needed (loop_sh->write_needed,
- priv->child_count);
- if (call_count == 0) {
- sh_loop_return (sh_frame, this, loop_frame, 0, 0);
- goto out;
- }
-
- loop_local->call_count = call_count;
-
- /*
- * We only really need the request size at the moment, but the buffer
- * is required if we want to issue a retry in the event of a short write.
- * Therefore, we duplicate the vector and ref the iobref here...
- */
- loop_local->cont.writev.vector = iov_dup(vector, count);
- loop_local->cont.writev.iobref = iobref_ref(iobref);
-
- for (i = 0; i < priv->child_count; i++) {
- if (!loop_sh->write_needed[i])
- continue;
- STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- loop_sh->healing_fd, vector, count,
- loop_sh->offset, 0, iobref, NULL);
-
- if (!--call_count)
- break;
- }
-
-out:
- return 0;
-}
-
-
-static int
-sh_loop_read (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk,
- (void *) (long) loop_sh->source,
- priv->children[loop_sh->source],
- priv->children[loop_sh->source]->fops->readv,
- loop_sh->healing_fd, loop_sh->block_size,
- loop_sh->offset, 0, NULL);
-
- return 0;
-}
-
-
-static int
-sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- uint32_t weak_checksum, uint8_t *strong_checksum,
- dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t *sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_private_t *sh_priv = NULL;
- int child_index = 0;
- int call_count = 0;
- int i = 0;
- int write_needed = 0;
-
- priv = this->private;
-
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- sh_frame = loop_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- sh_priv = sh->private;
-
- child_index = (long) cookie;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "checksum on %s failed on subvolume %s (%s)",
- sh_local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- } else {
- memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH,
- strong_checksum, MD5_DIGEST_LENGTH);
- }
-
- call_count = afr_frame_return (loop_frame);
-
- if (call_count == 0) {
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !sh_local->child_up[i])
- continue;
-
- if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH),
- loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH),
- MD5_DIGEST_LENGTH)) {
- /*
- Checksums differ, so this block
- must be written to this sink
- */
-
- gf_log (this->name, GF_LOG_DEBUG,
- "checksum on subvolume %s at offset %"
- PRId64" differs from that on source",
- priv->children[i]->name, loop_sh->offset);
-
- write_needed = loop_sh->write_needed[i] = 1;
- }
- }
-
- LOCK (&sh_priv->lock);
- {
- sh_priv->total_blocks++;
- if (write_needed)
- sh_priv->diff_blocks++;
- }
- UNLOCK (&sh_priv->lock);
-
- if (write_needed && !sh->op_failed) {
- sh_loop_read (loop_frame, this);
- } else {
- sh_loop_return (sh_frame, this, loop_frame,
- op_ret, op_errno);
- }
- }
-
- return 0;
-}
-
-static int
-sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- call_count = loop_sh->active_sinks + 1; /* sinks and source */
-
- loop_local->call_count = call_count;
-
- STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk,
- (void *) (long) loop_sh->source,
- priv->children[loop_sh->source],
- priv->children[loop_sh->source]->fops->rchecksum,
- loop_sh->healing_fd,
- loop_sh->offset, loop_sh->block_size, NULL);
-
- for (i = 0; i < priv->child_count; i++) {
- if (loop_sh->sources[i] || !loop_local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rchecksum,
- loop_sh->healing_fd,
- loop_sh->offset, loop_sh->block_size, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-static int
-sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
- int i = 0;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- for (i = 0; i < priv->child_count; i++) {
- if (loop_sh->sources[i] || !loop_local->child_up[i])
- continue;
- loop_sh->write_needed[i] = 1;
- }
- sh_loop_read (loop_frame, this);
- return 0;
-}
-
-afr_sh_algo_private_t*
-afr_sh_priv_init ()
-{
- afr_sh_algo_private_t *sh_priv = NULL;
-
- sh_priv = GF_CALLOC (1, sizeof (*sh_priv),
- gf_afr_mt_afr_private_t);
- if (!sh_priv)
- goto out;
-
- LOCK_INIT (&sh_priv->lock);
-out:
- return sh_priv;
-}
-
-void
-afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src,
- unsigned int child_count)
-{
- afr_local_t *dst_local = NULL;
- afr_self_heal_t *dst_sh = NULL;
- afr_local_t *src_local = NULL;
- afr_self_heal_t *src_sh = NULL;
-
- dst_local = dst->local;
- dst_sh = &dst_local->self_heal;
- src_local = src->local;
- src_sh = &src_local->self_heal;
- GF_ASSERT (src_sh->data_lock_held);
- GF_ASSERT (!dst_sh->data_lock_held);
- afr_lk_transfer_datalock (dst, src, child_count);
- src_sh->data_lock_held = _gf_false;
- dst_sh->data_lock_held = _gf_true;
-}
-
-int
-afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this,
- afr_sh_algo_fn sh_data_algo_start)
-{
- call_frame_t *first_loop_frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = 0;
- afr_private_t *priv = NULL;
-
- local = sh_frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- sh->sh_data_algo_start = sh_data_algo_start;
- local->call_count = 0;
- ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame);
- if (ret)
- goto out;
- afr_sh_transfer_lock (first_loop_frame, sh_frame, priv->child_count);
- sh->private = afr_sh_priv_init ();
- if (!sh->private) {
- ret = -1;
- goto out;
- }
- sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame);
- ret = 0;
-out:
- if (ret) {
- sh->op_failed = 1;
- sh_loop_driver_done (sh_frame, this, NULL);
- }
- return 0;
-}
-
-int
-afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this)
-{
- afr_sh_start_loops (sh_frame, this, sh_diff_checksum);
- return 0;
-}
-
-int
-afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this)
-{
- afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks);
- return 0;
-}
-
-struct afr_sh_algorithm afr_self_heal_algorithms[] = {
- {.name = "full", .fn = afr_sh_algo_full},
- {.name = "diff", .fn = afr_sh_algo_diff},
- {0, 0},
-};
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h
deleted file mode 100644
index 6b20789b1..000000000
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __AFR_SELF_HEAL_ALGORITHM_H__
-#define __AFR_SELF_HEAL_ALGORITHM_H__
-
-typedef int (*afr_sh_algo_fn) (call_frame_t *frame,
- xlator_t *this);
-
-struct afr_sh_algorithm {
- const char *name;
- afr_sh_algo_fn fn;
-};
-
-extern struct afr_sh_algorithm afr_self_heal_algorithms[3];
-typedef struct {
- gf_lock_t lock;
- unsigned int loops_running;
- off_t offset;
-
- int32_t total_blocks;
- int32_t diff_blocks;
-} afr_sh_algo_private_t;
-
-#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 2538f4c8b..4dac83113 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,2512 +8,1002 @@
cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
-#include "byte-order.h"
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
#include "afr.h"
-#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
#include "afr-self-heal.h"
-#include "pump.h"
+#include "byte-order.h"
-void
-afr_sh_reset (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- memset (sh->child_errno, 0,
- sizeof (*sh->child_errno) * priv->child_count);
- memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count);
- memset (sh->parentbufs, 0,
- sizeof (*sh->parentbufs) * priv->child_count);
- memset (sh->success, 0, sizeof (*sh->success) * priv->child_count);
- memset (sh->locked_nodes, 0,
- sizeof (*sh->locked_nodes) * priv->child_count);
- sh->active_sinks = 0;
-
- afr_reset_xattr (sh->xattr, priv->child_count);
-}
-//Intersection[child]=1 if child is part of intersection
-void
-afr_children_intersection_get (int32_t *set1, int32_t *set2,
- int *intersection, unsigned int child_count)
+int
+afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
- int i = 0;
-
- memset (intersection, 0, sizeof (*intersection) * child_count);
- for (i = 0; i < child_count; i++) {
- intersection[i] = afr_is_child_present (set1, child_count, i)
- && afr_is_child_present (set2, child_count,
- i);
- }
-}
+ afr_local_t *local = NULL;
-/**
- * select_source - select a source and return it
- */
+ local = frame->local;
-int
-afr_sh_select_source (int sources[], int child_count)
-{
- int i = 0;
- for (i = 0; i < child_count; i++)
- if (sources[i])
- return i;
+ syncbarrier_wake (&local->barrier);
- return -1;
+ return 0;
}
-void
-afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this)
-{
- int i = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
- } else if (sh->sources[i] == 1 && local->child_up[i] == 1) {
- sh->success[i] = 1;
- }
- }
- sh->active_sinks = active_sinks;
-}
int
-afr_sh_source_count (int sources[], int child_count)
+afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr)
{
- int i = 0;
- int nsource = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ loc_t loc = {0, };
- for (i = 0; i < child_count; i++)
- if (sources[i])
- nsource++;
- return nsource;
-}
+ priv = this->private;
+ local = frame->local;
-void
-afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno)
-{
- sh->op_ret = -1;
- sh->op_errno = afr_most_important_error(sh->op_errno, op_errno,
- _gf_false);
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
-{
- afr_private_t * priv = this->private;
- char *buf = NULL;
- char *ptr = NULL;
- int i = 0;
- int j = 0;
-
- /* 10 digits per entry + 1 space + '[' and ']' */
- buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char);
-
- for (i = 0; i < priv->child_count; i++) {
- ptr = buf;
- ptr += sprintf (ptr, "[ ");
- for (j = 0; j < priv->child_count; j++) {
- ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
- }
- sprintf (ptr, "]");
- gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf);
- }
-
- GF_FREE (buf);
-}
+ STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL);
-void
-afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this,
- const char *loc)
-{
- afr_private_t * priv = this->private;
- char *buf = NULL;
- char *ptr = NULL;
- int i = 0;
- int j = 0;
- int child_count = priv->child_count;
- char *matrix_begin = "[ [ ";
- char *matrix_end = "] ]";
- char *seperator = "] [ ";
- int pending_entry_strlen = 12; //Including space after entry
- int matrix_begin_strlen = 0;
- int matrix_end_strlen = 0;
- int seperator_strlen = 0;
- int string_length = 0;
- char *msg = "- Pending matrix: ";
-
- /*
- * for a list of lists of [ [ a b ] [ c d ] ]
- * */
-
- matrix_begin_strlen = strlen (matrix_begin);
- matrix_end_strlen = strlen (matrix_end);
- seperator_strlen = strlen (seperator);
- string_length = matrix_begin_strlen + matrix_end_strlen
- + (child_count -1) * seperator_strlen
- + (child_count * child_count * pending_entry_strlen);
-
- buf = GF_CALLOC (1, 1 + strlen (msg) + string_length , gf_afr_mt_char);
- if (!buf) {
- buf = "";
- goto out;
- }
-
- ptr = buf;
- ptr += sprintf (ptr, "%s", msg);
- ptr += sprintf (ptr, "%s", matrix_begin);
- for (i = 0; i < priv->child_count; i++) {
- for (j = 0; j < priv->child_count; j++) {
- ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
- }
- if (i < priv->child_count -1)
- ptr += sprintf (ptr, "%s", seperator);
- }
-
- ptr += sprintf (ptr, "%s", matrix_end);
+ syncbarrier_wait (&local->barrier, 1);
-out:
- gf_log (this->name, GF_LOG_ERROR, "Unable to self-heal contents of '%s'"
- " (possible split-brain). Please delete the file from all but "
- "the preferred subvolume.%s", loc, buf);
- if (buf)
- GF_FREE (buf);
- return;
+ return 0;
}
-void
-afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count)
+dict_t *
+afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type,
+ int *output_dirty, int **output_matrix, int subvol)
{
- int i = 0;
- int j = 0;
+ dict_t *xattr = NULL;
+ afr_private_t *priv = NULL;
+ int j = 0;
+ int idx = 0;
+ int ret = 0;
+ int *raw = 0;
- GF_ASSERT (pending_matrix);
+ priv = this->private;
+ idx = afr_index_for_transaction_type (type);
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- pending_matrix[i][j] = 0;
- }
- }
-}
+ xattr = dict_new ();
+ if (!xattr)
+ return NULL;
-void
-afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix,
- unsigned char *ignorant_subvols,
- size_t child_count)
-{
- int i = 0;
- int j = 0;
-
- GF_ASSERT (pending_matrix);
- GF_ASSERT (ignorant_subvols);
-
- for (i = 0; i < child_count; i++) {
- if (ignorant_subvols[i]) {
- for (j = 0; j < child_count; j++) {
- if (!ignorant_subvols[j])
- pending_matrix[j][i] += 1;
- }
- }
- }
+ if (output_dirty[subvol]) {
+ /* clear dirty */
+ raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32 (output_dirty[subvol]);
+ ret = dict_set_bin (xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto err;
+ }
+
+ /* clear/set pending */
+ for (j = 0; j < priv->child_count; j++) {
+ if (!output_matrix[subvol][j])
+ continue;
+
+ raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS,
+ gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32 (output_matrix[subvol][j]);
+
+ ret = dict_set_bin (xattr, priv->pending_key[j],
+ raw, sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto err;
+ }
+
+ return xattr;
+err:
+ if (xattr)
+ dict_unref (xattr);
+ return NULL;
}
+
int
-afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
- unsigned char *ignorant_subvols,
- dict_t *xattr[], afr_transaction_type type,
- size_t child_count)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
- int k = 0;
-
- afr_init_pending_matrix (pending_matrix, child_count);
-
- for (i = 0; i < child_count; i++) {
- pending_raw = NULL;
-
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], pending_key[j],
- &pending_raw);
-
- if (ret != 0) {
- /*
- * There is no xattr present. This means this
- * subvolume should be considered an 'ignorant'
- * subvolume.
- */
-
- if (ignorant_subvols)
- ignorant_subvols[i] = 1;
- continue;
- }
-
- memcpy (pending, pending_raw, sizeof(pending));
- k = afr_index_for_transaction_type (type);
-
- pending_matrix[i][j] = ntoh32 (pending[k]);
- }
- }
-
- return ret;
+afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, afr_transaction_type type,
+ struct afr_reply *replies, unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ unsigned char *pending = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+
+ pending = alloca0 (priv->child_count);
+
+ input_dirty = alloca0 (priv->child_count * sizeof (int));
+ input_matrix = ALLOC_MATRIX (priv->child_count, int);
+ output_dirty = alloca0 (priv->child_count * sizeof (int));
+ output_matrix = ALLOC_MATRIX (priv->child_count, int);
+
+ afr_selfheal_extract_xattr (this, replies, type, input_dirty,
+ input_matrix);
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sinks[i] && !healed_sinks[i])
+ pending[i] = 1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (pending[j])
+ output_matrix[i][j] = 1;
+ else
+ output_matrix[i][j] = -input_matrix[i][j];
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ output_dirty[i] = -input_dirty[i];
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ /* perform post-op only on subvols we had locked
+ and inspected on.
+ */
+ continue;
+
+ xattr = afr_selfheal_output_xattr (this, type, output_dirty,
+ output_matrix, i);
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unable to allocate xdata for subvol %d", i);
+ continue;
+ }
+
+ afr_selfheal_post_op (frame, this, inode, i, xattr);
+
+ dict_unref (xattr);
+ }
+
+ return 0;
}
-typedef enum {
- AFR_NODE_INVALID,
- AFR_NODE_INNOCENT,
- AFR_NODE_FOOL,
- AFR_NODE_WISE,
-} afr_node_type;
-typedef struct {
- afr_node_type type;
- int wisdom;
-} afr_node_character;
+void
+afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count)
+{
+ int i = 0;
+ dict_t *xdata = NULL;
+
+ if (dst == src)
+ return;
+
+ for (i = 0; i < count; i++) {
+ dst[i].valid = src[i].valid;
+ dst[i].op_ret = src[i].op_ret;
+ dst[i].op_errno = src[i].op_errno;
+ dst[i].prestat = src[i].prestat;
+ dst[i].poststat = src[i].poststat;
+ dst[i].preparent = src[i].preparent;
+ dst[i].postparent = src[i].postparent;
+ dst[i].preparent2 = src[i].preparent2;
+ dst[i].postparent2 = src[i].postparent2;
+ if (src[i].xdata)
+ xdata = dict_ref (src[i].xdata);
+ else
+ xdata = NULL;
+ if (dst[i].xdata)
+ dict_unref (dst[i].xdata);
+ dst[i].xdata = xdata;
+ memcpy (dst[i].checksum, src[i].checksum,
+ MD5_DIGEST_LENGTH);
+ }
+}
-static int
-afr_sh_is_innocent (int32_t *array, int child_count)
+int
+afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol,
+ int idx, dict_t *xdata)
{
- int i = 0;
- int ret = 1; /* innocent until proven guilty */
+ void *pending_raw = NULL;
+ int pending[3] = {0, };
- for (i = 0; i < child_count; i++) {
- if (array[i]) {
- ret = 0;
- break;
- }
- }
+ if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw))
+ return -1;
- return ret;
-}
+ if (!pending_raw)
+ return -1;
+ memcpy (pending, pending_raw, sizeof(pending));
-static int
-afr_sh_is_fool (int32_t *array, int i, int child_count)
-{
- return array[i]; /* fool if accuses itself */
+ dirty[subvol] = ntoh32 (pending[idx]);
+
+ return 0;
}
-static int
-afr_sh_is_wise (int32_t *array, int i, int child_count)
+int
+afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol,
+ int idx, dict_t *xdata)
{
- return !array[i]; /* wise if does not accuse itself */
-}
+ int i = 0;
+ void *pending_raw = NULL;
+ int pending[3] = {0, };
+ afr_private_t *priv = NULL;
+ priv = this->private;
-static int
-afr_sh_all_nodes_innocent (afr_node_character *characters,
- int child_count)
-{
- int i = 0;
- int ret = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw))
+ continue;
+
+ if (!pending_raw)
+ continue;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_INNOCENT) {
- ret = 0;
- break;
- }
- }
+ memcpy (pending, pending_raw, sizeof(pending));
- return ret;
+ matrix[subvol][i] = ntoh32 (pending[idx]);
+ }
+
+ return 0;
}
-static int
-afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count)
+int
+afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix)
{
- int i = 0;
- int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ dict_t *xdata = NULL;
+ int idx = -1;
+
+ idx = afr_index_for_transaction_type (type);
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].xdata)
+ continue;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- ret = 1;
- break;
- }
- }
+ xdata = replies[i].xdata;
- return ret;
+ afr_selfheal_fill_dirty (this, dirty, i, idx, xdata);
+ afr_selfheal_fill_matrix (this, matrix, i, idx, xdata);
+ }
+
+ return 0;
}
+
/*
- * The 'wisdom' of a wise node is 0 if any other wise node accuses it.
- * It is 1 if no other wise node accuses it.
- * Only wise nodes with wisdom 1 are sources.
+ * This function determines if a self-heal is required for a given inode,
+ * and if needed, in what direction.
+ *
+ * locked_on[] is the array representing servers which have been locked and
+ * from which xattrs have been fetched for analysis.
+ *
+ * The output of the function is by filling the arrays sources[] and sinks[].
+ *
+ * sources[i] is set if i'th server is an eligible source for a selfheal.
+ *
+ * sinks[i] is set if i'th server needs to be healed.
+ *
+ * if sources[0..N] are all set, there is no need for a selfheal.
+ *
+ * if sinks[0..N] are all set, the inode is in split brain.
*
- * If no nodes with wisdom 1 exist, a split-brain has occurred.
*/
-static void
-afr_sh_compute_wisdom (int32_t *pending_matrix[],
- afr_node_character characters[], int child_count)
-{
- int i = 0;
- int j = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- characters[i].wisdom = 1;
-
- for (j = 0; j < child_count; j++) {
- if ((characters[j].type == AFR_NODE_WISE)
- && pending_matrix[j][i]) {
-
- characters[i].wisdom = 0;
- }
- }
- }
- }
+int
+afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ int *dirty = NULL;
+ int **matrix = NULL;
+ char *accused = NULL;
+
+ priv = this->private;
+
+ dirty = alloca0 (priv->child_count * sizeof (int));
+ accused = alloca0 (priv->child_count);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ /* First construct the pending matrix for further analysis */
+ afr_selfheal_extract_xattr (this, replies, type, dirty, matrix);
+
+ /* Next short list all accused to exclude them from being sources */
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ accused[j] = 1;
+ }
+ }
+
+ /* Short list all non-accused as sources */
+ memset (sources, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!accused[i] && locked_on[i])
+ sources[i] = 1;
+ }
+
+ /* Everyone accused by sources are sinks */
+ memset (sinks, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ sinks[j] = 1;
+ }
+ }
+
+ /* If any source has 'dirty' bit, pick first
+ 'dirty' source and make everybody else sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && dirty[i]) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (j != i) {
+ sources[j] = 0;
+ sinks[j] = 1;
+ }
+ }
+ break;
+ }
+ }
+
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT (sources, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ sinks[i] = 1;
+ }
+ }
+
+ return 0;
}
-static int
-afr_sh_wise_nodes_conflict (afr_node_character *characters,
- int child_count)
+int
+afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
{
- int i = 0;
- int ret = 1;
+ afr_local_t *local = NULL;
+ int i = -1;
- for (i = 0; i < child_count; i++) {
- if ((characters[i].type == AFR_NODE_WISE)
- && characters[i].wisdom == 1) {
+ local = frame->local;
+ i = (long) cookie;
- /* There is atleast one bona-fide wise node */
- ret = 0;
- break;
- }
- }
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (buf)
+ local->replies[i].poststat = *buf;
+ if (parbuf)
+ local->replies[i].postparent = *parbuf;
+ if (xdata)
+ local->replies[i].xdata = dict_ref (xdata);
- return ret;
+ syncbarrier_wake (&local->barrier);
+
+ return 0;
}
-static int
-afr_sh_mark_wisest_as_sources (int sources[],
- afr_node_character *characters,
- int child_count)
+inode_t *
+afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on)
{
- int nsources = 0;
- int i = 0;
+ loc_t loc = {0, };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
- for (i = 0; i < child_count; i++) {
- if (characters[i].wisdom == 1) {
- sources[i] = 1;
- nsources++;
- }
- }
+ local = frame->local;
+ priv = frame->this->private;
- return nsources;
-}
+ xattr_req = dict_new ();
+ if (!xattr_req)
+ return NULL;
-static void
-afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix,
- afr_node_character *characters,
- int32_t child_count)
-{
- int i = 0;
- int j = 0;
- int witness = 0;
-
- GF_ASSERT (witnesses);
- GF_ASSERT (pending_matrix);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
-
- witness = 0;
- for (j = 0; j < child_count; j++) {
- if (i == j)
- continue;
- witness += pending_matrix[i][j];
- }
- witnesses[i] = witness;
- }
-}
+ if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ dict_destroy (xattr_req);
+ return NULL;
+ }
-static int32_t
-afr_find_biggest_witness_among_fools (int32_t *witnesses,
- afr_node_character *characters,
- int32_t child_count)
-{
- int i = 0;
- int biggest_witness = -1;
+ inode = inode_new (parent->table);
+ if (!inode) {
+ dict_destroy (xattr_req);
+ return NULL;
+ }
- GF_ASSERT (witnesses);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
+ loc.parent = inode_ref (parent);
+ uuid_copy (loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
+ AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- if (biggest_witness < witnesses[i])
- biggest_witness = witnesses[i];
- }
- return biggest_witness;
-}
+ afr_replies_copy (replies, local->replies, priv->child_count);
-int
-afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,
- afr_node_character *characters,
- int32_t child_count, int32_t witness)
-{
- int i = 0;
- int nsources = 0;
-
- GF_ASSERT (sources);
- GF_ASSERT (witnesses);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
-
- if (witness == witnesses[i]) {
- sources[i] = 1;
- nsources++;
- }
- }
- return nsources;
-}
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
-static int
-afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
- afr_node_character *characters,
- int child_count)
-{
- int32_t biggest_witness = 0;
- int nsources = 0;
- int32_t *witnesses = NULL;
-
- GF_ASSERT (child_count > 0);
-
- witnesses = GF_CALLOC (child_count, sizeof (*witnesses),
- gf_afr_mt_int32_t);
- if (NULL == witnesses) {
- nsources = -1;
- goto out;
- }
-
- afr_compute_witness_of_fools (witnesses, pending_matrix, characters,
- child_count);
- biggest_witness = afr_find_biggest_witness_among_fools (witnesses,
- characters,
- child_count);
- nsources = afr_mark_fool_as_source_by_witness (sources, witnesses,
- characters, child_count,
- biggest_witness);
-out:
- GF_FREE (witnesses);
- return nsources;
+ return inode;
}
-int
-afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs,
- int32_t *success_children,
- unsigned int child_count, uint32_t uid)
-{
- int i = 0;
- int nsources = 0;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- if (-1 == success_children[i])
- break;
-
- child = success_children[i];
- if (uid == bufs[child].ia_uid) {
- sources[child] = 1;
- nsources++;
- }
- }
- return nsources;
-}
int
-afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children,
- unsigned int child_count)
+afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on)
{
- int i = 0;
- int smallest = -1;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- if (-1 == success_children[i])
- break;
- child = success_children[i];
- if ((smallest == -1) ||
- (bufs[child].ia_uid < bufs[smallest].ia_uid)) {
- smallest = child;
- }
- }
- return smallest;
-}
+ loc_t loc = {0, };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
-static int
-afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children,
- int child_count, int32_t *sources)
-{
- int nsources = 0;
- int smallest = 0;
-
- smallest = afr_get_child_with_lowest_uid (bufs, success_children,
- child_count);
- if (smallest < 0) {
- nsources = -1;
- goto out;
- }
- nsources = afr_mark_child_as_source_by_uid (sources, bufs,
- success_children, child_count,
- bufs[smallest].ia_uid);
-out:
- return nsources;
-}
+ local = frame->local;
+ priv = frame->this->private;
-int
-afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children,
- struct iatt *bufs)
-{
- afr_private_t *priv = NULL;
- int i = 0;
- int child = -1;
- int read_child = -1;
-
- priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- child = success_children[i];
- if (child < 0)
- break;
- if (read_child < 0)
- read_child = child;
- else if (bufs[read_child].ia_size < bufs[child].ia_size)
- read_child = child;
- }
- return read_child;
-}
+ xattr_req = dict_new ();
+ if (!xattr_req)
+ return -ENOMEM;
-int
-afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children,
- int child_count, int32_t *sources)
-{
- int nsources = 0;
- int i = 0;
- int child = 0;
- gf_boolean_t sink_exists = _gf_false;
- gf_boolean_t source_exists = _gf_false;
- int source = -1;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child < 0)
- break;
- if (!bufs[child].ia_size) {
- sink_exists = _gf_true;
- continue;
- }
- if (!source_exists) {
- source_exists = _gf_true;
- source = child;
- continue;
- }
- if (bufs[source].ia_size != bufs[child].ia_size) {
- nsources = -1;
- goto out;
- }
- }
- if (!source_exists && !sink_exists) {
- nsources = -1;
- goto out;
- }
-
- if (!source_exists || !sink_exists)
- goto out;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child < 0)
- break;
- if (bufs[child].ia_size) {
- sources[child] = 1;
- nsources++;
- }
- }
-out:
- return nsources;
-}
+ if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ dict_destroy (xattr_req);
+ return -ENOMEM;
+ }
-char *
-afr_get_character_str (afr_node_type type)
-{
- char *character = NULL;
-
- switch (type) {
- case AFR_NODE_INNOCENT:
- character = "innocent";
- break;
- case AFR_NODE_FOOL:
- character = "fool";
- break;
- case AFR_NODE_WISE:
- character = "wise";
- break;
- default:
- character = "invalid";
- break;
- }
- return character;
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, gfid);
-afr_node_type
-afr_find_child_character_type (int32_t *pending_row, int32_t child,
- unsigned int child_count)
-{
- afr_node_type type = AFR_NODE_INVALID;
+ AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- GF_ASSERT ((child >= 0) && (child < child_count));
+ afr_replies_copy (replies, local->replies, priv->child_count);
- if (afr_sh_is_innocent (pending_row, child_count))
- type = AFR_NODE_INNOCENT;
- else if (afr_sh_is_fool (pending_row, child, child_count))
- type = AFR_NODE_FOOL;
- else if (afr_sh_is_wise (pending_row, child, child_count))
- type = AFR_NODE_WISE;
- return type;
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
+
+ return 0;
}
int
-afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs,
- int32_t **pending_matrix, int32_t *sources,
- int32_t *success_children, afr_transaction_type type,
- int32_t *subvol_status, gf_boolean_t ignore_ignorant)
+afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
- int nsources = -1;
- unsigned char *ignorant_subvols = NULL;
- unsigned int child_count = 0;
-
- priv = this->private;
- child_count = priv->child_count;
-
- if (afr_get_children_count (success_children, priv->child_count) == 0)
- goto out;
-
- if (!ignore_ignorant) {
- ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols),
- child_count, gf_afr_mt_char);
- if (NULL == ignorant_subvols)
- goto out;
- }
-
- afr_build_pending_matrix (priv->pending_key, pending_matrix,
- ignorant_subvols, xattr, type,
- priv->child_count);
-
- if (!ignore_ignorant)
- afr_mark_ignorant_subvols_as_pending (pending_matrix,
- ignorant_subvols,
- priv->child_count);
- sh_type = afr_self_heal_type_for_transaction (type);
- if (AFR_SELF_HEAL_INVALID == sh_type)
- goto out;
-
- afr_sh_print_pending_matrix (pending_matrix, this);
-
- nsources = afr_mark_sources (this, sources, pending_matrix, bufs,
- sh_type, success_children, subvol_status);
-out:
- GF_FREE (ignorant_subvols);
- return nsources;
-}
+ afr_private_t *priv = NULL;
-void
-afr_find_character_types (afr_node_character *characters,
- int32_t **pending_matrix, int32_t *success_children,
- unsigned int child_count)
-{
- afr_node_type type = AFR_NODE_INVALID;
- int child = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child == -1)
- break;
- type = afr_find_child_character_type (pending_matrix[child],
- child, child_count);
- characters[child].type = type;
- }
-}
+ priv = frame->this->private;
-void
-afr_mark_success_children_sources (int32_t *sources, int32_t *success_children,
- unsigned int child_count)
-{
- int i = 0;
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- sources[success_children[i]] = 1;
- }
+ return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies,
+ priv->child_up);
}
-/**
- * mark_sources: Mark all 'source' nodes and return number of source
- * nodes found
- *
- * A node (a row in the pending matrix) belongs to one of
- * three categories:
- *
- * M is the pending matrix.
- *
- * 'innocent' - M[i] is all zeroes
- * 'fool' - M[i] has i'th element = 1 (self-reference)
- * 'wise' - M[i] has i'th element = 0, others are 1 or 0.
- *
- * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is
- * needed.
- *
- * A 'wise' node can be a source. If two 'wise' nodes conflict, it is
- * a split-brain. If one wise node refers to the other but the other doesn't
- * refer back, the referrer is a source.
- *
- * All fools are sinks, unless there are no 'wise' nodes. In that case,
- * one of the fools is made a source.
- */
+
int
-afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix,
- struct iatt *bufs, afr_self_heal_type type,
- int32_t *success_children, int32_t *subvol_status)
+afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- /* stores the 'characters' (innocent, fool, wise) of the nodes */
- afr_node_character *characters = NULL;
- int nsources = -1;
- unsigned int child_count = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- child_count = priv->child_count;
- characters = GF_CALLOC (sizeof (afr_node_character),
- child_count, gf_afr_mt_afr_node_character);
- if (!characters)
- goto out;
-
- this = THIS;
-
- /* start clean */
- memset (sources, 0, sizeof (*sources) * child_count);
- nsources = 0;
- afr_find_character_types (characters, pending_matrix, success_children,
- child_count);
- if (afr_sh_all_nodes_innocent (characters, child_count)) {
- switch (type) {
- case AFR_SELF_HEAL_METADATA:
- nsources = afr_sh_mark_lowest_uid_as_source (bufs,
- success_children,
- child_count,
- sources);
- break;
- case AFR_SELF_HEAL_DATA:
- nsources = afr_sh_mark_zero_size_file_as_sink (bufs,
- success_children,
- child_count,
- sources);
- if ((nsources < 0) && subvol_status)
- *subvol_status |= SPLIT_BRAIN;
- break;
- default:
- break;
- }
- goto out;
- }
-
- if (afr_sh_wise_nodes_exist (characters, child_count)) {
- afr_sh_compute_wisdom (pending_matrix, characters, child_count);
-
- if (afr_sh_wise_nodes_conflict (characters, child_count)) {
- if (subvol_status)
- *subvol_status |= SPLIT_BRAIN;
- nsources = -1;
- } else {
- nsources = afr_sh_mark_wisest_as_sources (sources,
- characters,
- child_count);
- }
- } else {
- if (subvol_status)
- *subvol_status |= ALL_FOOLS;
- nsources = afr_mark_biggest_of_fools_as_source (sources,
- pending_matrix,
- characters,
- child_count);
- }
+ afr_local_t *local = NULL;
+ int i = 0;
-out:
- if (nsources == 0)
- afr_mark_success_children_sources (sources, success_children,
- child_count);
- GF_FREE (characters);
+ local = frame->local;
+ i = (long) cookie;
- gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources);
- return nsources;
-}
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], unsigned char success[],
- int child_count, afr_transaction_type type)
-{
- int tgt = 0;
- int src = 0;
- int value = 0;
-
- afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL,
- xattr, type, priv->child_count);
-
- /*
- * The algorithm here has two parts. First, for each subvol indexed
- * as tgt, we try to figure out what count everyone should have for it.
- * If the self-heal succeeded, that's easy; the value is zero.
- * Otherwise, the value is the maximum of the succeeding nodes' counts.
- * Once we know the value, we loop through (possibly for a second time)
- * setting each count to the difference so that when we're done all
- * succeeding nodes will have the same count for tgt.
- */
- for (tgt = 0; tgt < priv->child_count; ++tgt) {
- value = 0;
- if (!success[tgt]) {
- /* Find the maximum. */
- for (src = 0; src < priv->child_count; ++src) {
- if (!success[src]) {
- continue;
- }
- if (delta_matrix[src][tgt] > value) {
- value = delta_matrix[src][tgt];
- }
- }
- }
- /* Force everyone who succeeded to the chosen value. */
- for (src = 0; src < priv->child_count; ++src) {
- if (success[src]) {
- delta_matrix[src][tgt] = value
- - delta_matrix[src][tgt];
- }
- else {
- delta_matrix[src][tgt] = 0;
- }
- }
- }
+ syncbarrier_wake (&local->barrier);
+
+ return 0;
}
int
-afr_sh_delta_to_xattr (xlator_t *this,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
+afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_on)
{
- int i = 0;
- int j = 0;
- int k = 0;
- int ret = 0;
- int32_t *pending = NULL;
- int32_t *local_pending = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- for (i = 0; i < child_count; i++) {
- if (!xattr[i])
- continue;
-
- local_pending = NULL;
- for (j = 0; j < child_count; j++) {
- pending = GF_CALLOC (sizeof (int32_t), 3,
- gf_afr_mt_int32_t);
-
- if (!pending) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to allocate pending entry "
- "for %s[%d] on %s",
- priv->pending_key[j], type,
- priv->children[i]->name);
- continue;
- }
- /* 3 = data+metadata+entry */
-
- k = afr_index_for_transaction_type (type);
-
- pending[k] = hton32 (delta_matrix[i][j]);
-
- if (j == i) {
- local_pending = pending;
- continue;
- }
- ret = dict_set_bin (xattr[i], priv->pending_key[j],
- pending,
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- GF_FREE (pending);
- }
- }
- if (local_pending) {
- ret = dict_set_bin (xattr[i], priv->pending_key[i],
- local_pending,
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- GF_FREE (local_pending);
- }
- }
- }
- return 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ locked_on[i] = 1;
+ count++;
+ } else {
+ locked_on[i] = 0;
+ }
+ }
+
+ return count;
}
int
-afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
+afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- afr_sh_reset (frame, this);
-
- if (local->govinda_gOvinda) {
- gf_log (this->name, GF_LOG_DEBUG,
- "split brain found, aborting selfheal of %s",
- local->loc.path);
- sh->op_failed = 1;
- }
-
- if (sh->op_failed) {
- sh->completion_cbk (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
- afr_self_heal_metadata (frame, this);
- }
-
- return 0;
-}
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-static int
-afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- local = frame->local;
- int_lock = &local->internal_lock;
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLK, &flock, NULL);
- int_lock->lock_cbk = afr_sh_missing_entries_done;
- afr_unlock (frame, this);
+ loc_wipe (&loc);
- return 0;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
+
int
-afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count)
+afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- int ret = -ENOMEM;
- sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf),
- gf_afr_mt_iatt);
- if (!sh->buf)
- goto out;
- sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs),
- gf_afr_mt_iatt);
- if (!sh->parentbufs)
- goto out;
- sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno),
- gf_afr_mt_int);
- if (!sh->child_errno)
- goto out;
- sh->success_children = afr_children_create (child_count);
- if (!sh->success_children)
- goto out;
- sh->fresh_children = afr_children_create (child_count);
- if (!sh->fresh_children)
- goto out;
- sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr),
- gf_afr_mt_dict_t);
- if (!sh->xattr)
- goto out;
- ret = 0;
-out:
- return ret;
-}
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
-void
-afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr, struct iatt *postparent,
- loc_t *loc)
-{
- int child_index = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- sh->buf[child_index] = *buf;
- sh->parentbufs[child_index] = *postparent;
- sh->success_children[sh->success_count] = child_index;
- sh->success_count++;
- sh->xattr[child_index] = dict_ref (xattr);
- } else {
- gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume"
- " %s => -1 (%s)", loc->path,
- priv->children[child_index]->name,
- strerror (op_errno));
- local->self_heal.child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
- return;
-}
+ priv = this->private;
+ local = frame->local;
-gf_boolean_t
-afr_valid_ia_type (ia_type_t ia_type)
-{
- switch (ia_type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- case IA_IFDIR:
- return _gf_true;
- default:
- return _gf_false;
- }
- return _gf_false;
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-int
-afr_impunge_frame_create (call_frame_t *frame, xlator_t *this,
- int active_source, call_frame_t **impunge_frame)
-{
- afr_local_t *local = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int32_t op_errno = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
- call_frame_t *new_frame = NULL;
-
- op_errno = ENOMEM;
- priv = this->private;
- new_frame = copy_frame (frame);
- if (!new_frame) {
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out);
-
- local = frame->local;
- new_frame->local = impunge_local;
- impunge_sh = &impunge_local->self_heal;
- impunge_sh->sh_frame = frame;
- impunge_sh->active_source = active_source;
- impunge_local->child_up = memdup (local->child_up,
- sizeof (*local->child_up) *
- priv->child_count);
- if (!impunge_local->child_up)
- goto out;
-
- impunge_local->pending = afr_matrix_create (priv->child_count,
- AFR_NUM_CHANGE_LOGS);
- if (!impunge_local->pending)
- goto out;
-
- ret = afr_sh_common_create (impunge_sh, priv->child_count);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
- op_errno = 0;
- *impunge_frame = new_frame;
-out:
- if (op_errno && new_frame)
- AFR_STACK_DESTROY (new_frame);
- return -op_errno;
-}
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
-void
-afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this,
- struct iatt *buf,
- struct iatt *postparent,
- afr_impunge_done_cbk_t impunge_done)
-{
- call_frame_t *impunge_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int ret = 0;
- unsigned int enoent_count = 0;
- afr_private_t *priv = NULL;
- int i = 0;
- int32_t op_errno = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- enoent_count = afr_errno_count (NULL, sh->child_errno,
- priv->child_count, ENOENT);
- if (!enoent_count) {
- gf_log (this->name, GF_LOG_INFO,
- "no missing files - %s. proceeding to metadata check",
- local->loc.path);
- goto out;
- }
- sh->impunge_done = impunge_done;
- ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame);
- if (ret)
- goto out;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- loc_copy (&impunge_local->loc, &local->loc);
- ret = afr_build_parent_loc (&impunge_sh->parent_loc,
- &impunge_local->loc, &op_errno);
- if (ret) {
- ret = -op_errno;
- goto out;
- }
- impunge_local->call_count = enoent_count;
- impunge_sh->entrybuf = sh->buf[sh->source];
- impunge_sh->parentbuf = sh->parentbufs[sh->source];
- for (i = 0; i < priv->child_count; i++) {
- if (!impunge_local->child_up[i]) {
- impunge_sh->child_errno[i] = ENOTCONN;
- continue;
- }
- if (sh->child_errno[i] != ENOENT) {
- impunge_sh->child_errno[i] = EEXIST;
- continue;
- }
- }
- for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i] != ENOENT)
- continue;
- afr_sh_entry_impunge_create (impunge_frame, this, i);
- enoent_count--;
- }
- GF_ASSERT (!enoent_count);
- return;
-out:
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, "
- "reason: %s", local->loc.path, strerror (-ret));
- sh->op_failed = 1;
- }
- afr_sh_missing_entries_finish (frame, this);
-}
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLK, &flock, NULL);
-int
-afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- if (op_ret < 0)
- sh->op_failed = 1;
- afr_sh_missing_entries_finish (frame, this);
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_selfheal_locked_fill (frame, this, locked_on);
+ afr_selfheal_uninodelk (frame, this, inode, dom, off,
+ size, locked_on);
-static int
-sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int type = 0;
- struct iatt *buf = NULL;
- struct iatt *postparent = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- buf = &sh->buf[sh->source];
- postparent = &sh->parentbufs[sh->source];
-
- type = buf->ia_type;
- if (!afr_valid_ia_type (type)) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: unknown file type: 0%o", local->loc.path, type);
- local->govinda_gOvinda = 1;
- afr_sh_missing_entries_finish (frame, this);
- goto out;
- }
-
- afr_sh_missing_entry_call_impunge_recreate (frame, this,
- buf, postparent,
- afr_sh_create_entry_cbk);
-out:
- return 0;
-}
+ AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLKW, &flock, NULL);
+ break;
+ }
+ }
-void
-afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t ia_type = IA_INVAL;
- int32_t nsources = 0;
- loc_t *loc = NULL;
- int32_t subvol_status = 0;
- afr_transaction_type txn_type = AFR_DATA_TRANSACTION;
- gf_boolean_t split_brain = _gf_false;
- int read_child = -1;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
- loc = &local->loc;
-
- if (op_ret < 0) {
- if (op_errno == EIO)
- local->govinda_gOvinda = 1;
- // EIO can happen if finding the fresh parent dir failed
- goto out;
- }
-
- //now No chance for the ia_type to conflict
- ia_type = sh->buf[sh->success_children[0]].ia_type;
- txn_type = afr_transaction_type_get (ia_type);
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->success_children, txn_type,
- &subvol_status, _gf_false);
- if (nsources < 0) {
- gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s,"
- " in missing entry self-heal, continuing with the rest"
- " of the self-heals", local->loc.path);
- if (subvol_status & SPLIT_BRAIN) {
- split_brain = _gf_true;
- switch (txn_type) {
- case AFR_DATA_TRANSACTION:
- nsources = 1;
- sh->sources[sh->success_children[0]] = 1;
- break;
- case AFR_ENTRY_TRANSACTION:
- read_child = afr_get_no_xattr_dir_read_child
- (this,
- sh->success_children,
- sh->buf);
- sh->sources[read_child] = 1;
- nsources = 1;
- break;
- default:
- op_errno = EIO;
- goto out;
- }
- } else {
- op_errno = EIO;
- goto out;
- }
- }
-
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- sh->source = sh->fresh_children[0];
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_DEBUG, "No active sources found.");
- op_errno = EIO;
- goto out;
- }
-
- if (sh->gfid_sh_success_cbk)
- sh->gfid_sh_success_cbk (frame, this);
- sh->type = sh->buf[sh->source].ia_type;
- if (uuid_is_null (loc->inode->gfid))
- uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid);
- if (split_brain) {
- afr_sh_missing_entries_finish (frame, this);
- } else {
- sh_missing_entries_create (frame, this);
- }
- return;
-out:
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ loc_wipe (&loc);
-static int
-afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret,
- op_errno, inode, buf, xattr,
- postparent, &sh->lookup_loc);
- call_count = afr_frame_return (frame);
-
- if (call_count)
- goto out;
- op_ret = -1;
- if (!sh->success_count) {
- op_errno = afr_resultant_errno_get (NULL, sh->child_errno,
- priv->child_count);
- gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, "
- "reason %s", sh->lookup_loc.path,
- strerror (op_errno));
- goto done;
- }
-
- if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) &&
- (afr_conflicting_iattrs (sh->buf, sh->success_children,
- priv->child_count,
- sh->lookup_loc.path, this->name))) {
- op_errno = EIO;
- gf_log (this->name, GF_LOG_ERROR, "Conflicting entries "
- "for %s", sh->lookup_loc.path);
- goto done;
- }
-
- if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) &&
- (afr_gfid_missing_count (this->name, sh->success_children,
- sh->buf, priv->child_count,
- sh->lookup_loc.path))) {
- op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "Missing Gfids "
- "for %s", sh->lookup_loc.path);
- goto done;
- }
- op_ret = 0;
-
-done:
- sh->lookup_done (frame, this, op_ret, op_errno);
-out:
- return 0;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
+
int
-afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
- int32_t op_ret, int32_t op_errno)
+afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on)
{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->post_remove_call);
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_ERROR,
- "purge entry %s failed, on child %d reason, %s",
- local->loc.path, child, strerror (op_errno));
- LOCK (&frame->lock);
- {
- afr_sh_set_error (sh, EIO);
- sh->op_failed = 1;
- }
- UNLOCK (&frame->lock);
- }
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- sh->post_remove_call (frame, this);
- return 0;
-}
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
-void
-afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this,
- int child_index, struct iatt *buf,
- struct iatt *parentbuf,
- afr_expunge_done_cbk_t expunge_done)
-{
- call_frame_t *expunge_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int32_t op_errno = 0;
- int ret = 0;
-
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out);
-
- local = frame->local;
- sh = &local->self_heal;
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- loc_copy (&expunge_local->loc, &local->loc);
- ret = afr_build_parent_loc (&expunge_sh->parent_loc,
- &expunge_local->loc, &op_errno);
- if (ret) {
- ret = -op_errno;
- goto out;
- }
- sh->expunge_done = expunge_done;
- afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf,
- parentbuf);
- return;
-out:
- gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s",
- local->loc.path, strerror (op_errno));
- expunge_done (frame, this, child_index, -1, op_errno);
-}
-void
-afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children,
- int32_t *fresh_children,
- unsigned int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (afr_is_child_present (success_children, child_count, i) &&
- !afr_is_child_present (fresh_children, child_count, i)) {
- sh->child_errno[i] = ENOENT;
- GF_ASSERT (sh->xattr[i]);
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
- }
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ flock.l_type = F_UNLCK;
+ flock.l_start = off;
+ flock.l_len = size;
+
+ AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk,
+ dom, &loc, F_SETLK, &flock, NULL);
+
+ loc_wipe (&loc);
+
+ return 0;
}
+
int
-afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this)
+afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (sh->op_failed) {
- afr_sh_missing_entries_finish (frame, this);
- } else {
- if (afr_gfid_missing_count (this->name, sh->fresh_children,
- sh->buf, priv->child_count,
- local->loc.path)) {
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_missing_entries_lookup_done,
- sh->sh_gfid_req,
- AFR_LOOKUP_FAIL_CONFLICTS|
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- } else {
- //No need to set gfid so goto missing entries lookup done
- //Behave as if you have done the lookup
- afr_sh_remove_stale_lookup_info (sh,
- sh->success_children,
- sh->fresh_children,
- priv->child_count);
- afr_children_copy (sh->success_children,
- sh->fresh_children,
- priv->child_count);
- afr_sh_missing_entries_lookup_done (frame, this, 0, 0);
- }
- }
- return 0;
-}
+ loc_t loc = {0,};
-gf_boolean_t
-afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv,
- int child)
-{
- afr_self_heal_t *sh = NULL;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- sh = &local->self_heal;
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom,
+ &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- if (local->child_up[child] &&
- (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count,
- child))
- && (sh->child_errno[child] != ENOENT))
- return _gf_true;
+ loc_wipe (&loc);
- return _gf_false;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-gf_boolean_t
-afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv,
- int child)
+
+int
+afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {0,};
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- sh = &local->self_heal;
+ priv = this->private;
+ local = frame->local;
- if (local->child_up[child] &&
- (!afr_is_child_present (sh->fresh_children, priv->child_count,
- child))
- && (sh->child_errno[child] != ENOENT))
- return _gf_true;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- return _gf_false;
-}
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
+ name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
-void
-afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this,
- gf_boolean_t purge_condition (afr_local_t *local,
- afr_private_t *priv,
- int child))
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (purge_condition (local, priv, i))
- call_count++;
- }
-
- if (call_count == 0) {
- sh->post_remove_call (frame, this);
- goto out;
- }
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!purge_condition (local, priv, i))
- continue;
- gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s "
- "on %s", local->loc.path, priv->children[i]->name);
- afr_sh_call_entry_expunge_remove (frame, this,
- (long) i, &sh->buf[i],
- &sh->parentbufs[i],
- afr_sh_remove_entry_cbk);
- }
-out:
- return;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_selfheal_locked_fill (frame, this, locked_on);
+ afr_selfheal_unentrylk (frame, this, inode, dom, name,
+ locked_on);
+
+ AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom,
+ &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ break;
+ }
+ }
+
+ loc_wipe (&loc);
+
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-void
-afr_sh_purge_entry (call_frame_t *frame, xlator_t *this)
+
+int
+afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {0,};
+
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk,
+ dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
- local = frame->local;
- sh = &local->self_heal;
- sh->post_remove_call = afr_sh_missing_entries_finish;
+ loc_wipe (&loc);
- afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition);
+ return 0;
}
-void
-afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this)
+
+gf_boolean_t
+afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ int idx = -1;
+ afr_private_t *priv = NULL;
+ void *pending_raw = NULL;
+ int *pending_int = NULL;
+ int i = 0;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ priv = this->private;
+ idx = afr_index_for_transaction_type (type);
- sh->post_remove_call = afr_sh_purge_stale_entries_done;
+ if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) {
+ if (pending_raw) {
+ pending_int = pending_raw;
- for (i = 0; i < priv->child_count; i++) {
- if (afr_is_child_present (sh->fresh_children,
- priv->child_count, i))
- continue;
+ if (ntoh32 (pending_int[idx]))
+ return _gf_true;
+ }
+ }
- if ((!local->child_up[i]) || sh->child_errno[i] != 0)
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr (xdata, priv->pending_key[i],
+ &pending_raw))
+ continue;
+ if (!pending_raw)
+ continue;
+ pending_int = pending_raw;
- GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) ||
- uuid_is_null (sh->buf[i].ia_gfid));
+ if (ntoh32 (pending_int[idx]))
+ return _gf_true;
+ }
- if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) ||
- (uuid_compare (sh->buf[i].ia_gfid,
- sh->entrybuf.ia_gfid)))
- continue;
+ return _gf_false;
+}
- afr_children_add_child (sh->fresh_children, i,
- priv->child_count);
- }
- afr_sh_purge_entry_common (frame, this,
- afr_sh_purge_stale_entry_condition);
+gf_boolean_t
+afr_is_data_set (xlator_t *this, dict_t *xdata)
+{
+ return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION);
}
-void
-afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs,
- struct iatt *save,
- unsigned int child_count)
+gf_boolean_t
+afr_is_metadata_set (xlator_t *this, dict_t *xdata)
{
- int i = 0;
- int child = 0;
- gf_boolean_t saved = _gf_false;
-
- GF_ASSERT (save);
- //if iatt buf with gfid exists sets it
- for (i = 0; i < child_count; i++) {
- child = children[i];
- if (child == -1)
- break;
- *save = bufs[child];
- saved = _gf_true;
- if (!uuid_is_null (save->ia_gfid))
- break;
- }
- GF_ASSERT (saved);
+ return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION);
}
-void
-afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh,
- unsigned int child_count)
+gf_boolean_t
+afr_is_entry_set (xlator_t *this, dict_t *xdata)
{
- afr_children_intersection_get (sh->success_children,
- sh->fresh_parent_dirs,
- sh->sources, child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, child_count);
- memset (sh->sources, 0, sizeof (*sh->sources) * child_count);
+ return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION);
}
+
void
-afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_inode_link (inode_t *inode, struct iatt *iatt)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t fresh_child_enoents = 0;
- int32_t fresh_parent_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0)
- goto fail;
- afr_get_children_of_fresh_parent_dirs (sh, priv->child_count);
- fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs,
- priv->child_count);
- //we need the enoent count of the subvols present in fresh_parent_dirs
- fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs,
- sh->child_errno,
- priv->child_count, ENOENT);
- if (fresh_child_enoents == fresh_parent_count) {
- afr_sh_set_error (sh, ENOENT);
- sh->op_failed = 1;
- afr_sh_purge_entry (frame, this);
- } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children,
- priv->child_count, local->loc.path,
- this->name)) {
- afr_sh_save_child_iatts_from_policy (sh->fresh_children,
- sh->buf, &sh->entrybuf,
- priv->child_count);
- afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf,
- sh->fresh_children,
- priv->child_count);
- afr_sh_purge_stale_entry (frame, this);
- } else {
- op_errno = EIO;
- local->govinda_gOvinda = 1;
- goto fail;
- }
-
- return;
-
-fail:
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ inode_t *linked_inode = NULL;
-static void
-afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int enoent_count = 0;
- int nsources = 0;
- int source = -1;
- int32_t subvol_status = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0)
- goto out;
- enoent_count = afr_errno_count (NULL, sh->child_errno,
- priv->child_count, ENOENT);
- if (enoent_count > 0) {
- gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s,"
- " in missing entry self-heal, aborting missing-entry "
- "self-heal",
- local->loc.path);
- afr_sh_missing_entries_finish (frame, this);
- return;
- }
-
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->success_children,
- AFR_ENTRY_TRANSACTION, &subvol_status,
- _gf_true);
- if ((subvol_status & ALL_FOOLS) ||
- (subvol_status & SPLIT_BRAIN)) {
- gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative "
- "merge", sh->parent_loc.path);
- afr_mark_success_children_sources (sh->sources,
- sh->success_children,
- priv->child_count);
- } else if (nsources < 0) {
- gf_log (this->name, GF_LOG_ERROR, "No sources for dir "
- "of %s, in missing entry self-heal, aborting "
- "self-heal", local->loc.path);
- op_errno = EIO;
- goto out;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
- if (source == -1) {
- GF_ASSERT (0);
- gf_log (this->name, GF_LOG_DEBUG, "No active sources found.");
- op_errno = EIO;
- goto out;
- }
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_parent_dirs, priv->child_count);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_children_lookup_done, NULL, 0,
- NULL);
- return;
+ linked_inode = inode_link (inode, NULL, NULL, iatt);
-out:
- afr_sh_set_error (sh, op_errno);
- sh->op_failed = 1;
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ uuid_copy (inode->gfid, iatt->ia_gfid);
+ inode->ia_type = iatt->ia_type;
-void
-afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- memset (&sh->buf[i], 0, sizeof (sh->buf[i]));
- memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i]));
- sh->child_errno[i] = 0;
- }
- memset (&sh->parentbuf, 0, sizeof (sh->parentbuf));
- sh->success_count = 0;
- afr_reset_children (sh->success_children, child_count);
- afr_reset_children (sh->fresh_children, child_count);
- afr_reset_xattr (sh->xattr, child_count);
- loc_wipe (&sh->lookup_loc);
+ if (linked_inode) {
+ inode_lookup (linked_inode);
+ inode_unref (linked_inode);
+ }
}
-/* afr self-heal state will be lost if this call is made
- * please check the afr_sh_common_reset that is called in this function
+
+/*
+ * This function inspects the looked up replies (in an unlocked manner)
+ * and decides whether a locked verification and possible healing is
+ * required or not. It updates the three booleans for each type
+ * of healing. If the boolean flag gets set to FALSE, then we are sure
+ * no healing is required. If the boolean flag gets set to TRUE then
+ * we have to proceed with locked reinspection.
*/
+
int
-afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- afr_lookup_done_cbk_t lookup_done , uuid_t gfid,
- int32_t flags, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
-
- if (xattr_req) {
- afr_xattr_req_prepare (this, xattr_req, loc->path);
- if (gfid) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s with gfid: %s",
- loc->path, uuid_utoa (gfid));
- GF_ASSERT (!uuid_is_null (gfid));
- afr_set_dict_gfid (xattr_req, gfid);
- }
- }
-
- afr_sh_common_reset (sh, priv->child_count);
- sh->lookup_done = lookup_done;
- loc_copy (&sh->lookup_loc, loc);
- sh->lookup_flags = flags;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on subvolume %s",
- loc->path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame,
- afr_sh_common_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- loc, xattr_req);
-
- if (!--call_count)
- break;
- }
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
-
- return 0;
-}
+afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, uuid_t gfid,
+ gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int valid_cnt = 0;
+ struct iatt first = {0, };
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ replies = alloca0 (sizeof (*replies) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1)
+ continue;
+
+ if (afr_is_data_set (this, replies[i].xdata))
+ *data_selfheal = _gf_true;
+
+ if (afr_is_metadata_set (this, replies[i].xdata))
+ *metadata_selfheal = _gf_true;
+ if (afr_is_entry_set (this, replies[i].xdata))
+ *entry_selfheal = _gf_true;
+ valid_cnt ++;
+ if (valid_cnt == 1) {
+ first = replies[i].poststat;
+ continue;
+ }
-int
-afr_sh_post_nb_entrylk_missing_entry_sh_cbk (call_frame_t *frame,
- xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Non blocking entrylks failed.");
- sh->op_failed = -1;
- afr_sh_missing_entries_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_sh_common_lookup (frame, this, &sh->parent_loc,
- afr_sh_find_fresh_parents,
- NULL, AFR_LOOKUP_FAIL_CONFLICTS,
- NULL);
- }
-
- return 0;
-}
+ if (!IA_EQUAL (first, replies[i].poststat, type)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "TYPE mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_type,
+ (int) replies[i].poststat.ia_type,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
+ return -EIO;
+ }
-int
-afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
- char *base_name, afr_lock_cbk_t lock_cbk)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ if (!IA_EQUAL (first, replies[i].poststat, uid)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "UID mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_uid,
+ (int) replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+ *metadata_selfheal = _gf_true;
+ }
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK;
+ if (!IA_EQUAL (first, replies[i].poststat, gid)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "GID mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_uid,
+ (int) replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
- afr_set_lock_number (frame, this);
+ *metadata_selfheal = _gf_true;
+ }
- int_lock->lk_basename = base_name;
- int_lock->lk_loc = loc;
- int_lock->lock_cbk = lock_cbk;
+ if (!IA_EQUAL (first, replies[i].poststat, prot)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "MODE mismatch %d vs %d on %s for gfid:%s",
+ (int) st_mode_from_ia (first.ia_prot, 0),
+ (int) st_mode_from_ia (replies[i].poststat.ia_prot, 0),
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
- int_lock->lockee_count = 0;
- afr_init_entry_lockee (&int_lock->lockee[0], local, loc,
- base_name, priv->child_count);
- int_lock->lockee_count++;
- afr_nonblocking_entrylk (frame, this);
+ *metadata_selfheal = _gf_true;
+ }
- return 0;
-}
+ if (IA_ISREG(first.ia_type) &&
+ !IA_EQUAL (first, replies[i].poststat, size)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "SIZE mismatch %lld vs %lld on %s for gfid:%s",
+ (long long) first.ia_size,
+ (long long) replies[i].poststat.ia_size,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
-static int
-afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this,
- afr_lock_cbk_t lock_cbk)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_internal_lock_t *int_lock = NULL;
- int ret = -1;
- int32_t op_errno = 0;
-
- local = frame->local;
- sh = &local->self_heal;
-
- gf_log (this->name, GF_LOG_TRACE,
- "attempting to recreate missing entries for path=%s",
- local->loc.path);
-
- ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno);
- if (ret)
- goto out;
-
- afr_sh_entrylk (frame, this, &sh->parent_loc, NULL,
- lock_cbk);
- return 0;
-out:
- int_lock = &local->internal_lock;
- int_lock->lock_op_ret = -1;
- lock_cbk (frame, this);
- return 0;
-}
+ *data_selfheal = _gf_true;
+ }
+ }
-static int
-afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this)
-{
- afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_missing_entry_sh_cbk);
- return 0;
+ if (valid_cnt > 0)
+ afr_inode_link (inode, &first);
+
+ if (valid_cnt < 2)
+ return -ENOTCONN;
+
+ return 0;
}
-afr_local_t*
-afr_local_copy (afr_local_t *l, xlator_t *this)
+
+inode_t *
+afr_inode_find (xlator_t *this, uuid_t gfid)
{
- afr_private_t *priv = NULL;
- afr_local_t *lc = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *shc = NULL;
- int i = 0;
-
- priv = this->private;
-
- sh = &l->self_heal;
-
- lc = mem_get0 (this->local_pool);
- if (!lc)
- goto out;
-
- shc = &lc->self_heal;
-
- shc->unwind = sh->unwind;
- shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk;
- shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal;
- shc->do_gfid_self_heal = sh->do_gfid_self_heal;
- shc->do_data_self_heal = sh->do_data_self_heal;
- shc->do_metadata_self_heal = sh->do_metadata_self_heal;
- shc->do_entry_self_heal = sh->do_entry_self_heal;
- shc->force_confirm_spb = sh->force_confirm_spb;
- shc->forced_merge = sh->forced_merge;
- shc->background = sh->background;
- shc->type = sh->type;
-
- uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req);
- if (l->loc.path)
- loc_copy (&lc->loc, &l->loc);
-
- lc->child_up = memdup (l->child_up,
- sizeof (*lc->child_up) * priv->child_count);
- if (l->xattr_req)
- lc->xattr_req = dict_ref (l->xattr_req);
-
- if (l->cont.lookup.inode)
- lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode);
- if (l->cont.lookup.xattr)
- lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr);
- if (l->internal_lock.inode_locked_nodes)
- lc->internal_lock.inode_locked_nodes =
- memdup (l->internal_lock.inode_locked_nodes,
- sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count);
- else
- lc->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- if (l->internal_lock.locked_nodes)
- lc->internal_lock.locked_nodes =
- memdup (l->internal_lock.locked_nodes,
- sizeof (*lc->internal_lock.locked_nodes) * priv->child_count);
- else
- lc->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- for (i = 0; i < l->internal_lock.lockee_count; i++) {
- loc_copy (&lc->internal_lock.lockee[i].loc,
- &l->internal_lock.lockee[i].loc);
-
- lc->internal_lock.lockee[i].locked_count =
- l->internal_lock.lockee[i].locked_count;
-
- if (l->internal_lock.lockee[i].basename)
- lc->internal_lock.lockee[i].basename =
- gf_strdup (l->internal_lock.lockee[i].basename);
-
- if (l->internal_lock.lockee[i].locked_nodes) {
- lc->internal_lock.lockee[i].locked_nodes =
- memdup (l->internal_lock.lockee[i].locked_nodes,
- sizeof (*lc->internal_lock.lockee[i].locked_nodes) *
- priv->child_count);
- } else {
- lc->internal_lock.lockee[i].locked_nodes =
- GF_CALLOC (priv->child_count,
- sizeof (*lc->internal_lock.lockee[i].locked_nodes),
- gf_afr_mt_char);
- }
-
- }
- lc->internal_lock.lockee_count = l->internal_lock.lockee_count;
-
- lc->internal_lock.inodelk_lock_count =
- l->internal_lock.inodelk_lock_count;
- lc->internal_lock.entrylk_lock_count =
- l->internal_lock.entrylk_lock_count;
+ inode_table_t *table = NULL;
+ inode_t *inode = NULL;
+ table = this->itable;
+ if (!table)
+ return NULL;
-out:
- return lc;
-}
+ inode = inode_find (table, gfid);
+ if (inode)
+ return inode;
-int
-afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_local_t * orig_frame_local = NULL;
- afr_self_heal_t * orig_frame_sh = NULL;
- char sh_type_str[256] = {0,};
-
- priv = this->private;
- local = bgsh_frame->local;
- sh = &local->self_heal;
-
- if (local->govinda_gOvinda) {
- afr_set_split_brain (this, sh->inode, SPB, SPB);
- sh->op_failed = 1;
- }
-
- afr_self_heal_type_str_get (sh, sh_type_str,
- sizeof(sh_type_str));
- if (sh->op_failed) {
- gf_loglevel_t loglevel = GF_LOG_ERROR;
- if (priv->shd.iamshd)
- loglevel = GF_LOG_DEBUG;
-
- gf_log (this->name, loglevel, "background %s self-heal "
- "failed on %s", sh_type_str, local->loc.path);
-
- } else {
- gf_log (this->name, GF_LOG_DEBUG, "background %s self-heal "
- "completed on %s", sh_type_str, local->loc.path);
-
- }
-
- FRAME_SU_UNDO (bgsh_frame, afr_local_t);
-
- if (!sh->unwound && sh->unwind) {
- orig_frame_local = sh->orig_frame->local;
- orig_frame_sh = &orig_frame_local->self_heal;
- orig_frame_sh->actual_sh_started = _gf_true;
- sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- sh->op_failed);
- }
-
- if (sh->background) {
- LOCK (&priv->lock);
- {
- priv->background_self_heals_started--;
- }
- UNLOCK (&priv->lock);
- }
-
- AFR_STACK_DESTROY (bgsh_frame);
-
- return 0;
-}
+ inode = inode_new (table);
+ if (!inode)
+ return NULL;
-int
-afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t op_errno = 0;
- int ret = 0;
- afr_self_heal_t *orig_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t *sh_local = NULL;
- loc_t *loc = NULL;
-
- local = frame->local;
- orig_sh = &local->self_heal;
- priv = this->private;
-
- GF_ASSERT (local->loc.path);
-
- gf_log (this->name, GF_LOG_TRACE,
- "performing self heal on %s (metadata=%d data=%d entry=%d)",
- local->loc.path,
- local->self_heal.do_metadata_self_heal,
- local->self_heal.do_data_self_heal,
- local->self_heal.do_entry_self_heal);
-
- op_errno = ENOMEM;
- sh_frame = copy_frame (frame);
- if (!sh_frame)
- goto out;
- afr_set_lk_owner (sh_frame, this, sh_frame->root);
- afr_set_low_priority (sh_frame);
-
- sh_local = afr_local_copy (local, this);
- if (!sh_local)
- goto out;
- sh_frame->local = sh_local;
- sh = &sh_local->self_heal;
-
- sh->inode = inode_ref (inode);
- sh->orig_frame = frame;
-
- sh->completion_cbk = afr_self_heal_completion_cbk;
-
- sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success),
- gf_afr_mt_char);
- if (!sh->success)
- goto out;
- sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count,
- gf_afr_mt_int);
- if (!sh->sources)
- goto out;
- sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes),
- priv->child_count,
- gf_afr_mt_int);
- if (!sh->locked_nodes)
- goto out;
-
- sh->pending_matrix = afr_matrix_create (priv->child_count,
- priv->child_count);
- if (!sh->pending_matrix)
- goto out;
-
- sh->delta_matrix = afr_matrix_create (priv->child_count,
- priv->child_count);
- if (!sh->delta_matrix)
- goto out;
-
- sh->fresh_parent_dirs = afr_children_create (priv->child_count);
- if (!sh->fresh_parent_dirs)
- goto out;
- ret = afr_sh_common_create (sh, priv->child_count);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
-
- if (local->self_heal.background) {
- LOCK (&priv->lock);
- {
- if (priv->background_self_heals_started
- < priv->background_self_heal_count) {
- priv->background_self_heals_started++;
-
-
- } else {
- local->self_heal.background = _gf_false;
- sh->background = _gf_false;
- }
- }
- UNLOCK (&priv->lock);
- }
-
- if (!local->loc.parent) {
- sh->do_missing_entry_self_heal = _gf_false;
- sh->do_gfid_self_heal = _gf_false;
- }
-
- FRAME_SU_DO (sh_frame, afr_local_t);
- if (sh->do_missing_entry_self_heal || sh->do_gfid_self_heal) {
- afr_self_heal_missing_entries (sh_frame, this);
- } else {
- loc = &sh_local->loc;
- if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) {
- if (!uuid_is_null (inode->gfid))
- GF_ASSERT (!uuid_compare (inode->gfid,
- sh->sh_gfid_req));
- uuid_copy (loc->gfid, sh->sh_gfid_req);
- }
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
-
- afr_sh_missing_entries_done (sh_frame, this);
- }
- op_errno = 0;
+ uuid_copy (inode->gfid, gfid);
-out:
- if (op_errno) {
- orig_sh->unwind (frame, this, -1, op_errno, 1);
- if (sh_frame)
- AFR_STACK_DESTROY (sh_frame);
- }
- return 0;
+ return inode;
}
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size)
+
+call_frame_t *
+afr_frame_create (xlator_t *this)
{
- GF_ASSERT (str && (size > strlen (" missing-entry gfid "
- "meta-data data entry")));
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+ pid_t pid = -1;
- if (self_heal_p->do_metadata_self_heal) {
- snprintf (str, size, " meta-data");
- }
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ return NULL;
- if (self_heal_p->do_data_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " data");
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local) {
+ STACK_DESTROY (frame->root);
+ return NULL;
+ }
- if (self_heal_p->do_entry_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " entry");
- }
+ syncopctx_setfspid (&pid);
- if (self_heal_p->do_missing_entry_self_heal) {
- snprintf (str + strlen(str), size - strlen(str),
- " missing-entry");
- }
+ frame->root->pid = pid;
- if (self_heal_p->do_gfid_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " gfid");
- }
-}
+ afr_set_lk_owner (frame, this, frame->root);
-afr_self_heal_type
-afr_self_heal_type_for_transaction (afr_transaction_type type)
-{
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
-
- switch (type) {
- case AFR_DATA_TRANSACTION:
- sh_type = AFR_SELF_HEAL_DATA;
- break;
- case AFR_METADATA_TRANSACTION:
- sh_type = AFR_SELF_HEAL_METADATA;
- break;
- case AFR_ENTRY_TRANSACTION:
- sh_type = AFR_SELF_HEAL_ENTRY;
- break;
- case AFR_ENTRY_RENAME_TRANSACTION:
- GF_ASSERT (0);
- break;
- }
- return sh_type;
+ return frame;
}
-int
-afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
-{
- int ret = -1;
- uuid_t pargfid = {0};
- if (!child)
- goto out;
+/*
+ * This is the entry point for healing a given GFID
+ */
- if (!uuid_is_null (parent->inode->gfid))
- uuid_copy (pargfid, parent->inode->gfid);
- else if (!uuid_is_null (parent->gfid))
- uuid_copy (pargfid, parent->gfid);
+int
+afr_selfheal (xlator_t *this, uuid_t gfid)
+{
+ inode_t *inode = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
- if (uuid_is_null (pargfid))
- goto out;
+ inode = afr_inode_find (this, gfid);
+ if (!inode)
+ goto out;
- if (strcmp (parent->path, "/") == 0)
- ret = gf_asprintf ((char **)&child->path, "/%s", name);
- else
- ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
- name);
+ frame = afr_frame_create (this);
+ if (!frame)
+ goto out;
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed while setting child path");
- }
+ ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid,
+ &data_selfheal,
+ &metadata_selfheal,
+ &entry_selfheal);
+ if (ret)
+ goto out;
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
+ if (data_selfheal)
+ afr_selfheal_data (frame, this, inode);
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
- uuid_copy (child->pargfid, pargfid);
+ if (metadata_selfheal)
+ afr_selfheal_metadata (frame, this, inode);
- if (!child->inode) {
- ret = -1;
- goto out;
- }
+ if (entry_selfheal)
+ afr_selfheal_entry (frame, this, inode);
- ret = 0;
+ inode_forget (inode, 1);
out:
- if ((ret == -1) && child)
- loc_wipe (child);
-
- return ret;
-}
+ if (inode)
+ inode_unref (inode);
+ if (frame)
+ AFR_STACK_DESTROY (frame);
-int
-afr_sh_erase_pending (call_frame_t *frame, xlator_t *this,
- afr_transaction_type type, afr_fxattrop_cbk_t cbk,
- int (*finish)(call_frame_t *frame, xlator_t *this))
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
- int ret = -1;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix,
- sh->success, priv->child_count, type);
-
- erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,
- gf_afr_mt_dict_t);
- if (!erase_xattr)
- goto out;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
- erase_xattr[i] = dict_new ();
- if (!erase_xattr[i])
- goto out;
- }
- }
-
- afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr,
- priv->child_count, type);
-
- gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s",
- lkowner_utoa (&frame->root->lk_owner));
- afr_sh_print_pending_matrix (sh->delta_matrix, this);
- local->call_count = call_count;
- if (call_count == 0) {
- ret = 0;
- finish (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction
- STACK_WIND_COOKIE (frame, cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i],
- NULL);
- } else {
- STACK_WIND_COOKIE (frame, cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i],
- NULL);
- }
- }
-
- ret = 0;
-out:
- if (erase_xattr) {
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
- }
- }
-
- GF_FREE (erase_xattr);
-
- if (ret < 0) {
- sh->op_failed = _gf_true;
- finish (frame, this);
- }
-
- return 0;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
deleted file mode 100644
index 035fce543..000000000
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __AFR_SELF_HEAL_COMMON_H__
-#define __AFR_SELF_HEAL_COMMON_H__
-
-#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512))
-#define AFR_SH_MIN_PARTICIPANTS 2
-
-typedef enum {
- AFR_SELF_HEAL_ENTRY,
- AFR_SELF_HEAL_METADATA,
- AFR_SELF_HEAL_DATA,
- AFR_SELF_HEAL_INVALID = -1,
-} afr_self_heal_type;
-
-typedef enum {
- AFR_LOOKUP_FAIL_CONFLICTS = 1,
- AFR_LOOKUP_FAIL_MISSING_GFIDS = 2,
-} afr_lookup_flags_t;
-
-int
-afr_sh_select_source (int sources[], int child_count);
-
-int
-afr_sh_source_count (int sources[], int child_count);
-
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
-
-void
-afr_sh_print_split_brain_log (int32_t *pending_matrix[], xlator_t *this,
- const char *loc);
-
-int
-afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
- unsigned char *ignorant_subvols,
- dict_t *xattr[], afr_transaction_type type,
- size_t child_count);
-
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], unsigned char success[],
- int child_count, afr_transaction_type type);
-
-int
-afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix,
- struct iatt *bufs, afr_self_heal_type type,
- int32_t *success_children, int32_t *subvol_status);
-
-int
-afr_sh_delta_to_xattr (xlator_t *this,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type);
-
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size);
-
-afr_self_heal_type
-afr_self_heal_type_for_transaction (afr_transaction_type type);
-
-int
-afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs,
- int32_t **pending_matrix, int32_t *sources,
- int32_t *success_children, afr_transaction_type type,
- int32_t *subvol_status, gf_boolean_t ignore_ignorant);
-void
-afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count);
-
-void
-afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr, struct iatt *postparent,
- loc_t *loc);
-
-int
-afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid,
- int32_t flags, dict_t *xdata);
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
- int active_src, struct iatt *buf,
- struct iatt *parentbuf);
-int
-afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
- char *base_name, afr_lock_cbk_t lock_cbk);
-int
-afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,
- int child_index);
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this,
- afr_lock_cbk_t lock_cbk);
-afr_local_t *
-afr_local_copy (afr_local_t *l, xlator_t *this);
-int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
- off_t start, off_t len, gf_boolean_t block,
- afr_lock_cbk_t success_handler,
- afr_lock_cbk_t failure_handler);
-void
-afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno);
-void
-afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this);
-typedef int
-(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata);
-int
-afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name);
-int
-afr_impunge_frame_create (call_frame_t *frame, xlator_t *this,
- int active_source, call_frame_t **impunge_frame);
-void
-afr_sh_reset (call_frame_t *frame, xlator_t *this);
-
-void
-afr_children_intersection_get (int32_t *set1, int32_t *set2,
- int *intersection, unsigned int child_count);
-int
-afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children,
- struct iatt *bufs);
-int
-afr_sh_erase_pending (call_frame_t *frame, xlator_t *this,
- afr_transaction_type type, afr_fxattrop_cbk_t cbk,
- int (*finish)(call_frame_t *frame, xlator_t *this));
-#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 2f63ed27d..c0548d995 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,1496 +8,628 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
-int
-afr_sh_data_fail (call_frame_t *frame, xlator_t *this);
-
-static inline gf_boolean_t
-afr_sh_data_proceed (unsigned int success_count)
-{
- return (success_count >= AFR_SH_MIN_PARTICIPANTS);
-}
-
-extern int
-sh_loop_finish (call_frame_t *loop_frame, xlator_t *this);
-
-int
-afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this);
+#include "byte-order.h"
-int
-afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this);
+enum {
+ AFR_SELFHEAL_DATA_FULL = 0,
+ AFR_SELFHEAL_DATA_DIFF,
+};
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this);
-int
-afr_sh_data_done (call_frame_t *frame, xlator_t *this)
+#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
+static int
+__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, uint32_t weak, uint8_t *strong,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ afr_local_t *local = NULL;
+ int i = (long) cookie;
- local = frame->local;
- sh = &local->self_heal;
+ local = frame->local;
- sh->completion_cbk (frame, this);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (strong)
+ memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
- return 0;
+ syncbarrier_wake (&local->barrier);
+ return 0;
}
-int
-afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int
+attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "flush failed on %s on subvolume %s: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- }
- }
- UNLOCK (&frame->lock);
+ int i = (long) cookie;
+ afr_local_t *local = NULL;
- call_count = afr_frame_return (frame);
+ local = frame->local;
- if (call_count == 0) {
- afr_sh_data_done (frame, this);
- }
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (pre)
+ local->replies[i].prestat = *pre;
+ if (post)
+ local->replies[i].poststat = *post;
+ if (xdata)
+ local->replies[i].xdata = dict_ref (xdata);
- return 0;
-}
-
-int
-afr_sh_data_close (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (!sh->healing_fd) {
- //This happens when file is non-reg
- afr_sh_data_done (frame, this);
- return 0;
- }
- call_count = afr_set_elem_count_get (sh->success,
- priv->child_count);
- local->call_count = call_count;
+ syncbarrier_wake (&local->barrier);
- if (call_count == 0) {
- afr_sh_data_done (frame, this);
- return 0;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!sh->success[i])
- continue;
- gf_log (this->name, GF_LOG_DEBUG,
- "closing fd of %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- sh->healing_fd, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
+ return 0;
}
-int
-afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *statpre,
- struct iatt *statpost, dict_t *xdata)
-{
-
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "setattr failed on %s on subvolume %s: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_finish (frame, this);
- }
-
- return 0;
-}
-int
-afr_sh_data_setattr (call_frame_t *frame, xlator_t *this, struct iatt* stbuf)
+static gf_boolean_t
+__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int source,
+ unsigned char *healed_sinks,
+ off_t offset, size_t size)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
- int32_t valid = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- valid = (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME);
-
- call_count = afr_set_elem_count_get (sh->success,
- priv->child_count);
- local->call_count = call_count;
-
- if (call_count == 0) {
- GF_ASSERT (0);
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!sh->success[i])
- continue;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_subvols = NULL;
+ int i = 0;
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, stbuf, valid, NULL);
+ priv = this->private;
+ local = frame->local;
- if (!--call_count)
- break;
- }
+ wind_subvols = alloca0 (priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || healed_sinks[i])
+ wind_subvols[i] = 1;
+ }
- return 0;
-}
+ AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd,
+ offset, size, NULL);
-int
-afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->source == child_index);
- if (op_ret != -1) {
- sh->buf[child_index] = *buf;
- afr_sh_data_setattr (frame, this, buf);
- } else {
- gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
- "time-stamps after self-heal", local->loc.path);
- afr_sh_data_fail (frame, this);
- }
+ if (!local->replies[source].valid || local->replies[source].op_ret != 0)
+ return _gf_false;
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+ if (memcmp (local->replies[source].checksum,
+ local->replies[i].checksum,
+ MD5_DIGEST_LENGTH))
+ return _gf_false;
+ }
-/*
- * If there are any writes after the self-heal is triggered then the
- * stbuf stored in local->self_heal.buf[] will be invalid so we do one more
- * stat on the source and then set the [am]times
- */
-int
-afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->fstat,
- sh->healing_fd, NULL);
- return 0;
+ return _gf_true;
}
-//Fun fact, lock_cbk is being used for both lock & unlock
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this,
- afr_lock_cbk_t lock_cbk)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
- GF_ASSERT (sh->data_lock_held);
-
- sh->data_lock_held = _gf_false;
- int_lock->lock_cbk = lock_cbk;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
+ struct iovec *iovec = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing data selfheal of %s", local->loc.path);
+ priv = this->private;
- if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, afr_sh_data_close);
- else
- afr_sh_data_close (frame, this);
+ ret = syncop_readv (priv->children[source], fd, size, offset, 0,
+ &iovec, &count, &iobref);
+ if (ret <= 0)
+ return ret;
- return 0;
-}
-
-int
-afr_sh_data_fail (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing failed data selfheal of %s", local->loc.path);
+ /*
+ * TODO: Use fiemap() and discard() to heal holes
+ * in the future.
+ *
+ * For now,
+ *
+ * - if the source had any holes at all,
+ * AND
+ * - if we are writing past the original file size
+ * of the sink
+ * AND
+ * - is NOT the last block of the source file. if
+ * the block contains EOF, it has to be written
+ * in order to set the file size even if the
+ * last block is 0-filled.
+ * AND
+ * - if the read buffer is filled with only 0's
+ *
+ * then, skip writing to this source. We don't depend
+ * on the write to happen to update the size as we
+ * have performed an ftruncate() upfront anyways.
+ */
+#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b)))
+ if (HAS_HOLES ((&replies[source].poststat)) &&
+ offset >= replies[i].poststat.ia_size &&
+ !is_last_block (offset, size,
+ replies[source].poststat.ia_size) &&
+ (iov_0filled (iovec, count) == 0))
+ continue;
+
+ ret = syncop_writev (priv->children[i], fd, iovec, count,
+ offset, iobref, 0);
+ if (ret != iov_length (iovec, count)) {
+ /* write() failed on this sink. unset the corresponding
+ member in sinks[] (which is healed_sinks[] in the
+ caller) so that this server does NOT get considered
+ as successfully healed.
+ */
+ healed_sinks[i] = 0;
+ }
+ }
+ if (iobref)
+ iobref_unref (iobref);
- sh->op_failed = 1;
- if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, afr_sh_data_close);
- else
- afr_sh_data_close (frame, this);
- return 0;
+ return ret;
}
-int
-afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr, dict_t *xdata)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change "
- "log failed on %s for subvol %s, reason: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed) {
- if (sh->old_loop_frame)
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- afr_sh_data_fail (frame, this);
- goto out;
- }
- if (!IA_ISREG (sh->type)) {
- afr_sh_data_finish (frame, this);
- goto out;
- }
- GF_ASSERT (sh->old_loop_frame);
- afr_sh_data_lock (frame, this, 0, 0, _gf_true,
- afr_post_sh_big_lock_success,
- afr_post_sh_big_lock_failure);
- }
-out:
- return 0;
-}
-int
-afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION,
- afr_sh_data_erase_pending_cbk,
- afr_sh_data_finish);
- return 0;
-}
+static int
+afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks, off_t offset,
+ size_t size, int type, struct afr_reply *replies)
+{
+ int ret = -1;
+ int sink_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *data_lock = NULL;
+
+ priv = this->private;
+ sink_count = AFR_COUNT (healed_sinks, priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,
+ offset, size, data_lock);
+ {
+ if (ret < sink_count) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
-int
-afr_sh_data_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *pre,
- struct iatt *post, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "%s: Failed to fsync on "
- "%s - %s", local->loc.path,
- priv->children[child_index]->name, strerror (op_errno));
- LOCK (&frame->lock);
- {
- sh->op_failed = 1;
- }
- UNLOCK (&frame->lock);
- if (sh->old_loop_frame)
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- }
+ if (type == AFR_SELFHEAL_DATA_DIFF &&
+ __afr_selfheal_data_checksums_match (frame, this, fd, source,
+ healed_sinks, offset, size)) {
+ ret = 0;
+ goto unlock;
+ }
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- if (sh->op_failed)
- afr_sh_data_fail (frame, this);
- else
- afr_sh_data_erase_pending (frame, this);
- }
- return 0;
+ ret = __afr_selfheal_data_read_write (frame, this, fd, source,
+ healed_sinks, offset, size,
+ replies);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
+ offset, size, data_lock);
+ return ret;
}
-/*
- * Before erasing xattrs, make sure the data is written to disk
- */
-int
-afr_sh_data_fsync (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
-
- call_count = sh->active_sinks;
- if (call_count == 0) {
- afr_sh_data_erase_pending (frame, this);
- return 0;
- }
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!sh->success[i] || sh->sources[i])
- continue;
- STACK_WIND_COOKIE (frame, afr_sh_data_fsync_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->fsync,
- sh->healing_fd, 1, NULL);
- }
-
- return 0;
-}
-
-static struct afr_sh_algorithm *
-sh_algo_from_name (xlator_t *this, char *name)
+static int
+afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks)
{
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- if (name == NULL)
- goto out;
+ local = frame->local;
+ priv = this->private;
- while (afr_self_heal_algorithms[i].name) {
- if (!strcmp (name, afr_self_heal_algorithms[i].name)) {
- return &afr_self_heal_algorithms[i];
- }
+ AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL);
- i++;
- }
-
-out:
- return NULL;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret != 0)
+ /* fsync() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
static int
-sh_zero_byte_files_exist (afr_local_t *local, int child_count)
+afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- int i = 0;
- int ret = 0;
- afr_self_heal_t *sh = NULL;
-
- sh = &local->self_heal;
- for (i = 0; i < child_count; i++) {
- if (!local->child_up[i] || sh->child_errno[i])
- continue;
- if (sh->buf[i].ia_size == 0) {
- ret = 1;
- break;
- }
- }
+ loc_t loc = {0, };
- return ret;
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+ AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc,
+ &replies[source].poststat,
+ (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL);
-struct afr_sh_algorithm *
-afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- struct afr_sh_algorithm * algo = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- algo = sh_algo_from_name (this, priv->data_self_heal_algorithm);
-
- if (algo == NULL) {
- /* option not set, so fall back on heuristics */
-
- if (sh_zero_byte_files_exist (local, priv->child_count)
- || (sh->file_size <= (priv->data_self_heal_window_size *
- this->ctx->page_size))) {
-
- /*
- * If the file does not exist on one of the subvolumes,
- * or a zero-byte file exists (created by entry self-heal)
- * the entire content has to be copied anyway, so there
- * is no benefit from using the "diff" algorithm.
- *
- * If the file size is about the same as page size,
- * the entire file can be read and written with a few
- * (pipelined) STACK_WINDs, which will be faster
- * than "diff" which has to read checksums and then
- * read and write.
- */
-
- algo = sh_algo_from_name (this, "full");
-
- } else {
- algo = sh_algo_from_name (this, "diff");
- }
- }
+ loc_wipe (&loc);
- return algo;
+ return 0;
}
-
-int
-afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
+static int
+afr_data_self_heal_type_get (afr_private_t *priv, unsigned char *healed_sinks,
+ int source, struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- struct afr_sh_algorithm *sh_algo = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->algo_completion_cbk = afr_sh_data_fsync;
- sh->algo_abort_cbk = afr_sh_data_fail;
-
- sh_algo = afr_sh_data_pick_algo (frame, this);
-
- sh->algo = sh_algo;
- sh_algo->fn (frame, this);
-
- return 0;
-}
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int i = 0;
-int
-afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int call_count = 0;
- int child_index = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "ftruncate of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "ftruncate of %s on subvolume %s completed",
- local->loc.path,
- priv->children[child_index]->name);
+ if (priv->data_self_heal_algorithm == NULL) {
+ type = AFR_SELFHEAL_DATA_FULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] && i != source)
+ continue;
+ if (replies[i].poststat.ia_size) {
+ type = AFR_SELFHEAL_DATA_DIFF;
+ break;
+ }
}
+ } else if (strcmp (priv->data_self_heal_algorithm, "full") == 0) {
+ type = AFR_SELFHEAL_DATA_FULL;
+ } else if (strcmp (priv->data_self_heal_algorithm, "diff") == 0) {
+ type = AFR_SELFHEAL_DATA_DIFF;
}
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed)
- afr_sh_data_fail (frame, this);
- else
- afr_sh_data_sync_prepare (frame, this);
- }
-
- return 0;
+ return type;
}
+static int
+afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ off_t off = 0;
+ size_t block = 128 * 1024;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int ret = -1;
+ call_frame_t *iter_frame = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
+
+ priv = this->private;
+
+ sinks_str = alloca0 (priv->child_count * 8);
+ p = sinks_str;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ p += sprintf (p, "%d ", i);
+ }
-int
-afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- int *sources = NULL;
- int call_count = 0;
- int i = 0;
-
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sources = sh->sources;
- call_count = sh->active_sinks;
-
- local->call_count = call_count;
+ gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. "
+ "source=%d sinks=%s",
+ uuid_utoa (fd->inode->gfid), source, sinks_str);
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
+ type = afr_data_self_heal_type_get (priv, healed_sinks, source,
+ replies);
- STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- sh->healing_fd, sh->file_size,
- NULL);
+ iter_frame = afr_copy_frame (frame);
+ if (!iter_frame)
+ return -ENOMEM;
- if (!--call_count)
- break;
- }
-
- return 0;
-}
+ for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+ ret = afr_selfheal_data_block (iter_frame, this, fd, source,
+ healed_sinks, off, block, type,
+ replies);
+ if (ret < 0)
+ goto out;
-int
-afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- int ret = 0;
- int i = 0;
-
- priv = this->private;
- sh->source = afr_sh_select_source (sh->sources, priv->child_count);
- if (sh->source < 0) {
- ret = -1;
- goto out;
- }
+ AFR_STACK_RESET (iter_frame);
+ }
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == sh->source || sh->child_errno[i])
- continue;
+ afr_selfheal_data_restore_time (frame, this, fd->inode, source,
+ healed_sinks, replies);
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source]))
- sh->sources[i] = 0;
- }
+ ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks);
- afr_reset_children (sh->fresh_children, priv->child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
out:
- return ret;
-}
-
-void
-afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
-{
- int source = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- sh->block_size = this->ctx->page_size;
- sh->file_size = sh->buf[source].ia_size;
-
- if (FILE_HAS_HOLES (&sh->buf[source]))
- sh->file_has_holes = 1;
-
- if (sh->background && sh->unwind && !sh->unwound) {
- sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- sh->op_failed);
- sh->unwound = _gf_true;
- }
-
- afr_sh_mark_source_sinks (frame, this);
- if (sh->active_sinks == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_data_finish (frame, this);
- return;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing file %s from subvolume %s to %d other",
- local->loc.path, priv->children[sh->source]->name,
- sh->active_sinks);
-
- sh->actual_sh_started = _gf_true;
- afr_sh_data_trim_sinks (frame, this);
+ if (iter_frame)
+ AFR_STACK_DESTROY (iter_frame);
+ return ret;
}
-int
-afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int ret = 0;
- int *old_sources = NULL;
- int tstamp_source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s",
- lkowner_utoa (&frame->root->lk_owner));
- if (sh->sync_done) {
- //store sources before sync so that mtime can be set using the
- //iatt buf from one of them.
- old_sources = alloca (priv->child_count*sizeof (*old_sources));
- memcpy (old_sources, sh->sources,
- priv->child_count * sizeof (*old_sources));
- }
-
- nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix,
- sh->sources, sh->success_children,
- AFR_DATA_TRANSACTION, NULL, _gf_true);
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Picking favorite child %s as authentic source to "
- "resolve conflicting data of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- afr_sh_print_split_brain_log (sh->pending_matrix, this,
- local->loc.path);
- afr_set_split_brain (this, sh->inode, DONT_KNOW, SPB);
-
- afr_sh_data_fail (frame, this);
- return 0;
- }
-
- afr_set_split_brain (this, sh->inode, DONT_KNOW, NO_SPB);
-
- ret = afr_sh_inode_set_read_ctx (sh, this);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
- afr_sh_data_fail (frame, this);
- return 0;
- }
-
- if (sh->sync_done) {
- /* Perform setattr from one of the old_sources if possible
- * Because only they have the correct mtime, the new sources
- * (i.e. old sinks) have mtime from last writev in sync.
- */
- tstamp_source = sh->source;
- for (i = 0; i < priv->child_count; i++) {
- if (old_sources[i] && sh->sources[i])
- tstamp_source = i;
- }
- afr_sh_data_setattr (frame, this, &sh->buf[tstamp_source]);
- } else {
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- if (sh->do_data_self_heal &&
- afr_data_self_heal_enabled (priv->data_self_heal))
- afr_sh_data_fix (frame, this);
- else
- afr_sh_data_finish (frame, this);
- }
- return 0;
-}
-
-int
-afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
- dict_t **xattr,
- afr_transaction_type txn_type,
- uuid_t gfid)
-{
- afr_private_t *priv = NULL;
- int read_child = -1;
- int32_t **pending_matrix = NULL;
- int32_t *sources = NULL;
- int32_t *success_children = NULL;
- struct iatt *bufs = NULL;
- int32_t nsources = 0;
- int32_t prev_read_child = -1;
- int32_t config_read_child = -1;
- int32_t subvol_status = 0;
-
- priv = this->private;
- bufs = local->cont.lookup.bufs;
- success_children = local->cont.lookup.success_children;
-
- pending_matrix = local->cont.lookup.pending_matrix;
- sources = local->cont.lookup.sources;
- memset (sources, 0, sizeof (*sources) * priv->child_count);
-
- nsources = afr_build_sources (this, xattr, bufs, pending_matrix,
- sources, success_children, txn_type,
- &subvol_status, _gf_false);
- if (subvol_status & SPLIT_BRAIN) {
- gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain",
- local->loc.path);
- switch (txn_type) {
- case AFR_DATA_TRANSACTION:
- local->cont.lookup.possible_spb = _gf_true;
- nsources = 1;
- sources[success_children[0]] = 1;
- break;
- case AFR_ENTRY_TRANSACTION:
- read_child = afr_get_no_xattr_dir_read_child (this,
- success_children,
- bufs);
- sources[read_child] = 1;
- nsources = 1;
- break;
- default:
- break;
- }
- }
- if (nsources < 0)
- goto out;
-
- prev_read_child = local->read_child_index;
- config_read_child = priv->read_child;
- read_child = afr_select_read_child_from_policy (success_children,
- priv->child_count,
- prev_read_child,
- config_read_child,
- sources,
- priv->hash_mode, gfid);
-out:
- gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d",
- read_child);
- return read_child;
-}
-
-int
-afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf, dict_t *xdata)
+static int
+__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, unsigned char *healed_sinks,
+ struct afr_reply *replies, uint64_t size)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fstat of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->buf[child_index] = *buf;
- sh->success_children[sh->success_count] = child_index;
- sh->success_count++;
- } else {
- gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed "
- "on %s, reason %s", local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- /* Previous versions of glusterfs might have set
- * the pending data xattrs which need to be erased
- */
- if (!afr_sh_data_proceed (sh->success_count)) {
- gf_log (this->name, GF_LOG_ERROR, "inspecting metadata "
- "succeeded on < %d children, aborting "
- "self-heal for %s", AFR_SH_MIN_PARTICIPANTS,
- local->loc.path);
- afr_sh_data_fail (frame, this);
- goto out;
- }
- afr_sh_data_fxattrop_fstat_done (frame, this);
- }
-out:
- return 0;
-}
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *larger_sinks = 0;
+ int i = 0;
+ local = frame->local;
+ priv = this->private;
-int
-afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)
-{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- int child = 0;
- int32_t *fstat_children = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- fstat_children = memdup (sh->success_children,
- sizeof (*fstat_children) * priv->child_count);
- if (!fstat_children) {
- afr_sh_data_fail (frame, this);
- goto out;
- }
- call_count = sh->success_count;
- local->call_count = call_count;
-
- memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count);
- afr_reset_children (sh->success_children, priv->child_count);
- sh->success_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- child = fstat_children[i];
- if (child == -1)
- break;
- STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk,
- (void *) (long) child,
- priv->children[child],
- priv->children[child]->fops->fstat,
- sh->healing_fd, NULL);
- --call_count;
- }
- GF_ASSERT (!call_count);
-out:
- GF_FREE (fstat_children);
- return 0;
-}
+ larger_sinks = alloca0 (priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && replies[i].poststat.ia_size > size)
+ larger_sinks[i] = 1;
+ }
-void
-afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fxattrop of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->xattr[child_index] = dict_ref (xattr);
- sh->success_children[sh->success_count] = child_index;
- sh->success_count++;
- } else {
- gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s "
- "failed on %s, reason %s", local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-}
+ AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL);
-int
-afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata)
-{
- int call_count = -1;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret,
- op_errno, xattr);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- if (!afr_sh_data_proceed (sh->success_count)) {
- gf_log (this->name, GF_LOG_ERROR, "%s, inspecting "
- "change log succeeded on < %d children",
- local->loc.path, AFR_SH_MIN_PARTICIPANTS);
- afr_sh_data_fail (frame, this);
- goto out;
- }
- afr_sh_data_fstat (frame, this);
- }
-out:
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret == -1)
+ /* truncate() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
-
-int
-afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this)
-{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t **xattr_req;
- int32_t *zero_pending = NULL;
- int call_count = 0;
- int i = 0;
- int ret = 0;
- int j;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (local->child_up,
- priv->child_count);
-
- local->call_count = call_count;
-
- xattr_req = GF_CALLOC(priv->child_count, sizeof(struct dict_t *),
- gf_afr_mt_dict_t);
- if (!xattr_req)
- goto out;
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
+ *
+ * This can only happen if data was directly modified in the backend.
+ */
+static int
+__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int healed_sinks_count = 0;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count);
+
+ if (locked_count == healed_sinks_count || !sources_count) {
+ /* split brain */
+ return -EIO;
+ }
for (i = 0; i < priv->child_count; i++) {
- xattr_req[i] = dict_new();
- if (!xattr_req[i]) {
- ret = -1;
- goto out;
+ if (!sources[i])
+ continue;
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
+ source = i;
}
}
for (i = 0; i < priv->child_count; i++) {
- for (j = 0; j < priv->child_count; j++) {
- zero_pending = GF_CALLOC (3, sizeof (*zero_pending),
- gf_afr_mt_int32_t);
- if (!zero_pending) {
- ret = -1;
- goto out;
- }
- ret = dict_set_dynptr (xattr_req[i], priv->pending_key[j],
- zero_pending,
- 3 * sizeof (*zero_pending));
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value");
- goto out;
- } else {
- zero_pending = NULL;
- }
+ if (!sources[i])
+ continue;
+ if (replies[i].poststat.ia_size < size) {
+ sources[i] = 0;
+ sinks[i] = 1;
}
}
- afr_reset_xattr (sh->xattr, priv->child_count);
- afr_reset_children (sh->success_children, priv->child_count);
- memset (sh->child_errno, 0,
- sizeof (*sh->child_errno) * priv->child_count);
- sh->success_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd, GF_XATTROP_ADD_ARRAY,
- xattr_req[i], NULL);
-
- if (!--call_count)
- break;
- }
- }
-
-out:
- if (xattr_req) {
- for (i = 0; i < priv->child_count; i++)
- if (xattr_req[i])
- dict_unref(xattr_req[i]);
- GF_FREE(xattr_req);
- }
-
- if (ret) {
- GF_FREE (zero_pending);
- afr_sh_data_fail (frame, this);
- }
-
- return 0;
+ return source;
}
-int
-afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this)
+/*
+ * __afr_selfheal_data_prepare:
+ *
+ * This function inspects the on-disk xattrs and determines which subvols
+ * are sources and sinks.
+ *
+ * The return value is the index of the subvolume to be used as the source
+ * for self-healing, or -1 if no healing is necessary/split brain.
+ */
+static int
+__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
- local = frame->local;
- sh = &local->self_heal;
+ priv = this->private;
- sh->data_lock_held = _gf_true;
- afr_sh_data_fxattrop (frame, this);
- return 0;
-}
-
-int
-afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
+ ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid,
+ replies);
+ if (ret)
+ return ret;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks "
- "failed for %s. by %s",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_DATA_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
- sh->data_lock_failure_handler (frame, this);
- } else {
+ source = __afr_selfheal_data_finalize_source (this, sources, sinks,
+ healed_sinks, locked_on,
+ replies);
+ if (source < 0)
+ return -EIO;
- gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks "
- "done for %s by %s. Proceding to self-heal",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
- sh->data_lock_success_handler (frame, this);
- }
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
- return 0;
+ return source;
}
-int
-afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks "
- "failed for %s. by %s",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
-
- if (!sh->data_lock_block) {
- sh->data_lock_failure_handler(frame, this);
- } else {
- int_lock->lock_cbk =
- afr_sh_data_post_blocking_inodelk_cbk;
- afr_blocking_lock (frame, this);
- }
- } else {
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks "
- "done for %s by %s. Proceeding to self-heal",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
- sh->data_lock_success_handler (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK;
-
- afr_set_lock_number (frame, this);
-
- int_lock->lk_flock.l_start = start;
- int_lock->lk_flock.l_len = len;
- int_lock->lk_flock.l_type = F_WRLCK;
- int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk;
-
- afr_nonblocking_inodelk (frame, this);
-
- return 0;
-}
-
-int
-afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->old_loop_frame);
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- sh->data_lock_held = _gf_true;
- sh->sync_done = _gf_true;
- afr_sh_data_fxattrop (frame, this);
- return 0;
-}
+static int
+__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+ gf_boolean_t compat = _gf_false;
+ unsigned char *compat_lock = NULL;
+
+ priv = this->private;
+
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+ compat_lock = alloca0 (priv->child_count);
+
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
-int
-afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies);
+ if (ret < 0)
+ goto unlock;
+
+ source = ret;
+
+ ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,
+ locked_replies,
+ locked_replies[source].poststat.ia_size);
+ if (ret < 0)
+ goto unlock;
+
+ ret = 0;
+
+ /* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for
+ compatibility with older self-heal clients which do not
+ hold a lock in the @priv->sh_domain domain to guard
+ against concurrent ongoing self-heals
+ */
+ afr_selfheal_inodelk (frame, this, fd->inode, this->name,
+ LLONG_MAX - 2, 1, compat_lock);
+ compat = _gf_true;
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ if (ret < 0)
+ goto out;
- local = frame->local;
- sh = &local->self_heal;
+ ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
- GF_ASSERT (sh->old_loop_frame);
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- afr_sh_set_timestamps (frame, this);
- return 0;
+ ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks,
+ healed_sinks, AFR_DATA_TRANSACTION,
+ locked_replies, data_lock);
+out:
+ if (compat)
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
+ LLONG_MAX - 2, 1, compat_lock);
+ return ret;
}
-int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
- off_t start, off_t len, gf_boolean_t block,
- afr_lock_cbk_t success_handler,
- afr_lock_cbk_t failure_handler)
-{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->data_lock_success_handler = success_handler;
- sh->data_lock_failure_handler = failure_handler;
- sh->data_lock_block = block;
- return afr_sh_data_lock_rec (frame, this, start, len);
-}
-
-int
-afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+static fd_t *
+afr_selfheal_data_open (xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
- gf_boolean_t block = _gf_true;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "open of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "open of %s succeeded on child %s",
- local->loc.path,
- priv->children[child_index]->name);
- }
- }
- UNLOCK (&frame->lock);
+ loc_t loc = {0,};
+ int ret = 0;
+ fd_t *fd = NULL;
- call_count = afr_frame_return (frame);
+ fd = fd_create (inode, 0);
+ if (!fd)
+ return NULL;
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_data_fail (frame, this);
- return 0;
- }
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
+ ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ }
- /*
- * The read and write self-heal trigger codepaths do not provide
- * an unwind callback. We run a trylock in these codepaths
- * because we are sensitive to locking latency.
- */
- block = sh->unwind ? _gf_true : _gf_false;
- afr_sh_data_lock (frame, this, 0, 0, block,
- afr_sh_data_big_lock_success,
- afr_sh_data_fail);
- }
+ loc_wipe (&loc);
- return 0;
+ return fd;
}
int
-afr_sh_data_open (call_frame_t *frame, xlator_t *this)
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- int i = 0;
- int call_count = 0;
- fd_t *fd = NULL;
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
-
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if(!local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc,
- O_RDWR|O_LARGEFILE, fd, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-void
-afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- int i = 0;
-
- if (op_ret < 0) {
- afr_sh_data_fail (frame, this);
- return;
- }
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+ fd_t *fd = NULL;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count ; i++) {
- if (1 == local->child_up[i])
- sh->success[i] = 1;
- }
+ fd = afr_selfheal_data_open (this, inode);
+ if (!fd)
+ return -EIO;
- afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION,
- afr_sh_data_erase_pending_cbk,
- afr_sh_data_finish);
-}
+ locked_on = alloca0 (priv->child_count);
-int
-afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- sh->data_lock_held = _gf_true;
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_non_reg_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- return 0;
-}
+ ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
-gf_boolean_t
-afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv)
-{
- if (sh->force_confirm_spb)
- return _gf_true;
- if (sh->do_data_self_heal &&
- afr_data_self_heal_enabled (priv->data_self_heal))
- return _gf_true;
- return _gf_false;
-}
+ ret = __afr_selfheal_data (frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on);
-int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
-
- local = frame->local;
- sh = &local->self_heal;
-
- if (afr_can_start_data_self_heal (sh, priv)) {
- if (IA_ISREG (sh->type)) {
- afr_sh_data_open (frame, this);
- } else {
- afr_sh_data_lock (frame, this, 0, 0, _gf_true,
- afr_sh_non_reg_lock_success,
- afr_sh_data_fail);
- }
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "not doing data self heal on %s",
- local->loc.path);
- afr_sh_data_done (frame, this);
- }
+ if (fd)
+ fd_unref (fd);
- return 0;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index c3c9f9fca..9e714b026 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,2327 +8,622 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
-#include "inode.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include "afr-self-heal.h"
#include "byte-order.h"
-
#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\
- do {\
- _local = _frame->local;\
- _sh = &_local->self_heal;\
- _sh_frame = _sh->sh_frame;\
- _sh_local = _sh_frame->local;\
- _sh_sh = &_sh_local->self_heal;\
- } while (0);
-
-int
-afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this,
- int child_index);
-int
-afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->completion_cbk (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_cbk = afr_sh_entry_done;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "finishing entry selfheal of %s", local->loc.path);
-
- afr_sh_entry_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr, dict_t *xdata)
-{
- long i = 0;
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *orig_local = NULL;
- call_frame_t *orig_frame = NULL;
- afr_private_t *priv = NULL;
- int32_t read_child = -1;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
- i = (long)cookie;
-
-
- afr_children_add_child (sh->fresh_children, i, priv->child_count);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to erase pending xattrs on %s (%s)",
- local->loc.path, priv->children[i]->name,
- strerror (op_errno));
- }
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->source == -1) {
- //this happens if the forced merge option is set
- read_child = sh->fresh_children[0];
- } else {
- read_child = sh->source;
- }
- afr_inode_set_read_ctx (this, sh->inode, read_child,
- sh->fresh_children);
- orig_frame = sh->orig_frame;
- orig_local = orig_frame->local;
-
- if (sh->source != -1) {
- orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink;
- }
-
- afr_sh_entry_finish (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- if (sh->entries_skipped) {
- sh->op_failed = _gf_true;
- goto out;
- }
- afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION,
- afr_sh_entry_erase_pending_cbk,
- afr_sh_entry_finish);
- return 0;
-out:
- afr_sh_entry_finish (frame, this);
- return 0;
-}
-
-
-
-static int
-next_active_source (call_frame_t *frame, xlator_t *this,
- int current_active_source)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int source = -1;
- int next_active_source = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- source = sh->source;
-
- if (source != -1) {
- if (current_active_source != source)
- next_active_source = source;
- goto out;
- }
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_source)) {
-
- next_active_source = i;
- break;
- }
- }
-out:
- return next_active_source;
-}
-
static int
-next_active_sink (call_frame_t *frame, xlator_t *this,
- int current_active_sink)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int next_active_sink = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_sink)) {
-
- next_active_sink = i;
- break;
- }
- }
-
- return next_active_sink;
-}
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src);
-
-int
-afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
- int active_src, int32_t op_ret,
- int32_t op_errno)
-{
- int call_count = 0;
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_expunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop,
- dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = (long) cookie;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "setattr on parent directory of %s on subvolume %s failed: %s",
- expunge_local->loc.path,
- priv->children[active_src]->name, strerror (op_errno));
- }
-
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- int32_t valid = 0;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
-
- active_src = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "removed %s on %s",
- expunge_local->loc.path,
- priv->children[active_src]->name);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "removing %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- }
-
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->setattr,
- &expunge_sh->parent_loc,
- &expunge_sh->parentbuf,
- valid, NULL);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "expunging file %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->unlink,
- &expunge_local->loc, 0, NULL);
-
- return 0;
-}
+afr_selfheal_entry_delete (call_frame_t *frame, xlator_t *this, inode_t *dir,
+ const char *name, inode_t *inode, int child,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+ char g[64];
+
+ priv = this->private;
+
+ subvol = priv->children[child];
+
+ loc.parent = inode_ref (dir);
+ uuid_copy (loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
+
+ if (replies[child].valid && replies[child].op_ret == 0) {
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging dir %s/%s (%s) on %s",
+ uuid_utoa (dir->gfid), name,
+ uuid_utoa_r (replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir (subvol, &loc, 1);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging file %s/%s (%s) on %s",
+ uuid_utoa (dir->gfid), name,
+ uuid_utoa_r (replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink (subvol, &loc);
+ break;
+ }
+ }
+
+ loc_wipe (&loc);
+
+ return ret;
+}
+
+
+int
+afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst,
+ int source, inode_t *dir, const char *name,
+ inode_t *inode, struct afr_reply *replies)
+{
+ int ret = 0;
+ loc_t loc = {0,};
+ loc_t srcloc = {0,};
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ struct iatt *iatt = NULL;
+ char *linkname = NULL;
+ mode_t mode = 0;
+ struct iatt newent = {0,};
+
+ priv = this->private;
+
+ xdata = dict_new();
+ if (!xdata)
+ return -ENOMEM;
+
+ loc.parent = inode_ref (dir);
+ uuid_copy (loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
+
+ ret = afr_selfheal_entry_delete (frame, this, dir, name, inode, dst,
+ replies);
+ if (ret)
+ goto out;
+
+ ret = dict_set_static_bin (xdata, "gfid-req",
+ replies[source].poststat.ia_gfid, 16);
+ if (ret)
+ goto out;
+
+ iatt = &replies[source].poststat;
+
+ srcloc.inode = inode_ref (inode);
+ uuid_copy (srcloc.gfid, iatt->ia_gfid);
+
+ mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type);
+
+ switch (iatt->ia_type) {
+ case IA_IFDIR:
+ ret = syncop_mkdir (priv->children[dst], &loc, mode, xdata, 0);
+ break;
+ case IA_IFLNK:
+ ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == 0) {
+ ret = syncop_link (priv->children[dst], &srcloc, &loc);
+ } else {
+ ret = syncop_readlink (priv->children[source], &srcloc,
+ &linkname, 4096);
+ if (ret <= 0)
+ goto out;
+ ret = syncop_symlink (priv->children[dst], &loc, linkname,
+ xdata, NULL);
+ }
+ break;
+ default:
+ ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret)
+ goto out;
+ ret = syncop_mknod (priv->children[dst], &loc, mode,
+ iatt->ia_rdev, xdata, &newent);
+ if (ret == 0 && iatt->ia_size && !newent.ia_size) {
+ /* New entry created. Mark @dst pending on all sources */
+ ret = 1;
+ }
+ break;
+ }
-
-
-int
-afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "expunging directory %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->rmdir,
- &expunge_local->loc, 1, NULL);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
- int active_src, struct iatt *buf,
- struct iatt *parentbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int type = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- loc_t *loc = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- local = frame->local;
- sh = &local->self_heal;
- loc = &expunge_local->loc;
-
- type = buf->ia_type;
- if (loc->parent && uuid_is_null (loc->parent->gfid))
- uuid_copy (loc->pargfid, parentbuf->ia_gfid);
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
- break;
- case IA_IFDIR:
- afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- expunge_local->loc.path,
- priv->children[active_src]->name, type);
- goto out;
- break;
- }
-
- return 0;
-out:
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, -1, EINVAL);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = (long) cookie;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "lookup of %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf,
- postparent);
-
- return 0;
out:
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
+ if (xdata)
+ dict_unref (xdata);
+ loc_wipe (&loc);
+ loc_wipe (&srcloc);
+ return ret;
}
-int
-afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->lookup,
- &expunge_local->loc, NULL);
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int source = 0;
- call_frame_t *frame = NULL;
- int active_src = 0;
- int need_expunge = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = expunge_sh->active_source;
- source = (long) cookie;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret == -1 && op_errno == ENOENT)
- need_expunge = 1;
- else if (op_ret == -1)
- goto out;
-
- if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) &&
- !uuid_is_null (buf->ia_gfid) &&
- (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) {
- char uuidbuf1[64];
- char uuidbuf2[64];
- gf_log (this->name, GF_LOG_DEBUG,
- "entry %s found on %s with mismatching gfid (%s/%s)",
- expunge_local->loc.path,
- priv->children[source]->name,
- uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1),
- uuid_utoa_r (buf->ia_gfid, uuidbuf2));
- need_expunge = 1;
- }
-
- if (need_expunge) {
- gf_log (this->name, GF_LOG_INFO,
- "Entry %s is missing on %s and deleting from "
- "replica's other bricks",
- expunge_local->loc.path,
- priv->children[source]->name);
-
- if (postparent)
- expunge_sh->parentbuf = *postparent;
-
- afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
-
- return 0;
- }
-
-out:
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "%s exists under %s",
- expunge_local->loc.path,
- priv->children[source]->name);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "looking up %s under %s failed (%s)",
- expunge_local->loc.path,
- priv->children[source]->name,
- strerror (op_errno));
- }
-
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entry)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = -1;
- call_frame_t *expunge_frame = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- int source = 0;
- int op_errno = 0;
- char *name = NULL;
- int op_ret = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
- source = sh->source;
- sh->expunge_done = afr_sh_entry_expunge_entry_done;
-
- name = entry->d_name;
-
- if ((strcmp (name, ".") == 0)
- || (strcmp (name, "..") == 0)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- name, local->loc.path);
- op_ret = 0;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "inspecting existence of %s under %s",
- name, local->loc.path);
-
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- op_errno = ENOMEM;
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out);
-
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- expunge_sh->active_source = active_src;
- expunge_sh->entrybuf = entry->d_stat;
- loc_copy (&expunge_sh->parent_loc, &local->loc);
-
- ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc,
- name);
- if (ret != 0) {
- op_errno = EINVAL;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s", expunge_local->loc.path,
- priv->children[source]->name);
-
- STACK_WIND_COOKIE (expunge_frame,
- afr_sh_entry_expunge_entry_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->lookup,
- &expunge_local->loc, NULL);
-
- ret = 0;
-out:
- if (ret == -1)
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_expunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_expunge_entry (frame, this, entry);
- }
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdirp,
- sh->healing_fd, sh->block_size, sh->offset, NULL);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int **changelog = NULL;
+ int idx = 0;
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
+ priv = this->private;
- sh->offset = 0;
+ idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s to expunge entries",
- local->loc.path);
- goto out;
- }
+ uuid_copy (inode->gfid, replies[source].poststat.ia_gfid);
- active_src = next_active_sink (frame, this, sh->active_source);
- sh->active_source = active_src;
+ changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
- if (sh->op_failed) {
- goto out;
- }
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- goto out;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!newentry[i])
+ continue;
+ changelog[i][idx] = hton32(1);
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "expunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
+ afr_set_pending_dict (priv, xattr, changelog);
- afr_sh_entry_expunge_subvol (frame, this, active_src);
-
- return 0;
-out:
- afr_sh_entry_impunge_all (frame, this);
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ afr_selfheal_post_op (frame, this, inode, i, xattr);
+ }
+ dict_unref (xattr);
+ return ret;
}
-int
-afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- if (op_ret < 0)
- sh->entries_skipped = _gf_true;
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_sh_entry_impunge_subvol (frame, this);
-
- return 0;
-}
-
-void
-afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
-
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
-
- AFR_STACK_DESTROY (impunge_frame);
- sh->impunge_done (frame, this, op_ret, op_errno);
+static int
+__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ unsigned char *newentry = NULL;
+
+ priv = this->private;
+ newentry = alloca0 (priv->child_count);
+
+ if (!replies[source].valid)
+ return -EIO;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if (replies[source].op_ret == -1 &&
+ replies[source].op_errno == ENOENT) {
+ ret = afr_selfheal_entry_delete (frame, this, fd->inode,
+ name, inode, i, replies);
+ } else {
+ if (!uuid_compare (replies[i].poststat.ia_gfid,
+ replies[source].poststat.ia_gfid))
+ continue;
+
+ ret = afr_selfheal_recreate_entry (frame, this, i, source,
+ fd->inode, name, inode,
+ replies);
+ if (ret > 0) {
+ newentry[i] = 1;
+ ret = 0;
+ }
+ }
+ if (ret < 0)
+ break;
+ }
+
+ if (AFR_COUNT (newentry, priv->child_count))
+ afr_selfheal_newentry_mark (frame, this, inode, source, replies,
+ sources, newentry);
+ return ret;
}
-int
-afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop,
- dict_t *xdata)
-{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- child_index = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setattr done for %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "setattr (%s) on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- }
-
- call_count = afr_frame_return (impunge_frame);
- if (call_count == 0) {
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- 0, op_errno);
- }
-
- return 0;
-}
-int
-afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop,
- dict_t *xdata)
+static int
+__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, unsigned char *sources,
+ unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies)
{
- int call_count = 0;
- afr_local_t *setattr_local = NULL;
-
- setattr_local = setattr_frame->local;
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_INFO,
- "setattr on parent directory (%s) failed: %s",
- setattr_local->loc.path, strerror (op_errno));
- }
-
- call_count = afr_frame_return (setattr_frame);
- if (call_count == 0)
- AFR_STACK_DESTROY (setattr_frame);
- return 0;
-}
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int source = -1;
-int
-afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *setattr_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *setattr_frame = NULL;
- int32_t valid = 0;
- int32_t op_errno = 0;
- int child_index = 0;
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "setting ownership of %s on %s to %d/%d",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- impunge_sh->entrybuf.ia_uid,
- impunge_sh->entrybuf.ia_gid);
-
- setattr_frame = copy_frame (impunge_frame);
- if (!setattr_frame) {
- op_errno = ENOMEM;
- goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out);
- setattr_local = setattr_frame->local;
- call_count = afr_errno_count (NULL, impunge_sh->child_errno,
- priv->child_count, 0);
- loc_copy (&setattr_local->loc, &impunge_sh->parent_loc);
- impunge_local->call_count = call_count;
- setattr_local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (impunge_sh->child_errno[i])
- continue;
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- STACK_WIND_COOKIE (setattr_frame,
- afr_sh_entry_impunge_parent_setattr_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->setattr,
- &setattr_local->loc,
- &impunge_sh->parentbuf, valid, NULL);
-
- valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- STACK_WIND_COOKIE (impunge_frame,
- afr_sh_entry_impunge_setattr_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->setattr,
- &impunge_local->loc,
- &impunge_sh->entrybuf, valid, NULL);
- call_count--;
- }
- GF_ASSERT (!call_count);
- return 0;
-out:
- if (setattr_frame)
- AFR_STACK_DESTROY (setattr_frame);
- afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno);
- return 0;
-}
+ priv = this->private;
-int
-afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to perform xattrop on %s (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_impunge_setattr (impunge_frame, this);
- return 0;
-out:
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, op_errno);
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ source = i;
+ break;
+ }
+ }
-int
-afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame,
- xlator_t *this)
-{
- int active_src = 0;
- dict_t *xattr = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int32_t op_errno = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- afr_prepare_new_entry_pending_matrix (impunge_local->pending,
- afr_is_errno_unset,
- impunge_sh->child_errno,
- &impunge_sh->entrybuf,
- priv->child_count);
- xattr = dict_new ();
- if (!xattr) {
- op_errno = ENOMEM;
- goto out;
- }
-
- afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src,
- LOCAL_LAST);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->xattrop,
- &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr, NULL);
-
- if (xattr)
- dict_unref (xattr);
- return 0;
-out:
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, op_errno);
- return 0;
-}
+ if (source == -1) {
+ /* entry got deleted in the mean time? */
+ return 0;
+ }
-int
-afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- impunge_sh->child_errno[child_index] = op_errno;
- gf_log (this->name, GF_LOG_ERROR,
- "creation of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- } else {
- impunge_sh->child_errno[child_index] = 0;
- }
-
- call_count = afr_frame_return (impunge_frame);
- if (call_count == 0) {
- if (!afr_errno_count (NULL, impunge_sh->child_errno,
- priv->child_count, 0)) {
- // new_file creation failed every where
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, op_errno);
- goto out;
- }
- afr_sh_entry_impunge_perform_xattrop (impunge_frame, this);
- }
-out:
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
-int
-afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int call_count = 0;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
-
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- if (IA_IFLNK == impunge_sh->entrybuf.ia_type) {
- //For symlinks impunge is attempted un-conditionally
- //So the file can already exist.
- if ((op_ret < 0) && (op_errno == EEXIST))
- op_ret = 0;
- }
-
- call_count = afr_frame_return (impunge_frame);
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
-
- return 0;
-}
+ if (replies[i].op_errno != ENOENT)
+ continue;
-int
-afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- loc_t *loc = NULL;
- struct iatt *buf = NULL;
- loc_t oldloc = {0};
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- loc = &impunge_local->loc;
- buf = &impunge_sh->entrybuf;
-
- oldloc.inode = inode_ref (loc->inode);
- uuid_copy (oldloc.gfid, buf->ia_gfid);
- gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s",
- loc->path, priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->link,
- &oldloc, loc, NULL);
- loc_wipe (&oldloc);
-
- return 0;
-}
+ ret = afr_selfheal_recreate_entry (frame, this, i, source,
+ fd->inode, name, inode,
+ replies);
+ }
-int
-afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- if (op_ret < 0) {
- afr_sh_entry_impunge_create_file (impunge_frame, this,
- (long)cookie);
- } else {
- afr_sh_entry_impunge_hardlink (impunge_frame, this,
- (long)cookie);
- }
- return 0;
+ return ret;
}
-int
-afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame,
- xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- call_frame_t *frame = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- afr_self_heal_t *sh = NULL;
- loc_t *loc = NULL;
- dict_t *xattr_req = NULL;
- loc_t oldloc = {0};
- int ret = -1;
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- loc = &impunge_local->loc;
-
- xattr_req = dict_new ();
- if (!xattr_req)
- goto out;
- oldloc.inode = inode_ref (loc->inode);
- uuid_copy (oldloc.gfid, stbuf->ia_gfid);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->lookup,
- &oldloc, xattr_req);
- ret = 0;
-out:
- if (xattr_req)
- dict_unref (xattr_req);
- loc_wipe (&oldloc);
- if (ret)
- sh->impunge_done (frame, this, -1, ENOMEM);
- return 0;
-}
-int
-afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
+static int
+__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing file %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- dict = dict_new ();
- if (!dict)
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
-
- GF_ASSERT (!uuid_is_null (stbuf->ia_gfid));
- ret = afr_set_dict_gfid (dict, stbuf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
- impunge_local->loc.path);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mknod,
- &impunge_local->loc,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- makedev (ia_major (stbuf->ia_rdev),
- ia_minor (stbuf->ia_rdev)), 0, dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
-}
-
+ int ret = -1;
-
-int
-afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
-
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return 0;
- }
-
- GF_ASSERT (!uuid_is_null (stbuf->ia_gfid));
- ret = afr_set_dict_gfid (dict, stbuf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
- impunge_local->loc.path);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing directory %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mkdir,
- &impunge_local->loc,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- 0, dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
+ if (source < 0)
+ ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode,
+ sources, healed_sinks,
+ locked_on, replies);
+ else
+ ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode,
+ source, sources, healed_sinks,
+ locked_on, replies);
+ return ret;
}
-int
-afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, const char *linkname)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
- struct iatt *buf = NULL;
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- buf = &impunge_local->cont.dir_fop.buf;
-
- dict = dict_new ();
- if (!dict) {
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, ENOMEM);
- goto out;
- }
-
- GF_ASSERT (!uuid_is_null (buf->ia_gfid));
- ret = afr_set_dict_gfid (dict, buf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO,
- "%s: dict set gfid failed",
- impunge_local->loc.path);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing symlink %s -> %s on %s",
- impunge_local->loc.path, linkname,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->symlink,
- linkname, &impunge_local->loc, 0, dict);
-
- if (dict)
- dict_unref (dict);
-out:
- return 0;
+static int
+afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks, char *name)
+{
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ unsigned char *locked_on = NULL;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
+
+ priv = this->private;
+
+ locked_on = alloca0 (priv->child_count);
+
+ replies = alloca0 (priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name,
+ name, locked_on);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name,
+ replies, locked_on);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode,
+ source, sources, healed_sinks,
+ locked_on, replies);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, fd->inode, this->name, name,
+ locked_on);
+ if (inode)
+ inode_unref (inode);
+ return ret;
}
-int
-afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+static int
+afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int child, int source, unsigned char *sources,
+ unsigned char *healed_sinks)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "unlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
- impunge_sh->linkname);
-
- return 0;
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
-
- return 0;
-}
+ int ret = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ off_t offset = 0;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ subvol = priv->children[child];
-int
-afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
+ INIT_LIST_HEAD (&entries.list);
- priv = this->private;
- impunge_local = impunge_frame->local;
+ iter_frame = afr_copy_frame (frame);
+ if (!iter_frame)
+ return -ENOMEM;
- gf_log (this->name, GF_LOG_DEBUG,
- "unlinking symlink %s with wrong target on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->unlink,
- &impunge_local->loc, 0, NULL);
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
- return 0;
-}
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR))
+ continue;
+ ret = afr_selfheal_entry_dirent (iter_frame, this, fd,
+ source, sources,
+ healed_sinks,
+ entry->d_name);
+ AFR_STACK_RESET (iter_frame);
-int
-afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname, struct iatt *sbuf, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_INFO,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- /* symlink doesn't exist on the sink */
-
- if ((op_ret == -1) && (op_errno == ENOENT)) {
- afr_sh_entry_impunge_symlink (impunge_frame, this,
- child_index, impunge_sh->linkname);
- return 0;
- }
-
-
- /* symlink exists on the sink, so check if targets match */
-
- if (strcmp (linkname, impunge_sh->linkname) == 0) {
- /* targets match, nothing to do */
-
- goto out;
- } else {
- /*
- * Hah! Sneaky wolf in sheep's clothing!
- */
- afr_sh_entry_impunge_symlink_unlink (impunge_frame, this,
- child_index);
- return 0;
- }
+ if (ret)
+ break;
+ }
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
-
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
- return 0;
+ AFR_STACK_DESTROY (iter_frame);
+ return ret;
}
-
-int
-afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
+static int
+afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
- gf_log (this->name, GF_LOG_DEBUG,
- "checking symlink target of %s on %s",
- impunge_local->loc.path, priv->children[child_index]->name);
+ priv = this->private;
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->readlink,
- &impunge_local->loc, 4096, NULL);
+ gf_log (this->name, GF_LOG_INFO, "performing entry selfheal on %s",
+ uuid_utoa (fd->inode->gfid));
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i != source && !healed_sinks[i])
+ continue;
+ ret = afr_selfheal_entry_do_subvol (frame, this, fd, i, source,
+ sources, healed_sinks);
+ if (ret)
+ break;
+ }
+ return ret;
}
-int
-afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname, struct iatt *sbuf, dict_t *xdata)
+static int
+__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- impunge_sh->linkname = gf_strdup (linkname);
- afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index);
-
- return 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ priv = this->private;
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
- return 0;
-}
+ if (locked_count == sinks_count || !sources_count) {
+ return -1;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
-int
-afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
- impunge_local->cont.dir_fop.buf = *stbuf;
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->readlink,
- &impunge_local->loc, 4096, NULL);
-
- return 0;
+ return source;
}
-int
-afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- call_frame_t *frame = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t type = IA_INVAL;
- int active_src = 0;
- struct iatt *buf = NULL;
-
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
- afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf,
- &impunge_sh->parentbuf);
-
- buf = &impunge_sh->entrybuf;
- type = buf->ia_type;
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- afr_sh_entry_impunge_check_hardlink (impunge_frame, this,
- child_index, buf);
- break;
- case IA_IFDIR:
- afr_sh_entry_impunge_mkdir (impunge_frame, this,
- child_index, buf);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- impunge_local->loc.path,
- priv->children[active_src]->name, type);
- sh->impunge_done (frame, this, -1, EINVAL);
- break;
- }
-
- return 0;
-}
-int
-afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- call_frame_t *frame = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t type = IA_INVAL;
- int active_src = 0;
- struct iatt *buf = NULL;
-
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
- buf = &impunge_sh->entrybuf;
- type = buf->ia_type;
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- afr_sh_entry_impunge_mknod (impunge_frame, this,
- child_index, buf);
- break;
- case IA_IFLNK:
- afr_sh_entry_impunge_readlink (impunge_frame, this,
- child_index, buf);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- impunge_local->loc.path,
- priv->children[active_src]->name, type);
- sh->impunge_done (frame, this, -1, EINVAL);
- break;
- }
-
- return 0;
-}
-
-gf_boolean_t
-afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child,
- unsigned int child_count)
+static int
+__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p)
{
- gf_boolean_t recreate = _gf_false;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
- GF_ASSERT (impunge_sh->child_errno);
+ priv = this->private;
- if (child == impunge_sh->active_source)
- goto out;
+ ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid,
+ replies);
+ if (ret)
+ return ret;
- if (IA_IFLNK == impunge_sh->entrybuf.ia_type) {
- recreate = _gf_true;
- goto out;
- }
-
- if (impunge_sh->child_errno[child] == ENOENT)
- recreate = _gf_true;
-out:
- return recreate;
-}
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_ENTRY_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
-unsigned int
-afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources,
- unsigned int child_count)
-{
- int count = 0;
- int i = 0;
+ source = __afr_selfheal_entry_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
- for (i = 0; i < child_count; i++) {
- if (afr_sh_need_recreate (impunge_sh, i, child_count))
- count++;
- }
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
- return count;
-}
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
-int
-afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame,
- xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- unsigned int recreate_count = 0;
- int i = 0;
- int active_src = 0;
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
- impunge_sh->entrybuf = impunge_sh->buf[active_src];
- impunge_sh->parentbuf = impunge_sh->parentbufs[active_src];
- recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources,
- priv->child_count);
- if (!recreate_count) {
- afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0);
- goto out;
- }
- impunge_local->call_count = recreate_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!impunge_local->child_up[i]) {
- impunge_sh->child_errno[i] = ENOTCONN;
- continue;
- }
- if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) {
- impunge_sh->child_errno[i] = EEXIST;
- continue;
- }
- }
- for (i = 0; i < priv->child_count; i++) {
- if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count))
- continue;
- (void)afr_sh_entry_impunge_create (impunge_frame, this, i);
- recreate_count--;
- }
- GF_ASSERT (!recreate_count);
-out:
- return 0;
+ return ret;
}
-void
-afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- unsigned int gfid_miss_count = 0;
- unsigned int children_up_count = 0;
- uuid_t gfid = {0};
- int active_src = 0;
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
-
- if (op_ret < 0)
- goto done;
- if (impunge_sh->child_errno[active_src]) {
- op_ret = -1;
- op_errno = impunge_sh->child_errno[active_src];
- goto done;
- }
-
- gfid_miss_count = afr_gfid_missing_count (this->name,
- impunge_sh->success_children,
- impunge_sh->buf, priv->child_count,
- impunge_local->loc.path);
- children_up_count = afr_up_children_count (impunge_local->child_up,
- priv->child_count);
- if ((gfid_miss_count == children_up_count) &&
- (children_up_count < priv->child_count)) {
- op_ret = -1;
- op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "Not all children are up, "
- "gfid should not be assigned in this state for %s",
- impunge_local->loc.path);
- goto done;
- }
-
- if (gfid_miss_count) {
- afr_update_gfid_from_iatts (gfid, impunge_sh->buf,
- impunge_sh->success_children,
- priv->child_count);
- if (uuid_is_null (gfid)) {
- sh->entries_skipped = _gf_true;
- gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry "
- "self-heal because of gfid absence",
- impunge_local->loc.path);
- goto done;
- }
- afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc,
- afr_sh_entry_common_lookup_done, gfid,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- } else {
- afr_sh_entry_call_impunge_recreate (impunge_frame, this);
- }
- return;
-done:
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
- return;
-}
-int
-afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entry)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int ret = -1;
- call_frame_t *impunge_frame = NULL;
- afr_local_t *impunge_local = NULL;
- int active_src = 0;
- int op_errno = 0;
- int op_ret = -1;
-
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
- sh->impunge_done = afr_sh_entry_impunge_entry_done;
-
- if ((strcmp (entry->d_name, ".") == 0)
- || (strcmp (entry->d_name, "..") == 0)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- entry->d_name, local->loc.path);
- op_ret = 0;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "inspecting existence of %s under %s",
- entry->d_name, local->loc.path);
-
- ret = afr_impunge_frame_create (frame, this, active_src,
- &impunge_frame);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
-
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc,
- entry->d_name);
- loc_copy (&impunge_sh->parent_loc, &local->loc);
- if (ret != 0) {
- op_errno = ENOMEM;
- goto out;
- }
-
- afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc,
- afr_sh_entry_common_lookup_done, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS, NULL);
-
- op_ret = 0;
+static int
+__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+
+ priv = this->private;
+
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL,
+ data_lock);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_prepare (frame, this, fd, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies, &source);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
+ data_lock);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_selfheal_entry_do (frame, this, fd, source, sources,
+ healed_sinks, locked_replies);
+ if (ret)
+ goto out;
+
+ ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks,
+ healed_sinks, AFR_ENTRY_TRANSACTION,
+ locked_replies, data_lock);
out:
- if (ret) {
- if (impunge_frame)
- AFR_STACK_DESTROY (impunge_frame);
- sh->impunge_done (frame, this, op_ret, op_errno);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_impunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_impunge_entry (frame, this, entry);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int32_t active_src = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- active_src = sh->active_source;
- gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd",
- local->loc.path, sh->offset);
-
- STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdirp,
- sh->healing_fd, sh->block_size, sh->offset, NULL);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh->offset = 0;
-
- active_src = next_active_source (frame, this, sh->active_source);
- sh->active_source = active_src;
-
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
-
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- afr_sh_entry_erase_pending (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "impunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
-
- afr_sh_entry_impunge_subvol (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "opendir of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
-
- sh->active_source = -1;
- afr_sh_entry_expunge_all (frame, this);
- }
-
- return 0;
+ return ret;
}
-int
-afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
+static fd_t *
+afr_selfheal_data_opendir (xlator_t *this, inode_t *inode)
{
- int i = 0;
- int call_count = 0;
-
- int source = -1;
- int *sources = NULL;
-
- fd_t *fd = NULL;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ loc_t loc = {0,};
+ int ret = 0;
+ fd_t *fd = NULL;
- source = local->self_heal.source;
- sources = local->self_heal.sources;
+ fd = fd_create (inode, 0);
+ if (!fd)
+ return NULL;
- sh->block_size = priv->sh_readdir_size;
- sh->offset = 0;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- call_count = sh->active_sinks;
- if (source != -1)
- call_count++;
+ ret = syncop_opendir (this, &loc, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ }
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
-
- if (source != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "opening directory %s on subvolume %s (source)",
- local->loc.path, priv->children[source]->name);
-
- /* open source */
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->opendir,
- &local->loc, fd, NULL);
- call_count--;
- }
-
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "opening directory %s on subvolume %s (sink)",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->opendir,
- &local->loc, fd, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
+ loc_wipe (&loc);
+ return fd;
}
int
-afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
+afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- afr_sh_mark_source_sinks (frame, this);
- if (source != -1)
- sh->success[source] = 1;
-
- if (sh->active_sinks == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "no active sinks for self-heal on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- if (source == -1 && sh->active_sinks < 2) {
- gf_log (this->name, GF_LOG_TRACE,
- "cannot sync with 0 sources and 1 sink on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
-
- if (source != -1)
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing directory %s from subvolume %s to "
- "%d other",
- local->loc.path, priv->children[source]->name,
- sh->active_sinks);
- else
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s found. "
- "merging all entries as a conservative decision",
- local->loc.path);
-
- sh->actual_sh_started = _gf_true;
- afr_sh_entry_open (frame, this);
-
- return 0;
-}
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ fd_t *fd = NULL;
+ int ret = 0;
+ priv = this->private;
-void
-afr_sh_entry_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
- int nsources = 0;
- int32_t subvol_status = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0) {
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_entry_finish (frame, this);
- goto out;
- }
-
- if (sh->forced_merge) {
- sh->source = -1;
- goto heal;
- }
-
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->success_children,
- AFR_ENTRY_TRANSACTION, &subvol_status,
- _gf_true);
- if ((subvol_status & ALL_FOOLS) ||
- (subvol_status & SPLIT_BRAIN)) {
- gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative "
- "merge", local->loc.path);
- source = -1;
- memset (sh->sources, 0,
- sizeof (*sh->sources) * priv->child_count);
- } else if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_entry_finish (frame, this);
- return;
- } else {
- source = afr_sh_select_source (sh->sources, priv->child_count);
- }
-
- sh->source = source;
-
- afr_reset_children (sh->fresh_children, priv->child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- if (sh->source >= 0)
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
-
-heal:
- afr_sh_entry_sync_prepare (frame, this);
-out:
- return;
-}
+ fd = afr_selfheal_data_opendir (this, inode);
+ if (!fd)
+ return -EIO;
-int
-afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks "
- "failed for %s.", local->loc.path);
- sh->op_failed = 1;
- afr_sh_entry_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done "
- "for %s. Proceeding to FOP", local->loc.path);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_entry_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- }
-
- return 0;
-}
-
-int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ locked_on = alloca0 (priv->child_count);
+ ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
- priv = this->private;
- local = frame->local;
+ ret = __afr_selfheal_entry (frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on);
- if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) {
- afr_sh_entrylk (frame, this, &local->loc, NULL,
- afr_sh_post_nonblocking_entry_cbk);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to completion on %s",
- local->loc.path);
- afr_sh_entry_done (frame, this);
- }
+ if (fd)
+ fd_unref (fd);
- return 0;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index cc85d9b9f..83628297f 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,622 +8,273 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-int
-afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- afr_sh_reset (frame, this);
- if (IA_ISDIR (sh->type)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to entry check on %s",
- local->loc.path);
- afr_self_heal_entry (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to data check on %s",
- local->loc.path);
- afr_self_heal_data (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_cbk = afr_sh_metadata_done;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_sh_inode_unlock (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_metadata_fail (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->op_failed = 1;
- afr_sh_metadata_finish (frame, this);
- return 0;
-}
-
-int
-afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- int call_count = 0;
- long i = 0;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
- i = (long)cookie;
-
- if ((!IA_ISREG (sh->buf[sh->source].ia_type)) &&
- (!IA_ISDIR (sh->buf[sh->source].ia_type))) {
- afr_children_add_child (sh->fresh_children, i,
- priv->child_count);
- }
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if ((!IA_ISREG (sh->buf[sh->source].ia_type)) &&
- (!IA_ISDIR (sh->buf[sh->source].ia_type))) {
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
- }
- afr_sh_metadata_finish (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION,
- afr_sh_metadata_erase_pending_cbk,
- afr_sh_metadata_finish);
- return 0;
-}
-
-
-int
-afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "setting attributes failed for %s on %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->success[child_index] = 0;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (local->xattr_req) {
- dict_unref (local->xattr_req);
- local->xattr_req = NULL;
- }
- afr_sh_metadata_erase_pending (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
-{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata);
-
- return 0;
-}
-
-int
-afr_sh_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- dict_t *xdata)
-{
- int i = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- local = frame->local;
-
- if (op_ret < 0) {
- afr_sh_metadata_sync_cbk (frame, cookie,
- this, -1, op_errno, xdata);
- goto out;
- }
-
- i = (long) cookie;
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc, local->xattr_req, 0, NULL);
-
- out:
- return 0;
-}
-
-inline void
-afr_prune_special_keys (dict_t *xattr_dict)
-{
- dict_del (xattr_dict, GF_SELINUX_XATTR_KEY);
-}
-
-inline void
-afr_prune_pending_keys (dict_t *xattr_dict, afr_private_t *priv)
-{
- int i = 0;
-
- for (; i < priv->child_count; i++) {
- dict_del (xattr_dict, priv->pending_key[i]);
- }
-}
-
-int
-afr_sh_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr,
- dict_t *xdata)
-{
- int i = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- local = frame->local;
-
- if (op_ret < 0) {
- afr_sh_metadata_sync_cbk (frame, cookie,
- this, -1, op_errno, xdata);
- goto out;
- }
-
- afr_prune_pending_keys (xattr, priv);
-
- afr_prune_special_keys (xattr);
-
- i = (long) cookie;
-
- /* send removexattr in bulk via xdata */
- STACK_WIND_COOKIE (frame, afr_sh_removexattr_cbk,
- cookie,
- priv->children[i],
- priv->children[i]->fops->removexattr,
- &local->loc, "", xattr);
+#include "byte-order.h"
- out:
- return 0;
-}
+#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE)
int
-afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
+afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
- int active_sinks = 0;
- int call_count = 0;
- int i = 0;
-
- struct iatt stbuf = {0,};
- int32_t valid = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- active_sinks = sh->active_sinks;
-
- /*
- * 2 calls per sink - setattr, setxattr
- */
- if (xattr) {
- call_count = active_sinks * 2;
- local->xattr_req = dict_ref (xattr);
- } else
- call_count = active_sinks;
-
- local->call_count = call_count;
-
- stbuf.ia_atime = sh->buf[source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec;
-
- stbuf.ia_uid = sh->buf[source].ia_uid;
- stbuf.ia_gid = sh->buf[source].ia_gid;
-
- stbuf.ia_type = sh->buf[source].ia_type;
- stbuf.ia_prot = sh->buf[source].ia_prot;
-
- valid = GF_SET_ATTR_MODE |
- GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- for (i = 0; i < priv->child_count; i++) {
- if (call_count == 0) {
- break;
- }
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing metadata of %s from %s to %s",
- local->loc.path, priv->children[source]->name,
- priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, &stbuf, valid, NULL);
-
- call_count--;
-
- if (!xattr)
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_getxattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->getxattr,
- &local->loc, NULL, NULL);
- call_count--;
- }
-
- return 0;
+ int ret = -1;
+ loc_t loc = {0,};
+ dict_t *xattr = NULL;
+ dict_t *old_xattr = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s",
+ uuid_utoa (inode->gfid));
+
+ ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL);
+ if (ret < 0) {
+ loc_wipe (&loc);
+ return -EIO;
+ }
+
+ afr_filter_xattrs (xattr);
+ dict_del (xattr, GF_SELINUX_XATTR_KEY);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ ret = syncop_setattr (priv->children[i], &loc,
+ &locked_replies[source].poststat,
+ AFR_HEAL_ATTR, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+
+ old_xattr = NULL;
+ ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0);
+ if (old_xattr) {
+ dict_del (old_xattr, GF_SELINUX_XATTR_KEY);
+ afr_filter_xattrs (old_xattr);
+ ret = syncop_removexattr (priv->children[i], &loc, "",
+ old_xattr);
+ }
+
+ ret = syncop_setxattr (priv->children[i], &loc, xattr, 0);
+ if (ret)
+ healed_sinks[i] = 0;
+ }
+
+ loc_wipe (&loc);
+ if (xattr)
+ dict_unref (xattr);
+
+ return 0;
}
-int
-afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr,
- dict_t *xdata)
+/*
+ * Look for mismatching uid/gid or mode even if xattrs don't say so, and
+ * pick one arbitrarily as winner.
+ */
+
+static int
+__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
- local->loc.path, priv->children[source]->name,
- strerror (op_errno));
-
- afr_sh_metadata_sync (frame, this, NULL);
- } else {
- afr_prune_pending_keys (xattr, priv);
- afr_sh_metadata_sync (frame, this, xattr);
- }
-
- return 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt first = {0, };
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
+
+ if (locked_count == sinks_count || !sources_count) {
+ if (!priv->metadata_splitbrain_forced_heal) {
+ return -EIO;
+ }
+ /* Metadata split brain, select one subvol
+ arbitrarily */
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i] && sinks[i]) {
+ sources[i] = 1;
+ sinks[i] = 0;
+ break;
+ }
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (source == -1) {
+ source = i;
+ first = replies[i].poststat;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!IA_EQUAL (first, replies[i].poststat, type) ||
+ !IA_EQUAL (first, replies[i].poststat, uid) ||
+ !IA_EQUAL (first, replies[i].poststat, gid) ||
+ !IA_EQUAL (first, replies[i].poststat, prot)) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ }
+ }
+
+ return source;
}
-int
-afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- afr_sh_mark_source_sinks (frame, this);
- if (sh->active_sinks == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_metadata_finish (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "syncing metadata of %s from subvolume %s to %d active sinks",
- local->loc.path, priv->children[source]->name,
- sh->active_sinks);
-
- sh->actual_sh_started = _gf_true;
- STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
- priv->children[source],
- priv->children[source]->fops->getxattr,
- &local->loc, NULL, NULL);
-
- return 0;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
+ replies);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_METADATA_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
+
+ source = __afr_selfheal_metadata_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0)
+ return -EIO;
+
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
+
+ return source;
}
-void
-afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static int
+__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0) {
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->success_children,
- AFR_METADATA_TRANSACTION, NULL, _gf_false);
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_WARNING,
- "Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- afr_sh_print_split_brain_log (sh->pending_matrix, this,
- local->loc.path);
- afr_set_split_brain (this, sh->inode, SPB, DONT_KNOW);
- afr_sh_metadata_fail (frame, this);
- goto out;
- }
-
- afr_set_split_brain (this, sh->inode, NO_SPB, DONT_KNOW);
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- if (source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
-
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
-
- sh->source = source;
-
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == source || sh->child_errno[i])
- continue;
-
- if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
-
- if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
- }
-
- if ((!IA_ISREG (sh->buf[source].ia_type)) &&
- (!IA_ISDIR (sh->buf[source].ia_type))) {
- afr_reset_children (sh->fresh_children, priv->child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
- }
-
- if (sh->do_metadata_self_heal && priv->metadata_self_heal)
- afr_sh_metadata_sync_prepare (frame, this);
- else
- afr_sh_metadata_finish (frame, this);
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+
+ priv = this->private;
+
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, data_lock);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies);
+ if (ret < 0)
+ goto unlock;
+
+ source = ret;
+ ret = 0;
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, this->name,
+ LLONG_MAX -1, 0, data_lock);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
+
+ ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks,
+ healed_sinks, AFR_METADATA_TRANSACTION,
+ locked_replies, data_lock);
out:
- return;
-}
-
-int
-afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame,
- xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata "
- "inodelks failed for %s.", local->loc.path);
- gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal "
- "failed for %s.", local->loc.path);
- afr_sh_metadata_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata "
- "inodelks done for %s. Proceeding to FOP",
- local->loc.path);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_metadata_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- }
-
- return 0;
-}
-
-int
-afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK;
-
- afr_set_lock_number (frame, this);
-
- int_lock->lk_flock.l_start = LLONG_MAX - 1;
- int_lock->lk_flock.l_len = 0;
- int_lock->lk_flock.l_type = F_WRLCK;
- int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk;
-
- afr_nonblocking_inodelk (frame, this);
-
- return 0;
+ return ret;
}
-gf_boolean_t
-afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv)
-{
- if (sh->force_confirm_spb)
- return _gf_true;
- if (sh->do_metadata_self_heal && priv->metadata_self_heal)
- return _gf_true;
- return _gf_false;
-}
int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = this->private;
- afr_self_heal_t *sh = &local->self_heal;
-
- local = frame->local;
- sh = &local->self_heal;
-
- if (afr_can_start_metadata_self_heal (sh, priv)) {
- afr_sh_metadata_lock (frame, this);
- } else {
- afr_sh_metadata_done (frame, this);
- }
-
- return 0;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ locked_on = alloca0 (priv->child_count);
+
+ ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_metadata (frame, this, inode, locked_on);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on);
+
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
new file mode 100644
index 000000000..ce80b8da3
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -0,0 +1,457 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "afr.h"
+#include "afr-self-heal.h"
+
+
+int
+__afr_selfheal_assign_gfid (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies, int gfid_idx)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+
+ priv = this->private;
+
+ uuid_copy (parent->gfid, pargfid);
+
+ xdata = dict_new ();
+ if (!xdata) {
+ return -ENOMEM;
+ }
+
+ ret = dict_set_static_bin (xdata, "gfid-req",
+ replies[gfid_idx].poststat.ia_gfid, 16);
+ if (ret) {
+ dict_destroy (xdata);
+ return -ENOMEM;
+ }
+
+ loc.parent = inode_ref (parent);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.pargfid, pargfid);
+ loc.name = bname;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].op_ret == 0 || replies[i].op_errno != ENODATA)
+ continue;
+
+ ret = syncop_lookup (priv->children[i], &loc, xdata, 0, 0, 0);
+ }
+
+ loc_wipe (&loc);
+ dict_unref (xdata);
+
+ return ret;
+}
+
+
+int
+__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies, int gfid_idx)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ uuid_copy (parent->gfid, pargfid);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[gfid_idx].poststat.ia_gfid) == 0)
+ continue;
+
+ ret |= afr_selfheal_recreate_entry (frame, this, i, gfid_idx,
+ parent, bname, inode, replies);
+ }
+
+ return ret;
+}
+
+
+int
+__afr_selfheal_name_expunge (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies)
+{
+ loc_t loc = {0, };
+ int i = 0;
+ afr_private_t *priv = NULL;
+ char g[64];
+ int ret = 0;
+
+ priv = this->private;
+
+ loc.parent = inode_ref (parent);
+ uuid_copy (loc.pargfid, pargfid);
+ loc.name = bname;
+ loc.inode = inode_ref (inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret)
+ continue;
+
+ switch (replies[i].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging dir %s/%s (%s) on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g),
+ priv->children[i]->name);
+ ret |= syncop_rmdir (priv->children[i], &loc, 1);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging file %s/%s (%s) on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g),
+ priv->children[i]->name);
+ ret |= syncop_unlink (priv->children[i], &loc);
+ break;
+ }
+ }
+
+ loc_wipe (&loc);
+
+ return ret;
+
+}
+
+
+int
+__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, int source,
+ unsigned char *locked_on, struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uuid_t gfid = {0, };
+ int gfid_idx = -1;
+ gf_boolean_t source_is_empty = _gf_true;
+ gf_boolean_t need_heal = _gf_false;
+ int first_idx = -1;
+ char g1[64],g2[64];
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ need_heal = _gf_true;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ need_heal = _gf_true;
+ }
+
+ if (!need_heal)
+ return 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (!replies[i].op_ret && (source == -1 || sources[i])) {
+ source_is_empty = _gf_false;
+ break;
+ }
+ }
+
+ if (source_is_empty) {
+ return __afr_selfheal_name_expunge (frame, this, parent, pargfid,
+ bname, inode, replies);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (uuid_is_null (replies[i].poststat.ia_gfid))
+ continue;
+
+ if (uuid_is_null (gfid)) {
+ uuid_copy (gfid, replies[i].poststat.ia_gfid);
+ gfid_idx = i;
+ continue;
+ }
+
+ if (sources[i] || source == -1) {
+ if (gfid_idx != -1 &&
+ (sources[gfid_idx] || source == -1) &&
+ uuid_compare (gfid, replies[i].poststat.ia_gfid)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "GFID mismatch for <gfid:%s>/%s "
+ "%s on %s and %s on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g1),
+ priv->children[i]->name,
+ uuid_utoa_r (replies[gfid_idx].poststat.ia_gfid, g2),
+ priv->children[gfid_idx]->name);
+ return -1;
+ }
+
+ uuid_copy (gfid, replies[i].poststat.ia_gfid);
+ gfid_idx = i;
+ continue;
+ }
+ }
+
+ if (gfid_idx == -1)
+ return -1;
+
+ __afr_selfheal_assign_gfid (frame, this, parent, pargfid, bname, inode,
+ replies, gfid_idx);
+
+ return __afr_selfheal_name_impunge (frame, this, parent, pargfid,
+ bname, inode, replies, gfid_idx);
+}
+
+
+int
+__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks, unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
+
+ if (locked_count == sinks_count || !sources_count) {
+ return -1;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+
+ return source;
+}
+
+
+int
+__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, struct afr_reply *replies,
+ int *source_p)
+{
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_ENTRY_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
+
+ source = __afr_selfheal_name_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
+
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
+
+ return ret;
+}
+
+
+int
+afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *locked_on = NULL;
+ int source = -1;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+
+ priv = this->private;
+
+ locked_on = alloca0 (priv->child_count);
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+
+ replies = alloca0 (priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname,
+ locked_on);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid,
+ locked_on, sources, sinks,
+ healed_sinks, replies,
+ &source);
+ if (ret)
+ goto unlock;
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname,
+ replies, locked_on);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_do (frame, this, parent, pargfid, bname,
+ inode, sources, sinks, healed_sinks,
+ source, locked_on, replies);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, parent, this->name, bname,
+ locked_on);
+ if (inode)
+ inode_unref (inode);
+
+ return ret;
+}
+
+
+int
+afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ const char *bname, gf_boolean_t *need_heal)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
+ int first_idx = -1;
+
+ priv = this->private;
+
+ replies = alloca0 (sizeof (*replies) * priv->child_count);
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname,
+ replies, priv->child_up);
+ if (!inode)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ *need_heal = _gf_true;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ *need_heal = _gf_true;
+ }
+
+ if (inode)
+ inode_unref (inode);
+ return 0;
+}
+
+int
+afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname)
+{
+ inode_t *parent = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t need_heal = _gf_false;
+
+ parent = afr_inode_find (this, pargfid);
+ if (!parent)
+ goto out;
+
+ frame = afr_frame_create (this);
+ if (!frame)
+ goto out;
+
+ ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid,
+ bname, &need_heal);
+ if (ret)
+ goto out;
+
+ if (need_heal)
+ afr_selfheal_name_do (frame, this, parent, pargfid, bname);
+out:
+ if (parent)
+ inode_unref (parent);
+ if (frame)
+ AFR_STACK_DESTROY (frame);
+
+ return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 7c9bc8111..a1b972ac3 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,36 +8,160 @@
cases as published by the Free Software Foundation.
*/
-#ifndef __AFR_SELF_HEAL_H__
-#define __AFR_SELF_HEAL_H__
-#include <sys/stat.h>
+#ifndef _AFR_SELFHEAL_H
+#define _AFR_SELFHEAL_H
-#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type)
-#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type))
-#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid))
-#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size)
-#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size)
+/* Perform fop on all UP subvolumes and wait for all callbacks to return */
+
+#define AFR_ONALL(frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ __count++; \
+ } \
+ syncbarrier_wait (&__local->barrier, __count); \
+ } while (0)
+
+
+/* Perform fop on all subvolumes represented by list[] array and wait
+ for all callbacks to return */
+
+#define AFR_ONLIST(list, frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!list[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ __count++; \
+ } \
+ syncbarrier_wait (&__local->barrier, __count); \
+ } while (0)
+
+
+#define AFR_SEQ(frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ syncbarrier_wait (&__local->barrier, 1); \
+ } \
+ } while (0)
+
+
+#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \
+ int __i; \
+ __ptr = alloca0 (n * sizeof(type *)); \
+ for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \
+ __ptr;})
+
+
+#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
+
+
+int
+afr_selfheal (xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name);
+
+int
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+int
+afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);
int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+
+int
+afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
+
+int
+afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on);
int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr);
+afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
+
+int
+afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
+
+int
+afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies);
+
+inode_t *
+afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on);
int
-afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode);
+afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks);
int
-afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
- dict_t **xattr,
- afr_transaction_type txn_type,
- uuid_t gfid);
-#endif /* __AFR_SELF_HEAL_H__ */
+afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix);
+
+int
+afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, afr_transaction_type type,
+ struct afr_reply *replies, unsigned char *locked_on);
+
+int
+afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst,
+ int source, inode_t *dir, const char *name,
+ inode_t *inode, struct afr_reply *replies);
+
+int
+afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr);
+
+call_frame_t *
+afr_frame_create (xlator_t *this);
+
+inode_t *
+afr_inode_find (xlator_t *this, uuid_t gfid);
+
+#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index 37bc224f5..4bfe909bc 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,1266 +8,1249 @@
cases as published by the Free Software Foundation.
*/
+
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
+
#include "afr.h"
-#include "syncop.h"
+#include "afr-self-heal.h"
#include "afr-self-heald.h"
-#include "afr-self-heal-common.h"
#include "protocol-common.h"
-#include "event-history.h"
-typedef enum {
- STOP_CRAWL_ON_SINGLE_SUBVOL = 1
-} afr_crawl_flags_t;
+#define SHD_INODE_LRU_LIMIT 2048
+#define AFR_EH_HEALED_LIMIT 1024
+#define AFR_EH_HEAL_FAIL_LIMIT 1024
+#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
+#define AFR_STATISTICS_HISTORY_SIZE 50
-typedef enum {
- HEAL = 1,
- INFO
-} shd_crawl_op;
-typedef struct shd_dump {
- dict_t *dict;
- xlator_t *this;
- int child;
-} shd_dump_t;
+#define ASSERT_LOCAL(this, healer) \
+ if (!afr_shd_is_subvol_local(this, healer->subvol)) { \
+ healer->local = _gf_false; \
+ if (safe_break (healer)) { \
+ break; \
+ } else { \
+ continue; \
+ } \
+ } else { \
+ healer->local = _gf_true; \
+ }
-typedef struct shd_event_ {
- int child;
- char *path;
-} shd_event_t;
-typedef struct shd_pos_ {
- int child;
- xlator_t *this;
- afr_child_pos_t pos;
-} shd_pos_t;
+#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n])
+#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n])
-typedef int
-(*afr_crawl_done_cbk_t) (int ret, call_frame_t *sync_frame, void *crawl_data);
+int afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p);
-void
-afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl,
- process_entry_cbk_t process_entry, void *op_data,
- gf_boolean_t exclusive, int crawl_flags,
- afr_crawl_done_cbk_t crawl_done);
+char *
+afr_subvol_name (xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
-static int
-_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data);
+ priv = this->private;
+ if (subvol < 0 || subvol > priv->child_count)
+ return NULL;
-/* For calling straight through (e.g. already in a synctask). */
-int
-afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos);
+ return priv->children[subvol]->name;
+}
-/* For deferring through a new synctask. */
-int
-afr_syncop_find_child_position (void *data);
-static int
-_loc_assign_gfid_path (loc_t *loc)
+void
+afr_destroy_crawl_event_data (void *data)
{
- int ret = -1;
- char gfid_path[64] = {0};
-
- if (loc->inode && !uuid_is_null (loc->inode->gfid)) {
- ret = inode_path (loc->inode, NULL, (char**)&loc->path);
- } else if (!uuid_is_null (loc->gfid)) {
- snprintf (gfid_path, sizeof (gfid_path), "<gfid:%s>",
- uuid_utoa (loc->gfid));
- loc->path = gf_strdup (gfid_path);
- if (loc->path)
- ret = 0;
- }
- return ret;
+ return;
}
+
void
-shd_cleanup_event (void *event)
+afr_destroy_shd_event_data (void *data)
{
- shd_event_t *shd_event = event;
+ shd_event_t *shd_event = data;
+
+ if (!shd_event)
+ return;
+ GF_FREE (shd_event->path);
- if (!shd_event)
- goto out;
- GF_FREE (shd_event->path);
- GF_FREE (shd_event);
-out:
return;
}
-int
-afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count)
-{
- int i = 0;
- int ret = -1;
- for (i = 0; i < child_count; i++) {
- if (shd->pos[i] == AFR_POS_LOCAL) {
- ret = i;
- break;
- }
- }
- return ret;
-}
-static int
-_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent)
+gf_boolean_t
+afr_shd_is_subvol_local (xlator_t *this, int subvol)
{
- int ret = 0;
+ char *pathinfo = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+ gf_boolean_t is_local = _gf_false;
+ loc_t loc = {0, };
- uuid_copy (loc->pargfid, parent->inode->gfid);
- loc->path = "";
- loc->name = name;
- loc->parent = inode_ref (parent->inode);
- if (!loc->parent) {
- loc->path = NULL;
- loc_wipe (loc);
- ret = -1;
- }
- return ret;
-}
+ priv = this->private;
-int
-_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path,
- struct timeval *tv, gf_boolean_t dyn)
-{
- //subkey not used for now
- int ret = -1;
- uint64_t count = 0;
- char key[256] = {0};
- int xl_id = 0;
+ loc.inode = this->itable->root;
+ uuid_copy (loc.gfid, loc.inode->gfid);
- ret = dict_get_int32 (output, this->name, &xl_id);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
- goto out;
- }
+ ret = syncop_getxattr (priv->children[subvol], &loc, &xattr,
+ GF_XATTR_PATHINFO_KEY);
+ if (ret)
+ return _gf_false;
+ if (!xattr)
+ return _gf_false;
- snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
- ret = dict_get_uint64 (output, key, &count);
+ ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &pathinfo);
+ if (ret)
+ return _gf_false;
- snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count);
- if (dyn)
- ret = dict_set_dynstr (output, key, path);
- else
- ret = dict_set_str (output, key, path);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output",
- path);
- goto out;
- }
+ afr_local_pathinfo (pathinfo, &is_local);
- if (!tv)
- goto inc_count;
- snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id,
- child, count);
- ret = dict_set_uint32 (output, key, tv->tv_sec);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time",
- path);
- goto out;
- }
+ gf_log (this->name, GF_LOG_DEBUG, "subvol %s is %slocal",
+ priv->children[subvol]->name, is_local? "" : "not ");
-inc_count:
- snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
- ret = dict_set_uint64 (output, key, count + 1);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Could not increment count");
- goto out;
- }
- ret = 0;
-out:
- return ret;
+ return is_local;
}
-int
-_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child,
- char **fpath, gf_boolean_t *missing)
-{
- dict_t *xattr = NULL;
- char *path = NULL;
- int ret = -1;
-
- ret = syncop_getxattr (readdir_xl, child, &xattr, GFID_TO_PATH_KEY);
- if (ret < 0) {
- if ((errno == ENOENT) && missing)
- *missing = _gf_true;
- goto out;
- }
- ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to get path for "
- "gfid %s", uuid_utoa (child->gfid));
- goto out;
- }
- path = gf_strdup (path);
- if (!path) {
- ret = -1;
- goto out;
- }
- ret = 0;
-out:
- if (!ret)
- *fpath = path;
- if (xattr)
- dict_unref (xattr);
- return ret;
-}
int
-_add_event_to_dict (circular_buffer_t *cb, void *data)
+__afr_shd_healer_wait (struct subvol_healer *healer)
{
- int ret = 0;
- shd_dump_t *dump_data = NULL;
- shd_event_t *shd_event = NULL;
+ afr_private_t *priv = NULL;
+ struct timespec wait_till = {0, };
+ int ret = 0;
- dump_data = data;
- shd_event = cb->data;
- if (shd_event->child != dump_data->child)
- goto out;
- ret = _add_path_to_dict (dump_data->this, dump_data->dict,
- dump_data->child, shd_event->path, &cb->tv,
- _gf_false);
-out:
- return ret;
-}
+ priv = healer->this->private;
-int
-_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child)
-{
- shd_dump_t dump_data = {0};
+disabled_loop:
+ wait_till.tv_sec = time (NULL) + 60;
- dump_data.this = this;
- dump_data.dict = dict;
- dump_data.child = child;
- eh_dump (eh, &dump_data, _add_event_to_dict);
- return 0;
-}
+ while (!healer->rerun) {
+ ret = pthread_cond_timedwait (&healer->cond,
+ &healer->mutex,
+ &wait_till);
+ if (ret == ETIMEDOUT)
+ break;
+ }
-void
-_remove_stale_index (xlator_t *this, xlator_t *readdir_xl,
- loc_t *parent, char *fname)
-{
- int ret = 0;
- loc_t index_loc = {0};
+ ret = healer->rerun;
+ healer->rerun = 0;
- ret = _build_index_loc (this, &index_loc, fname, parent);
- if (ret)
- goto out;
- gf_log (this->name, GF_LOG_DEBUG, "Removing stale index "
- "for %s on %s", index_loc.name, readdir_xl->name);
- ret = syncop_unlink (readdir_xl, &index_loc);
- if(ret && (errno != ENOENT)) {
- gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index "
- "on %s - %s",index_loc.name, readdir_xl->name,
- strerror (errno));
- }
- index_loc.path = NULL;
- loc_wipe (&index_loc);
-out:
- return;
+ if (!priv->shd.enabled)
+ goto disabled_loop;
+
+ return ret;
}
+
int
-_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data,
- gf_dirent_t *entry,
- loc_t *childloc, loc_t *parentloc, struct iatt *iattr)
+afr_shd_healer_wait (struct subvol_healer *healer)
{
- dict_t *output = NULL;
- xlator_t *readdir_xl = NULL;
- int ret = -1;
- char *path = NULL;
- gf_boolean_t missing = _gf_false;
- char gfid_str[64] = {0};
+ int ret = 0;
- if (uuid_is_null (childloc->gfid))
- goto out;
-
- output = crawl_data->op_data;
- readdir_xl = crawl_data->readdir_xl;
-
- ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path,
- &missing);
- if (ret == 0) {
- ret = _add_path_to_dict (this, output, crawl_data->child, path,
- NULL, _gf_true);
- } else if (missing) {
- _remove_stale_index (this, readdir_xl, parentloc,
- uuid_utoa_r (childloc->gfid, gfid_str));
- }
+ pthread_mutex_lock (&healer->mutex);
+ {
+ ret = __afr_shd_healer_wait (healer);
+ }
+ pthread_mutex_unlock (&healer->mutex);
-out:
- if (ret && path)
- GF_FREE (path);
- return ret;
+ return ret;
}
-void
-_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child,
- int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp,
- afr_crawl_data_t *crawl_data)
-{
- int ret = 0;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- eh_t *eh = NULL;
- char *path = NULL;
- char gfid_str[64] = {0};
- shd_event_t *event = NULL;
- int32_t sh_failed = 0;
- gf_boolean_t split_brain = 0;
- int32_t actual_sh_done = 0;
- priv = this->private;
- shd = &priv->shd;
- if (crawl_data->crawl == INDEX) {
- if ((op_ret < 0) && (op_errno == ENOENT)) {
- _remove_stale_index (this, crawl_data->readdir_xl,
- parent, uuid_utoa_r (child->gfid,
- gfid_str));
- goto out;
- }
- ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl,
- child, &path, NULL);
- if (ret)
- goto out;
- } else {
- path = gf_strdup (child->path);
- if (!path) {
- ret = -1;
- goto out;
- }
- }
-
- if (xattr_rsp) {
- ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed);
- ret = dict_get_int32 (xattr_rsp, "actual-sh-done", &actual_sh_done);
- }
-
- split_brain = afr_is_split_brain (this, child->inode);
- if ((op_ret < 0 && op_errno == EIO) || split_brain) {
- eh = shd->split_brain;
- } else if ((op_ret < 0) || sh_failed) {
- eh = shd->heal_failed;
- } else if (actual_sh_done == 1) {
- eh = shd->healed;
- }
+gf_boolean_t
+safe_break (struct subvol_healer *healer)
+{
+ gf_boolean_t ret = _gf_false;
- ret = -1;
+ pthread_mutex_lock (&healer->mutex);
+ {
+ if (healer->rerun)
+ goto unlock;
- if (eh != NULL) {
- event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t);
- if (!event)
- goto out;
- event->child = crawl_data->child;
- event->path = path;
+ healer->running = _gf_false;
+ ret = _gf_true;
+ }
+unlock:
+ pthread_mutex_unlock (&healer->mutex);
- ret = eh_save_history (eh, event);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save "
- "to event history, (%d, %s)", path, op_ret,
- strerror (op_errno));
+ return ret;
+}
- goto out;
- }
- } else {
- gf_log (this->name, GF_LOG_DEBUG, "%s:Self heal already done ",
- path);
- }
- ret = 0;
+inode_t *
+afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid)
+{
+ inode_t *inode = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+
+ inode = inode_find (this->itable, gfid);
+ if (inode)
+ goto out;
+
+ loc.inode = inode_new (this->itable);
+ if (!loc.inode)
+ goto out;
+ uuid_copy (loc.gfid, gfid);
+
+ ret = syncop_lookup (subvol, &loc, NULL, &iatt, NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ inode = inode_link (loc.inode, NULL, NULL, &iatt);
+ if (inode)
+ inode_lookup (inode);
out:
- if (ret && path)
- GF_FREE (path);
- return;
+ loc_wipe (&loc);
+ return inode;
}
-int
-_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr)
-{
- inode_t *link_inode = NULL;
- int ret = -1;
- link_inode = inode_link (loc->inode, NULL, NULL, iattr);
- if (link_inode == NULL) {
- gf_log (this->name, GF_LOG_ERROR, "inode link failed "
- "on the inode (%s)", uuid_utoa (iattr->ia_gfid));
- goto out;
- }
- inode_unref (loc->inode);
- loc->inode = link_inode;
- ret = 0;
+fd_t *
+afr_shd_index_opendir (xlator_t *this, int child)
+{
+ fd_t *fd = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ loc_t rootloc = {0, };
+ inode_t *inode = NULL;
+ int ret = 0;
+ dict_t *xattr = NULL;
+ void *index_gfid = NULL;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ rootloc.inode = inode_ref (this->itable->root);
+ uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr (subvol, &rootloc, &xattr,
+ GF_XATTROP_INDEX_GFID);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid);
+ if (ret)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "index-dir gfid for %s: %s",
+ subvol->name, uuid_utoa (index_gfid));
+
+ inode = afr_shd_inode_find (this, subvol, index_gfid);
+ if (!inode)
+ goto out;
+ fd = fd_anonymous (inode);
out:
- return ret;
+ loc_wipe (&rootloc);
+ if (xattr)
+ dict_unref (xattr);
+ return fd;
}
+
int
-_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry,
- loc_t *child, loc_t *parent, struct iatt *iattr)
+afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
{
- struct iatt parentbuf = {0};
- int ret = 0;
- dict_t *xattr_rsp = NULL;
+ loc_t loc = {0, };
+ int ret = 0;
- gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path);
+ loc.parent = inode_ref (inode);
+ loc.name = name;
- ret = syncop_lookup (this, child, NULL,
- iattr, &xattr_rsp, &parentbuf);
- _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp,
- crawl_data);
- if (xattr_rsp)
- dict_unref (xattr_rsp);
- if (ret == 0)
- ret = _link_inode_update_loc (this, child, iattr);
+ ret = syncop_unlink (subvol, &loc);
- return ret;
+ loc_wipe (&loc);
+ return ret;
}
-static int
-afr_crawl_done (int ret, call_frame_t *sync_frame, void *data)
-{
- GF_FREE (data);
- STACK_DESTROY (sync_frame->root);
- return 0;
-}
-void
-_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl)
+int
+afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent,
+ const char *bname)
{
- afr_start_crawl (this, child, crawl, _self_heal_entry,
- NULL, _gf_true, STOP_CRAWL_ON_SINGLE_SUBVOL,
- afr_crawl_done);
-}
+ int ret = -1;
-gf_boolean_t
-_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason)
-{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- gf_boolean_t proceed = _gf_false;
- char *msg = NULL;
-
- priv = this->private;
- shd = &priv->shd;
- if (!shd->enabled) {
- msg = "Self-heal daemon is not enabled";
- gf_log (this->name, GF_LOG_DEBUG, "%s", msg);
- goto out;
- }
- if (!priv->child_up[child]) {
- gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , "
- "subvol went down", priv->children[child]->name);
- msg = "Brick is Not connected";
- goto out;
- }
+ ret = afr_selfheal_name (THIS, parent, bname);
- if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) {
- if (afr_up_children_count (priv->child_up,
- priv->child_count) < 2) {
- gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as "
- "< 2 children are up");
- msg = "< 2 bricks in replica are running";
- goto out;
- }
- }
- proceed = _gf_true;
-out:
- if (reason)
- *reason = msg;
- return proceed;
+ return ret;
}
int
-_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl,
- shd_crawl_op op, dict_t *output)
+afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
{
- afr_private_t *priv = NULL;
- char *status = NULL;
- char *subkey = NULL;
- char key[256] = {0};
- shd_pos_t pos_data = {0};
- int op_ret = -1;
- int xl_id = -1;
- int i = 0;
- int ret = 0;
- int crawl_flags = 0;
-
- priv = this->private;
- if (op == HEAL)
- crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL;
-
- if (output) {
- ret = dict_get_int32 (output, this->name, &xl_id);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Invalid input, "
- "translator-id is not available");
- goto out;
- }
- }
- pos_data.this = this;
- subkey = "status";
- for (i = 0; i < priv->child_count; i++) {
- if (_crawl_proceed (this, i, crawl_flags, &status)) {
- pos_data.child = i;
- /*
- * We're already in a synctask in this case, so we
- * don't need to defer through a second (and in fact
- * that can cause deadlock). Just call straight
- * through instead.
- */
- ret = afr_find_child_position(pos_data.this,
- pos_data.child,
- &pos_data.pos);
- if (ret) {
- status = "Not able to find brick location";
- } else if (pos_data.pos == AFR_POS_REMOTE) {
- status = "brick is remote";
- } else {
- op_ret = 0;
- if (op == HEAL) {
- status = "Started self-heal";
- _do_self_heal_on_subvol (this, i,
- crawl);
- } else if (output) {
- status = "";
- afr_start_crawl (this, i, INDEX,
- _add_summary_to_dict,
- output, _gf_false, 0,
- NULL);
- }
- }
- if (output) {
- snprintf (key, sizeof (key), "%d-%d-%s", xl_id,
- i, subkey);
- ret = dict_set_str (output, key, status);
- }
- if (!op_ret && (crawl == FULL))
- break;
- }
- if (output) {
- snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i,
- subkey);
- ret = dict_set_str (output, key, status);
- }
- }
-out:
- return op_ret;
+ int ret = 0;
+ eh_t *eh = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ this = healer->this;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = &healer->crawl_event;
+
+ subvol = priv->children[child];
+
+ ret = afr_selfheal (this, gfid);
+
+ if (ret == -EIO) {
+ eh = shd->split_brain;
+ crawl_event->split_brain_count++;
+ } else if (ret < 0) {
+ eh = shd->heal_failed;
+ crawl_event->heal_failed_count++;
+ } else if (ret == 0) {
+ eh = shd->healed;
+ crawl_event->healed_count++;
+ }
+
+ afr_shd_gfid_to_path (this, subvol, gfid, &path);
+ if (!path)
+ return ret;
+
+ if (eh) {
+ shd_event = GF_CALLOC (1, sizeof(*shd_event),
+ gf_afr_mt_shd_event_t);
+ if (!shd_event) {
+ GF_FREE (path);
+ return ret;
+ }
+
+ shd_event->child = child;
+ shd_event->path = path;
+
+ if (eh_save_history (eh, shd_event) < 0) {
+ GF_FREE (shd_event);
+ GF_FREE (path);
+ }
+ }
+ return ret;
}
-int
-_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl,
- dict_t *output)
-{
- return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output);
-}
-int
-_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output)
+void
+afr_shd_sweep_prepare (struct subvol_healer *healer)
{
- return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output);
+ crawl_event_t *event = NULL;
+
+ event = &healer->crawl_event;
+
+ event->healed_count = 0;
+ event->split_brain_count = 0;
+ event->heal_failed_count = 0;
+
+ time (&event->start_time);
+ event->end_time = 0;
}
-int
-_add_all_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict)
+
+void
+afr_shd_sweep_done (struct subvol_healer *healer)
{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- int i = 0;
+ crawl_event_t *event = NULL;
+ crawl_event_t *history = NULL;
+ afr_self_heald_t *shd = NULL;
- priv = this->private;
- shd = &priv->shd;
+ event = &healer->crawl_event;
+ shd = &(((afr_private_t *)healer->this->private)->shd);
- for (i = 0; i < priv->child_count; i++) {
- if (shd->pos[i] != AFR_POS_LOCAL)
- continue;
- _add_eh_to_dict (this, eh, dict, i);
- }
- return 0;
+ time (&event->end_time);
+ history = memdup (event, sizeof (*event));
+ event->start_time = 0;
+
+ if (!history)
+ return;
+
+ if (eh_save_history (shd->statistics[healer->subvol], history) < 0)
+ GF_FREE (history);
}
+
int
-afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
+afr_shd_index_sweep (struct subvol_healer *healer)
{
- gf_xl_afr_op_t op = GF_AFR_OP_INVALID;
- int ret = 0;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- int xl_id = 0;
+ xlator_t *this = NULL;
+ int child = -1;
+ fd_t *fd = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ uuid_t gfid;
+ int ret = 0;
+ int count = 0;
+
+ this = healer->this;
+ child = healer->subvol;
+ priv = this->private;
+ subvol = priv->children[child];
+
+ fd = afr_shd_index_opendir (this, child);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to opendir index-dir on %s", subvol->name);
+ return -errno;
+ }
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!priv->shd.enabled) {
+ ret = -EBUSY;
+ break;
+ }
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG, "got entry: %s",
+ entry->d_name);
+
+ ret = uuid_parse (entry->d_name, gfid);
+ if (ret)
+ continue;
+
+ ret = afr_shd_selfheal (healer, child, gfid);
+ if (ret == 0)
+ count++;
+
+ if (ret == -ENOENT || ret == -ESTALE) {
+ afr_shd_index_purge (subvol, fd->inode,
+ entry->d_name);
+ ret = 0;
+ }
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ if (!ret)
+ ret = count;
+ return ret;
+}
- priv = this->private;
- shd = &priv->shd;
- ret = dict_get_int32 (input, "xl-op", (int32_t*)&op);
- if (ret)
- goto out;
- ret = dict_get_int32 (input, this->name, &xl_id);
- if (ret)
- goto out;
- ret = dict_set_int32 (output, this->name, xl_id);
- if (ret)
- goto out;
- switch (op) {
- case GF_AFR_OP_HEAL_INDEX:
- ret = _do_self_heal_on_local_subvols (this, INDEX, output);
- break;
- case GF_AFR_OP_HEAL_FULL:
- ret = _do_self_heal_on_local_subvols (this, FULL, output);
- break;
- case GF_AFR_OP_INDEX_SUMMARY:
- (void)_get_index_summary_on_local_subvols (this, output);
- ret = 0;
- break;
- case GF_AFR_OP_HEALED_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->healed, output);
- break;
- case GF_AFR_OP_HEAL_FAILED_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->heal_failed,
- output);
- break;
- case GF_AFR_OP_SPLIT_BRAIN_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->split_brain,
- output);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op);
- break;
- }
-out:
- dict_del (output, this->name);
- return ret;
+int
+afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode)
+{
+ fd_t *fd = NULL;
+ xlator_t *this = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ int ret = 0;
+
+ this = healer->this;
+ priv = this->private;
+ subvol = priv->children[healer->subvol];
+
+ fd = fd_anonymous (inode);
+ if (!fd)
+ return -errno;
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, &entries))) {
+ if (ret < 0)
+ break;
+
+ ret = gf_link_inodes_from_dirent (this, fd->inode, &entries);
+ if (ret)
+ break;
+
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!priv->shd.enabled) {
+ ret = -EBUSY;
+ break;
+ }
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ afr_shd_selfheal_name (healer, healer->subvol,
+ inode->gfid, entry->d_name);
+
+ afr_shd_selfheal (healer, healer->subvol,
+ entry->d_stat.ia_gfid);
+
+ if (entry->d_stat.ia_type == IA_IFDIR) {
+ ret = afr_shd_full_sweep (healer, entry->inode);
+ if (ret)
+ break;
+ }
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ return ret;
}
-void
-afr_poll_self_heal (void *data)
-{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- struct timeval timeout = {0};
- xlator_t *this = NULL;
- long child = (long)data;
- gf_timer_t *old_timer = NULL;
- gf_timer_t *new_timer = NULL;
- shd_pos_t pos_data = {0};
- int ret = 0;
- this = THIS;
- priv = this->private;
- shd = &priv->shd;
-
- if (shd->pos[child] == AFR_POS_UNKNOWN) {
- pos_data.this = this;
- pos_data.child = child;
- ret = synctask_new (this->ctx->env,
- afr_syncop_find_child_position,
- NULL, NULL, &pos_data);
- if (!ret)
- shd->pos[child] = pos_data.pos;
- }
- if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL))
- _do_self_heal_on_subvol (this, child, INDEX);
- timeout.tv_sec = shd->timeout;
- timeout.tv_usec = 0;
- //notify and previous timer should be synchronized.
- LOCK (&priv->lock);
- {
- old_timer = shd->timer[child];
- if (shd->pos[child] == AFR_POS_REMOTE)
- goto unlock;
- shd->timer[child] = gf_timer_call_after (this->ctx, timeout,
- afr_poll_self_heal,
- data);
- new_timer = shd->timer[child];
- }
-unlock:
- UNLOCK (&priv->lock);
-
- if (old_timer)
- gf_timer_call_cancel (this->ctx, old_timer);
- if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) {
- gf_log (this->name, GF_LOG_WARNING,
- "Could not create self-heal polling timer for %s",
- priv->children[child]->name);
- }
- return;
+void *
+afr_shd_index_healer (void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int ret = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+
+ for (;;) {
+ afr_shd_healer_wait (healer);
+
+ ASSERT_LOCAL(this, healer);
+
+ do {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "starting index sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+
+ afr_shd_sweep_prepare (healer);
+
+ ret = afr_shd_index_sweep (healer);
+
+ afr_shd_sweep_done (healer);
+ /*
+ As long as at least one gfid was
+ healed, keep retrying. We may have
+ just healed a directory and thereby
+ created entries for other gfids which
+ could not be healed thus far.
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finished index sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+ /*
+ Give a pause before retrying to avoid a busy loop
+ in case the only entry in index is because of
+ an ongoing I/O.
+ */
+ sleep (1);
+ } while (ret > 0);
+ }
+
+ return NULL;
}
-static int
-afr_handle_child_up (int ret, call_frame_t *sync_frame, void *data)
+
+void *
+afr_shd_full_healer (void *data)
{
- afr_self_heald_t *shd = NULL;
- shd_pos_t *pos_data = data;
- afr_private_t *priv = NULL;
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int run = 0;
- if (ret)
- goto out;
+ healer = data;
+ THIS = this = healer->this;
- priv = pos_data->this->private;
- shd = &priv->shd;
- shd->pos[pos_data->child] = pos_data->pos;
- if (pos_data->pos != AFR_POS_REMOTE)
- afr_poll_self_heal ((void*)(long)pos_data->child);
- _do_self_heal_on_local_subvols (THIS, INDEX, NULL);
-out:
- GF_FREE (data);
- return 0;
-}
+ for (;;) {
+ pthread_mutex_lock (&healer->mutex);
+ {
+ run = __afr_shd_healer_wait (healer);
+ if (!run)
+ healer->running = _gf_false;
+ }
+ pthread_mutex_unlock (&healer->mutex);
-void
-afr_proactive_self_heal (void *data)
-{
- xlator_t *this = NULL;
- long child = (long)data;
- shd_pos_t *pos_data = NULL;
- int ret = 0;
+ if (!run)
+ break;
- this = THIS;
+ ASSERT_LOCAL(this, healer);
- //Position of brick could have changed and it could be local now.
- //Compute the position again
- pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t);
- if (!pos_data)
- goto out;
- pos_data->this = this;
- pos_data->child = child;
- ret = synctask_new (this->ctx->env, afr_syncop_find_child_position,
- afr_handle_child_up, NULL, pos_data);
- if (ret)
- goto out;
-out:
- return;
-}
+ gf_log (this->name, GF_LOG_INFO,
+ "starting full sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
-static int
-get_pathinfo_host (char *pathinfo, char *hostname, size_t size)
-{
- char *start = NULL;
- char *end = NULL;
- int ret = -1;
- int i = 0;
+ afr_shd_sweep_prepare (healer);
- if (!pathinfo)
- goto out;
+ afr_shd_full_sweep (healer, this->itable->root);
- start = strchr (pathinfo, ':');
- if (!start)
- goto out;
- end = strrchr (pathinfo, ':');
- if (start == end)
- goto out;
+ afr_shd_sweep_done (healer);
- memset (hostname, 0, size);
- i = 0;
- while (++start != end)
- hostname[i++] = *start;
- ret = 0;
-out:
- return ret;
+ gf_log (this->name, GF_LOG_INFO,
+ "finished full sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+ }
+
+ return NULL;
}
+
int
-afr_local_pathinfo (char *pathinfo, gf_boolean_t *local)
+afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer)
{
- int ret = 0;
- char pathinfohost[1024] = {0};
- char localhost[1024] = {0};
- xlator_t *this = THIS;
+ int ret = 0;
- *local = _gf_false;
- ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost));
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s",
- pathinfo);
- goto out;
- }
+ ret = pthread_mutex_init (&healer->mutex, NULL);
+ if (ret)
+ goto out;
- ret = gethostname (localhost, sizeof (localhost));
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, "
- "reason: %s", strerror (errno));
- goto out;
- }
+ ret = pthread_cond_init (&healer->cond, NULL);
+ if (ret)
+ goto out;
- if (!strcmp (localhost, pathinfohost))
- *local = _gf_true;
+ healer->this = this;
+ healer->running = _gf_false;
+ healer->rerun = _gf_false;
+ healer->local = _gf_false;
out:
- return ret;
+ return ret;
}
+
int
-afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data,
- loc_t *dirloc)
+afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer,
+ void *(threadfn)(void *))
{
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- void *index_gfid = NULL;
- loc_t rootloc = {0};
- struct iatt iattr = {0};
- struct iatt parent = {0};
- int ret = 0;
- xlator_t *readdir_xl = crawl_data->readdir_xl;
-
- priv = this->private;
- if (crawl_data->crawl == FULL) {
- afr_build_root_loc (this, dirloc);
- } else {
- afr_build_root_loc (this, &rootloc);
- ret = syncop_getxattr (readdir_xl, &rootloc, &xattr,
- GF_XATTROP_INDEX_GFID);
- if (ret < 0)
- goto out;
- ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "failed to get index "
- "dir gfid on %s", readdir_xl->name);
- goto out;
- }
- if (!index_gfid) {
- gf_log (this->name, GF_LOG_ERROR, "index gfid empty "
- "on %s", readdir_xl->name);
- ret = -1;
- goto out;
- }
- uuid_copy (dirloc->gfid, index_gfid);
- dirloc->path = "";
- dirloc->inode = inode_new (priv->root_inode->table);
- ret = syncop_lookup (readdir_xl, dirloc, NULL,
- &iattr, NULL, &parent);
- if (ret < 0) {
- if (errno != ENOENT) {
- gf_log (this->name, GF_LOG_ERROR, "lookup "
- "failed on index dir on %s - (%s)",
- readdir_xl->name, strerror (errno));
- }
- goto out;
- }
- ret = _link_inode_update_loc (this, dirloc, &iattr);
- if (ret)
- goto out;
- }
- ret = 0;
-out:
- if (xattr)
- dict_unref (xattr);
- loc_wipe (&rootloc);
- return ret;
+ int ret = 0;
+
+ pthread_mutex_lock (&healer->mutex);
+ {
+ if (healer->running) {
+ pthread_cond_signal (&healer->cond);
+ } else {
+ ret = gf_thread_create (&healer->thread, NULL,
+ threadfn, healer);
+ if (ret)
+ goto unlock;
+ healer->running = 1;
+ }
+
+ healer->rerun = 1;
+ }
+unlock:
+ pthread_mutex_unlock (&healer->mutex);
+
+ return ret;
}
+
int
-afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd,
- loc_t *dirloc)
+afr_shd_full_healer_spawn (xlator_t *this, int subvol)
{
- fd_t *fd = NULL;
- int ret = 0;
-
- if (crawl_data->crawl == FULL) {
- fd = fd_create (dirloc->inode, crawl_data->pid);
- if (!fd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to create fd for %s", dirloc->path);
- ret = -1;
- goto out;
- }
-
- ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "opendir failed on %s", dirloc->path);
- goto out;
- }
- } else {
- fd = fd_anonymous (dirloc->inode);
- }
- ret = 0;
-out:
- if (!ret)
- *dirfd = fd;
- return ret;
+ return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol),
+ afr_shd_full_healer);
}
-xlator_t*
-afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data)
-{
- afr_private_t *priv = this->private;
- if (crawl_data->crawl == FULL) {
- return this;
- } else {
- return priv->children[crawl_data->child];
- }
- return NULL;
+int
+afr_shd_index_healer_spawn (xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol),
+ afr_shd_index_healer);
}
+
int
-afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent,
- gf_dirent_t *entry, afr_crawl_data_t *crawl_data)
+afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output,
+ crawl_event_t *crawl_event)
{
- int ret = -1;
- afr_private_t *priv = NULL;
+ int ret = 0;
+ uint64_t count = 0;
+ char key[256] = {0};
+ int xl_id = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = 0;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+ int child = -1;
- priv = this->private;
- if (crawl_data->crawl == FULL) {
- ret = afr_build_child_loc (this, child, parent, entry->d_name);
- } else {
- child->inode = inode_new (priv->root_inode->table);
- if (!child->inode)
- goto out;
- uuid_parse (entry->d_name, child->gfid);
- ret = _loc_assign_gfid_path (child);
- }
-out:
- return ret;
-}
+ child = crawl_event->child;
+ healed_count = crawl_event->healed_count;
+ split_brain_count = crawl_event->split_brain_count;
+ heal_failed_count = crawl_event->heal_failed_count;
+ crawl_type = crawl_event->crawl_type;
-static int
-_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries,
- off_t *offset, afr_crawl_data_t *crawl_data)
-{
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- int ret = 0;
- loc_t entry_loc = {0};
- fd_t *fd = NULL;
- struct iatt iattr = {0};
-
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- if (!_crawl_proceed (this, crawl_data->child,
- crawl_data->crawl_flags, NULL)) {
- ret = -1;
- goto out;
- }
- *offset = entry->d_off;
- if (IS_ENTRY_CWD (entry->d_name) ||
- IS_ENTRY_PARENT (entry->d_name))
- continue;
- if ((crawl_data->crawl == FULL) &&
- uuid_is_null (entry->d_stat.ia_gfid)) {
- gf_log (this->name, GF_LOG_WARNING, "%s/%s: No "
- "gfid present skipping",
- parentloc->path, entry->d_name);
- continue;
- }
-
- loc_wipe (&entry_loc);
- ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc,
- entry, crawl_data);
- if (ret)
- goto out;
+ if (!crawl_event->start_time)
+ goto out;
- ret = crawl_data->process_entry (this, crawl_data, entry,
- &entry_loc, parentloc, &iattr);
+ start_time_str = gf_strdup (ctime (&crawl_event->start_time));
- if (ret)
- continue;
+ if (crawl_event->end_time)
+ end_time_str = gf_strdup (ctime (&crawl_event->end_time));
- if (crawl_data->crawl == INDEX)
- continue;
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_get_uint64 (output, key, &count);
- if (!IA_ISDIR (iattr.ia_type))
- continue;
- fd = NULL;
- ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc);
- if (ret)
- continue;
- ret = _crawl_directory (fd, &entry_loc, crawl_data);
- if (fd)
- fd_unref (fd);
+
+ snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64(output, key, healed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_healed_count to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, split_brain_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_split_brain_count to outout");
+ goto out;
}
- ret = 0;
-out:
- loc_wipe (&entry_loc);
- return ret;
-}
-static int
-_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data)
-{
- xlator_t *this = NULL;
- off_t offset = 0;
- gf_dirent_t entries;
- int ret = 0;
- gf_boolean_t free_entries = _gf_false;
- xlator_t *readdir_xl = crawl_data->readdir_xl;
+ snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_str (output, key, crawl_type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_type to output");
+ goto out;
+ }
- INIT_LIST_HEAD (&entries.list);
- this = THIS;
+ snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, heal_failed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_healed_failed_count to outout");
+ goto out;
+ }
- GF_ASSERT (loc->inode);
+ snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_dynstr (output, key, start_time_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_start_time to outout");
+ goto out;
+ } else {
+ start_time_str = NULL;
+ }
- if (crawl_data->crawl == FULL)
- gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path);
+ if (!end_time_str)
+ progress = 1;
else
- gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s",
- uuid_utoa (loc->gfid));
-
- while (1) {
- if (crawl_data->crawl == FULL)
- ret = syncop_readdirp (readdir_xl, fd, 131072, offset,
- NULL, &entries);
- else
- ret = syncop_readdir (readdir_xl, fd, 131072, offset,
- &entries);
- if (ret <= 0)
- break;
- ret = 0;
- free_entries = _gf_true;
-
- if (!_crawl_proceed (this, crawl_data->child,
- crawl_data->crawl_flags, NULL)) {
- ret = -1;
- goto out;
- }
- if (list_empty (&entries.list))
- goto out;
+ progress = 0;
+
+ snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ if (!end_time_str)
+ end_time_str = gf_strdup ("Could not determine the end time");
+ ret = dict_set_dynstr (output, key, end_time_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_end_time to outout");
+ goto out;
+ } else {
+ end_time_str = NULL;
+ }
- ret = _process_entries (this, loc, &entries, &offset,
- crawl_data);
- gf_dirent_free (&entries);
- free_entries = _gf_false;
+ snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64,
+ xl_id, child, count);
+
+ ret = dict_set_int32 (output, key, progress);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_inprogress to outout");
+ goto out;
}
- ret = 0;
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_set_uint64 (output, key, count + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not increment the counter.");
+ goto out;
+ }
out:
- if (free_entries)
- gf_dirent_free (&entries);
+ GF_FREE (start_time_str);
+ GF_FREE (end_time_str);
return ret;
}
-static char*
-position_str_get (afr_child_pos_t pos)
-{
- switch (pos) {
- case AFR_POS_UNKNOWN:
- return "unknown";
- case AFR_POS_LOCAL:
- return "local";
- case AFR_POS_REMOTE:
- return "remote";
- }
- return NULL;
-}
int
-afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos)
+afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path,
+ struct timeval *tv)
{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- dict_t *xattr_rsp = NULL;
- loc_t loc = {0};
- int ret = 0;
- char *node_uuid = NULL;
+ int ret = -1;
+ uint64_t count = 0;
+ char key[256] = {0};
+ int xl_id = 0;
+
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
+ goto out;
+ }
- priv = this->private;
- shd = &priv->shd;
+ snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
+ ret = dict_get_uint64 (output, key, &count);
- afr_build_root_loc (this, &loc);
+ snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count);
+ ret = dict_set_dynstr (output, key, path);
- ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp,
- GF_XATTR_NODE_UUID_KEY);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - "
- "(%s)", priv->children[child]->name, strerror (errno));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output",
+ path);
goto out;
}
- ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid);
+ if (tv) {
+ snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id,
+ child, count);
+ ret = dict_set_uint32 (output, key, tv->tv_sec);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time",
+ path);
+ goto out;
+ }
+ }
+
+ snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
+
+ ret = dict_set_uint64 (output, key, count + 1);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on "
- "child %s", priv->children[child]->name);
+ gf_log (this->name, GF_LOG_ERROR, "Could not increment count");
goto out;
}
- if (!strcmp (node_uuid, shd->node_uuid))
- *pos = AFR_POS_LOCAL;
- else
- *pos = AFR_POS_REMOTE;
-
- gf_log (this->name, GF_LOG_DEBUG, "child %s is %s",
- priv->children[child]->name, position_str_get (*pos));
+ ret = 0;
out:
- if (ret)
- *pos = AFR_POS_UNKNOWN;
- loc_wipe (&loc);
return ret;
}
+
int
-afr_syncop_find_child_position (void *data)
+afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p)
{
- shd_pos_t *pos_data = data;
- int ret = 0;
-
- ret = afr_find_child_position (pos_data->this, pos_data->child,
- &pos_data->pos);
- return ret;
+ loc_t loc = {0,};
+ char *path = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+
+ uuid_copy (loc.gfid, gfid);
+ loc.inode = inode_new (this->itable);
+
+ ret = syncop_getxattr (subvol, &loc, &xattr, GFID_TO_PATH_KEY);
+ loc_wipe (&loc);
+ if (ret)
+ return ret;
+
+ ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path);
+ if (ret || !path)
+ return -EINVAL;
+
+ *path_p = gf_strdup (path);
+ if (!*path_p)
+ return -ENOMEM;
+ return 0;
}
-static int
-afr_dir_crawl (void *data)
+
+int
+afr_shd_gather_index_entries (xlator_t *this, int child, dict_t *output)
{
- xlator_t *this = NULL;
- int ret = -1;
- xlator_t *readdir_xl = NULL;
- fd_t *fd = NULL;
- loc_t dirloc = {0};
- afr_crawl_data_t *crawl_data = data;
+ fd_t *fd = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ uuid_t gfid;
+ int ret = 0;
+ int count = 0;
+ char *path = NULL;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ fd = afr_shd_index_opendir (this, child);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to opendir index-dir on %s", subvol->name);
+ return -errno;
+ }
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG, "got entry: %s",
+ entry->d_name);
+
+ ret = uuid_parse (entry->d_name, gfid);
+ if (ret)
+ continue;
+
+ path = NULL;
+ ret = afr_shd_gfid_to_path (this, subvol, gfid, &path);
+
+ if (ret == -ENOENT || ret == -ESTALE) {
+ afr_shd_index_purge (subvol, fd->inode,
+ entry->d_name);
+ ret = 0;
+ continue;
+ }
+
+ ret = afr_shd_dict_add_path (this, output, child, path,
+ NULL);
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ if (!ret)
+ ret = count;
+ return ret;
+}
- this = THIS;
- if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags,
- NULL))
- goto out;
+int
+afr_add_shd_event (circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ shd_event = cb->data;
+
+ if (!shd->index_healers[shd_event->child].local)
+ return 0;
+
+ path = gf_strdup (shd_event->path);
+ if (!path)
+ return -ENOMEM;
+
+ afr_shd_dict_add_path (this, output, shd_event->child, path,
+ &cb->tv);
+ return 0;
+}
- readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data);
- if (!readdir_xl)
- goto out;
- crawl_data->readdir_xl = readdir_xl;
+int
+afr_add_crawl_event (circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ crawl_event_t *crawl_event = NULL;
- ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc);
- if (ret)
- goto out;
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = cb->data;
- ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc);
- if (ret)
- goto out;
+ if (!shd->index_healers[crawl_event->child].local)
+ return 0;
- ret = _crawl_directory (fd, &dirloc, crawl_data);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s",
- readdir_xl->name);
- else
- gf_log (this->name, GF_LOG_DEBUG, "Crawl completed "
- "on %s", readdir_xl->name);
- if (crawl_data->crawl == INDEX)
- dirloc.path = NULL;
-out:
- if (fd)
- fd_unref (fd);
- if (crawl_data->crawl == INDEX)
- dirloc.path = NULL;
- loc_wipe (&dirloc);
- return ret;
+ afr_shd_dict_add_crawl_event (this, output, crawl_event);
+
+ return 0;
}
-static int
-afr_dir_exclusive_crawl (void *data)
-{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- gf_boolean_t crawl = _gf_false;
- int ret = 0;
- int child = -1;
- xlator_t *this = NULL;
- afr_crawl_data_t *crawl_data = data;
-
- this = THIS;
- priv = this->private;
- shd = &priv->shd;
- child = crawl_data->child;
-
- LOCK (&priv->lock);
- {
- if (shd->inprogress[child]) {
- if (shd->pending[child] != FULL)
- shd->pending[child] = crawl_data->crawl;
- } else {
- shd->inprogress[child] = _gf_true;
- crawl = _gf_true;
- }
- }
- UNLOCK (&priv->lock);
- if (!crawl) {
- gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress "
- "for %s", priv->children[child]->name);
+int
+afr_selfheal_daemon_init (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this);
+ if (!this->itable)
+ goto out;
+
+ shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->index_healers)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->index_healers[i].subvol = i;
+ ret = afr_shd_healer_init (this, &shd->index_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->full_healers)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ shd->full_healers[i].subvol = i;
+ ret = afr_shd_healer_init (this, &shd->full_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->healed)
+ goto out;
+
+ shd->heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->heal_failed)
+ goto out;
+
+ shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->split_brain)
+ goto out;
+
+ shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count,
+ gf_common_mt_eh_t);
+ if (!shd->statistics)
goto out;
+
+ for (i = 0; i < priv->child_count ; i++) {
+ shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE,
+ _gf_false,
+ afr_destroy_crawl_event_data);
+ if (!shd->statistics[i])
+ goto out;
+ shd->full_healers[i].crawl_event.child = i;
+ shd->full_healers[i].crawl_event.crawl_type = "FULL";
+ shd->index_healers[i].crawl_event.child = i;
+ shd->index_healers[i].crawl_event.crawl_type = "INDEX";
}
- do {
- afr_dir_crawl (data);
- LOCK (&priv->lock);
- {
- if (shd->pending[child] != NONE) {
- crawl_data->crawl = shd->pending[child];
- shd->pending[child] = NONE;
- } else {
- shd->inprogress[child] = _gf_false;
- crawl = _gf_false;
- }
- }
- UNLOCK (&priv->lock);
- } while (crawl);
+ ret = 0;
out:
- return ret;
+ return ret;
}
-void
-afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl,
- process_entry_cbk_t process_entry, void *op_data,
- gf_boolean_t exclusive, int crawl_flags,
- afr_crawl_done_cbk_t crawl_done)
-{
- afr_private_t *priv = NULL;
- call_frame_t *frame = NULL;
- afr_crawl_data_t *crawl_data = NULL;
- int ret = 0;
- int (*crawler) (void*) = NULL;
- priv = this->private;
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame)
- goto out;
+int
+afr_selfheal_childup (xlator_t *this, int subvol)
+{
+ afr_shd_index_healer_spawn (this, subvol);
- afr_set_lk_owner (frame, this, frame->root);
- afr_set_low_priority (frame);
- crawl_data = GF_CALLOC (1, sizeof (*crawl_data),
- gf_afr_mt_crawl_data_t);
- if (!crawl_data)
- goto out;
- crawl_data->process_entry = process_entry;
- crawl_data->child = idx;
- crawl_data->pid = frame->root->pid;
- crawl_data->crawl = crawl;
- crawl_data->op_data = op_data;
- crawl_data->crawl_flags = crawl_flags;
- gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s",
- crawl_data->crawl, priv->children[idx]->name);
-
- if (exclusive)
- crawler = afr_dir_exclusive_crawl;
- else
- crawler = afr_dir_crawl;
- ret = synctask_new (this->ctx->env, crawler,
- crawl_done, frame, crawl_data);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Could not create the "
- "task for %d ret %d", idx, ret);
-out:
- return;
+ return 0;
}
-void
-afr_build_root_loc (xlator_t *this, loc_t *loc)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- loc->path = gf_strdup ("/");
- loc->name = "";
- loc->inode = inode_ref (priv->root_inode);
- uuid_copy (loc->gfid, loc->inode->gfid);
+int64_t
+afr_shd_get_index_count (xlator_t *this, int i)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ uint64_t count = 0;
+ loc_t rootloc = {0, };
+ dict_t *xattr = NULL;
+ int ret = -1;
+
+ priv = this->private;
+ subvol = priv->children[i];
+
+ rootloc.inode = inode_ref (this->itable->root);
+ uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr (subvol, &rootloc, &xattr,
+ GF_XATTROP_INDEX_COUNT);
+ loc_wipe (&rootloc);
+
+ if (ret < 0)
+ return -1;
+
+ ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, &count);
+ if (ret)
+ return -1;
+ return count;
}
+
int
-afr_set_root_gfid (dict_t *dict)
+afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
{
- uuid_t gfid;
- int ret = 0;
+ gf_xl_afr_op_t op = GF_AFR_OP_INVALID;
+ int ret = 0;
+ int xl_id = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ struct subvol_healer *healer = NULL;
+ int i = 0;
+ char key[64];
+ int op_ret = 0;
+ int64_t cnt = 0;
- memset (gfid, 0, 16);
- gfid[15] = 1;
+ priv = this->private;
+ shd = &priv->shd;
- ret = afr_set_dict_gfid (dict, gfid);
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == -1)
+ goto out;
- return ret;
-}
+ ret = dict_get_int32 (input, "xl-op", (int32_t*)&op);
+ if (ret)
+ goto out;
+ ret = dict_get_int32 (input, this->name, &xl_id);
+ if (ret)
+ goto out;
+ ret = dict_set_int32 (output, this->name, xl_id);
+ if (ret)
+ goto out;
+ switch (op) {
+ case GF_AFR_OP_HEAL_INDEX:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else if (AFR_COUNT (priv->child_up,
+ priv->child_count) < 2) {
+ ret = dict_set_str (output, key,
+ "< 2 bricks in replica are up");
+ } else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
+ ret = dict_set_str (output, key,
+ "Brick is remote");
+ } else {
+ ret = dict_set_str (output, key,
+ "Started self-heal");
+ afr_shd_index_healer_spawn (this, i);
+ op_ret = 0;
+ }
+ }
+ break;
+ case GF_AFR_OP_HEAL_FULL:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->full_healers[i];
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else if (AFR_COUNT (priv->child_up,
+ priv->child_count) < 2) {
+ ret = dict_set_str (output, key,
+ "< 2 bricks in replica are up");
+ } else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
+ ret = dict_set_str (output, key,
+ "Brick is remote");
+ } else {
+ ret = dict_set_str (output, key,
+ "Started self-heal");
+ afr_shd_full_healer_spawn (this, i);
+ op_ret = 0;
+ }
+ }
+ break;
+ case GF_AFR_OP_INDEX_SUMMARY:
+ for (i = 0; i < priv->child_count; i++)
+ if (shd->index_healers[i].local)
+ afr_shd_gather_index_entries (this, i, output);
+ break;
+ case GF_AFR_OP_HEALED_FILES:
+ eh_dump (shd->healed, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_HEAL_FAILED_FILES:
+ eh_dump (shd->heal_failed, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_SPLIT_BRAIN_FILES:
+ eh_dump (shd->split_brain, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_STATISTICS:
+ for (i = 0; i < priv->child_count; i++) {
+ eh_dump (shd->statistics[i], output,
+ afr_add_crawl_event);
+ afr_shd_dict_add_crawl_event (this, output,
+ &shd->index_healers[i].crawl_event);
+ afr_shd_dict_add_crawl_event (this, output,
+ &shd->full_healers[i].crawl_event);
+ }
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i]) {
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else {
+ snprintf (key, 64, "%d-%d-hardlinks", xl_id, i);
+ cnt = afr_shd_get_index_count (this, i);
+ if (cnt >= 0) {
+ ret = dict_set_uint64 (output, key, cnt);
+ }
+ op_ret = 0;
+ }
+ }
+
+// ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED,
+// STATISTICS_TO_BE_HEALED,
+// output);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op);
+ break;
+ }
+out:
+ dict_del (output, this->name);
+ return op_ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 32a8aaca5..10e229ee7 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,45 +8,65 @@
cases as published by the Free Software Foundation.
*/
-#ifndef __AFR_SELF_HEALD_H__
-#define __AFR_SELF_HEALD_H__
-#include "xlator.h"
-#define IS_ROOT_PATH(path) (!strcmp (path, "/"))
-#define IS_ENTRY_CWD(entry) (!strcmp (entry, "."))
-#define IS_ENTRY_PARENT(entry) (!strcmp (entry, ".."))
-#define AFR_ALL_CHILDREN -1
+#ifndef _AFR_SELF_HEALD_H
+#define _AFR_SELF_HEALD_H
-typedef struct afr_crawl_data_ {
- int child;
- pid_t pid;
- afr_crawl_type_t crawl;
- xlator_t *readdir_xl;
- void *op_data;
- int crawl_flags;
- int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data,
- gf_dirent_t *entry, loc_t *child, loc_t *parent,
- struct iatt *iattr);
-} afr_crawl_data_t;
+#include <pthread.h>
-typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data,
- gf_dirent_t *entry, loc_t *child, loc_t *parent,
- struct iatt *iattr);
-void afr_build_root_loc (xlator_t *this, loc_t *loc);
+typedef struct {
+ int child;
+ char *path;
+} shd_event_t;
-int afr_set_root_gfid (dict_t *dict);
+typedef struct {
+ int child;
+ uint64_t healed_count;
+ uint64_t split_brain_count;
+ uint64_t heal_failed_count;
+
+ /* If start_time is 0, it means crawler is not in progress
+ and stats are not valid */
+ time_t start_time;
+ /* If start_time is NOT 0 and end_time is 0, it means
+ cralwer is in progress */
+ time_t end_time;
+ char *crawl_type;
+} crawl_event_t;
+
+struct subvol_healer {
+ xlator_t *this;
+ int subvol;
+ gf_boolean_t local;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
+ crawl_event_t crawl_event;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ pthread_t thread;
+};
+
+typedef struct {
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
+ struct subvol_healer *index_healers;
+ struct subvol_healer *full_healers;
+
+ eh_t *healed;
+ eh_t *heal_failed;
+ eh_t *split_brain;
+ eh_t **statistics;
+} afr_self_heald_t;
-void
-afr_proactive_self_heal (void *data);
int
-afr_xl_op (xlator_t *this, dict_t *input, dict_t *output);
+afr_selfheal_childup (xlator_t *this, int subvol);
+
+int
+afr_selfheal_daemon_init (xlator_t *this);
-/*
- * In addition to its self-heal use, this is used to find a local default
- * read_child.
- */
int
-afr_local_pathinfo (char *pathinfo, gf_boolean_t *local);
-#endif /* __AFR_SELF_HEALD_H__ */
+afr_xl_op (xlator_t *this, dict_t *input, dict_t *output);
+
+#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 217ff8548..205ff759e 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -18,189 +18,130 @@
#include <signal.h>
+gf_boolean_t
+afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume);
-#define LOCKED_NO 0x0 /* no lock held */
-#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path
- of RENAME */
-#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */
-afr_fd_ctx_t *
-__afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+int
+__afr_txn_write_fop (call_frame_t *frame, xlator_t *this)
{
- uint64_t ctx = 0;
- int ret = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int i = 0;
+ afr_local_t *local = NULL;
afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
+ local = frame->local;
priv = this->private;
- ret = __fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0 && fd_is_anonymous (fd)) {
- ret = __afr_fd_ctx_set (this, fd);
- if (ret < 0)
- goto out;
-
- ret = __fd_ctx_get (fd, this, &ctx);
- if (ret < 0)
- goto out;
+ call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- for (i = 0; i < priv->child_count; i++)
- fd_ctx->opened_on[i] = AFR_FD_OPENED;
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-out:
- return fd_ctx;
-}
-
+ local->call_count = call_count;
-afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this)
-{
- afr_fd_ctx_t *fd_ctx = NULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ local->transaction.wind (frame, this, i);
- LOCK(&fd->lock);
- {
- fd_ctx = __afr_fd_ctx_get (fd, this);
+ if (!--call_count)
+ break;
+ }
}
- UNLOCK(&fd->lock);
- return fd_ctx;
+ return 0;
}
-static void
-afr_save_lk_owner (call_frame_t *frame)
+int
+__afr_txn_write_done (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
local = frame->local;
- local->saved_lk_owner = frame->root->lk_owner;
-}
-
+ local->transaction.unwind (frame, this);
-static void
-afr_restore_lk_owner (call_frame_t *frame)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
+ AFR_STACK_DESTROY (frame);
- frame->root->lk_owner = local->saved_lk_owner;
+ return 0;
}
-static void
-__mark_all_pending (int32_t *pending[], int child_count,
- afr_transaction_type type)
+call_frame_t*
+afr_transaction_detach_fop_frame (call_frame_t *frame)
{
- int i = 0;
- int j = 0;
+ afr_local_t * local = NULL;
+ call_frame_t *fop_frame = NULL;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (1);
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ fop_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
}
+ UNLOCK (&frame->lock);
+
+ return fop_frame;
}
static void
-__mark_child_dead (int32_t *pending[], int child_count, int child,
- afr_transaction_type type)
+afr_save_lk_owner (call_frame_t *frame)
{
- int j = 0;
+ afr_local_t * local = NULL;
- j = afr_index_for_transaction_type (type);
+ local = frame->local;
- pending[child][j] = 0;
+ local->saved_lk_owner = frame->root->lk_owner;
}
static void
-__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+afr_restore_lk_owner (call_frame_t *frame)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_local_t * local = NULL;
local = frame->local;
- if (!local->fd)
- return;
-
- fd_ctx = afr_fd_ctx_get (local->fd, this);
-
- if (!fd_ctx)
- goto out;
-
- LOCK (&local->fd->lock);
- {
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- fd_ctx->pre_op_done[child_index]++;
- }
- UNLOCK (&local->fd->lock);
-out:
- return;
-}
-
-static void
-__mark_non_participant_children (int32_t *pending[], int child_count,
- unsigned char *participants,
- afr_transaction_type type)
-{
- int i = 0;
- int j = 0;
-
- j = afr_index_for_transaction_type (type);
- for (i = 0; i < child_count; i++) {
- if (!participants[i])
- pending[i][j] = 0;
- }
+ frame->root->lk_owner = local->saved_lk_owner;
}
-
void
-__mark_all_success (int32_t *pending[], int child_count,
- afr_transaction_type type)
+__mark_all_success (call_frame_t *frame, xlator_t *this)
{
- int i;
- int j;
-
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (-1);
- }
-}
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i;
-void
-_set_all_child_errno (int *child_errno, unsigned int child_count)
-{
- int i = 0;
+ local = frame->local;
+ priv = this->private;
- for (i = 0; i < child_count; i++)
- if (child_errno[i] == 0)
- child_errno[i] = ENOTCONN;
+ for (i = 0; i < priv->child_count; i++) {
+ local->transaction.failed_subvols[i] = 0;
+ }
}
-void
+
+int
afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
fd_t *fd = NULL;
local = frame->local;
- priv = this->private;
fd = local->fd;
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
-
- _set_all_child_errno (local->child_errno, priv->child_count);
-
/* Perform fops with the lk-owner from top xlator.
* Eg: lk-owner of posix-lk and flush should be same,
* flush cant clear the posix-lks without that lk-owner.
@@ -209,6 +150,10 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
frame->root->lk_owner =
local->transaction.main_frame->root->lk_owner;
+ if (local->pre_op_compat)
+ /* old mode, pre-op was done as afr_changelog_do()
+ just now, before OP */
+ afr_changelog_pre_op_update (frame, this);
/* The wake up needs to happen independent of
what type of fop arrives here. If it was
@@ -221,6 +166,8 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
if (fd)
afr_delayed_changelog_wake_up (this, fd);
local->transaction.fop (frame, this);
+
+ return 0;
}
@@ -286,39 +233,28 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this)
return op_ret;
}
+
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
- int child, afr_xattrop_type_t op)
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending)
{
int i = 0;
int ret = 0;
+ int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, };
- if (op == LOCAL_FIRST) {
- ret = dict_set_static_bin (xattr, priv->pending_key[child],
- pending[child],
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
- if (ret)
- goto out;
- }
for (i = 0; i < priv->child_count; i++) {
- if (i == child)
- continue;
+ if (!memcmp (pending_zero, pending[i], sizeof (pending_zero)))
+ /* don't set xattrs for non-pending servers */
+ continue;
+
ret = dict_set_static_bin (xattr, priv->pending_key[i],
- pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ pending[i],
+ AFR_NUM_CHANGE_LOGS * sizeof (int));
/* 3 = data+metadata+entry */
- if (ret < 0)
- goto out;
- }
- if (op == LOCAL_LAST) {
- ret = dict_set_static_bin (xattr, priv->pending_key[child],
- pending[child],
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
if (ret)
- goto out;
+ break;
}
-out:
+
return ret;
}
@@ -347,110 +283,58 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
/* {{{ pending */
-int32_t
-afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr,
- dict_t *xdata)
+
+int
+afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int call_count = -1;
- priv = this->private;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
int_lock = &local->internal_lock;
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
-
- if (call_count == 0) {
- if (local->transaction.resume_stub) {
- AFR_CALL_RESUME (local->transaction.resume_stub);
- local->transaction.resume_stub = NULL;
- }
+ if (local->transaction.resume_stub) {
+ call_resume (local->transaction.resume_stub);
+ local->transaction.resume_stub = NULL;
+ }
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
- }
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ int_lock->lock_cbk = local->transaction.done;
+ afr_unlock (frame, this);
+ }
- return 0;
+ return 0;
}
-void
-afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this,
- inode_t *inode, afr_transaction_type type)
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
{
- int i = -1;
- int count = 0;
- int read_child = -1;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int **pending = NULL;
- int idx = 0;
- int32_t *stale_children = NULL;
- int32_t *fresh_children = NULL;
- gf_boolean_t rm_stale_children = _gf_false;
-
- idx = afr_index_for_transaction_type (type);
-
- priv = this->private;
- local = frame->local;
- pending = local->pending;
+ afr_inodelk_t *inodelk = NULL;
+ int i = 0;
- if (local->op_ret < 0)
- goto out;
- fresh_children = local->fresh_children;
- read_child = afr_inode_get_read_ctx (this, inode, fresh_children);
- if (read_child < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain "
- "for %s", uuid_utoa (inode->gfid));
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!afr_is_child_present (fresh_children,
- priv->child_count, i))
- continue;
- if (pending[i][idx])
- continue;
- /* child is down or op failed on it */
- if (!stale_children)
- stale_children = afr_children_create (priv->child_count);
- if (!stale_children)
- goto out;
-
- rm_stale_children = _gf_true;
- stale_children[count++] = i;
- gf_log (this->name, GF_LOG_DEBUG, "Removing stale child "
- "%d for %s", i, uuid_utoa (inode->gfid));
+ for (i = 0; int_lock->inodelk[i].domain; i++) {
+ inodelk = &int_lock->inodelk[i];
+ if (strcmp (dom, inodelk->domain) == 0)
+ return inodelk;
}
-
- if (!rm_stale_children)
- goto out;
-
- afr_inode_rm_stale_children (this, inode, stale_children);
-out:
- GF_FREE (stale_children);
- return;
+ return NULL;
}
unsigned char*
afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
{
unsigned char *locked_nodes = NULL;
+ afr_inodelk_t *inodelk = NULL;
switch (type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- locked_nodes = int_lock->inode_locked_nodes;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ locked_nodes = inodelk->locked_nodes;
break;
case AFR_ENTRY_TRANSACTION:
@@ -463,367 +347,468 @@ afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
return locked_nodes;
}
+
int
-afr_changelog_pre_op_call_count (afr_transaction_type type,
- afr_internal_lock_t *int_lock,
- unsigned int child_count)
+afr_changelog_call_count (afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned int child_count)
{
- int call_count = 0;
- unsigned char *locked_nodes = NULL;
+ int call_count = 0;
- locked_nodes = afr_locked_nodes_get (type, int_lock);
- GF_ASSERT (locked_nodes);
+ call_count = AFR_COUNT(pre_op_subvols, child_count);
- call_count = afr_locked_children_count (locked_nodes, child_count);
if (type == AFR_ENTRY_RENAME_TRANSACTION)
call_count *= 2;
return call_count;
}
-int
-afr_changelog_post_op_call_count (afr_transaction_type type,
- unsigned char *pre_op,
- unsigned int child_count)
-{
- int call_count = 0;
- call_count = afr_pre_op_done_children_count (pre_op, child_count);
- if (type == AFR_ENTRY_RENAME_TRANSACTION)
- call_count *= 2;
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- return call_count;
-}
+ local = frame->local;
+ priv = this->private;
-void
-afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv)
-{
- int i = 0;
- int index = 0;
- int32_t postop = 0;
- int32_t preop = 1;
- int32_t **txn_changelog = NULL;
-
- txn_changelog = local->transaction.txn_changelog;
- index = afr_index_for_transaction_type (local->transaction.type);
for (i = 0; i < priv->child_count; i++) {
- postop = ntoh32 (local->pending[i][index]);
- txn_changelog[i][index] = hton32 (postop + preop);
+ if (local->transaction.failed_subvols[i])
+ return _gf_false;
}
+
+ return _gf_true;
}
-afr_xattrop_type_t
-afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child,
- afr_transaction_type type)
+
+void
+afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)
{
- int index = 0;
- afr_xattrop_type_t op = LOCAL_LAST;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int i_errno = 0;
+ gf_boolean_t matching_errors = _gf_true;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
- index = afr_index_for_transaction_type (type);
- if (optimized && !pending[child][index])
- op = LOCAL_FIRST;
- return op;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret != -1) {
+ /* Operation succeeded on at least on subvol,
+ so it is not a failed-everywhere situation.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+ i_errno = local->replies[i].op_errno;
+
+ if (i_errno == ENOTCONN) {
+ /* ENOTCONN is not a symmetric error. We do not
+ know if the operation was performed on the
+ backend or not.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+
+ if (!op_errno) {
+ op_errno = i_errno;
+ } else if (op_errno != i_errno) {
+ /* Mismatching op_errno's */
+ matching_errors = _gf_false;
+ break;
+ }
+ }
+
+ if (matching_errors)
+ __mark_all_success (frame, this);
}
-void
-afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
- int optimized, int child)
+
+int
+afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
{
- int32_t **txn_changelog = NULL;
- int32_t **changelog = NULL;
- afr_private_t *priv = NULL;
- int ret = 0;
- afr_xattrop_type_t op = LOCAL_LAST;
+ afr_private_t * priv = this->private;
+ int i = 0;
+ int ret = 0;
+ int idx = 0;
+ afr_local_t * local = NULL;
+ dict_t *xattr = NULL;
+ int nothing_failed = 1;
+ gf_boolean_t need_undirty = _gf_false;
- priv = this->private;
- txn_changelog = local->transaction.txn_changelog;
- op = afr_get_postop_xattrop_type (local->pending, optimized, child,
- local->transaction.type);
- if (optimized)
- changelog = txn_changelog;
- else
- changelog = local->pending;
- ret = afr_set_pending_dict (priv, xattr, changelog, child, op);
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ local = frame->local;
+ idx = afr_index_for_transaction_type (local->transaction.type);
+
+ nothing_failed = afr_txn_nothing_failed (frame, this);
+
+ if (afr_changelog_pre_op_uninherit (frame, this))
+ need_undirty = _gf_false;
+ else
+ need_undirty = _gf_true;
+
+ if (nothing_failed && !need_undirty) {
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
+
+ xattr = dict_new ();
+ if (!xattr) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
+
+ if (need_undirty) {
+ local->dirty[idx] = hton32(-1);
+
+ ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
+
+ }
+
+ if (!nothing_failed) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict (priv, xattr, local->pending);
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
+
+ }
+
+ afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done);
+out:
+ if (xattr)
+ dict_unref (xattr);
+
+ return 0;
}
gf_boolean_t
-afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int index = -1;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
int i = 0;
+ gf_boolean_t ret = _gf_false;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int type = 0;
local = frame->local;
priv = this->private;
+ fd = local->fd;
- index = afr_index_for_transaction_type (local->transaction.type);
+ type = afr_index_for_transaction_type (local->transaction.type);
+ if (type != AFR_DATA_TRANSACTION)
+ return !local->transaction.dirtied;
- for (i = 0; i < priv->child_count; i++) {
- if (local->pending[i][index] == 0)
- return _gf_false;
- }
+ if (!fd)
+ return !local->transaction.dirtied;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
+
+ if (local->transaction.no_uninherit)
+ return _gf_false;
+
+ /* This function must be idempotent. So check if we
+ were called before and return the same answer again.
+
+ It is important to keep this function idempotent for
+ the call in afr_changelog_post_op_safe() to not have
+ side effects on the call from afr_changelog_post_op_now()
+ */
+ if (local->transaction.uninherit_done)
+ return local->transaction.uninherit_value;
+
+ LOCK(&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] !=
+ fd_ctx->pre_op_done[type][i]) {
+ ret = !local->transaction.dirtied;
+ goto unlock;
+ }
+ }
+
+ if (fd_ctx->inherited[type]) {
+ ret = _gf_true;
+ fd_ctx->inherited[type]--;
+ } else if (fd_ctx->on_disk[type]) {
+ ret = _gf_false;
+ fd_ctx->on_disk[type]--;
+ } else {
+ /* ASSERT */
+ ret = _gf_false;
+ }
+
+ if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ fd_ctx->pre_op_done[type][i] = 0;
+ }
+ }
+unlock:
+ UNLOCK(&fd->lock);
- return _gf_true;
+ local->transaction.uninherit_done = _gf_true;
+ local->transaction.uninherit_value = ret;
+
+ return ret;
}
-int
-afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
{
- afr_private_t * priv = this->private;
- afr_internal_lock_t *int_lock = NULL;
- int i = 0;
- int call_count = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int type = 0;
- afr_local_t * local = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- dict_t **xattr = NULL;
- int piggyback = 0;
- int nothing_failed = 1;
+ local = frame->local;
+ priv = this->private;
+ fd = local->fd;
- local = frame->local;
- int_lock = &local->internal_lock;
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return _gf_false;
- __mark_non_participant_children (local->pending, priv->child_count,
- local->transaction.pre_op,
- local->transaction.type);
+ type = afr_index_for_transaction_type (local->transaction.type);
- if (local->fd)
- afr_transaction_rm_stale_children (frame, this,
- local->fd->inode,
- local->transaction.type);
+ if (!fd)
+ return _gf_false;
- xattr = alloca (priv->child_count * sizeof (*xattr));
- memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
- for (i = 0; i < priv->child_count; i++) {
- xattr[i] = dict_new ();
- }
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
- call_count = afr_changelog_post_op_call_count (local->transaction.type,
- local->transaction.pre_op,
- priv->child_count);
- local->call_count = call_count;
+ LOCK(&fd->lock);
+ {
+ if (!fd_ctx->on_disk[type]) {
+ /* nothing to inherit yet */
+ ret = _gf_false;
+ goto unlock;
+ }
- if (local->fd)
- fdctx = afr_fd_ctx_get (local->fd, this);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] !=
+ fd_ctx->pre_op_done[type][i]) {
+ /* either inherit exactly, or don't */
+ ret = _gf_false;
+ goto unlock;
+ }
+ }
- if (call_count == 0) {
- /* no child is up */
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- goto out;
- }
+ fd_ctx->inherited[type]++;
- nothing_failed = afr_txn_nothing_failed (frame, this);
+ ret = _gf_true;
- afr_compute_txn_changelog (local , priv);
+ local->transaction.inherited = _gf_true;
+ }
+unlock:
+ UNLOCK(&fd->lock);
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
+ return ret;
+}
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- afr_set_postop_dict (local, this, xattr[i],
- local->optimistic_change_log, i);
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- {
- if (!fdctx) {
- afr_set_postop_dict (local, this, xattr[i],
- 0, i);
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- break;
- }
- /* local->transaction.postop_piggybacked[] was
- precomputed in is_piggyback_postop() when called from
- afr_changelog_post_op_safe()
- */
+gf_boolean_t
+afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
- piggyback = 0;
- if (local->transaction.postop_piggybacked[i])
- piggyback = 1;
+ local = frame->local;
+ priv = this->private;
+ fd = local->fd;
- afr_set_postop_dict (local, this, xattr[i],
- piggyback, i);
+ if (!fd)
+ return _gf_false;
- if (nothing_failed && piggyback) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i], NULL);
- } else {
- STACK_WIND_COOKIE (frame,
- afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- }
- break;
- case AFR_METADATA_TRANSACTION:
- {
- if (nothing_failed && local->optimistic_change_log) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- break;
- }
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
+ if (local->transaction.inherited)
+ /* was already inherited in afr_changelog_pre_op */
+ return _gf_false;
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- if (nothing_failed && local->optimistic_change_log) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- } else {
- STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- call_count--;
- }
+ if (!local->transaction.dirtied)
+ return _gf_false;
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+ if (!afr_txn_nothing_failed (frame, this))
+ return _gf_false;
- afr_set_postop_dict (local, this, xattr[i],
- local->optimistic_change_log, i);
+ type = afr_index_for_transaction_type (local->transaction.type);
- /* fall through */
+ ret = _gf_false;
- case AFR_ENTRY_TRANSACTION:
- {
- if (nothing_failed && local->optimistic_change_log) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- break;
- }
+ LOCK(&fd->lock);
+ {
+ if (!fd_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ fd_ctx->pre_op_done[type][i] =
+ local->transaction.pre_op[i];
+ } else {
+ for (i = 0; i < priv->child_count; i++)
+ if (fd_ctx->pre_op_done[type][i] !=
+ local->transaction.pre_op[i]) {
+ local->transaction.no_uninherit = 1;
+ goto unlock;
+ }
+ }
+ fd_ctx->on_disk[type]++;
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
- }
+ ret = _gf_true;
+ }
+unlock:
+ UNLOCK(&fd->lock);
- if (!--call_count)
- break;
- }
+ return ret;
+}
-out:
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+
+int
+afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+
+ if (op_ret == -1)
+ afr_transaction_fop_failed (frame, this, (long) cookie);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ local->transaction.changelog_resume (frame, this);
return 0;
}
-int32_t
-afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr,
- dict_t *xdata)
+int
+afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = this->private;
- int call_count = -1;
- int child_index = (long) cookie;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- LOCK (&frame->lock);
- {
- switch (op_ret) {
- case 0:
- __mark_pre_op_done_on_fd (frame, this, child_index);
- //fallthrough we need to mark the pre_op
- case 1:
- local->transaction.pre_op[child_index] = 1;
- /* special op_ret for piggyback */
- break;
- case -1:
- if (op_errno == ENOTSUP) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop not supported by %s",
- priv->children[child_index]->name);
- local->op_ret = -1;
-
- } else if (!child_went_down (op_ret, op_errno)) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop failed on child %s: %s",
- priv->children[child_index]->name,
- strerror (op_errno));
+ call_count = afr_changelog_call_count (local->transaction.type,
+ local->transaction.pre_op,
+ priv->child_count);
+
+ if (call_count == 0) {
+ changelog_resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ local->transaction.changelog_resume = changelog_resume;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ if (!local->fd) {
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ } else {
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
}
- local->op_errno = op_errno;
- break;
- }
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ call_count--;
- if (call_count == 0) {
- if ((local->op_ret == -1) &&
- (local->op_errno == ENOTSUP)) {
- local->transaction.resume (frame, this);
- } else {
- afr_transaction_perform_fop (frame, this);
- }
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd)
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ else
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ break;
+ }
+
+ if (!--call_count)
+ break;
}
- return 0;
+ return 0;
}
+
int
afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
{
@@ -831,206 +816,122 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
int i = 0;
int ret = 0;
int call_count = 0;
- dict_t **xattr = NULL;
- afr_fd_ctx_t *fdctx = NULL;
+ int op_errno = 0;
afr_local_t *local = NULL;
- int piggyback = 0;
afr_internal_lock_t *int_lock = NULL;
unsigned char *locked_nodes = NULL;
+ unsigned char *pending_subvols = NULL;
+ int idx = -1;
+ gf_boolean_t pre_nop = _gf_true;
+ dict_t *xdata_req = NULL;
local = frame->local;
int_lock = &local->internal_lock;
+ idx = afr_index_for_transaction_type (local->transaction.type);
- xattr = alloca (priv->child_count * sizeof (*xattr));
- memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
-
- for (i = 0; i < priv->child_count; i++) {
- xattr[i] = dict_new ();
- }
+ locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock);
- call_count = afr_changelog_pre_op_call_count (local->transaction.type,
- int_lock,
- priv->child_count);
- if (call_count == 0) {
- local->internal_lock.lock_cbk =
- local->transaction.done;
- afr_unlock (frame, this);
- goto out;
- }
+ pending_subvols = alloca0 (priv->child_count);
- local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_nodes[i]) {
+ local->transaction.pre_op[i] = 1;
+ call_count++;
+ } else {
+ pending_subvols[i] = 1;
+ }
+ }
- __mark_all_pending (local->pending, priv->child_count,
- local->transaction.type);
+ /* TBD: quorum check w/ call_count */
- if (local->fd)
- fdctx = afr_fd_ctx_get (local->fd, this);
+ if (call_count == 0) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
- locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock);
- for (i = 0; i < priv->child_count; i++) {
- if (!locked_nodes[i])
- continue;
- ret = afr_set_pending_dict (priv, xattr[i], local->pending,
- i, LOCAL_FIRST);
+ xdata_req = dict_new();
+ if (!xdata_req) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ pre_nop = _gf_true;
+ if (afr_changelog_pre_op_inherit (frame, this))
+ goto next;
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- {
- if (!fdctx) {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- break;
- }
-
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_done[i]) {
- fdctx->pre_op_piggyback[i]++;
- piggyback = 1;
- fdctx->hit++;
- } else {
- fdctx->miss++;
- }
- }
- UNLOCK (&local->fd->lock);
+ if (call_count < priv->child_count) {
+ /* For subvols we are not performing operation on,
+ mark them as pending up-front along with the FOP
+ so that we can safely defer unmarking dirty until
+ later.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (pending_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict (priv, xdata_req,
+ local->pending);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ pre_nop = _gf_false;
+ }
- afr_set_delayed_post_op (frame, this);
+ if (call_count > 1 &&
+ (local->transaction.type == AFR_DATA_TRANSACTION ||
+ !local->optimistic_change_log)) {
- if (piggyback)
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
- case AFR_METADATA_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- break;
- }
+ /* If we are performing change on only one subvol, no
+ need to mark dirty, because we are setting the pending
+ counts already anyways
+ */
+ local->dirty[idx] = hton32(1);
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
+ ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- } else {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
+ pre_nop = _gf_false;
+ local->transaction.dirtied = 1;
+ }
- call_count--;
- }
+ if (pre_nop)
+ goto next;
+ if (!local->pre_op_compat) {
+ dict_copy (xdata_req, local->xdata_req);
+ goto next;
+ }
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+ afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop);
- ret = afr_set_pending_dict (priv, xattr[i], local->pending,
- i, LOCAL_FIRST);
+ if (xdata_req)
+ dict_unref (xdata_req);
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ return 0;
+next:
+ afr_transaction_perform_fop (frame, this);
- /* fall through */
+ if (xdata_req)
+ dict_unref (xdata_req);
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- break;
- }
+ return 0;
+err:
+ local->internal_lock.lock_cbk = local->transaction.done;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
- }
+ afr_unlock (frame, this);
- if (!--call_count)
- break;
- }
-out:
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+ if (xdata_req)
+ dict_unref (xdata_req);
- return 0;
+ return 0;
}
@@ -1179,12 +1080,14 @@ int
afr_set_transaction_flock (afr_local_t *local)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- int_lock->lk_flock.l_len = local->transaction.len;
- int_lock->lk_flock.l_start = local->transaction.start;
- int_lock->lk_flock.l_type = F_WRLCK;
+ inodelk->flock.l_len = local->transaction.len;
+ inodelk->flock.l_start = local->transaction.start;
+ inodelk->flock.l_type = F_WRLCK;
return 0;
}
@@ -1199,6 +1102,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
int_lock->transaction_lk_type = AFR_TRANSACTION_LK;
+ int_lock->domain = this->name;
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
@@ -1259,121 +1163,105 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
void
afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- /* call this function from any of the related optimizations
- which benefit from delaying post op are enabled, namely:
+ /* call this function from any of the related optimizations
+ which benefit from delaying post op are enabled, namely:
- - changelog piggybacking
- - eager locking
- */
+ - changelog piggybacking
+ - eager locking
+ */
- priv = this->private;
- if (!priv)
- return;
+ priv = this->private;
+ if (!priv)
+ return;
- if (!priv->post_op_delay_secs)
- return;
+ if (!priv->post_op_delay_secs)
+ return;
local = frame->local;
if (!local->transaction.eager_lock_on)
return;
- if (!local)
- return;
+ if (!local)
+ return;
+
+ if (!local->fd)
+ return;
+
+ if (local->op == GF_FOP_WRITE)
+ local->delayed_post_op = _gf_true;
+}
+
+gf_boolean_t
+afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ if (!fd) {
+ /* If false is returned, it may keep on taking eager-lock
+ * which may lead to starvation, so return true to avoid that.
+ */
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd");
+ return _gf_true;
+ }
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1
+ */
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_true;
- if (!local->fd)
- return;
+ if (fd_ctx->open_fd_count > 1)
+ return _gf_true;
- if (local->op == GF_FOP_WRITE)
- local->delayed_post_op = _gf_true;
+ return _gf_false;
}
gf_boolean_t
is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- gf_boolean_t res = _gf_false;
+ afr_local_t *local = NULL;
+ gf_boolean_t res = _gf_false;
- local = frame->local;
- if (!local)
- goto out;
+ local = frame->local;
+ if (!local)
+ goto out;
- if (!local->delayed_post_op)
- goto out;
+ if (!local->delayed_post_op)
+ goto out;
- res = _gf_true;
+ //Mark pending changelog ASAP
+ if (!afr_txn_nothing_failed (frame, this))
+ goto out;
+
+ if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
+ goto out;
+
+ res = _gf_true;
out:
- return res;
+ return res;
}
void
afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
- call_stub_t *stub);
+ call_stub_t *stub);
void
afr_delayed_changelog_wake_up_cbk (void *data)
{
- fd_t *fd = NULL;
+ fd_t *fd = NULL;
- fd = data;
+ fd = data;
- afr_delayed_changelog_wake_up (THIS, fd);
-}
-
-
-/*
- Check if the frame is destined to get optimized away
- with changelog piggybacking
-*/
-static gf_boolean_t
-is_piggyback_post_op (call_frame_t *frame, fd_t *fd)
-{
- afr_fd_ctx_t *fdctx = NULL;
- afr_local_t *local = NULL;
- gf_boolean_t piggyback = _gf_true;
- afr_private_t *priv = NULL;
- int i = 0;
-
- priv = frame->this->private;
- local = frame->local;
- fdctx = afr_fd_ctx_get (fd, frame->this);
-
- LOCK(&fd->lock);
- {
- piggyback = _gf_true;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
- if (fdctx->pre_op_piggyback[i]) {
- fdctx->pre_op_piggyback[i]--;
- local->transaction.postop_piggybacked[i] = 1;
- } else {
- /* For at least _one_ subvolume we cannot
- piggyback on the changelog, and have to
- perform a hard POST-OP and therefore fsync
- if necesssary
- */
- piggyback = _gf_false;
- GF_ASSERT (fdctx->pre_op_done[i]);
- fdctx->pre_op_done[i]--;
- }
- }
- }
- UNLOCK(&fd->lock);
-
- if (!afr_txn_nothing_failed (frame, frame->this)) {
- /* something failed in this transaction,
- we will be performing a hard post-op
- */
- return _gf_false;
- }
-
- return piggyback;
+ afr_delayed_changelog_wake_up (THIS, fd);
}
@@ -1381,70 +1269,72 @@ is_piggyback_post_op (call_frame_t *frame, fd_t *fd)
int
afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
{
- afr_fd_ctx_t *fdctx = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
- fdctx = afr_fd_ctx_get (fd, this);
+ fdctx = afr_fd_ctx_get (fd, this);
- LOCK(&fd->lock);
- {
- fdctx->witnessed_unstable_write = _gf_true;
- }
- UNLOCK(&fd->lock);
+ LOCK(&fd->lock);
+ {
+ fdctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&fd->lock);
- return 0;
+ return 0;
}
/* TEST and CLEAR operation */
gf_boolean_t
afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
{
- afr_fd_ctx_t *fdctx = NULL;
- gf_boolean_t witness = _gf_false;
+ afr_fd_ctx_t *fdctx = NULL;
+ gf_boolean_t witness = _gf_false;
- fdctx = afr_fd_ctx_get (fd, this);
+ fdctx = afr_fd_ctx_get (fd, this);
+ if (!fdctx)
+ return _gf_true;
- LOCK(&fd->lock);
- {
- if (fdctx->witnessed_unstable_write) {
- witness = _gf_true;
- fdctx->witnessed_unstable_write = _gf_false;
- }
- }
- UNLOCK (&fd->lock);
+ LOCK(&fd->lock);
+ {
+ if (fdctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ fdctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK (&fd->lock);
- return witness;
+ return witness;
}
int
afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *pre,
- struct iatt *post, dict_t *xdata)
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
int child_index = (long) cookie;
- int call_count = -1;
- afr_local_t *local = NULL;
+ int call_count = -1;
+ afr_local_t *local = NULL;
- priv = this->private;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
- if (afr_fop_failed (op_ret, op_errno)) {
- /* Failure of fsync() is as good as failure of previous
- write(). So treat it like one.
+ if (op_ret != 0) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
*/
- gf_log (this->name, GF_LOG_WARNING,
- "fsync(%s) failed on subvolume %s. Transaction was %s",
- uuid_utoa (local->fd->inode->gfid),
- priv->children[child_index]->name,
- gf_fop_list[local->op]);
+ gf_log (this->name, GF_LOG_WARNING,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa (local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ gf_fop_list[local->op]);
- afr_transaction_fop_failed (frame, this, child_index);
+ afr_transaction_fop_failed (frame, this, child_index);
}
- call_count = afr_frame_return (frame);
+ call_count = afr_frame_return (frame);
- if (call_count == 0)
+ if (call_count == 0)
afr_changelog_post_op_now (frame, this);
return 0;
@@ -1454,38 +1344,46 @@ afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
+ call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
- if (!call_count) {
- /* will go straight to unlock */
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- local->call_count = call_count;
+ local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
+ xdata = dict_new();
+ if (xdata)
+ ret = dict_set_int32 (xdata, "batch-fsync", 1);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
- STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
+ STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
(void *) (long) i, priv->children[i],
priv->children[i]->fops->fsync, local->fd,
- 1, NULL);
- if (!--call_count)
- break;
- }
+ 1, xdata);
+ if (!--call_count)
+ break;
+ }
- return 0;
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
}
@@ -1493,74 +1391,83 @@ int
afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
+ priv = this->private;
- if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
-
- if (is_piggyback_post_op (frame, local->fd)) {
- /* just detected that this post-op is about to
- be optimized away as a new write() has
- already piggybacked on this frame's changelog.
- */
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
-
- /* Calling afr_changelog_post_op_now() now will result in
- issuing ->[f]xattrop().
-
- Performing a hard POST-OP (->[f]xattrop() FOP) is a more
- responsible operation that what it might appear on the surface.
-
- The changelog of a file (in the xattr of the file on the server)
- stores information (pending count) about the state of the file
- on the OTHER server. This changelog is blindly trusted, and must
- therefore be updated in such a way it remains trustworthy. This
- implies that decrementing the pending count (essentially "clearing
- the dirty flag") must be done STRICTLY after we are sure that the
- operation on the other server has reached stable storage.
-
- While the backend filesystem on that server will eventually flush
- it to stable storage, we (being in userspace) have no mechanism
- to get notified when the write became "stable".
-
- This means we need take matter into our own hands and issue an
- fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
- and get an acknowledgement for it. And we need to wait for the
- fsync() acknowledgement before initiating the hard POST-OP.
-
- However if the FD itself was opened in O_SYNC or O_DSYNC then
- we are already guaranteed that the writes were made stable as
- part of the FOP itself. The same holds true for NFS stable
- writes which happen on an anonymous FD with O_DSYNC or O_SYNC
- flag set in the writev() @flags param. For all other write types,
- mark a flag in the fdctx whenever an unstable write is witnessed.
- */
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (afr_changelog_pre_op_uninherit (frame, this) &&
+ afr_txn_nothing_failed (frame, this)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- /* Time to fsync() */
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- afr_changelog_fsync (frame, this);
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync (frame, this);
+ } else {
+ afr_changelog_post_op_now (frame, this);
+ }
- return 0;
+ return 0;
}
void
afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
- call_stub_t *stub)
+ call_stub_t *stub)
{
afr_fd_ctx_t *fd_ctx = NULL;
call_frame_t *prev_frame = NULL;
- struct timeval delta = {0, };
+ struct timespec delta = {0, };
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -1568,10 +1475,10 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
fd_ctx = afr_fd_ctx_get (fd, this);
if (!fd_ctx)
- return;
+ goto out;
delta.tv_sec = priv->post_op_delay_secs;
- delta.tv_usec = 0;
+ delta.tv_nsec = 0;
pthread_mutex_lock (&fd_ctx->delay_lock);
{
@@ -1590,10 +1497,11 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
unlock:
pthread_mutex_unlock (&fd_ctx->delay_lock);
+out:
if (prev_frame) {
local = prev_frame->local;
local->transaction.resume_stub = stub;
- afr_changelog_post_op_safe (prev_frame, this);
+ afr_changelog_post_op_now (prev_frame, this);
} else if (stub) {
call_resume (stub);
}
@@ -1603,14 +1511,14 @@ unlock:
void
afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (is_afr_delayed_changelog_post_op_needed (frame, this))
- afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
- else
- afr_changelog_post_op_safe (frame, this);
+ if (is_afr_delayed_changelog_post_op_needed (frame, this))
+ afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
+ else
+ afr_changelog_post_op_safe (frame, this);
}
@@ -1621,54 +1529,48 @@ afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
The @stub gets saved in @local and gets resumed in
afr_local_cleanup()
-*/
+ */
void
afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
{
- afr_delayed_changelog_post_op (this, NULL, fd, stub);
+ afr_delayed_changelog_post_op (this, NULL, fd, stub);
}
void
afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
{
- afr_delayed_changelog_post_op (this, NULL, fd, NULL);
+ afr_delayed_changelog_post_op (this, NULL, fd, NULL);
}
int
afr_transaction_resume (call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (local->transaction.eager_lock_on) {
- /* We don't need to retain "local" in the
- fd list anymore, writes to all subvols
- are finished by now */
- LOCK (&local->fd->lock);
- {
- list_del_init (&local->transaction.eager_locked);
- }
- UNLOCK (&local->fd->lock);
- }
- afr_restore_lk_owner (frame);
+ if (local->transaction.eager_lock_on) {
+ /* We don't need to retain "local" in the
+ fd list anymore, writes to all subvols
+ are finished by now */
+ afr_remove_eager_lock_stub (local);
+ }
+
+ afr_restore_lk_owner (frame);
+
+ afr_handle_symmetric_errors (frame, this);
+
+ if (!local->pre_op_compat)
+ /* new mode, pre-op was done along
+ with OP */
+ afr_changelog_pre_op_update (frame, this);
if (__fop_changelog_needed (frame, this)) {
afr_changelog_post_op (frame, this);
} else {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
+ afr_changelog_post_op_done (frame, this);
}
return 0;
@@ -1680,64 +1582,63 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index)
+afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
+ int child_index)
{
afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
local = frame->local;
- priv = this->private;
- __mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
+ local->transaction.failed_subvols[child_index] = 1;
}
-static gf_boolean_t
+ static gf_boolean_t
afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
{
- uint64_t start1 = local1->transaction.start;
- uint64_t start2 = local2->transaction.start;
- uint64_t end1 = 0;
- uint64_t end2 = 0;
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
- if (local1->transaction.len)
- end1 = start1 + local1->transaction.len - 1;
- else
- end1 = ULLONG_MAX;
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
- if (local2->transaction.len)
- end2 = start2 + local2->transaction.len - 1;
- else
- end2 = ULLONG_MAX;
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
- return ((end1 >= start2) && (end2 >= start1));
+ return ((end1 >= start2) && (end2 >= start1));
}
-
void
afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- afr_local_t *each = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *each = NULL;
- priv = this->private;
+ priv = this->private;
- if (!local->fd)
- return;
+ if (!local->fd)
+ return;
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- return;
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return;
- if (!priv->eager_lock)
- return;
+ if (!priv->eager_lock)
+ return;
- fdctx = afr_fd_ctx_get (local->fd, this);
- if (!fdctx)
- return;
+ fdctx = afr_fd_ctx_get (local->fd, this);
+ if (!fdctx)
+ return;
+ if (afr_are_multiple_fds_opened (local->fd, this))
+ return;
/*
* Once full file lock is acquired in eager-lock phase, overlapping
* writes do not compete for inode-locks, instead are transferred to the
@@ -1755,22 +1656,22 @@ afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
* This check makes sure the locks are not transferred for
* overlapping writes.
*/
- LOCK (&local->fd->lock);
- {
- list_for_each_entry (each, &fdctx->eager_locked,
- transaction.eager_locked) {
- if (afr_locals_overlap (each, local)) {
- local->transaction.eager_lock_on = _gf_false;
- goto unlock;
- }
- }
+ LOCK (&local->fd->lock);
+ {
+ list_for_each_entry (each, &fdctx->eager_locked,
+ transaction.eager_locked) {
+ if (afr_locals_overlap (each, local)) {
+ local->transaction.eager_lock_on = _gf_false;
+ goto unlock;
+ }
+ }
- local->transaction.eager_lock_on = _gf_true;
- list_add_tail (&local->transaction.eager_locked,
- &fdctx->eager_locked);
- }
+ local->transaction.eager_lock_on = _gf_true;
+ list_add_tail (&local->transaction.eager_locked,
+ &fdctx->eager_locked);
+ }
unlock:
- UNLOCK (&local->fd->lock);
+ UNLOCK (&local->fd->lock);
}
@@ -1789,11 +1690,10 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local->transaction.type = type;
ret = afr_transaction_local_init (local, this);
- if (ret < 0) {
+ if (ret < 0)
goto out;
- }
- afr_transaction_eager_lock_init (local, this);
+ afr_transaction_eager_lock_init (local, this);
if (local->fd && local->transaction.eager_lock_on)
afr_set_lk_owner (frame, this, local->fd);
@@ -1802,6 +1702,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
if (!local->transaction.eager_lock_on && local->loc.inode) {
fd = fd_lookup (local->loc.inode, frame->root->pid);
+ if (fd == NULL)
+ fd = fd_lookup_anonymous (local->loc.inode);
+
if (fd) {
afr_delayed_changelog_wake_up (this, fd);
fd_unref (fd);
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index 55e8bbcca..77cc8eed0 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -11,10 +11,7 @@
#ifndef __TRANSACTION_H__
#define __TRANSACTION_H__
-typedef enum {
- LOCAL_FIRST = 1,
- LOCAL_LAST = 2
-} afr_xattrop_type_t;
+#include "afr.h"
void
afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
@@ -23,14 +20,15 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
int
afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
+
int32_t
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
-afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this);
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
- int child, afr_xattrop_type_t op);
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
+
void
afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
@@ -38,6 +36,18 @@ void
afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
void
-__mark_all_success (int32_t *pending[], int child_count,
- afr_transaction_type type);
+__mark_all_success (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this);
+
+int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type);
+
+int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
+
+int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this);
+int __afr_txn_write_done (call_frame_t *frame, xlator_t *this);
+call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
+
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index bee10fd01..ead08425f 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -21,11 +21,6 @@
#endif
#include "afr-common.c"
-#define SHD_INODE_LRU_LIMIT 2048
-#define AFR_EH_HEALED_LIMIT 1024
-#define AFR_EH_HEAL_FAIL_LIMIT 1024
-#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
-
struct volume_options options[];
int32_t
@@ -114,6 +109,14 @@ reconfigure (xlator_t *this, dict_t *options)
priv = this->private;
+ GF_OPTION_RECONF ("afr-dirty-xattr",
+ priv->afr_dirty, options, str,
+ out);
+
+ GF_OPTION_RECONF ("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, options, bool,
+ out);
+
GF_OPTION_RECONF ("background-self-heal-count",
priv->background_self_heal_count, options, uint32,
out);
@@ -127,9 +130,6 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options,
bool, out);
- GF_OPTION_RECONF ("strict-readdir", priv->strict_readdir, options, bool,
- out);
-
GF_OPTION_RECONF ("data-self-heal-window-size",
priv->data_self_heal_window_size, options,
uint32, out);
@@ -146,8 +146,6 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("data-self-heal-algorithm",
priv->data_self_heal_algorithm, options, str, out);
- GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, bool, out);
-
GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
@@ -175,22 +173,29 @@ reconfigure (xlator_t *this, dict_t *options)
priv->read_child = index;
}
+ GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out);
+
GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out);
GF_OPTION_RECONF ("quorum-type", qtype, options, str, out);
GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options,
uint32, out);
fix_quorum_options(this,priv,qtype);
- GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options,
- int32, out);
GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options,
uint32, out);
GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size,
- options, size, out);
+ options, size_uint64, out);
/* Reset this so we re-discover in case the topology changed. */
- GF_OPTION_RECONF ("readdir-failover", priv->readdir_failover, options,
+ GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options,
bool, out);
+
+ GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options,
+ bool, out);
+
+ GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options,
+ bool, out);
+
priv->did_discovery = _gf_false;
ret = 0;
@@ -242,10 +247,6 @@ init (xlator_t *this)
priv = this->private;
LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
- //lock recovery is not done in afr
- pthread_mutex_init (&priv->mutex, NULL);
- INIT_LIST_HEAD (&priv->saved_fds);
child_count = xlator_subvolume_count (this);
@@ -253,6 +254,11 @@ init (xlator_t *this)
priv->read_child = -1;
+ GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out);
+
+ GF_OPTION_INIT ("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, bool, out);
+
GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out);
if (read_subvol) {
priv->read_child = xlator_subvolume_index (this, read_subvol);
@@ -306,10 +312,6 @@ init (xlator_t *this)
GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
- GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out);
-
- GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
-
GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);
GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool,
@@ -324,17 +326,22 @@ init (xlator_t *this)
GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out);
- GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out);
+ GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);
GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out);
GF_OPTION_INIT ("quorum-type", qtype, str, out);
GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out);
- GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size,
+ GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
out);
fix_quorum_options(this,priv,qtype);
GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
- GF_OPTION_INIT ("readdir-failover", priv->readdir_failover, bool, out);
+ GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool,
+ out);
+
+ GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out);
+
+ GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
priv->wait_count = 1;
@@ -376,8 +383,6 @@ init (xlator_t *this)
AFR_XATTR_PREFIX,
trav->xlator->name);
if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed to set pending key");
ret = -ENOMEM;
goto out;
}
@@ -386,6 +391,13 @@ init (xlator_t *this)
i++;
}
+ ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT,
+ this->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
gf_afr_mt_int32_t);
if (!priv->last_event) {
@@ -393,6 +405,12 @@ init (xlator_t *this)
goto out;
}
+ ret = afr_selfheal_daemon_init (this);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/* keep more local here as we may need them for self-heal etc */
this->local_pool = mem_pool_new (afr_local_t, 512);
if (!this->local_pool) {
@@ -402,53 +420,8 @@ init (xlator_t *this)
goto out;
}
- priv->first_lookup = 1;
priv->root_inode = NULL;
- if (!priv->shd.iamshd) {
- ret = 0;
- goto out;
- }
-
- ret = -ENOMEM;
- priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count,
- gf_afr_mt_brick_pos_t);
- if (!priv->shd.pos)
- goto out;
-
- priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count,
- gf_afr_mt_int32_t);
- if (!priv->shd.pending)
- goto out;
-
- priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress),
- child_count, gf_afr_mt_shd_bool_t);
- if (!priv->shd.inprogress)
- goto out;
- priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count,
- gf_afr_mt_shd_timer_t);
- if (!priv->shd.timer)
- goto out;
-
- priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false);
- if (!priv->shd.healed)
- goto out;
-
- priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false);
- if (!priv->shd.heal_failed)
- goto out;
-
- priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false);
- if (!priv->shd.split_brain)
- goto out;
-
- this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this);
- if (!this->itable)
- goto out;
- priv->root_inode = inode_ref (this->itable->root);
- GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out);
- GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out);
-
ret = 0;
out:
return ret;
@@ -463,7 +436,7 @@ fini (xlator_t *this)
priv = this->private;
this->private = NULL;
afr_priv_destroy (priv);
- if (this->itable);//I dont see any destroy func
+ //if (this->itable);//I dont see any destroy func
return 0;
}
@@ -483,6 +456,9 @@ struct xlator_fops fops = {
.finodelk = afr_finodelk,
.entrylk = afr_entrylk,
.fentrylk = afr_fentrylk,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
/* inode read */
.access = afr_access,
@@ -555,11 +531,11 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = 2,
- .default_value = "0",
+ .default_value = "1",
.description = "inode-read fops happen only on one of the bricks in "
"replicate. AFR will prefer the one computed using "
"the method specified using this option"
- "0 = first responder, "
+ "0 = first up server, "
"1 = hash by GFID of file (all clients use "
"same subvolume), "
"2 = hash by GFID of file and client PID",
@@ -567,8 +543,8 @@ struct volume_options options[] = {
{ .key = {"choose-local" },
.type = GF_OPTION_TYPE_BOOL,
.default_value = "true",
- .description = "Choose a local subvolume(i.e. Brick) to read from if "
- "read-subvolume is not explicitly set.",
+ .description = "Choose a local subvolume (i.e. Brick) to read from"
+ " if read-subvolume is not explicitly set.",
},
{ .key = {"favorite-child"},
.type = GF_OPTION_TYPE_XLATOR,
@@ -658,10 +634,6 @@ struct volume_options options[] = {
"pre fop changelog operations in afr transaction "
"if this option is enabled."
},
- { .key = {"strict-readdir"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
{ .key = {"inodelk-trace"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
@@ -672,13 +644,19 @@ struct volume_options options[] = {
.default_value = "off",
.description = "Enabling this option logs entry lock/unlocks"
},
+ { .key = {"pre-op-compat"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Use separate pre-op xattrop() FOP rather than "
+ "overloading xdata of the OP"
+ },
{ .key = {"eager-lock"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
.description = "Lock phase of a transaction has two sub-phases. "
"First is an attempt to acquire locks in parallel by "
"broadcasting non-blocking lock requests. If lock "
- "aquistion fails on any server, then the held locks "
+ "acquisition fails on any server, then the held locks "
"are unlocked and revert to a blocking locked mode "
"sequentially on one server after another. If this "
"option is enabled the initial broadcasting lock "
@@ -694,14 +672,14 @@ struct volume_options options[] = {
"arrives before the unlock phase of the \"optimized\" "
"transaction, that in turn \"takes over\" the lock as "
"well. The actual unlock now happens at the end of "
- "the last \"optimzed\" transaction."
+ "the last \"optimized\" transaction."
},
{ .key = {"self-heal-daemon"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
+ .default_value = "on",
.description = "This option applies to only self-heal-daemon. "
- "Index directory crawl and automatic healing of files"
+ "Index directory crawl and automatic healing of files "
"will not be performed if this option is turned off."
},
{ .key = {"iam-self-heal-daemon"},
@@ -736,14 +714,6 @@ struct volume_options options[] = {
"self-heal-daemon so that it can crawl only on "
"local index directories.",
},
- { .key = {"heal-timeout"},
- .type = GF_OPTION_TYPE_INT,
- .min = 60,
- .max = INT_MAX,
- .default_value = "600",
- .description = "time interval for checking the need to self-heal "
- "in self-heal-daemon"
- },
{ .key = {"post-op-delay-secs"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
@@ -760,10 +730,20 @@ struct volume_options options[] = {
.max = 131072,
.default_value = "1KB",
},
- { .key = {"readdir-failover"},
+ { .key = {"ensure-durability"},
.type = GF_OPTION_TYPE_BOOL,
- .description = "readdir(p) will not failover if this option is off",
+ .description = "Afr performs fsyncs for transactions if this "
+ "option is on to make sure the changelogs/data is "
+ "written to the disk",
.default_value = "on",
},
+ { .key = {"afr-dirty-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = AFR_DIRTY_DEFAULT,
+ },
+ { .key = {"metadata-splitbrain-forced-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 387ed12ec..36042f7b2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -20,104 +20,42 @@
#include "call-stub.h"
#include "compat-errno.h"
#include "afr-mem-types.h"
-#include "afr-self-heal-algorithm.h"
#include "libxlator.h"
#include "timer.h"
+#include "syncop.h"
+
+#include "afr-self-heald.h"
#define AFR_XATTR_PREFIX "trusted.afr"
#define AFR_PATHINFO_HEADER "REPLICATE:"
#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
+#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
#define AFR_LOCKEE_COUNT_MAX 3
-
-struct _pump_private;
-
-typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int child, int32_t op_error,
- int32_t op_errno);
-
-typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int32_t op_error, int32_t op_errno);
-typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this);
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
-typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno);
-typedef enum {
- AFR_POS_UNKNOWN,
- AFR_POS_LOCAL,
- AFR_POS_REMOTE
-} afr_child_pos_t;
+typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol);
-typedef enum {
- SPLIT_BRAIN = 1,
- ALL_FOOLS = 2
-} afr_subvol_status_t;
+typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err);
-typedef enum {
- AFR_INODE_SET_READ_CTX = 1,
- AFR_INODE_RM_STALE_CHILDREN,
- AFR_INODE_SET_OPENDIR_DONE,
- AFR_INODE_GET_READ_CTX,
- AFR_INODE_GET_OPENDIR_DONE,
-} afr_inode_op_t;
-
-typedef struct afr_inode_params_ {
- afr_inode_op_t op;
- union {
- gf_boolean_t value;
- struct {
- int32_t read_child;
- int32_t *children;
- } read_ctx;
- } u;
-} afr_inode_params_t;
-
-typedef enum afr_spb_state {
- DONT_KNOW,
- SPB,
- NO_SPB
-} afr_spb_state_t;
-
-typedef struct afr_inode_ctx_ {
- uint64_t masks;
- int32_t *fresh_children;//increasing order of latency
- afr_spb_state_t mdata_spb;
- afr_spb_state_t data_spb;
-} afr_inode_ctx_t;
+typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);
-typedef enum {
- NONE,
- INDEX,
- FULL,
-} afr_crawl_type_t;
-
-typedef struct afr_self_heald_ {
- gf_boolean_t enabled;
- gf_boolean_t iamshd;
- afr_crawl_type_t *pending;
- gf_boolean_t *inprogress;
- afr_child_pos_t *pos;
- gf_timer_t **timer;
- eh_t *healed;
- eh_t *heal_failed;
- eh_t *split_brain;
- char *node_uuid;
- int timeout;
-} afr_self_heald_t;
+#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;})
+#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;})
+#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
- unsigned int read_child_rr; /* round-robin index of the read_child */
- gf_lock_t read_child_lock; /* lock to protect above */
-
xlator_t **children;
- int first_lookup;
inode_t *root_inode;
unsigned char *child_up;
@@ -138,6 +76,7 @@ typedef struct _afr_private {
gf_boolean_t metadata_change_log; /* on/off */
gf_boolean_t entry_change_log; /* on/off */
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
unsigned int hash_mode; /* for when read_child is not set */
int favorite_child; /* subvolume to be preferred in resolving
@@ -146,143 +85,45 @@ typedef struct _afr_private {
gf_boolean_t inodelk_trace;
gf_boolean_t entrylk_trace;
- gf_boolean_t strict_readdir;
-
unsigned int wait_count; /* # of servers to wait for success */
uint64_t up_count; /* number of CHILD_UPs we have seen */
uint64_t down_count; /* number of CHILD_DOWNs we have seen */
- struct _pump_private *pump_private; /* Set if we are loaded as pump */
- int use_afr_in_pump;
-
- pthread_mutex_t mutex;
- struct list_head saved_fds; /* list of fds on which locks have succeeded */
gf_boolean_t optimistic_change_log;
gf_boolean_t eager_lock;
+ gf_boolean_t pre_op_compat; /* on/off */
uint32_t post_op_delay_secs;
unsigned int quorum_count;
char vol_uuid[UUID_SIZE + 1];
int32_t *last_event;
- afr_self_heald_t shd;
+
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
+
gf_boolean_t choose_local;
gf_boolean_t did_discovery;
- gf_boolean_t readdir_failover;
uint64_t sh_readdir_size;
-} afr_private_t;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
+ char *afr_dirty;
-typedef struct {
- /* External interface: These are variables (some optional) that
- are set by whoever has triggered self-heal */
-
- gf_boolean_t do_data_self_heal;
- gf_boolean_t do_metadata_self_heal;
- gf_boolean_t do_entry_self_heal;
- gf_boolean_t do_gfid_self_heal;
- gf_boolean_t do_missing_entry_self_heal;
- gf_boolean_t force_confirm_spb; /* Check for split-brains even when
- self-heal is turned off */
-
- gf_boolean_t forced_merge; /* Is this a self-heal triggered to
- forcibly merge the directories? */
-
- gf_boolean_t background; /* do self-heal in background
- if possible */
- ia_type_t type; /* st_mode of the entry we're doing
- self-heal on */
- inode_t *inode; /* inode on which the self-heal is
- performed on */
- uuid_t sh_gfid_req; /* gfid self-heal needs to be done
- with this gfid if it is not null */
-
- /* Function to call to unwind. If self-heal is being done in the
- background, this function will be called as soon as possible. */
-
- int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret,
- int32_t op_errno, int32_t sh_failed);
-
- /* End of external interface members */
-
-
- /* array of stat's, one for each child */
- struct iatt *buf;
- struct iatt *parentbufs;
- struct iatt parentbuf;
- struct iatt entrybuf;
-
- afr_expunge_done_cbk_t expunge_done;
- afr_impunge_done_cbk_t impunge_done;
-
- /* array of xattr's, one for each child */
- dict_t **xattr;
-
- /* array containing if the lookups succeeded in the order of response
- */
- int32_t *success_children;
- int success_count;
- /* array containing the fresh children found in the self-heal process */
- int32_t *fresh_children;
- /* array containing the fresh children found in the parent lookup */
- int32_t *fresh_parent_dirs;
- /* array of errno's, one for each child */
- int *child_errno;
- /*loc used for lookup*/
- loc_t lookup_loc;
- int32_t lookup_flags;
- afr_lookup_done_cbk_t lookup_done;
-
- int32_t **pending_matrix;
- int32_t **delta_matrix;
+ afr_self_heald_t shd;
- int32_t op_ret;
- int32_t op_errno;
+ /* pump dependencies */
+ void *pump_private;
+ gf_boolean_t use_afr_in_pump;
+} afr_private_t;
- int *sources;
- int source;
- int active_source;
- int active_sinks;
- unsigned char *success;
- unsigned char *locked_nodes;
- int lock_count;
-
- const char *linkname;
- gf_boolean_t entries_skipped;
-
- int op_failed;
- gf_boolean_t actual_sh_started;
- gf_boolean_t sync_done;
- gf_boolean_t data_lock_held;
- gf_boolean_t eof_reached;
- fd_t *healing_fd;
- int file_has_holes;
- blksize_t block_size;
- off_t file_size;
- off_t offset;
- unsigned char *write_needed;
- uint8_t *checksum;
- afr_post_remove_call_t post_remove_call;
-
- loc_t parent_loc;
-
- call_frame_t *orig_frame;
- call_frame_t *old_loop_frame;
- gf_boolean_t unwound;
-
- afr_sh_algo_private_t *private;
-
- struct afr_sh_algorithm *algo;
- afr_lock_cbk_t data_lock_success_handler;
- afr_lock_cbk_t data_lock_failure_handler;
- gf_boolean_t data_lock_block;
- int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this);
- int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
- void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this);
-
- call_frame_t *sh_frame;
-} afr_self_heal_t;
typedef enum {
AFR_DATA_TRANSACTION, /* truncate, write, ... */
@@ -356,12 +197,19 @@ int
afr_entry_lockee_cmp (const void *l1, const void *l2);
typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
+
+typedef struct {
loc_t *lk_loc;
- struct gf_flock lk_flock;
int lockee_count;
afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
@@ -370,13 +218,11 @@ typedef struct {
unsigned char *locked_nodes;
unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
selfheal_lk_type_t selfheal_lk_type;
transaction_lk_type_t transaction_lk_type;
int32_t lock_count;
- int32_t inodelk_lock_count;
int32_t entrylk_lock_count;
uint64_t lock_number;
@@ -387,32 +233,75 @@ typedef struct {
int32_t lock_op_ret;
int32_t lock_op_errno;
afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
} afr_internal_lock_t;
-typedef struct _afr_locked_fd {
- fd_t *fd;
- struct list_head list;
-} afr_locked_fd_t;
-
struct afr_reply {
int valid;
int32_t op_ret;
int32_t op_errno;
+ dict_t *xdata;
+ struct iatt poststat;
+ struct iatt postparent;
+ struct iatt prestat;
+ struct iatt preparent;
+ struct iatt preparent2;
+ struct iatt postparent2;
+ uint8_t checksum[MD5_DIGEST_LENGTH];
};
+typedef enum {
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
+} afr_fd_open_status_t;
+
+typedef struct {
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+
+ unsigned int *lock_piggyback;
+ unsigned int *lock_acquired;
+
+ int flags;
+
+ /* used for delayed-post-op optimization */
+ pthread_mutex_t delay_lock;
+ gf_timer_t *delay_timer;
+ call_frame_t *delay_frame;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ uint32_t open_fd_count;
+
+
+ /* list of frames currently in progress */
+ struct list_head eager_locked;
+} afr_fd_ctx_t;
+
+
typedef struct _afr_local {
- int uid;
- int gid;
+ glusterfs_fop_t op;
unsigned int call_count;
- unsigned int success_count;
- unsigned int enoent_count;
-
- unsigned int govinda_gOvinda;
+ /* @event_generation: copy of priv->event_generation taken at the
+ time of starting the transaction. The copy is made so that we
+ have a stable value through the various phases of the transaction.
+ */
+ unsigned int event_generation;
- unsigned int read_child_index;
- unsigned char read_child_returned;
- unsigned int first_up_child;
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
gf_lkowner_t saved_lk_owner;
@@ -421,30 +310,92 @@ typedef struct _afr_local {
int32_t **pending;
+ int dirty[AFR_NUM_CHANGE_LOGS];
+
loc_t loc;
loc_t newloc;
fd_t *fd;
+ afr_fd_ctx_t *fd_ctx;
- glusterfs_fop_t fop;
-
+ /* @child_up: copy of priv->child_up taken at the time of transaction
+ start. The copy is taken so that we have a stable child_up array
+ through the phases of the transaction as priv->child_up[i] can keep
+ changing through time.
+ */
unsigned char *child_up;
- int32_t *fresh_children; //in the order of response
- int32_t *child_errno;
+ /* @read_attempted:
+ array of flags representing subvolumes where read operations of
+ the read transaction have already been attempted. The array is
+ first pre-filled with down subvolumes, and as reads are performed
+ on other subvolumes, those are set as well. This way if the read
+ operation fails we do not retry on that subvolume again.
+ */
+ unsigned char *read_attempted;
- dict_t *xattr_req;
+ /* @readfn:
- int32_t inodelk_count;
- int32_t entrylk_count;
+ pointer to function which will perform the read operation on a given
+ subvolume. Used in read transactions.
+ */
- afr_internal_lock_t internal_lock;
+ afr_read_txn_wind_t readfn;
- afr_locked_fd_t *locked_fd;
- int32_t source_child;
- int32_t lock_recovery_child;
+ /* @refreshed:
+
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
+
+ /* @inode:
+
+ the inode on which the read txn is performed on. ref'ed and copied
+ from either fd->inode or loc.inode
+ */
+
+ inode_t *inode;
+
+ /* @parent[2]:
+
+ parent inode[s] on which directory transactions are performed.
+ */
+
+ inode_t *parent;
+ inode_t *parent2;
+
+ /* @readable:
+
+ array of flags representing servers from which a read can be
+ performed. This is the output of afr_inode_refresh()
+ */
+ unsigned char *readable;
+
+ afr_inode_refresh_cbk_t refreshfn;
+
+ /* @refreshinode:
+
+ Inode currently getting refreshed.
+ */
+ inode_t *refreshinode;
+
+ /*
+ @pre_op_compat:
+
+ compatibility mode of pre-op. send a separate pre-op and
+ op operations as part of transaction, rather than combining
+ */
+
+ gf_boolean_t pre_op_compat;
+
+ dict_t *xattr_req;
+
+ afr_internal_lock_t internal_lock;
dict_t *dict;
+
int optimistic_change_log;
gf_boolean_t delayed_post_op;
@@ -454,12 +405,16 @@ typedef struct _afr_local {
*/
gf_boolean_t stable_write;
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
/*
This struct contains the arguments for the "continuation"
(scheme-like) of fops
*/
- int op;
struct {
struct {
unsigned char buf_set;
@@ -467,24 +422,6 @@ typedef struct _afr_local {
} statfs;
struct {
- uint32_t parent_entrylk;
- uuid_t gfid_req;
- inode_t *inode;
- struct iatt buf;
- struct iatt postparent;
- dict_t **xattrs;
- dict_t *xattr;
- struct iatt *postparents;
- struct iatt *bufs;
- int32_t read_child;
- int32_t *sources;
- int32_t *success_children;
- int32_t **pending_matrix;
- gf_boolean_t fresh_lookup;
- gf_boolean_t possible_spb;
- } lookup;
-
- struct {
int32_t flags;
} open;
@@ -552,7 +489,9 @@ typedef struct _afr_local {
struct {
struct iatt prebuf;
struct iatt postbuf;
+ } inode_wfop; //common structure for all inode-write-fops
+ struct {
int32_t op_ret;
struct iovec *vector;
@@ -563,34 +502,21 @@ typedef struct _afr_local {
} writev;
struct {
- struct iatt prebuf;
- struct iatt postbuf;
- } fsync;
-
- struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} truncate;
struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} ftruncate;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} setattr;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} fsetattr;
struct {
@@ -652,6 +578,26 @@ typedef struct _afr_local {
dict_t *params;
char *linkpath;
} symlink;
+
+ struct {
+ int32_t mode;
+ off_t offset;
+ size_t len;
+ } fallocate;
+
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+
+ struct {
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
+
+
} cont;
struct {
@@ -668,22 +614,67 @@ typedef struct _afr_local {
afr_transaction_type type;
- /* pre-compute the post piggyback status before
- entering POST-OP phase
- */
- int *postop_piggybacked;
-
/* stub to resume on destruction
of the transaction frame */
call_stub_t *resume_stub;
struct list_head eager_locked;
- int32_t **txn_changelog;//changelog after pre+post ops
unsigned char *pre_op;
+ /* @fop_subvols: subvolumes on which FOP will be attempted */
+ unsigned char *fop_subvols;
+
+ /* @failed_subvols: subvolumes on which FOP failed. Always
+ a subset of @fop_subvols */
+ unsigned char *failed_subvols;
+
+ /* @dirtied: flag which indicates whether we set dirty flag
+ in the OP. Typically true when we are performing operation
+ on more than one subvol and optimistic changelog is disabled
+
+ A 'true' value set in @dirtied flag means an 'undirtying'
+ has to be done in POST-OP phase.
+ */
+ gf_boolean_t dirtied;
+
+ /* @inherited: flag which indicates that the dirty flags
+ of the previous transaction were inherited
+ */
+ gf_boolean_t inherited;
+
+ /*
+ @no_uninherit: flag which indicates that a pre_op_uninherit()
+ must _not_ be attempted (and returned as failure) always. This
+ flag is set when a hard pre-op is performed, but not accounted
+ for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+ from the pre-op piggybacking entirely and therefore uninherit
+ must not be attempted.
+ */
+ gf_boolean_t no_uninherit;
+
+ /* @uninherit_done:
+ @uninherit_value:
+
+ The above pair variables make pre_op_uninherit() idempotent.
+ Both are FALSE initially. The first call to pre_op_uninherit
+ sets @uninherit_done to TRUE and the return value to
+ @uninherit_value. Further calls will check for @uninherit_done
+ to be TRUE and if so will simply return @uninherit_value.
+ */
+ gf_boolean_t uninherit_done;
+ gf_boolean_t uninherit_value;
+
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+
+ afr_changelog_resume_t changelog_resume;
+
call_frame_t *main_frame;
+ int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
+
int (*fop) (call_frame_t *frame, xlator_t *this);
int (*done) (call_frame_t *frame, xlator_t *this);
@@ -695,7 +686,7 @@ typedef struct _afr_local {
/* post-op hook */
} transaction;
- afr_self_heal_t self_heal;
+ syncbarrier_t barrier;
struct marker_str marker;
@@ -709,75 +700,58 @@ typedef struct _afr_local {
struct afr_reply *replies;
} afr_local_t;
-typedef enum {
- AFR_FD_NOT_OPENED,
- AFR_FD_OPENED,
- AFR_FD_OPENING
-} afr_fd_open_status_t;
-
-typedef struct {
- unsigned int *pre_op_done;
- afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
- unsigned int *pre_op_piggyback;
-
- unsigned int *lock_piggyback;
- unsigned int *lock_acquired;
-
- int flags;
- uint64_t up_count; /* number of CHILD_UPs this fd has seen */
- uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
-
- int32_t last_tried;
-
- int hit, miss;
- gf_boolean_t failed_over;
- struct list_head entries; /* needed for readdir failover */
-
- unsigned char *locked_on; /* which subvolumes locks have been successful */
-
- /* used for delayed-post-op optimization */
- pthread_mutex_t delay_lock;
- gf_timer_t *delay_timer;
- call_frame_t *delay_frame;
- int call_child;
-
- /* set if any write on this fd was a non stable write
- (i.e, without O_SYNC or O_DSYNC)
- */
- gf_boolean_t witnessed_unstable_write;
-
- /* list of frames currently in progress */
- struct list_head eager_locked;
-} afr_fd_ctx_t;
-
-
-/* try alloc and if it fails, goto label */
-#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \
- var = mem_get0 (THIS->local_pool); \
- if (!var) { \
- gf_log (this->name, GF_LOG_ERROR, \
- "out of memory :("); \
- op_errno = ENOMEM; \
- goto label; \
- } \
- } while (0);
-
/* did a call fail due to a child failing? */
#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
((op_errno == ENOTCONN) || \
(op_errno == EBADFD)))
-#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1)
+int
+afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+int
+__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
-/* have we tried all children? */
-#define all_tried(i, count) ((i) == (count) - 1)
+int
+__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvol,
+ int event_generation);
+int
+afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int event_generation);
-int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid);
+int
+afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this);
int
-pump_command_reply (call_frame_t *frame, xlator_t *this);
+afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
+ unsigned char *readable);
+
+int
+afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p,
+ int type);
+int
+afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
+ int *event_p, afr_transaction_type type);
+
+#define afr_data_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION)
+
+#define afr_metadata_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION)
+
+int
+afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_inode_refresh_cbk_t cbk);
int32_t
afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
@@ -793,9 +767,6 @@ int
afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
int
-afr_save_locked_fd (xlator_t *this, fd_t *fd);
-
-int
afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
unsigned char *locked_nodes);
@@ -805,10 +776,6 @@ afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner);
int
afr_set_lock_number (call_frame_t *frame, xlator_t *this);
-
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2);
-
int32_t
afr_unlock (call_frame_t *frame, xlator_t *this);
@@ -824,46 +791,30 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this);
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count);
-int pump_start (call_frame_t *frame, xlator_t *this);
-
int
__afr_fd_ctx_set (xlator_t *this, fd_t *fd);
int
afr_fd_ctx_set (xlator_t *this, fd_t *fd);
-int32_t
-afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children);
-
-void
-afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child,
- int32_t *fresh_children);
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
int
afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno);
-unsigned int
-afr_up_children_count (unsigned char *child_up, unsigned int child_count);
-
-unsigned int
-afr_locked_children_count (unsigned char *children, unsigned int child_count);
-
-unsigned int
-afr_pre_op_done_children_count (unsigned char *pre_op,
- unsigned int child_count);
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
-gf_boolean_t
-afr_is_fresh_lookup (loc_t *loc, xlator_t *this);
+int
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);
void
-afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent);
-
-int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+afr_replies_wipe (afr_local_t *local, afr_private_t *priv);
void
afr_local_cleanup (afr_local_t *local, xlator_t *this);
@@ -871,32 +822,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this);
int
afr_frame_return (call_frame_t *frame);
-gf_boolean_t
-afr_is_split_brain (xlator_t *this, inode_t *inode);
-
-void
-afr_set_split_brain (xlator_t *this, inode_t *inode, afr_spb_state_t mdata_spb,
- afr_spb_state_t data_spb);
-
int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
fd_t *fd, dict_t *xdata);
void
-afr_set_opendir_done (xlator_t *this, inode_t *inode);
-
-gf_boolean_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode);
-
-void
afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
int
afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
-int
-afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd);
-
#define AFR_STACK_UNWIND(fop, frame, params ...) \
do { \
afr_local_t *__local = NULL; \
@@ -927,23 +862,16 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd);
} \
} while (0);
-#define AFR_CALL_RESUME(stub) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- \
- __local = stub->frame->local; \
- __this = stub->frame->this; \
- stub->frame->local = NULL; \
- \
- call_resume (stub); \
- if (__local) { \
- afr_local_cleanup (__local, __this); \
- mem_put (__local); \
- } \
- } while (0)
+#define AFR_FRAME_INIT(frame, op_errno) \
+ ({frame->local = mem_get0 (THIS->local_pool); \
+ if (afr_local_init (frame->local, THIS->private, &op_errno)) { \
+ afr_local_cleanup (frame->local, THIS); \
+ mem_put (frame->local); \
+ frame->local = NULL; }; \
+ frame->local;})
+
+#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0)
-#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
/* allocate and return a string that is the basename of argument */
static inline char *
AFR_BASENAME (const char *str)
@@ -956,6 +884,9 @@ AFR_BASENAME (const char *str)
return __basename_str;
}
+call_frame_t *
+afr_copy_frame (call_frame_t *base);
+
int
afr_transaction_local_init (afr_local_t *local, xlator_t *this);
@@ -963,9 +894,6 @@ int32_t
afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
-int32_t *
-afr_children_create (int32_t child_count);
-
int
afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
@@ -974,101 +902,20 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
transaction_lk_type_t lk_type);
int
-afr_first_up_child (unsigned char *child_up, size_t child_count);
+afr_higher_errno (int32_t old_errno, int32_t new_errno);
int
-afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count,
- int32_t prev_read_child,
- int32_t config_read_child, int32_t *sources,
- unsigned int hmode, uuid_t gfid);
+afr_final_errno (afr_local_t *local, afr_private_t *priv);
-void
-afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
- int32_t *fresh_children, int32_t prev_read_child,
- int32_t config_read_child, uuid_t gfid);
-
-int32_t
-afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child,
- int32_t *fresh_children,
- int32_t *call_child, int32_t *last_index);
-
-int32_t
-afr_next_call_child (int32_t *fresh_children, unsigned char *child_up,
- size_t child_count, int32_t *last_index,
- int32_t read_child);
-void
-afr_get_fresh_children (int32_t *success_children, int32_t *sources,
- int32_t *children, unsigned int child_count);
-void
-afr_children_add_child (int32_t *children, int32_t child,
- int32_t child_count);
-void
-afr_children_rm_child (int32_t *children, int32_t child,
- int32_t child_count);
-void
-afr_reset_children (int32_t *children, int32_t child_count);
-int32_t
-afr_most_important_error(int32_t old_errno, int32_t new_errno,
- gf_boolean_t eio);
-int
-afr_errno_count (int32_t *children, int *child_errno,
- unsigned int child_count, int32_t op_errno);
int
-afr_get_children_count (int32_t *children, unsigned int child_count);
-gf_boolean_t
-afr_is_child_present (int32_t *success_children, int32_t child_count,
- int32_t child);
-void
-afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs,
- int32_t *success_children,
- unsigned int child_count);
-void
-afr_reset_xattr (dict_t **xattr, unsigned int child_count);
-gf_boolean_t
-afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
- unsigned int child_count, const char *path,
- const char *xlator_name);
-unsigned int
-afr_gfid_missing_count (const char *xlator_name, int32_t *children,
- struct iatt *bufs, unsigned int child_count,
- const char *path);
-void
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path);
-void
-afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count);
-afr_transaction_type
-afr_transaction_type_get (ia_type_t ia_type);
-int32_t
-afr_resultant_errno_get (int32_t *children,
- int *child_errno, unsigned int child_count);
-void
-afr_inode_rm_stale_children (xlator_t *this, inode_t *inode,
- int32_t *stale_children);
-void
-afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,
- gf_boolean_t background, ia_type_t ia_type, char *reason,
- void (*gfid_sh_success_cbk) (call_frame_t *sh_frame,
- xlator_t *this),
- int (*unwind) (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- int32_t sh_failed));
-void
-afr_fix_open (xlator_t *this, fd_t *fd, size_t need_open_count, int *need_open);
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
void
-afr_open_fd_fix (fd_t *fd, xlator_t *this);
-int
-afr_set_elem_count_get (unsigned char *elems, int child_count);
+afr_fix_open (fd_t *fd, xlator_t *this);
afr_fd_ctx_t *
afr_fd_ctx_get (fd_t *fd, xlator_t *this);
-gf_boolean_t
-afr_open_only_data_self_heal (char *data_self_heal);
-
-gf_boolean_t
-afr_data_self_heal_enabled (char *data_self_heal);
-
void
afr_set_low_priority (call_frame_t *frame);
int
@@ -1084,22 +931,9 @@ afr_matrix_cleanup (int32_t **pending, unsigned int m);
int32_t**
afr_matrix_create (unsigned int m, unsigned int n);
-gf_boolean_t
-afr_is_errno_set (int *child_errno, int child);
-
-gf_boolean_t
-afr_is_errno_unset (int *child_errno, int child);
-
-gf_boolean_t
-afr_is_fd_fixable (fd_t *fd);
-
void
-afr_prepare_new_entry_pending_matrix (int32_t **pending,
- gf_boolean_t (*is_pending) (int *, int),
- int *ctx, struct iatt *buf,
- unsigned int child_count);
-void
-afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
+afr_filter_xattrs (dict_t *xattr);
+
/*
* Special value indicating we should use the "auto" quorum method instead of
* a fixed value (including zero to turn off quorum enforcement).
@@ -1119,7 +953,6 @@ afr_xattr_array_destroy (dict_t **xattr, unsigned int child_count);
} \
} while (0);
-
int
afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
@@ -1129,4 +962,15 @@ afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
void
afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+
+int
+afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local);
+
+void
+afr_remove_eager_lock_stub (afr_local_t *local);
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
index db83410c7..eed509956 100644
--- a/xlators/cluster/afr/src/pump.c
+++ b/xlators/cluster/afr/src/pump.c
@@ -21,6 +21,120 @@
#include "afr-common.c"
#include "defaults.c"
#include "glusterfs.h"
+#include "pump.h"
+
+
+static int
+afr_set_dict_gfid (dict_t *dict, uuid_t gfid)
+{
+ int ret = 0;
+ uuid_t *pgfid = NULL;
+
+ GF_ASSERT (gfid);
+
+ pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
+ if (!pgfid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*pgfid, gfid);
+
+ ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t));
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed");
+
+out:
+ if (ret && pgfid)
+ GF_FREE (pgfid);
+ return ret;
+}
+
+static int
+afr_set_root_gfid (dict_t *dict)
+{
+ uuid_t gfid;
+ int ret = 0;
+
+ memset (gfid, 0, 16);
+ gfid[15] = 1;
+
+ ret = afr_set_dict_gfid (dict, gfid);
+
+ return ret;
+}
+
+static int
+afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+{
+ int ret = -1;
+ uuid_t pargfid = {0};
+
+ if (!child)
+ goto out;
+
+ if (!uuid_is_null (parent->inode->gfid))
+ uuid_copy (pargfid, parent->inode->gfid);
+ else if (!uuid_is_null (parent->gfid))
+ uuid_copy (pargfid, parent->gfid);
+
+ if (uuid_is_null (pargfid))
+ goto out;
+
+ if (strcmp (parent->path, "/") == 0)
+ ret = gf_asprintf ((char **)&child->path, "/%s", name);
+ else
+ ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
+ name);
+
+ if (-1 == ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "asprintf failed while setting child path");
+ }
+
+ child->name = strrchr (child->path, '/');
+ if (child->name)
+ child->name++;
+
+ child->parent = inode_ref (parent->inode);
+ child->inode = inode_new (parent->inode->table);
+ uuid_copy (child->pargfid, pargfid);
+
+ if (!child->inode) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if ((ret == -1) && child)
+ loc_wipe (child);
+
+ return ret;
+}
+
+static void
+afr_build_root_loc (xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ loc->path = gf_strdup ("/");
+ loc->name = "";
+ loc->inode = inode_ref (priv->root_inode);
+ uuid_copy (loc->gfid, loc->inode->gfid);
+}
+
+static void
+afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
+{
+ GF_ASSERT (loc);
+ GF_ASSERT (buf);
+
+ uuid_copy (loc->gfid, buf->ia_gfid);
+ if (postparent)
+ uuid_copy (loc->pargfid, postparent->ia_gfid);
+}
static uint64_t pump_pid = 0;
static inline void
@@ -387,54 +501,68 @@ gf_pump_traverse_directory (loc_t *loc)
if (ret)
goto out;
- if (!IS_ENTRY_CWD (entry->d_name) &&
- !IS_ENTRY_PARENT (entry->d_name)) {
-
- is_directory_empty = _gf_false;
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup %s => %"PRId64,
- entry_loc.path,
- iatt.ia_ino);
-
- ret = syncop_lookup (this, &entry_loc, NULL,
- &iatt, &xattr_rsp, &parent);
-
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: lookup failed",
- entry_loc.path);
- continue;
- }
- pump_fill_loc_info (&entry_loc, &iatt,
- &parent);
-
- pump_update_resume_state (this, entry_loc.path);
-
- pump_save_path (this, entry_loc.path);
- pump_save_file_stats (this, entry_loc.path);
-
- ret = pump_check_and_update_status (this);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump beginning to exit out");
- goto out;
- }
-
- if (IA_ISDIR (iatt.ia_type)) {
- if (is_pump_traversal_allowed (this, entry_loc.path)) {
- gf_log (this->name, GF_LOG_TRACE,
- "entering dir=%s",
- entry->d_name);
- gf_pump_traverse_directory (&entry_loc);
- }
- }
+ if ((strcmp (entry->d_name, ".") == 0) ||
+ (strcmp (entry->d_name, "..") == 0))
+ continue;
+
+ is_directory_empty = _gf_false;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "lookup %s => %"PRId64,
+ entry_loc.path,
+ iatt.ia_ino);
+
+ ret = syncop_lookup (this, &entry_loc, NULL, &iatt,
+ &xattr_rsp, &parent);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: lookup failed", entry_loc.path);
+ continue;
+ }
+
+ ret = afr_selfheal_name (this, loc->gfid, entry->d_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: name self-heal failed (%s/%s)",
+ entry_loc.path, uuid_utoa (loc->gfid),
+ entry->d_name);
+ continue;
+ }
+
+ ret = afr_selfheal (this, iatt.ia_gfid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: self-heal failed (%s)",
+ entry_loc.path, uuid_utoa (iatt.ia_gfid));
+ continue;
+ }
+
+ pump_fill_loc_info (&entry_loc, &iatt, &parent);
+
+ pump_update_resume_state (this, entry_loc.path);
+
+ pump_save_path (this, entry_loc.path);
+ pump_save_file_stats (this, entry_loc.path);
+
+ ret = pump_check_and_update_status (this);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Pump beginning to exit out");
+ goto out;
+ }
+
+ if (IA_ISDIR (iatt.ia_type)) {
+ if (is_pump_traversal_allowed (this, entry_loc.path)) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "entering dir=%s", entry->d_name);
+ gf_pump_traverse_directory (&entry_loc);
+ }
}
}
gf_dirent_free (&entries);
free_entries = _gf_false;
- gf_log (this->name, GF_LOG_TRACE,
- "offset incremented to %d",
+ gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d",
(int32_t ) offset);
}
@@ -443,7 +571,7 @@ gf_pump_traverse_directory (loc_t *loc)
if (ret < 0)
gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed");
- if (is_directory_empty && IS_ROOT_PATH (loc->path)) {
+ if (is_directory_empty && (strcmp (loc->path, "/") == 0)) {
pump_change_state (this, PUMP_STATE_RUNNING);
gf_log (this->name, GF_LOG_INFO, "Empty source brick. "
"Nothing to be done.");
@@ -496,17 +624,18 @@ pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this,
afr_build_root_loc (this, &loc);
ret = syncop_removexattr (priv->children[source], &loc,
- PUMP_PATH);
+ PUMP_PATH, 0);
ret = syncop_removexattr (priv->children[sink], &loc,
- PUMP_SINK_COMPLETE);
+ PUMP_SINK_COMPLETE, 0);
for (i = 0; i < priv->child_count; i++) {
ret = syncop_removexattr (priv->children[i], &loc,
- PUMP_SOURCE_COMPLETE);
- if (ret)
+ PUMP_SOURCE_COMPLETE, 0);
+ if (ret) {
gf_log (this->name, GF_LOG_DEBUG, "removexattr "
- "failed with %s", strerror (errno));
+ "failed with %s", strerror (-ret));
+ }
}
loc_wipe (&loc);
@@ -598,6 +727,7 @@ pump_lookup_sink (loc_t *loc)
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
"Lookup on sink child failed");
+ ret = -1;
goto out;
}
@@ -1275,128 +1405,16 @@ out:
}
-struct _xattr_key {
- char *key;
- struct list_head list;
-};
-
-static int
-__gather_xattr_keys (dict_t *dict, char *key, data_t *value,
- void *data)
-{
- struct list_head * list = data;
- struct _xattr_key * xkey = NULL;
-
- if (!strncmp (key, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
-
- xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key);
- if (!xkey)
- return -1;
-
- xkey->key = key;
- INIT_LIST_HEAD (&xkey->list);
-
- list_add_tail (&xkey->list, list);
- }
- return 0;
-}
-
-static void
-__filter_xattrs (dict_t *dict)
-{
- struct list_head keys;
-
- struct _xattr_key *key;
- struct _xattr_key *tmp;
-
- INIT_LIST_HEAD (&keys);
-
- dict_foreach (dict, __gather_xattr_keys,
- (void *) &keys);
-
- list_for_each_entry_safe (key, tmp, &keys, list) {
- dict_del (dict, key->key);
-
- list_del_init (&key->list);
-
- GF_FREE (key);
- }
-}
-
-int32_t
-pump_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
-
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- last_index = &local->cont.getxattr.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
-
- unwind = 0;
- STACK_WIND_COOKIE (frame, pump_getxattr_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name, NULL);
- }
-
-out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
-
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL);
- }
-
- return 0;
-}
-
-int32_t
-pump_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
+int
+pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- int32_t ret = -1;
- int32_t op_errno = 0;
- uint64_t read_child = 0;
-
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int ret = 0;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ priv = this->private;
- children = priv->children;
if (!priv->use_afr_in_pump) {
STACK_WIND (frame, default_getxattr_cbk,
FIRST_CHILD (this),
@@ -1405,14 +1423,6 @@ pump_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
if (name) {
if (!strncmp (name, AFR_XATTR_PREFIX,
strlen (AFR_XATTR_PREFIX))) {
@@ -1430,32 +1440,7 @@ pump_getxattr (call_frame_t *frame, xlator_t *this,
}
}
- local->fresh_children = GF_CALLOC (priv->child_count,
- sizeof (*local->fresh_children),
- gf_afr_mt_int32_t);
- if (!local->fresh_children) {
- ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.getxattr.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
- loc_copy (&local->loc, loc);
- if (name)
- local->cont.getxattr.name = gf_strdup (name);
-
- STACK_WIND_COOKIE (frame, pump_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->getxattr,
- loc, name, xdata);
+ afr_getxattr (frame, this, loc, name, xdata);
ret = 0;
out:
@@ -1464,134 +1449,6 @@ out:
return 0;
}
-static int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (setxattr, main_frame,
- local->op_ret, local->op_errno, NULL);
- }
- return 0;
-}
-
-static int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
-}
-
-static int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags, NULL);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-static int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int32_t
-pump_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
int
pump_command_reply (call_frame_t *frame, xlator_t *this)
{
@@ -1615,51 +1472,56 @@ pump_command_reply (call_frame_t *frame, xlator_t *this)
}
int
-pump_parse_command (call_frame_t *frame, xlator_t *this,
- afr_local_t *local, dict_t *dict)
+pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict,
+ int *op_errno_p)
{
-
+ afr_local_t *local = NULL;
int ret = -1;
+ int op_errno = 0;
if (pump_command_start (this, dict)) {
- frame->local = local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->dict = dict_ref (dict);
ret = pump_execute_start (frame, this);
} else if (pump_command_pause (this, dict)) {
- frame->local = local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->dict = dict_ref (dict);
ret = pump_execute_pause (frame, this);
} else if (pump_command_abort (this, dict)) {
- frame->local = local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->dict = dict_ref (dict);
ret = pump_execute_abort (frame, this);
} else if (pump_command_commit (this, dict)) {
- frame->local = local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->dict = dict_ref (dict);
ret = pump_execute_commit (frame, this);
}
+out:
+ if (op_errno_p)
+ *op_errno_p = op_errno;
return ret;
}
int
-pump_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata)
+pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ afr_private_t *priv = NULL;
int ret = -1;
int op_errno = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict,
- op_errno, out);
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out);
priv = this->private;
if (!priv->use_afr_in_pump) {
@@ -1670,56 +1532,15 @@ pump_setxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0) {
- afr_local_cleanup (local, this);
- goto out;
- }
-
- ret = pump_parse_command (frame, this,
- local, dict);
- if (ret >= 0) {
- ret = 0;
+ ret = pump_parse_command (frame, this, dict, &op_errno);
+ if (ret >= 0)
goto out;
- }
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- ret = -1;
- afr_local_cleanup (local, this);
- goto out;
- }
-
- transaction_frame->local = local;
-
- local->op_ret = -1;
-
- local->cont.setxattr.dict = dict_ref (dict);
- local->cont.setxattr.flags = flags;
-
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
- local->transaction.unwind = afr_setxattr_unwind;
-
- loc_copy (&local->loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
-
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ afr_setxattr (frame, this, loc, dict, flags, xdata);
ret = 0;
out:
if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
}
@@ -2387,7 +2208,7 @@ int32_t
init (xlator_t *this)
{
afr_private_t * priv = NULL;
- pump_private_t *pump_priv = NULL;
+ pump_private_t *pump_priv = NULL;
int child_count = 0;
xlator_list_t * trav = NULL;
int i = 0;
@@ -2408,17 +2229,11 @@ init (xlator_t *this)
"Volume is dangling.");
}
- this->private = GF_CALLOC (1, sizeof (afr_private_t),
- gf_afr_mt_afr_private_t);
- if (!this->private)
+ priv = GF_CALLOC (1, sizeof (afr_private_t), gf_afr_mt_afr_private_t);
+ if (!priv)
goto out;
- priv = this->private;
LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
- //lock recovery is not done in afr
- pthread_mutex_init (&priv->mutex, NULL);
- INIT_LIST_HEAD (&priv->saved_fds);
child_count = xlator_subvolume_count (this);
if (child_count != 2) {
@@ -2452,9 +2267,6 @@ init (xlator_t *this)
and the sink.
*/
- priv->strict_readdir = _gf_false;
-
- priv->wait_count = 1;
priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
gf_afr_mt_char);
if (!priv->child_up) {
@@ -2488,7 +2300,8 @@ init (xlator_t *this)
while (i < child_count) {
priv->children[i] = trav->xlator;
- ret = gf_asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+ ret = gf_asprintf (&priv->pending_key[i], "%s.%s",
+ AFR_XATTR_PREFIX,
trav->xlator->name);
if (-1 == ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -2501,7 +2314,12 @@ init (xlator_t *this)
i++;
}
- priv->first_lookup = 1;
+ ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name);
+ if (-1 == ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
priv->root_inode = NULL;
priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
@@ -2526,13 +2344,12 @@ init (xlator_t *this)
pump_priv->resume_path = GF_CALLOC (1, PATH_MAX,
gf_afr_mt_char);
if (!pump_priv->resume_path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
+ gf_log (this->name, GF_LOG_ERROR, "Out of memory");
ret = -1;
goto out;
}
- pump_priv->env = syncenv_new (0);
+ pump_priv->env = this->ctx->env;
if (!pump_priv->env) {
gf_log (this->name, GF_LOG_ERROR,
"Could not create new sync-environment");
@@ -2550,11 +2367,32 @@ init (xlator_t *this)
}
priv->pump_private = pump_priv;
+ pump_priv = NULL;
+
+ this->private = priv;
+ priv = NULL;
pump_change_state (this, PUMP_STATE_ABORT);
ret = 0;
out:
+
+ if (pump_priv) {
+ GF_FREE (pump_priv->resume_path);
+ LOCK_DESTROY (&pump_priv->resume_path_lock);
+ LOCK_DESTROY (&pump_priv->pump_state_lock);
+ GF_FREE (pump_priv);
+ }
+
+ if (priv) {
+ GF_FREE (priv->child_up);
+ GF_FREE (priv->children);
+ GF_FREE (priv->pending_key);
+ GF_FREE (priv->last_event);
+ LOCK_DESTROY (&priv->lock);
+ GF_FREE (priv);
+ }
+
return ret;
}
@@ -2573,9 +2411,6 @@ fini (xlator_t *this)
if (!pump_priv)
goto afr_priv;
- if (pump_priv->env)
- syncenv_destroy (pump_priv->env);
-
GF_FREE (pump_priv->resume_path);
LOCK_DESTROY (&pump_priv->resume_path_lock);
LOCK_DESTROY (&pump_priv->pump_state_lock);
diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h
index bc4c31a78..9d0b6db6a 100644
--- a/xlators/cluster/afr/src/pump.h
+++ b/xlators/cluster/afr/src/pump.h
@@ -75,4 +75,7 @@ pump_command_status (xlator_t *this, dict_t *dict);
int
pump_execute_status (call_frame_t *frame, xlator_t *this);
+int
+pump_command_reply (call_frame_t *frame, xlator_t *this);
+
#endif /* __PUMP_H__ */
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index 174bea841..3fc29bf81 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,4 +1,3 @@
-
xlator_LTLIBRARIES = dht.la nufa.la switch.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 1d394910a..3868fc38f 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -22,6 +22,7 @@
#include "dht-common.h"
#include "defaults.h"
#include "byte-order.h"
+#include "glusterfs-acl.h"
#include <sys/time.h>
#include <libgen.h>
@@ -62,6 +63,11 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data)
}
*size = hton64 (ntoh64 (*size) + ntoh64 (*ptr));
+
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ ret = gf_get_min_stime (THIS, dst, key, value);
+ if (ret < 0)
+ return ret;
} else {
/* compare user xattrs only */
if (!strncmp (key, "user.", strlen ("user."))) {
@@ -148,9 +154,11 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
int op_errno = 0;
int ret = -1;
dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
local = discover_frame->local;
layout = local->layout;
+ conf = this->private;
LOCK(&discover_frame->lock);
{
@@ -193,11 +201,14 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
"(overlaps/holes present: %s, "
"ENOENT errors: %d)", local->loc.path,
(ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0);
- op_errno = EINVAL;
- goto out;
+ if ((ret > 0) && (ret == conf->subvolume_cnt)) {
+ op_errno = ESTALE;
+ goto out;
+ }
}
- dht_layout_set (this, local->inode, layout);
+ if (local->inode)
+ dht_layout_set (this, local->inode, layout);
}
DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
@@ -430,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
op_ret, op_errno, xattr);
if (op_ret == -1) {
- local->op_errno = ENOENT;
+ local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"lookup of %s on %s returned error (%s)",
local->loc.path, prev->this->name,
@@ -739,7 +750,7 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
cached_subvol = local->cached_subvol;
conf = this->private;
- ret = dht_layout_preset (this, local->cached_subvol, inode);
+ ret = dht_layout_preset (this, local->cached_subvol, local->loc.inode);
if (ret < 0) {
gf_log (this->name, GF_LOG_DEBUG,
"failed to set layout for subvolume %s",
@@ -1404,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
conf = this->private;
if (!conf)
@@ -1625,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
- if (op_ret == -1) {
+ if ((op_ret == -1) && !((op_errno == ENOENT) ||
+ (op_errno == ENOTCONN))) {
local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
@@ -1638,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
unlock:
UNLOCK (&frame->lock);
- if (op_ret == -1)
+ if (local->op_ret == -1)
goto err;
cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
@@ -1777,6 +1788,7 @@ dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this,
}
(void) strcat (local->xattr_val, value);
+ (void) strcat (local->xattr_val, " ");
local->op_ret = 0;
}
@@ -1801,6 +1813,8 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
if (!*dict)
goto out;
+ local->xattr_val[strlen (local->xattr_val) - 1] = '\0';
+
/* we would need max this many bytes to create xattr string
* extra 40 bytes is just an estimated amount of additional
* space required as we include translator name and some
@@ -1829,10 +1843,13 @@ dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
} else {
gf_log (this->name, GF_LOG_WARNING,
"Unknown local->xsel (%s)", local->xsel);
+ GF_FREE (xattr_buf);
goto out;
}
ret = dict_set_dynstr (*dict, local->xsel, xattr_buf);
+ if (ret)
+ GF_FREE (xattr_buf);
GF_FREE (local->xattr_val);
out:
@@ -1996,18 +2013,18 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (dict_get (xattr, conf->xattr_name)) {
dict_del (xattr, conf->xattr_name);
}
+
+ if (frame->root->pid >= 0 ) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
+
local->op_ret = 0;
if (!local->xattr) {
local->xattr = dict_copy_with_ref (xattr, NULL);
} else {
- /* first aggregate everything into xattr and then copy into
- * local->xattr. This is required as we want to have
- * 'local->xattr' as the proper final dictionary passed above
- * distribute xlator.
- */
- dht_aggregate_xattr (xattr, local->xattr);
- local->xattr = dict_copy (xattr, local->xattr);
+ dht_aggregate_xattr (local->xattr, xattr);
}
out:
if (is_last_call (this_call_cnt)) {
@@ -2027,6 +2044,67 @@ dht_getxattr_unwind (call_frame_t *frame,
int
+dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+
+ if (op_ret != -1) {
+ if (local->xattr)
+ dict_unref (local->xattr);
+ local->xattr = dict_ref (xattr);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+ local->xattr_req = dict_ref (xdata);
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno,
+ local->xattr, local->xattr_req);
+ }
+
+ return 0;
+}
+
+
+int
+dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *key, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int cnt = 0;
+ xlator_t *subvol = NULL;
+
+
+ local = frame->local;
+ layout = local->layout;
+
+ cnt = local->call_cnt = layout->cnt;
+
+ local->op_ret = -1;
+ local->op_errno = ENODATA;
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_getxattr_get_real_filename_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key, xdata);
+ }
+
+ return 0;
+}
+
+
+int
dht_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *key, dict_t *xdata)
#define DHT_IS_DIR(layout) (layout->cnt > 1)
@@ -2047,7 +2125,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -2075,6 +2152,14 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
}
+ if (key &&
+ (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY,
+ strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)
+ && DHT_IS_DIR(layout)) {
+ dht_getxattr_get_real_filename (frame, this, loc, key, xdata);
+ return 0;
+ }
+
/* for file use cached subvolume (obviously!): see if {}
* below
* for directory:
@@ -2084,8 +2169,9 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
* NOTE: Don't trust inode here, as that may not be valid
* (until inode_link() happens)
*/
- if (key && (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)
- && DHT_IS_DIR(layout)) {
+ if (key && DHT_IS_DIR(layout) &&
+ (XATTR_IS_PATHINFO (key)
+ || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) {
(void) strncpy (local->xsel, key, 256);
cnt = local->call_cnt = layout->cnt;
for (i = 0; i < cnt; i++) {
@@ -2099,7 +2185,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
/* node-uuid or pathinfo for files */
if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0)
- || (strcmp (key, GF_XATTR_PATHINFO_KEY) == 0))) {
+ || XATTR_IS_PATHINFO (key))) {
cached_subvol = local->cached_subvol;
(void) strncpy (local->xsel, key, 256);
@@ -2156,7 +2242,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, key,
local, dht_getxattr_unwind,
sub_volumes, cnt,
- MARKER_UUID_TYPE, conf->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -2164,6 +2251,18 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
+ if (key && !strcmp (GF_XATTR_QUOTA_LIMIT_LIST, key)) {
+ /* quota hardlimit and aggregated size of a directory is stored
+ * in inode contexts of each brick. Hence its good enough that
+ * we send getxattr for this key to any brick.
+ */
+ local->call_cnt = 1;
+ subvol = dht_first_up_subvol (this);
+ STACK_WIND (frame, dht_getxattr_cbk, subvol,
+ subvol->fops->getxattr, loc, key, xdata);
+ return 0;
+ }
+
if (key && *conf->vol_uuid) {
if ((match_uuid_local (key, conf->vol_uuid) == 0) &&
(GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
@@ -2180,6 +2279,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
local, dht_getxattr_unwind,
sub_volumes, cnt,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -2252,6 +2352,7 @@ dht_fgetxattr (call_frame_t *frame, xlator_t *this,
}
if ((fd->inode->ia_type == IA_IFDIR)
+ && key
&& (strncmp (key, GF_XATTR_LOCKINFO_KEY,
strlen (GF_XATTR_LOCKINFO_KEY) != 0))) {
cnt = local->call_cnt = layout->cnt;
@@ -2395,7 +2496,6 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
conf = this->private;
@@ -2609,7 +2709,6 @@ dht_removexattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR);
if (!local) {
@@ -2778,11 +2877,16 @@ int
dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct statvfs *statvfs, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- int bsize = 0;
- int frsize = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int bsize = 0;
+ int frsize = 0;
+ int8_t quota_deem_statfs = 0;
+ GF_UNUSED int ret = 0;
+ unsigned long new_usage = 0;
+ unsigned long cur_usage = 0;
+ ret = dict_get_int8 (xdata, "quota-deem-statfs", &quota_deem_statfs);
local = frame->local;
@@ -2792,8 +2896,22 @@ dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
goto unlock;
}
+ if (!statvfs) {
+ op_errno = EINVAL;
+ local->op_ret = -1;
+ goto unlock;
+ }
local->op_ret = 0;
+ if (quota_deem_statfs) {
+ new_usage = statvfs->f_blocks - statvfs->f_bfree;
+ cur_usage = local->statvfs.f_blocks - local->statvfs.f_bfree;
+ /* We take the maximux of the usage from the subvols */
+ if (new_usage >= cur_usage)
+ local->statvfs = *statvfs;
+ goto unlock;
+ }
+
if (local->statvfs.f_bsize != 0) {
bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
@@ -2814,6 +2932,7 @@ dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->statvfs.f_flag = statvfs->f_flag;
local->statvfs.f_namemax = statvfs->f_namemax;
+
}
unlock:
UNLOCK (&frame->lock);
@@ -2841,7 +2960,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -2962,7 +3080,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
list_for_each_entry (orig_entry, (&orig_entries->list), list) {
next_offset = orig_entry->d_off;
if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) &&
- (prev->this != dht_first_up_subvol (this))) {
+ (prev->this != local->first_up_subvol)) {
continue;
}
if (check_is_linkfile (NULL, (&orig_entry->d_stat),
@@ -3002,7 +3120,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
/* making sure we set the inode ctx right with layout,
currently possible only for non-directories, so for
directories don't set entry inodes */
- if (!IA_ISDIR(entry->d_stat.ia_type)) {
+ if (!IA_ISDIR(entry->d_stat.ia_type) && orig_entry->inode) {
ret = dht_layout_preset (this, prev->this,
orig_entry->inode);
if (ret)
@@ -3044,13 +3162,16 @@ done:
}
if (conf->readdir_optimize == _gf_true) {
- if (next_subvol != dht_first_up_subvol (this)) {
+ if (next_subvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name, GF_LOG_ERROR,
"dict set failed");
- }
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
+ }
}
STACK_WIND (frame, dht_readdirp_cbk,
@@ -3196,6 +3317,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->fd = fd_ref (fd);
local->size = size;
local->xattr_req = (dict)? dict_ref (dict) : NULL;
+ local->first_up_subvol = dht_first_up_subvol (this);
dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
@@ -3214,13 +3336,16 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
"failed to set '%s' key",
conf->link_xattr_name);
if (conf->readdir_optimize == _gf_true) {
- if (xvol != dht_first_up_subvol (this)) {
+ if (xvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name,
GF_LOG_ERROR,
"Dict set failed");
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
}
}
}
@@ -3478,7 +3603,9 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
subvol, subvol->fops->mknod, loc, mode,
rdev, umask, params);
} else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+
+ avail_subvol = dht_free_disk_available_subvol (this, subvol,
+ local);
if (avail_subvol != subvol) {
/* Choose the minimum filled volume, and create the
files there */
@@ -3590,12 +3717,12 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
}
hashed_subvol = dht_subvol_get_hashed (this, loc);
+ /* Dont fail unlink if hashed_subvol is NULL which can be the result
+ * of layout anomaly */
if (!hashed_subvol) {
gf_log (this->name, GF_LOG_DEBUG,
"no subvolume in layout for path=%s",
loc->path);
- op_errno = EINVAL;
- goto err;
}
cached_subvol = local->cached_subvol;
@@ -3607,7 +3734,7 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
}
local->flags = xflag;
- if (hashed_subvol != cached_subvol) {
+ if (hashed_subvol && hashed_subvol != cached_subvol) {
STACK_WIND (frame, dht_unlink_linkfile_cbk,
hashed_subvol, hashed_subvol->fops->unlink, loc,
xflag, xdata);
@@ -3899,7 +4026,7 @@ dht_create (call_frame_t *frame, xlator_t *this,
}
/* Choose the minimum filled volume, and create the
files there */
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+ avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->flags = flags;
@@ -4487,17 +4614,85 @@ err:
int
+dht_rmdir_cached_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *parent)
+{
+ dht_local_t *local = NULL;
+ xlator_t *src = NULL;
+ call_frame_t *main_frame = NULL;
+ dht_local_t *main_local = NULL;
+ int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
+ dict_t *xattrs = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ src = local->hashed_subvol;
+
+ main_frame = local->main_frame;
+ main_local = main_frame->local;
+
+ if (op_ret == 0) {
+ main_local->op_ret = -1;
+ main_local->op_errno = ENOTEMPTY;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s found on cached subvol %s",
+ local->loc.path, src->name);
+ goto err;
+ } else if (op_errno != ENOENT) {
+ main_local->op_ret = -1;
+ main_local->op_errno = op_errno;
+ goto err;
+ }
+
+ xattrs = dict_new ();
+ if (!xattrs) {
+ gf_log (this->name, GF_LOG_ERROR, "dict_new failed");
+ goto err;
+ }
+
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key"
+ " in dict");
+ if (xattrs)
+ dict_unref (xattrs);
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_rmdir_lookup_cbk,
+ src, src->fops->lookup, &local->loc, xattrs);
+ if (xattrs)
+ dict_unref (xattrs);
+
+ return 0;
+err:
+
+ this_call_cnt = dht_frame_return (main_frame);
+ if (is_last_call (this_call_cnt))
+ dht_rmdir_do (main_frame, this);
+
+ DHT_STACK_DESTROY (frame);
+ return 0;
+}
+
+
+int
dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
gf_dirent_t *entries, xlator_t *src)
{
- int ret = 0;
- int build_ret = 0;
- gf_dirent_t *trav = NULL;
+ int ret = 0;
+ int build_ret = 0;
+ gf_dirent_t *trav = NULL;
call_frame_t *lookup_frame = NULL;
dht_local_t *lookup_local = NULL;
- dht_local_t *local = NULL;
- dict_t *xattrs = NULL;
- dht_conf_t *conf = this->private;
+ dht_local_t *local = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = this->private;
+ xlator_t *subvol = NULL;
local = frame->local;
@@ -4557,6 +4752,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
lookup_frame->local = lookup_local;
lookup_local->main_frame = frame;
+ lookup_local->hashed_subvol = src;
build_ret = dht_build_child_loc (this, &lookup_local->loc,
&local->loc, trav->d_name);
@@ -4575,9 +4771,20 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
}
UNLOCK (&frame->lock);
- STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk,
- src, src->fops->lookup,
- &lookup_local->loc, xattrs);
+ subvol = dht_linkfile_subvol (this, NULL, &trav->d_stat,
+ trav->dict);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_INFO,
+ "linkfile not having link subvolume. path=%s",
+ lookup_local->loc.path);
+ STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk,
+ src, src->fops->lookup,
+ &lookup_local->loc, xattrs);
+ } else {
+ STACK_WIND (lookup_frame, dht_rmdir_cached_lookup_cbk,
+ subvol, subvol->fops->lookup,
+ &lookup_local->loc, xattrs);
+ }
ret++;
}
@@ -4643,16 +4850,18 @@ int
dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
int this_call_cnt = -1;
- call_frame_t *prev = NULL;
- dict_t *dict = NULL;
- int ret = 0;
- dht_conf_t *conf = this->private;
+ call_frame_t *prev = NULL;
+ dict_t *dict = NULL;
+ int ret = 0;
+ dht_conf_t *conf = this->private;
+ int i = 0;
local = frame->local;
prev = cookie;
+ this_call_cnt = dht_frame_return (frame);
if (op_ret == -1) {
gf_log (this->name, GF_LOG_DEBUG,
"opendir on %s for %s failed (%s)",
@@ -4665,6 +4874,12 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto err;
}
+ if (!is_last_call (this_call_cnt))
+ return 0;
+
+ if (local->op_ret == -1)
+ goto err;
+
dict = dict_new ();
if (!dict) {
local->op_ret = -1;
@@ -4678,9 +4893,13 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%s: failed to set '%s' key",
local->loc.path, conf->link_xattr_name);
- STACK_WIND (frame, dht_rmdir_readdirp_cbk,
- prev->this, prev->this->fops->readdirp,
- local->fd, 4096, 0, dict);
+ local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rmdir_readdirp_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->readdirp,
+ local->fd, 4096, 0, dict);
+ }
if (dict)
dict_unref (dict);
@@ -4688,8 +4907,6 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
err:
- this_call_cnt = dht_frame_return (frame);
-
if (is_last_call (this_call_cnt)) {
dht_rmdir_do (frame, this);
}
@@ -4775,7 +4992,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK);
if (!local) {
@@ -4955,7 +5171,8 @@ dht_notify (xlator_t *this, int event, void *data, ...)
gf_log (this->name, GF_LOG_WARNING,
"Received CHILD_DOWN. Exiting");
if (conf->defrag) {
- gf_defrag_stop (conf->defrag, NULL);
+ gf_defrag_stop (conf->defrag,
+ GF_DEFRAG_STATUS_FAILED, NULL);
} else {
kill (getpid(), SIGTERM);
}
@@ -5031,7 +5248,8 @@ dht_notify (xlator_t *this, int event, void *data, ...)
if (cmd == GF_DEFRAG_CMD_STATUS)
gf_defrag_status_get (defrag, output);
else if (cmd == GF_DEFRAG_CMD_STOP)
- gf_defrag_stop (defrag, output);
+ gf_defrag_stop (defrag,
+ GF_DEFRAG_STATUS_STOPPED, output);
}
unlock:
UNLOCK (&defrag->lock);
@@ -5083,8 +5301,8 @@ unlock:
* not need to handle CHILD_DOWN event here.
*/
if (conf->defrag) {
- ret = pthread_create (&conf->defrag->th, NULL,
- gf_defrag_start, this);
+ ret = gf_thread_create (&conf->defrag->th, NULL,
+ gf_defrag_start, this);
if (ret) {
conf->defrag = NULL;
GF_FREE (conf->defrag);
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index f079e688b..2ece28a61 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -14,6 +14,7 @@
#endif
#include <regex.h>
+#include <signal.h>
#include "dht-mem-types.h"
#include "libxlator.h"
@@ -46,7 +47,7 @@ struct dht_layout {
int gen;
int type;
int ref; /* use with dht_conf_t->layout_lock */
- int search_unhashed;
+ gf_boolean_t search_unhashed;
struct {
int err; /* 0 = normal
-1 = dir exists and no xattr
@@ -183,6 +184,7 @@ struct dht_local {
xlator_t *link_subvol;
struct dht_rebalance_ rebalance;
+ xlator_t *first_up_subvol;
};
typedef struct dht_local dht_local_t;
@@ -211,6 +213,10 @@ enum gf_defrag_status_t {
GF_DEFRAG_STATUS_STOPPED,
GF_DEFRAG_STATUS_COMPLETE,
GF_DEFRAG_STATUS_FAILED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
};
typedef enum gf_defrag_status_t gf_defrag_status_t;
@@ -227,6 +233,7 @@ struct gf_defrag_info_ {
uint64_t total_data;
uint64_t num_files_lookedup;
uint64_t total_failures;
+ uint64_t skipped;
gf_lock_t lock;
int cmd;
pthread_t th;
@@ -256,7 +263,7 @@ struct dht_conf {
int gen;
dht_du_t *du_stats;
double min_free_disk;
- double min_free_inodes;
+ double min_free_inodes;
char disk_unit;
int32_t refresh_interval;
gf_boolean_t unhashed_sticky_bit;
@@ -325,12 +332,7 @@ typedef enum {
#define DHT_MIGRATION_IN_PROGRESS 1
#define DHT_MIGRATION_COMPLETED 2
-#define DHT_LINKFILE_MODE (S_ISVTX)
-
-#define check_is_linkfile(i,s,x,n) ( \
- ((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \
- == DHT_LINKFILE_MODE) && \
- dict_get (x, n))
+#define check_is_linkfile(i,s,x,n) (IS_DHT_LINKFILE_MODE (s) && dict_get (x, n))
#define IS_DHT_MIGRATION_PHASE2(buf) ( \
IA_ISREG ((buf)->ia_type) && \
@@ -349,6 +351,8 @@ typedef enum {
} \
} while (0)
+#define dht_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE)
+
#define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type))
#define layout_is_sane(layout) ((layout) && (layout->cnt > 0))
@@ -437,6 +441,7 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt
xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
+xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev);
int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p);
@@ -461,7 +466,8 @@ dht_layout_sort_volname (dht_layout_t *layout);
int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc);
gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol);
+xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *layout);
int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
@@ -683,6 +689,12 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
+int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata);
+int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
+int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, dict_t *xdata);
int32_t dht_init (xlator_t *this);
void dht_fini (xlator_t *this);
@@ -720,7 +732,8 @@ int
gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict);
int
-gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output);
+gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status,
+ dict_t *output);
void*
gf_defrag_start (void *this);
@@ -740,6 +753,7 @@ dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this,
int
dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
int32_t update_ctx);
+void dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat);
int dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx);
int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx);
@@ -752,9 +766,11 @@ dht_dir_has_layout (dict_t *xattr, char *name);
gf_boolean_t
dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
xlator_t *
-dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol);
+dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
xlator_t *
-dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol);
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
int
dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this);
@@ -765,4 +781,7 @@ dht_priv_dump (xlator_t *this);
int32_t
dht_inodectx_dump (xlator_t *this, inode_t *inode);
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol);
+
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 0c87f4a64..fe3955ecb 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -251,25 +251,45 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
/*Get the best subvolume to create the file in*/
xlator_t *
-dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
+dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *local)
{
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ loc_t *loc = NULL;
conf = this->private;
+ if (!local)
+ goto out;
+ loc = &local->loc;
+ if (!local->layout) {
+ layout = dht_layout_get (this, loc->parent);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "layout missing path=%s parent=%s",
+ loc->path, uuid_utoa (loc->parent->gfid));
+ goto out;
+ }
+ } else {
+ layout = dht_layout_ref (this, local->layout);
+ }
- LOCK (&conf->subvolume_lock);
+ LOCK (&conf->subvolume_lock);
{
- avail_subvol = dht_subvol_with_free_space_inodes(this, subvol);
+ avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+ layout);
if(!avail_subvol)
{
avail_subvol = dht_subvol_maxspace_nonzeroinode(this,
- subvol);
+ subvol,
+ layout);
}
}
UNLOCK (&conf->subvolume_lock);
-
+out:
if (!avail_subvol) {
gf_log (this->name,
GF_LOG_DEBUG,
@@ -278,17 +298,42 @@ dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
avail_subvol = subvol;
}
-
+ if (layout)
+ dht_layout_unref (this, layout);
return avail_subvol;
}
+static inline
+int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout)
+{
+ int ret = -1;
+ int i = 0;
+
+ if (!this || !layout)
+ goto out;
+
+ /* check if subvol has layout errors, before selecting it */
+ for (i = 0; i < layout->cnt; i++) {
+ if (!strcmp (layout->list[i].xlator->name, this->name) &&
+ (layout->list[i].err != 0)) {
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
/*Get subvolume which has both space and inodes more than the min criteria*/
xlator_t *
-dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
{
int i = 0;
double max = 0;
double max_inodes = 0;
+ int ignore_subvol = 0;
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
@@ -296,6 +341,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
conf = this->private;
for(i=0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
if ((conf->disk_unit == 'p') &&
(conf->du_stats[i].avail_percent > conf->min_free_disk) &&
(conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
@@ -325,10 +376,12 @@ dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol)
/* Get subvol which has atleast one inode and maximum space */
xlator_t *
-dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol)
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
{
int i = 0;
double max = 0;
+ int ignore_subvol = 0;
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
@@ -336,6 +389,12 @@ dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol)
conf = this->private;
for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
if (conf->disk_unit == 'p') {
if ((conf->du_stats[i].avail_percent > max)
&& (conf->du_stats[i].avail_inodes > 0 )) {
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
index 519dbfbb2..656cf23a0 100644
--- a/xlators/cluster/dht/src/dht-hashfn.c
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -94,7 +94,7 @@ dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p)
if (!munged && priv->rsync_regex_valid) {
len = strlen(name) + 1;
rsync_friendly_name = alloca(len);
- gf_log (this->name, GF_LOG_DEBUG, "trying regex for %s", name);
+ gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name);
munged = dht_munge_name (name, rsync_friendly_name, len,
&priv->rsync_regex);
if (munged) {
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 52a24acc2..f1dc5072f 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -18,6 +18,28 @@
#include "xlator.h"
#include "dht-common.h"
+static inline int
+dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol)
+{
+ uint64_t tmp_subvol = 0;
+
+ tmp_subvol = (long)subvol;
+ return inode_ctx_set1 (inode, this, &tmp_subvol);
+}
+
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol)
+{
+ int ret = -1;
+ uint64_t tmp_subvol = 0;
+
+ ret = inode_ctx_get1 (inode, this, &tmp_subvol);
+ if (tmp_subvol && subvol)
+ *subvol = (xlator_t *)tmp_subvol;
+
+ return ret;
+}
+
int
dht_frame_return (call_frame_t *frame)
@@ -340,20 +362,6 @@ out:
return local;
}
-
-char *
-basestr (const char *str)
-{
- char *basestr = NULL;
-
- basestr = strrchr (str, '/');
- if (basestr)
- basestr ++;
-
- return basestr;
-}
-
-
xlator_t *
dht_first_up_subvol (xlator_t *this)
{
@@ -505,7 +513,36 @@ out:
return next;
}
+/* This func wraps around, if prev is actually the last subvol.
+ */
+xlator_t *
+dht_subvol_next_available (xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ /* if prev is last in conf->subvolumes, then wrap
+ * around.
+ */
+ if ((i + 1) < conf->subvolume_cnt) {
+ next = conf->subvolumes[i + 1];
+ } else {
+ next = conf->subvolumes[0];
+ }
+ break;
+ }
+ }
+
+out:
+ return next;
+}
int
dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
{
@@ -698,6 +735,10 @@ dht_migration_complete_check_task (void *data)
loc_t tmp_loc = {0,};
char *path = NULL;
dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ uint64_t tmp_subvol = 0;
+ int open_failed = 0;
this = THIS;
frame = data;
@@ -706,32 +747,48 @@ dht_migration_complete_check_task (void *data)
src_node = local->cached_subvol;
- if (!local->loc.inode && !local->fd)
+ if (!local->loc.inode && !local->fd) {
+ local->op_errno = EINVAL;
goto out;
+ }
- /* getxattr on cached_subvol for 'linkto' value */
- if (!local->loc.inode)
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check wont be done*/
+
+ if (!local->loc.inode) {
ret = syncop_fgetxattr (src_node, local->fd, &dict,
conf->link_xattr_name);
- else
+ } else {
+ SYNCTASK_SETID (0, 0);
ret = syncop_getxattr (src_node, &local->loc, &dict,
conf->link_xattr_name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ }
if (!ret)
dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
if (ret) {
- if ((errno != ENOENT) || (!local->loc.inode)) {
+ if (!dht_inode_missing(-ret) || (!local->loc.inode)) {
+ local->op_errno = -ret;
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to get the 'linkto' xattr %s",
- local->loc.path, strerror (errno));
+ local->loc.path, strerror (-ret));
+ ret = -1;
goto out;
}
+
/* Need to do lookup on hashed subvol, then get the file */
ret = syncop_lookup (this, &local->loc, NULL, &stbuf, NULL,
NULL);
- if (ret)
+ if (ret) {
+ local->op_errno = -ret;
+ ret = -1;
goto out;
+ }
+
dst_node = dht_subvol_get_cached (this, local->loc.inode);
}
@@ -740,17 +797,21 @@ dht_migration_complete_check_task (void *data)
"%s: failed to get the destination node",
local->loc.path);
ret = -1;
+ local->op_errno = EINVAL;
goto out;
}
/* lookup on dst */
if (local->loc.inode) {
- ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL, NULL);
+ ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL,
+ NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to lookup the file on %s",
local->loc.path, dst_node->name);
+ local->op_errno = -ret;
+ ret = -1;
goto out;
}
@@ -759,6 +820,7 @@ dht_migration_complete_check_task (void *data)
"%s: gfid different on the target file on %s",
local->loc.path, dst_node->name);
ret = -1;
+ local->op_errno = EIO;
goto out;
}
}
@@ -766,15 +828,13 @@ dht_migration_complete_check_task (void *data)
/* update inode ctx (the layout) */
dht_layout_unref (this, local->layout);
- if (!local->loc.inode)
- ret = dht_layout_preset (this, dst_node, local->fd->inode);
- else
- ret = dht_layout_preset (this, dst_node, local->loc.inode);
+ ret = dht_layout_preset (this, dst_node, inode);
if (ret != 0) {
gf_log (this->name, GF_LOG_DEBUG,
"%s: could not set preset layout for subvol %s",
local->loc.path, dst_node->name);
ret = -1;
+ local->op_errno = EINVAL;
goto out;
}
@@ -784,56 +844,69 @@ dht_migration_complete_check_task (void *data)
"%s: no pre-set layout for subvolume %s",
local->loc.path, dst_node ? dst_node->name : "<nil>");
ret = -1;
+ local->op_errno = EINVAL;
goto out;
}
- if (!local->loc.inode)
- ret = dht_layout_set (this, local->fd->inode, layout);
- else
- ret = dht_layout_set (this, local->loc.inode, layout);
+ ret = dht_layout_set (this, inode, layout);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set the new layout",
local->loc.path);
+ local->op_errno = EINVAL;
goto out;
}
local->cached_subvol = dst_node;
ret = 0;
- /* once we detect the migration complete, the fd-ctx is no more
- required.. delete the ctx */
- ret = fd_ctx_del (local->fd, this, NULL);
- if (!ret)
+ /* once we detect the migration complete, the inode-ctx2 is no more
+ required.. delete the ctx and also, it means, open() already
+ done on all the fd of inode */
+ ret = inode_ctx_reset1 (inode, this, &tmp_subvol);
+ if (tmp_subvol)
+ goto out;
+
+ if (list_empty (&inode->fd_list))
goto out;
/* perform open as root:root. There is window between linkfile
* creation(root:root) and setattr with the correct uid/gid
*/
SYNCTASK_SETID(0, 0);
- /* if 'local->fd' (ie, fd based operation), send a 'open()' on
- destination if not already done */
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
- ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ /* perform 'open()' on all the fd's present on the inode */
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+
+ /* flags for open are stripped down to allow following the
+ * new location of the file, otherwise we can get EEXIST or
+ * truncate the file again as rebalance is moving the data */
+ ret = syncop_open (dst_node, &tmp_loc,
+ (iter_fd->flags &
+ ~(O_CREAT | O_EXCL | O_TRUNC)), iter_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ open_failed = 1;
+ local->op_errno = -ret;
+ ret = -1;
+ }
}
+ GF_FREE (path);
+
SYNCTASK_SETID (frame->root->uid, frame->root->gid);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+
+ if (open_failed) {
+ ret = -1;
goto out;
}
-
ret = 0;
out:
@@ -878,6 +951,9 @@ dht_rebalance_inprogress_task (void *data)
struct iatt stbuf = {0,};
loc_t tmp_loc = {0,};
dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ int open_failed = 0;
this = THIS;
frame = data;
@@ -889,19 +965,26 @@ dht_rebalance_inprogress_task (void *data)
if (!local->loc.inode && !local->fd)
goto out;
- /* getxattr on cached_subvol for 'linkto' value */
- if (local->loc.inode)
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check wont be done*/
+ if (local->loc.inode) {
+ SYNCTASK_SETID (0, 0);
ret = syncop_getxattr (src_node, &local->loc, &dict,
conf->link_xattr_name);
- else
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ } else {
ret = syncop_fgetxattr (src_node, local->fd, &dict,
conf->link_xattr_name);
+ }
- if (ret) {
+ if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to get the 'linkto' xattr %s",
- local->loc.path, strerror (errno));
- goto out;
+ local->loc.path, strerror (-ret));
+ ret = -1;
+ goto out;
}
dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
@@ -923,6 +1006,7 @@ dht_rebalance_inprogress_task (void *data)
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to lookup the file on %s",
local->loc.path, dst_node->name);
+ ret = -1;
goto out;
}
@@ -936,35 +1020,52 @@ dht_rebalance_inprogress_task (void *data)
}
ret = 0;
+
+ if (list_empty (&inode->fd_list))
+ goto done;
+
/* perform open as root:root. There is window between linkfile
* creation(root:root) and setattr with the correct uid/gid
*/
SYNCTASK_SETID (0, 0);
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
+
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+
+ /* flags for open are stripped down to allow following the
+ * new location of the file, otherwise we can get EEXIST or
+ * truncate the file again as rebalance is moving the data */
ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ (iter_fd->flags &
+ ~(O_CREAT | O_EXCL | O_TRUNC)), iter_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to send open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ ret = -1;
+ open_failed = 1;
+ }
}
+ GF_FREE (path);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+
+ if (open_failed) {
+ ret = -1;
goto out;
}
- SYNCTASK_SETID (frame->root->uid, frame->root->gid);
- ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node);
+done:
+ ret = dht_inode_ctx_set1 (this, inode, dst_node);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set fd-ctx target file at %s",
+ "%s: failed to set inode-ctx target file at %s",
local->loc.path, dst_node->name);
goto out;
}
@@ -1008,6 +1109,34 @@ dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this,
return ret;
}
+
+void
+dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ dht_stat_time_t *time = 0;
+ int ret = -1;
+
+ ret = dht_inode_ctx_get (inode, this, &ctx);
+
+ if (ret)
+ return;
+
+ time = &ctx->time;
+
+ time->mtime = stat->ia_mtime;
+ time->mtime_nsec = stat->ia_mtime_nsec;
+
+ time->ctime = stat->ia_ctime;
+ time->ctime_nsec = stat->ia_ctime_nsec;
+
+ time->atime = stat->ia_atime;
+ time->atime_nsec = stat->ia_atime_nsec;
+
+ return;
+}
+
+
int
dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
int32_t post)
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index f17cb73b9..e8a9a7196 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -35,7 +35,7 @@ dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
prev = cookie;
local->op_errno = op_errno;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
prev->this->name, strerror (op_errno));
@@ -130,10 +130,11 @@ int
dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata)
{
- uint64_t tmp_subvol = 0;
+ xlator_t *subvol = 0;
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -143,7 +144,7 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
prev = cookie;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
@@ -154,21 +155,23 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->call_cnt != 1)
goto out;
+ local->op_errno = op_errno;
/* Check if the rebalance phase2 is true */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
/* Phase 2 of migration */
local->rebalance.target_op_fn = dht_attr2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_attr2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -381,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = 0;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
if (!local) {
@@ -393,22 +398,24 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->call_cnt != 1)
goto out;
- if ((op_ret == -1) && (op_errno != ENOENT))
+ if ((op_ret == -1) && !dht_inode_missing(op_errno))
goto out;
+ local->op_errno = op_errno;
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
/* File would be migrated to other node */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_readv2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_readv2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -499,24 +506,34 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int ret = -1;
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
local = frame->local;
+ prev = cookie;
+ if (!prev || !prev->this)
+ goto out;
if (local->call_cnt != 1)
goto out;
if ((op_ret == -1) && (op_errno == ENOTCONN) &&
IA_ISDIR(local->loc.inode->ia_type)) {
- subvol = dht_first_up_subvol (this);
+ subvol = dht_subvol_next_available (this, prev->this);
if (!subvol)
goto out;
+ /* check if we are done with visiting every node */
+ if (subvol == local->cached_subvol) {
+ goto out;
+ }
+
STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access,
&local->loc, local->rebalance.flags, NULL);
return 0;
}
- if ((op_ret == -1) && (op_errno == ENOENT)) {
+ if ((op_ret == -1) && dht_inode_missing(op_errno)) {
/* File would be migrated to other node */
+ local->op_errno = op_errno;
local->rebalance.target_op_fn = dht_access2;
ret = dht_rebalance_complete_check (frame->this, frame);
if (!ret)
@@ -604,8 +621,9 @@ int
dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int ret = -1;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
@@ -615,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
/* If context is set, then send flush() it to the destination */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_flush2 (this, frame, 0);
return 0;
}
@@ -632,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -701,12 +715,14 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
prev = cookie;
local->op_errno = op_errno;
- if (op_ret == -1) {
+ if (op_ret == -1 && !dht_inode_missing(op_errno)) {
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
prev->this->name, strerror (op_errno));
@@ -721,8 +737,9 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
goto out;
}
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ local->op_errno = op_errno;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_fsync2;
/* Check if the rebalance phase1 is true */
@@ -737,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
}
+ if (!ret)
+ return 0;
} else {
dht_fsync2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
out:
DHT_STRIP_PHASE1_FLAGS (postbuf);
@@ -757,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
-
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index d4a3ecc39..576f007e5 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -19,6 +19,9 @@
int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret);
+int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret);
int
dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -27,8 +30,9 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
- if (op_ret == -1) {
+ if (op_ret == -1 && !dht_inode_missing(op_errno)) {
goto out;
}
@@ -50,6 +54,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->rebalance.target_op_fn = dht_writev2;
+ local->op_errno = op_errno;
/* Phase 2 of migration */
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
@@ -62,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
dht_writev2 (this, frame, 0);
return 0;
}
@@ -87,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -169,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -178,7 +181,7 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
prev = cookie;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
local->op_errno = op_errno;
local->op_ret = -1;
gf_log (this->name, GF_LOG_DEBUG,
@@ -198,6 +201,7 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->rebalance.target_op_fn = dht_truncate2;
+ local->op_errno = op_errno;
/* Phase 2 of migration */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
@@ -209,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_truncate2 (this, frame, 0);
return 0;
}
@@ -234,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = local->fd ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -346,6 +348,407 @@ err:
return 0;
}
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_fallocate2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_fallocate2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate,
+ local->fd, local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, NULL);
+
+ return 0;
+}
+
+int
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mode;
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_fallocate_cbk,
+ subvol, subvol->fops->fallocate,
+ fd, mode, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_discard2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_discard2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (discard, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_zerofill2;
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ ret = fd_ctx_get (local->fd, this, NULL);
+ if (!ret) {
+ dht_zerofill2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ uint64_t tmp_subvol = 0;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (local->fd)
+ ret = fd_ctx_get (local->fd, this, &tmp_subvol);
+ if (!ret)
+ subvol = (xlator_t *)(long)tmp_subvol;
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+
/* handle cases of migration here for 'setattr()' calls */
int
dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -360,7 +763,7 @@ dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
prev = cookie;
local->op_errno = op_errno;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
prev->this->name, strerror (op_errno));
@@ -397,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -461,9 +862,13 @@ unlock:
UNLOCK (&frame->lock);
this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
+ if (is_last_call (this_call_cnt)) {
+ if (local->op_ret == 0)
+ dht_inode_ctx_time_set (local->loc.inode, this,
+ &local->stbuf);
DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno,
&local->prebuf, &local->stbuf, xdata);
+ }
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 057229869..e1a37b77c 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -25,14 +25,12 @@
#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size))
-
dht_layout_t *
dht_layout_new (xlator_t *this, int cnt)
{
dht_layout_t *layout = NULL;
dht_conf_t *conf = NULL;
-
conf = this->private;
layout = GF_CALLOC (1, layout_size (cnt),
@@ -50,6 +48,7 @@ dht_layout_new (xlator_t *this, int cnt)
}
layout->ref = 1;
+
out:
return layout;
}
@@ -443,8 +442,13 @@ dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator)
int i = 0;
for (i = 0; i < layout->cnt; i++) {
- if (!strcmp (layout->list[i].xlator->name, xlator->name))
- return _gf_true;
+ /* Check if xlator is already part of layout, and layout is
+ * non-zero. */
+ if (!strcmp (layout->list[i].xlator->name, xlator->name)) {
+ if (layout->list[i].start != layout->list[i].stop)
+ return _gf_true;
+ break;
+ }
}
return _gf_false;
}
@@ -454,12 +458,16 @@ dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
{
int64_t diff = 0;
- if (layout->list[i].err || layout->list[j].err)
- diff = layout->list[i].err - layout->list[j].err;
- else
- diff = (int64_t) layout->list[i].start
- - (int64_t) layout->list[j].start;
+ /* swap zero'ed out layouts to front, if needed */
+ if (!layout->list[j].start && !layout->list[j].stop) {
+ diff = (int64_t) layout->list[i].stop
+ - (int64_t) layout->list[j].stop;
+ goto out;
+ }
+ diff = (int64_t) layout->list[i].start
+ - (int64_t) layout->list[j].start;
+out:
return diff;
}
@@ -524,8 +532,20 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
char is_virgin = 1;
uint32_t no_space = 0;
- /* TODO: explain what is happening */
-
+ /* This funtion scans through the layout spread of a directory to
+ check if there are any anomalies. Prior to calling this function
+ the layout entries should be sorted in the ascending order.
+
+ If the layout entry has err != 0
+ then increment the corresponding anomaly.
+ else
+ if (start of the current layout entry > stop + 1 of previous
+ non erroneous layout entry)
+ then it indicates a hole in the layout
+ if (start of the current layout entry < stop + 1 of previous
+ non erroneous layout entry)
+ then it indicates an overlap in the layout
+ */
last_stop = layout->list[0].start - 1;
prev_stop = last_stop;
@@ -533,14 +553,15 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
switch (layout->list[i].err) {
case -1:
case ENOENT:
+ case ESTALE:
missing++;
- break;
+ continue;
case ENOTCONN:
down++;
- break;
+ continue;
case ENOSPC:
no_space++;
- break;
+ continue;
case 0:
/* if err == 0 and start == stop, then it is a non misc++;
* participating subvolume(spread-cnt). Then, do not
@@ -552,6 +573,7 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
break;
default:
misc++;
+ continue;
}
is_virgin = 0;
@@ -713,7 +735,7 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
&disk_layout_raw);
if (dict_ret < 0) {
- if (err == 0) {
+ if (err == 0 && layout->list[pos].stop) {
gf_log (this->name, GF_LOG_INFO,
"%s - disk layout missing", loc->path);
ret = -1;
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index b4a074b65..dbc9d0b3c 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -19,6 +19,35 @@
#include "compat.h"
#include "dht-common.h"
+int
+dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ char is_linkfile = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret)
+ goto out;
+
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
+ if (!is_linkfile)
+ gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s",
+ prev->this->name, local->loc.path);
+out:
+ local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
+ inode, stbuf, postparent, postparent,
+ xattr);
+ return 0;
+}
#define is_equal(a, b) (a == b)
int
@@ -28,15 +57,47 @@ dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postparent, dict_t *xdata)
{
dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
local = frame->local;
if (!op_ret)
local->linked = _gf_true;
+ FRAME_SU_UNDO (frame, dht_local_t);
+
+ if (op_ret && (op_errno == EEXIST)) {
+ conf = this->private;
+ prev = cookie;
+ subvol = prev->this;
+ if (!subvol)
+ goto out;
+ xattrs = dict_new ();
+ if (!xattrs)
+ goto out;
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set linkto key");
+ goto out;
+ }
+
+ STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol,
+ subvol->fops->lookup, &local->loc, xattrs);
+ if (xattrs)
+ dict_unref (xattrs);
+ return 0;
+ }
+out:
local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
inode, stbuf, preparent, postparent,
xdata);
+ if (xattrs)
+ dict_unref (xattrs);
return 0;
}
@@ -88,6 +149,9 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
}
local->link_subvol = fromvol;
+ /* Always create as root:root. dht_linkfile_attr_heal fixes the
+ * ownsership */
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_linkfile_create_cbk,
fromvol, fromvol->fops->mknod, loc,
S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict);
@@ -233,11 +297,11 @@ dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
GF_VALIDATE_OR_GOTO ("dht", local, out);
GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out);
- if ((local->stbuf.ia_type == IA_INVAL) ||
- (is_equal (frame->root->uid, local->stbuf.ia_uid) &&
- is_equal (frame->root->gid, local->stbuf.ia_gid)))
+ if (local->stbuf.ia_type == IA_INVAL)
return 0;
+ uuid_copy (local->loc.gfid, local->stbuf.ia_gfid);
+
copy = copy_frame (frame);
if (!copy)
@@ -253,6 +317,8 @@ dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
copy->local = copy_local;
+ FRAME_SU_DO (copy, dht_local_t);
+
STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol,
subvol->fops->setattr, &copy_local->loc,
&stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL);
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 370118f98..4f78f5203 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -16,6 +16,7 @@
#include "dht-common.h"
#include "xlator.h"
+#include <signal.h>
#include <fnmatch.h>
#define GF_DISK_SECTOR_SIZE 512
@@ -57,7 +58,8 @@ dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count,
if (ret < 0) {
gf_log (THIS->name, GF_LOG_WARNING,
"failed to write (%s)",
- strerror (errno));
+ strerror (-ret));
+ ret = -1;
goto out;
}
@@ -75,7 +77,8 @@ dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count,
/* 'path' will be logged in calling function */
gf_log (THIS->name, GF_LOG_WARNING,
"failed to write (%s)",
- strerror (errno));
+ strerror (-ret));
+ ret = -1;
goto out;
}
}
@@ -91,6 +94,41 @@ out:
}
+/*
+ return values:
+ -1 : failure
+ -2 : success
+
+Hard link migration is carried out in three stages.
+
+(Say there are n hardlinks)
+Stage 1: Setting the new hashed subvol information on the 1st hardlink
+ encountered (linkto setxattr)
+
+Stage 2: Creating hardlinks on new hashed subvol for the 2nd to (n-1)th
+ hardlink
+
+Stage 3: Physical migration of the data file for nth hardlink
+
+Why to deem "-2" as success and not "0":
+
+ dht_migrate_file expects return value "0" from _is_file_migratable if
+the file has to be migrated.
+
+ _is_file_migratable returns zero only when it is called with the
+flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS".
+
+ gf_defrag_handle_hardlink calls dht_migrate_file for physical migration
+of the data file with the flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS"
+
+Hence, gf_defrag_handle_hardlink returning "0" for success will force
+"dht_migrate_file" to migrate each of the hardlink which is not intended.
+
+For each of the three stage mentioned above "-2" will be returned and will
+be converted to "0" in dht_migrate_file.
+
+*/
+
int32_t
gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
struct iatt *stbuf)
@@ -157,9 +195,11 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Linkto setxattr "
"failed %s -> %s (%s)", cached_subvol->name,
- loc->name, strerror (errno));
+ loc->name, strerror (-ret));
+ ret = -1;
goto out;
}
+ ret = -2;
goto out;
} else {
linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs);
@@ -172,7 +212,8 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
ret = syncop_link (hashed_subvol, loc, loc);
if (ret) {
- op_errno = errno;
+ op_errno = -ret;
+ ret = -1;
gf_log (this->name, GF_LOG_ERROR, "link of %s -> %s"
" failed on subvol %s (%s)", loc->name,
uuid_utoa(loc->gfid),
@@ -184,7 +225,8 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
ret = syncop_lookup (hashed_subvol, loc, NULL, &iatt, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed lookup %s on %s (%s)"
- , loc->name, hashed_subvol->name, strerror (errno));
+ , loc->name, hashed_subvol->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -194,12 +236,19 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
if (ret)
goto out;
}
- ret = 0;
+ ret = -2;
out:
return ret;
}
-
+/*
+ return values
+ 0 : File will be migrated
+ -2 : File will not be migrated
+ (This is the return value from gf_defrag_handle_hardlink. Checkout
+ gf_defrag_handle_hardlink for description of "returning -2")
+ -1 : failure
+*/
static inline int
__is_file_migratable (xlator_t *this, loc_t *loc,
struct iatt *stbuf, dict_t *xattrs, int flags)
@@ -222,7 +271,12 @@ __is_file_migratable (xlator_t *this, loc_t *loc,
if (flags == GF_DHT_MIGRATE_HARDLINK) {
ret = gf_defrag_handle_hardlink (this, loc,
xattrs, stbuf);
- if (ret) {
+
+ /*
+ Returning zero will force the file to be remigrated.
+ Checkout gf_defrag_handle_hardlink for more information.
+ */
+ if (ret && ret != -2) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to migrate file with link",
loc->path);
@@ -230,8 +284,8 @@ __is_file_migratable (xlator_t *this, loc_t *loc,
} else {
gf_log (this->name, GF_LOG_WARNING,
"%s: file has hardlinks", loc->path);
+ ret = -ENOTSUP;
}
- ret = ENOTSUP;
goto out;
}
@@ -243,7 +297,7 @@ out:
static inline int
__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf,
- dict_t *dict, fd_t **dst_fd)
+ dict_t *dict, fd_t **dst_fd, dict_t *xattr)
{
xlator_t *this = NULL;
int ret = -1;
@@ -288,30 +342,38 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
}
- if ((ret == -1) && (errno != ENOENT)) {
+ if ((ret < 0) && (-ret != ENOENT)) {
/* File exists in destination, but not accessible */
gf_log (THIS->name, GF_LOG_WARNING,
"%s: failed to lookup file (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
/* Create the destination with LINKFILE mode, and linkto xattr,
if the linkfile already exists, it will just open the file */
ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd,
- dict);
+ dict, &new_stbuf);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"failed to create %s on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
goto out;
}
+ ret = syncop_fsetxattr (to, fd, xattr, 0);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set xattr on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+
ret = syncop_ftruncate (to, fd, stbuf->ia_size);
if (ret < 0)
gf_log (this->name, GF_LOG_ERROR,
"ftruncate failed for %s on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
ret = syncop_fsetattr (to, fd, stbuf,
(GF_SET_ATTR_UID | GF_SET_ATTR_GID),
@@ -319,7 +381,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
if (ret < 0)
gf_log (this->name, GF_LOG_ERROR,
"chown failed for %s on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
if (dst_fd)
*dst_fd = fd;
@@ -340,13 +402,17 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
int ret = -1;
xlator_t *this = NULL;
+ uint64_t src_statfs_blocks = 1;
+ uint64_t dst_statfs_blocks = 1;
+
this = THIS;
ret = syncop_statfs (from, loc, &src_statfs);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to get statfs of %s on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -354,7 +420,8 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to get statfs of %s on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -363,22 +430,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
if (flag != GF_DHT_MIGRATE_DATA)
goto check_avail_space;
- if (((dst_statfs.f_bavail *
- dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) <
- (((src_statfs.f_bavail * src_statfs.f_bsize) /
- GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) {
- gf_log (this->name, GF_LOG_WARNING,
- "data movement attempted from node (%s) with"
- " higher disk space to a node (%s) with "
- "lesser disk space (%s)", from->name,
- to->name, loc->path);
-
- /* this is not a 'failure', but we don't want to
- consider this as 'success' too :-/ */
- ret = 1;
- goto out;
+ /* Check:
+ During rebalance `migrate-data` - Destination subvol experiences
+ a `reduction` in 'blocks' of free space, at the same time source
+ subvol gains certain 'blocks' of free space. A valid check is
+ necessary here to avoid errorneous move to destination where
+ the space could be scantily available.
+ */
+ if (stbuf) {
+ dst_statfs_blocks = ((dst_statfs.f_bavail *
+ dst_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ src_statfs_blocks = ((src_statfs.f_bavail *
+ src_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ if ((dst_statfs_blocks - stbuf->ia_blocks) <
+ (src_statfs_blocks + stbuf->ia_blocks)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "data movement attempted from node (%s) with"
+ " higher disk space to a node (%s) with "
+ "lesser disk space (%s)", from->name,
+ to->name, loc->path);
+
+ /* this is not a 'failure', but we don't want to
+ consider this as 'success' too :-/ */
+ ret = 1;
+ goto out;
+ }
}
-
check_avail_space:
if (((dst_statfs.f_bavail * dst_statfs.f_bsize) /
GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) {
@@ -441,6 +520,8 @@ __dht_rebalance_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst
if (ret >= 0)
ret = 0;
+ else
+ ret = -1;
return ret;
}
@@ -469,10 +550,11 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
}
ret = syncop_open (from, loc, O_RDWR, fd);
- if (ret == -1) {
+ if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"failed to open file %s on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -495,7 +577,8 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to set xattr on %s in %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -509,7 +592,8 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to set mode on %s in %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -549,9 +633,10 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
/* check in the destination if the file is link file */
ret = syncop_lookup (to, loc, dict, &stbuf, &rsp_dict, NULL);
- if ((ret == -1) && (errno != ENOENT)) {
+ if ((ret < 0) && (-ret != ENOENT)) {
gf_log (this->name, GF_LOG_WARNING, "%s: lookup failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -574,7 +659,8 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to delete the linkfile (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
}
@@ -594,15 +680,17 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
if (ret < 0) {
gf_log (this->name, GF_LOG_WARNING,
"%s: readlink on symlink failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
- ret = syncop_symlink (to, loc, link, dict);
+ ret = syncop_symlink (to, loc, link, dict, 0);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: creating symlink failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -612,18 +700,31 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot,
buf->ia_type),
makedev (ia_major (buf->ia_rdev),
- ia_minor (buf->ia_rdev)), dict);
+ ia_minor (buf->ia_rdev)), dict, 0);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
done:
+ ret = syncop_setattr (to, loc, buf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID |
+ GF_SET_ATTR_MODE), NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform setattr on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
+ }
+
ret = syncop_unlink (from, loc);
- if (ret)
+ if (ret) {
gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
+ }
out:
if (dict)
@@ -677,7 +778,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
ret = syncop_lookup (from, loc, dict, &stbuf, &xattr_rsp, NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "%s: lookup failed on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -689,9 +791,11 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
/* Check if file can be migrated */
ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag);
- if (ret)
+ if (ret) {
+ if (ret == -2)
+ ret = 0;
goto out;
-
+ }
/* Take care of the special files */
if (!IA_ISREG (stbuf.ia_type)) {
/* Special files */
@@ -699,9 +803,18 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* TODO: move all xattr related operations to fd based operations */
+ ret = syncop_listxattr (from, loc, &xattr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to get xattr from %s (%s)",
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
+ }
+
/* create the destination, with required modes/xattr */
ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf,
- dict, &dst_fd);
+ dict, &dst_fd, xattr);
if (ret)
goto out;
@@ -718,10 +831,12 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+
ret = syncop_fstat (from, src_fd, &stbuf);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -740,33 +855,22 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to reset target size back to 0 (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
}
ret = -1;
goto out;
}
- /* TODO: move all xattr related operations to fd based operations */
- ret = syncop_listxattr (from, loc, &xattr);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to get xattr from %s (%s)",
- loc->path, from->name, strerror (errno));
-
- ret = syncop_setxattr (to, loc, xattr, 0);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set xattr on %s (%s)",
- loc->path, to->name, strerror (errno));
-
/* TODO: Sync the locks */
ret = syncop_fsync (to, dst_fd, 0);
- if (ret)
+ if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to fsync on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
+ }
/* Phase 2 - Data-Migration Complete, Housekeeping updates pending */
@@ -776,7 +880,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
/* Failed to get the stat info */
gf_log (this->name, GF_LOG_ERROR,
"failed to fstat file %s on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -798,7 +903,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to perform setattr on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -809,7 +915,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to perform setattr on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
}
/* Make the source as a linkfile first before deleting it */
@@ -819,16 +926,37 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_WARNING, \
"%s: failed to perform setattr on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
+ /* Free up the data blocks on the source node, as the whole
+ file is migrated */
+ ret = syncop_ftruncate (from, src_fd, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)",
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
+ }
+
+ /* remove the 'linkto' xattr from the destination */
+ ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform removexattr on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
+ }
+
/* Do a stat and check the gfid before unlink */
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to do a stat on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -838,33 +966,18 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to perform unlink on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
}
- /* Free up the data blocks on the source node, as the whole
- file is migrated */
- ret = syncop_ftruncate (from, src_fd, 0);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform truncate on %s (%s)",
- loc->path, from->name, strerror (errno));
- }
-
- /* remove the 'linkto' xattr from the destination */
- ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform removexattr on %s (%s)",
- loc->path, to->name, strerror (errno));
- }
-
ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
"%s: failed to lookup the file on subvolumes (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
}
gf_log (this->name, GF_LOG_INFO,
@@ -1027,10 +1140,10 @@ gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag)
{
/* if errno is not ENOSPC or ENOTCONN, we can still continue
with rebalance process */
- if ((errno != ENOSPC) || (errno != ENOTCONN))
+ if ((op_errno != ENOSPC) || (op_errno != ENOTCONN))
return 1;
- if (errno == ENOTCONN) {
+ if (op_errno == ENOTCONN) {
/* Most probably mount point went missing (mostly due
to a brick down), say rebalance failure to user,
let him restart it if everything is fine */
@@ -1038,7 +1151,7 @@ gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag)
return -1;
}
- if (errno == ENOSPC) {
+ if (op_errno == ENOSPC) {
/* rebalance process itself failed, may be
remote brick went down, or write failed due to
disk full etc etc.. */
@@ -1096,11 +1209,12 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
int32_t op_errno = 0;
char *uuid_str = NULL;
uuid_t node_uuid = {0,};
- int readdir_operrno = 0;
struct timeval dir_start = {0,};
struct timeval end = {0,};
double elapsed = {0,};
struct timeval start = {0,};
+ int32_t err = 0;
+ int loglevel = GF_LOG_TRACE;
gf_log (this->name, GF_LOG_INFO, "migrate data called on %s",
loc->path);
@@ -1116,6 +1230,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s",
loc->path);
+ ret = -1;
goto out;
}
@@ -1128,14 +1243,11 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s."
" Aborting migrate-data",
- strerror(readdir_operrno));
+ strerror(-ret));
+ ret = -1;
goto out;
}
- /* Need to keep track of ENOENT errno, that means, there is no
- need to send more readdirp() */
- readdir_operrno = errno;
-
if (list_empty (&entries.list))
break;
@@ -1200,6 +1312,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "%s"
" lookup failed", entry_loc.path);
+ ret = -1;
continue;
}
@@ -1208,6 +1321,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if(ret < 0) {
gf_log (this->name, GF_LOG_ERROR, "Failed to "
"get node-uuid for %s", entry_loc.path);
+ ret = -1;
continue;
}
@@ -1217,6 +1331,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
gf_log (this->name, GF_LOG_ERROR, "Failed to "
"get node-uuid from dict for %s",
entry_loc.path);
+ ret = -1;
continue;
}
@@ -1241,30 +1356,50 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
/* if distribute is present, it will honor this key.
- * -1 is returned if distribute is not present or file
- * doesn't have a link-file. If file has link-file, the
- * path of link-file will be the value, and also that
- * guarantees that file has to be mostly migrated */
+ * -1, ENODATA is returned if distribute is not present
+ * or file doesn't have a link-file. If file has
+ * link-file, the path of link-file will be the value,
+ * and also that guarantees that file has to be mostly
+ * migrated */
ret = syncop_getxattr (this, &entry_loc, &dict,
GF_XATTR_LINKINFO_KEY);
if (ret < 0) {
- gf_log (this->name, GF_LOG_TRACE, "failed to "
- "get link-to key for %s",
- entry_loc.path);
+ if (-ret != ENODATA) {
+ loglevel = GF_LOG_ERROR;
+ defrag->total_failures += 1;
+ } else {
+ loglevel = GF_LOG_TRACE;
+ }
+ gf_log (this->name, loglevel, "%s: failed to "
+ "get "GF_XATTR_LINKINFO_KEY" key - %s",
+ entry_loc.path, strerror (-ret));
+ ret = -1;
continue;
}
ret = syncop_setxattr (this, &entry_loc, migrate_data,
0);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "migrate-data"
- " failed for %s", entry_loc.path);
- defrag->total_failures +=1;
+ err = op_errno;
+ /* errno is overloaded. See
+ * rebalance_task_completion () */
+ if (err != ENOSPC) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "migrate-data skipped for %s"
+ " due to space constraints",
+ entry_loc.path);
+ defrag->skipped +=1;
+ } else{
+ gf_log (this->name, GF_LOG_ERROR,
+ "migrate-data failed for %s",
+ entry_loc.path);
+ defrag->total_failures +=1;
+ }
}
- if (ret == -1) {
- op_errno = errno;
+ if (ret < 0) {
+ op_errno = -ret;
ret = gf_defrag_handle_migrate_error (op_errno,
defrag);
@@ -1299,9 +1434,6 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
gf_dirent_free (&entries);
free_entries = _gf_false;
INIT_LIST_HEAD (&entries.list);
-
- if (readdir_operrno == ENOENT)
- break;
}
gettimeofday (&end, NULL);
@@ -1325,7 +1457,6 @@ out:
}
-
int
gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *fix_layout, dict_t *migrate_data)
@@ -1340,12 +1471,12 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *dict = NULL;
off_t offset = 0;
struct iatt iatt = {0,};
- int readdirp_errno = 0;
ret = syncop_lookup (this, loc, NULL, &iatt, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s",
loc->path);
+ ret = -1;
goto out;
}
@@ -1379,14 +1510,11 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s"
- ". Aborting fix-layout",strerror(errno));
+ ". Aborting fix-layout",strerror(-ret));
+ ret = -1;
goto out;
}
- /* Need to keep track of ENOENT errno, that means, there is no
- need to send more readdirp() */
- readdirp_errno = errno;
-
if (list_empty (&entries.list))
break;
@@ -1440,6 +1568,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "%s"
" lookup failed", entry_loc.path);
+ ret = -1;
continue;
}
@@ -1451,6 +1580,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
defrag->defrag_status =
GF_DEFRAG_STATUS_FAILED;
defrag->total_failures ++;
+ ret = -1;
goto out;
}
ret = gf_defrag_fix_layout (this, defrag, &entry_loc,
@@ -1467,8 +1597,6 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
gf_dirent_free (&entries);
free_entries = _gf_false;
INIT_LIST_HEAD (&entries.list);
- if (readdirp_errno == ENOENT)
- break;
}
ret = 0;
@@ -1533,6 +1661,7 @@ gf_defrag_start_crawl (void *data)
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "look up on / failed");
+ ret = -1;
goto out;
}
@@ -1553,6 +1682,7 @@ gf_defrag_start_crawl (void *data)
gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed",
loc.path);
defrag->total_failures++;
+ ret = -1;
goto out;
}
@@ -1659,6 +1789,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
uint64_t size = 0;
uint64_t lookup = 0;
uint64_t failures = 0;
+ uint64_t skipped = 0;
char *status = "";
double elapsed = 0;
struct timeval end = {0,};
@@ -1675,6 +1806,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
size = defrag->total_data;
lookup = defrag->num_files_lookedup;
failures = defrag->total_failures;
+ skipped = defrag->skipped;
gettimeofday (&end, NULL);
@@ -1698,6 +1830,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
gf_log (THIS->name, GF_LOG_WARNING,
"failed to set lookedup file count");
+
ret = dict_set_int32 (dict, "status", defrag->defrag_status);
if (ret)
gf_log (THIS->name, GF_LOG_WARNING,
@@ -1710,6 +1843,14 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
}
ret = dict_set_uint64 (dict, "failures", failures);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set failure count");
+
+ ret = dict_set_uint64 (dict, "skipped", skipped);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set skipped file count");
log:
switch (defrag->defrag_status) {
case GF_DEFRAG_STATUS_NOT_STARTED:
@@ -1727,13 +1868,15 @@ log:
case GF_DEFRAG_STATUS_FAILED:
status = "failed";
break;
+ default:
+ break;
}
gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f "
"secs", status, elapsed);
gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %"
- PRIu64", lookups: %"PRIu64", failures: %"PRIu64, files, size,
- lookup, failures);
+ PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: "
+ "%"PRIu64, files, size, lookup, failures, skipped);
out:
@@ -1741,7 +1884,8 @@ out:
}
int
-gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output)
+gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status,
+ dict_t *output)
{
/* TODO: set a variable 'stop_defrag' here, it should be checked
in defrag loop */
@@ -1753,7 +1897,7 @@ gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output)
}
gf_log ("", GF_LOG_INFO, "Received stop command on rebalance");
- defrag->defrag_status = GF_DEFRAG_STATUS_STOPPED;
+ defrag->defrag_status = status;
if (output)
gf_defrag_status_get (defrag, output);
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index fd0208f71..925538cc8 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -306,7 +306,54 @@ err:
NULL, NULL);
return 0;
}
+#define DHT_MARK_FOP_INTERNAL(xattr) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \
+ if (tmp) { \
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set" \
+ " internal dict key for %s", local->loc.path); \
+ } \
+ }while (0)
+
+#define DHT_MARKER_DONT_ACCOUNT(xattr) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str (xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, \
+ "yes"); \
+ if (tmp) { \
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set" \
+ " marker dont account key for %s", local->loc.path); \
+ } \
+ }while (0)
+int
+dht_rename_done (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal (frame, this);
+ }
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
+ &local->stbuf, &local->preoldparent,
+ &local->postoldparent, &local->preparent,
+ &local->postparent, NULL);
+
+ return 0;
+}
int
dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -340,11 +387,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
WIPE (&local->postparent);
if (is_last_call (this_call_cnt)) {
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
}
out:
@@ -362,7 +405,7 @@ dht_rename_cleanup (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
-
+ dict_t *xattr = NULL;
local = frame->local;
this = frame->this;
@@ -386,24 +429,53 @@ dht_rename_cleanup (call_frame_t *frame)
if (!call_cnt)
goto nolinks;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (dst_hashed != src_hashed && dst_hashed != src_cached) {
+ dict_t *xattr_new = NULL;
+
gf_log (this->name, GF_LOG_TRACE,
"unlinking linkfile %s @ %s => %s",
local->loc.path, dst_hashed->name, src_cached->name);
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
+
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr_new);
+
+ dict_unref (xattr_new);
+ xattr_new = NULL;
}
if (src_cached != dst_hashed) {
+ dict_t *xattr_new = NULL;
+
gf_log (this->name, GF_LOG_TRACE,
"unlinking link %s => %s (%s)", local->loc.path,
local->loc2.path, src_cached->name);
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
+ if (uuid_compare (local->loc.pargfid,
+ local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr_new);
+
+ dict_unref (xattr_new);
+ xattr_new = NULL;
}
+ if (xattr)
+ dict_unref (xattr);
+
return 0;
nolinks:
@@ -467,6 +539,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *rename_subvol = NULL;
call_frame_t *link_frame = NULL;
dht_local_t *link_local = NULL;
+ dict_t *xattr = NULL;
local = frame->local;
prev = cookie;
@@ -476,6 +549,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ if (local->linked == _gf_true)
+ FRAME_SU_UNDO (frame, dht_local_t);
if (op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: rename on %s failed (%s)", local->loc.path,
@@ -510,17 +585,21 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
err:
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent, prev->this);
-
- if (local->linked == _gf_true) {
- local->linked = _gf_false;
- dht_linkfile_attr_heal (frame, this);
+ /* Merge attrs only from src_cached. In case there of src_cached !=
+ * dst_hashed, this ignores linkfile attrs. */
+ if (prev->this == src_cached) {
+ dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+ dht_iatt_merge (this, &local->preoldparent, preoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postoldparent, postoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->preparent, prenewparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postparent, postnewparent,
+ prev->this);
}
+
/* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
* is called. since rename has already happened on rename_subvol,
* unlink should not be sent for oldpath (either linkfile or cached-file)
@@ -544,24 +623,47 @@ err:
if (local->call_cnt == 0)
goto unwind;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (src_cached != dst_hashed && src_cached != dst_cached) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
gf_log (this->name, GF_LOG_TRACE,
"deleting old src datafile %s @ %s",
local->loc.path, src_cached->name);
+ if (uuid_compare (local->loc.pargfid,
+ local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr_new);
+
+ dict_unref (xattr_new);
+ xattr_new = NULL;
}
if (src_hashed != rename_subvol && src_hashed != src_cached) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
gf_log (this->name, GF_LOG_TRACE,
"deleting old src linkfile %s @ %s",
local->loc.path, src_hashed->name);
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
STACK_WIND (frame, dht_rename_unlink_cbk,
src_hashed, src_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr_new);
+
+ dict_unref (xattr_new);
+ xattr_new = NULL;
}
if (dst_cached
@@ -573,8 +675,10 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_cached, dst_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
unwind:
@@ -582,16 +686,16 @@ unwind:
WIPE (&local->postoldparent);
WIPE (&local->preparent);
WIPE (&local->postparent);
+ if (xattr)
+ dict_unref (xattr);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
return 0;
cleanup:
+ if (xattr)
+ dict_unref (xattr);
dht_rename_cleanup (frame);
return 0;
@@ -601,12 +705,13 @@ cleanup:
int
dht_do_rename (call_frame_t *frame)
{
- dht_local_t *local = NULL;
- xlator_t *dst_hashed = NULL;
- xlator_t *src_cached = NULL;
- xlator_t *dst_cached = NULL;
- xlator_t *this = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *this = NULL;
xlator_t *rename_subvol = NULL;
+ dict_t *dict = NULL;
local = frame->local;
@@ -621,13 +726,19 @@ dht_do_rename (call_frame_t *frame)
else
rename_subvol = dst_hashed;
+ if ((src_cached != dst_hashed) && (rename_subvol == dst_hashed)) {
+ DHT_MARKER_DONT_ACCOUNT(dict);
+ }
+
gf_log (this->name, GF_LOG_TRACE,
"renaming %s => %s (%s)",
local->loc.path, local->loc2.path, rename_subvol->name);
+ if (local->linked == _gf_true)
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_rename_cbk,
rename_subvol, rename_subvol->fops->rename,
- &local->loc, &local->loc2, NULL);
+ &local->loc, &local->loc2, dict);
return 0;
}
@@ -655,7 +766,8 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = -1;
if (op_errno != ENOENT)
local->op_errno = op_errno;
- } else {
+ } else if (local->src_cached == prev->this) {
+ /* merge of attr returned only from linkfile creation */
dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
}
@@ -722,6 +834,7 @@ dht_rename_create_links (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
+ dict_t *xattr = NULL;
local = frame->local;
@@ -732,18 +845,27 @@ dht_rename_create_links (call_frame_t *frame)
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ DHT_MARK_FOP_INTERNAL (xattr);
if (src_cached == dst_cached) {
+ dict_t *xattr_new = NULL;
+
if (dst_hashed == dst_cached)
goto nolinks;
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
gf_log (this->name, GF_LOG_TRACE,
"unlinking dst linkfile %s @ %s",
local->loc2.path, dst_hashed->name);
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
STACK_WIND (frame, dht_rename_unlink_links_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr_new);
+
+ dict_unref (xattr_new);
return 0;
}
@@ -765,12 +887,23 @@ dht_rename_create_links (call_frame_t *frame)
}
if (src_cached != dst_hashed) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
gf_log (this->name, GF_LOG_TRACE,
"link %s => %s (%s)", local->loc.path,
local->loc2.path, src_cached->name);
+ if (uuid_compare (local->loc.pargfid,
+ local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+
STACK_WIND (frame, dht_rename_links_cbk,
src_cached, src_cached->fops->link,
- &local->loc, &local->loc2, NULL);
+ &local->loc, &local->loc2, xattr_new);
+
+ dict_unref (xattr_new);
}
nolinks:
@@ -778,6 +911,8 @@ nolinks:
/* skip to next step */
dht_do_rename (frame);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 037b26fed..0e6527544 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -17,6 +17,7 @@
#include "glusterfs.h"
#include "xlator.h"
#include "dht-common.h"
+#include "glusterfs-acl.h"
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \
layout->list[i].start = srt; \
@@ -126,6 +127,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
int32_t *disk_layout = NULL;
dht_local_t *local = NULL;
dht_conf_t *conf = NULL;
+ data_t *data = NULL;
local = frame->local;
if (req_subvol)
@@ -170,7 +172,16 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
layout->type, subvol->name, loc->path);
dict_ref (xattr);
-
+ if (local->xattr) {
+ data = dict_get (local->xattr, QUOTA_LIMIT_KEY);
+ if (data) {
+ ret = dict_add (xattr, QUOTA_LIMIT_KEY, data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "set quota limit key on %s",loc->path);
+ }
+ }
+ }
if (!uuid_is_null (local->gfid))
uuid_copy (loc->gfid, local->gfid);
@@ -264,7 +275,14 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
}
missing_xattr++;
}
-
+ /* Also account for subvolumes with no-layout. Used for zero'ing out
+ * the layouts and for setting quota key's if present */
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false ==
+ dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
+ missing_xattr++;
+ }
+ }
gf_log (this->name, GF_LOG_TRACE,
"%d subvolumes missing xattr for %s",
missing_xattr, loc->path);
@@ -275,7 +293,6 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
}
local->call_cnt = missing_xattr;
-
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].err != -1 || !layout->list[i].stop)
continue;
@@ -288,13 +305,15 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
dummy = dht_layout_new (this, 1);
if (!dummy)
goto out;
- for (i = 0; i < conf->subvolume_cnt; i++) {
+ for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) {
if (_gf_false ==
dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
conf->subvolumes[i]);
+ missing_xattr--;
}
}
+
dht_layout_unref (this, dummy);
out:
return 0;
@@ -564,9 +583,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
for (i = 0; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1 || err == 0) {
- layout->list[i].err = -1;
+ if (err == -1 || err == 0 || err == ENOENT) {
+ /* Setting list[i].err = -1 is an indication for
+ dht_selfheal_layout_new_directory() to assign
+ a range. We set it to -1 based on any one of
+ the three criteria:
+
+ - err == -1 already, which means directory
+ existed but layout was not set on it.
+
+ - err == 0, which means directory exists and
+ has an old layout piece which will be
+ overwritten now.
+
+ - err == ENOENT, which means directory does
+ not exist (possibly racing with mkdir or
+ finishing half done mkdir). The missing
+ directory will be attempted to be recreated.
+
+ It is important to note that it is safe
+ to race with mkdir() as self-heal and
+ mkdir are idempotent operations. Both will
+ strive to set the directory and layouts to
+ the same final state.
+ */
count++;
+ if (!err)
+ layout->list[i].err = -1;
}
}
@@ -647,6 +690,13 @@ dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
max_overlap = 0;
max_overlap_idx = i;
for (j = (i + 1); j < new->cnt; ++j) {
+ if (new->list[j].err > 0) {
+ /* Subvol might be marked for decommission
+ with EINVAL, or some other serious error
+ marked with positive errno.
+ */
+ continue;
+ }
/* Calculate the overlap now. */
curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j);
/* Calculate the overlap after the proposed swap. */
@@ -769,7 +819,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
DHT_RESET_LAYOUT_RANGE (layout);
for (i = start_subvol; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -782,7 +832,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
for (i = 0; i < start_subvol; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -980,10 +1030,11 @@ dht_dir_attr_heal (void *data)
ret = syncop_setattr (subvol, &local->loc, &local->stbuf,
(GF_SET_ATTR_UID | GF_SET_ATTR_GID),
NULL, NULL);
- if (ret)
+ if (ret) {
gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on"
" %s on %s subvol (%s)", local->loc.path,
- subvol->name, strerror (errno));
+ subvol->name, strerror (-ret));
+ }
}
out:
return 0;
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
index 6b91c56a1..f2e7467ab 100644
--- a/xlators/cluster/dht/src/dht-shared.c
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -103,20 +103,20 @@ dht_priv_dump (xlator_t *this)
this->name);
gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt);
for (i = 0; i < conf->subvolume_cnt; i++) {
- sprintf (key, "subvolumes[%d]", i);
+ snprintf (key, sizeof (key), "subvolumes[%d]", i);
gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
conf->subvolumes[i]->name);
if (conf->file_layouts && conf->file_layouts[i]){
- sprintf (key, "file_layouts[%d]", i);
+ snprintf (key, sizeof (key), "file_layouts[%d]", i);
dht_layout_dump(conf->file_layouts[i], key);
}
if (conf->dir_layouts && conf->dir_layouts[i]) {
- sprintf (key, "dir_layouts[%d]", i);
+ snprintf (key, sizeof (key), "dir_layouts[%d]", i);
dht_layout_dump(conf->dir_layouts[i], key);
}
if (conf->subvolume_status) {
- sprintf (key, "subvolume_status[%d]", i);
+ snprintf (key, sizeof (key), "subvolume_status[%d]", i);
gf_proc_dump_write(key, "%d",
(int)conf->subvolume_status[i]);
}
@@ -130,14 +130,35 @@ dht_priv_dump (xlator_t *this)
gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
- if (conf ->du_stats) {
- gf_proc_dump_write("du_stats.avail_percent", "%lf",
- conf->du_stats->avail_percent);
- gf_proc_dump_write("du_stats.avail_space", "%lu",
- conf->du_stats->avail_space);
- gf_proc_dump_write("du_stats.avail_inodes", "%lf",
- conf->du_stats->avail_inodes);
- gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log);
+
+ if (conf->du_stats) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i])
+ continue;
+
+ snprintf (key, sizeof (key), "subvolumes[%d]", i);
+ gf_proc_dump_write (key, "%s",
+ conf->subvolumes[i]->name);
+
+ snprintf (key, sizeof (key),
+ "du_stats[%d].avail_percent", i);
+ gf_proc_dump_write (key, "%lf",
+ conf->du_stats[i].avail_percent);
+
+ snprintf (key, sizeof (key), "du_stats[%d].avail_space",
+ i);
+ gf_proc_dump_write (key, "%lu",
+ conf->du_stats[i].avail_space);
+
+ snprintf (key, sizeof (key),
+ "du_stats[%d].avail_inodes", i);
+ gf_proc_dump_write (key, "%lf",
+ conf->du_stats[i].avail_inodes);
+
+ snprintf (key, sizeof (key), "du_stats[%d].log", i);
+ gf_proc_dump_write (key, "%lu",
+ conf->du_stats[i].log);
+ }
}
if (conf->last_stat_fetch.tv_sec)
@@ -262,6 +283,28 @@ out:
return ret;
}
+
+int
+dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf)
+{
+ int i = 0;
+ int ret = -1;
+
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i]) {
+ conf->decommissioned_bricks[i] = NULL;
+ conf->decommission_subvols_cnt--;
+ }
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
void
dht_init_regex (xlator_t *this, dict_t *odict, char *name,
regex_t *re, gf_boolean_t *re_valid)
@@ -323,7 +366,6 @@ dht_reconfigure (xlator_t *this, dict_t *options)
" lookup-unhashed should be boolean,"
" not (%s), defaulting to (%d)",
temp_str, conf->search_unhashed);
- //return -1;
ret = -1;
goto out;
}
@@ -358,6 +400,10 @@ dht_reconfigure (xlator_t *this, dict_t *options)
ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
if (ret == -1)
goto out;
+ } else {
+ ret = dht_decommissioned_remove (this, conf);
+ if (ret == -1)
+ goto out;
}
dht_init_regex (this, options, "rsync-hash-regex",
@@ -402,11 +448,11 @@ gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *dat
if (!pattern)
goto out;
if (!num) {
- if (gf_string2bytesize(pattern, &pattern_list->size)
+ if (gf_string2bytesize_uint64(pattern, &pattern_list->size)
== 0) {
pattern = "*";
}
- } else if (gf_string2bytesize (num, &pattern_list->size) != 0) {
+ } else if (gf_string2bytesize_uint64 (num, &pattern_list->size) != 0) {
gf_log (this->name, GF_LOG_ERROR,
"invalid number format \"%s\"", num);
goto out;
@@ -581,7 +627,8 @@ dht_init (xlator_t *this)
}
GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);
- gf_asprintf (&conf->link_xattr_name, "%s.linkto", conf->xattr_name);
+ gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR,
+ conf->xattr_name);
gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);
if (!conf->link_xattr_name || !conf->wild_xattr_name) {
goto err;
@@ -663,7 +710,8 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = 1,
.validate = GF_OPT_VALIDATE_MIN,
- .description = "Specifies the directory layout spread."
+ .description = "Specifies the directory layout spread. Takes number "
+ "of subvolumes as default value."
},
{ .key = {"decommissioned-bricks"},
.type = GF_OPTION_TYPE_ANY,
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index 814f0a8eb..fc0ca2f77 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -70,6 +70,9 @@ struct xlator_fops fops = {
.fxattrop = dht_fxattrop,
.setattr = dht_setattr,
.fsetattr = dht_fsetattr,
+ .fallocate = dht_fallocate,
+ .discard = dht_discard,
+ .zerofill = dht_zerofill,
};
struct xlator_dumpops dumpops = {
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 5e5c68058..e934acdf0 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -323,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (subvol != avail_subvol) {
@@ -427,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (avail_subvol != subvol) {
@@ -482,97 +484,141 @@ same_first_part (char *str1, char term1, char *str2, char term2)
}
}
+typedef struct nufa_args {
+ xlator_t *this;
+ char *volname;
+ gf_boolean_t addr_match;
+} nufa_args_t;
+
+static void
+nufa_find_local_brick (xlator_t *xl, void *data)
+{
+ nufa_args_t *args = data;
+ xlator_t *this = args->this;
+ char *local_volname = args->volname;
+ gf_boolean_t addr_match = args->addr_match;
+ char *brick_host = NULL;
+ dht_conf_t *conf = this->private;
+ int ret = -1;
+
+ /*This means a local subvol was already found. We pick the first brick
+ * that is local*/
+ if (conf->private)
+ return;
+
+ if (strcmp (xl->name, local_volname) == 0) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s",
+ local_volname);
+ return;
+ }
+
+ if (!addr_match)
+ return;
+
+ ret = dict_get_str (xl->options, "remote-host", &brick_host);
+ if ((ret == 0) &&
+ (gf_is_same_address (local_volname, brick_host) ||
+ gf_is_local_addr (brick_host))) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using the first local "
+ "subvol %s", xl->name);
+ return;
+ }
+
+}
+
+static void
+nufa_to_dht (xlator_t *this)
+{
+ GF_ASSERT (this);
+ GF_ASSERT (this->fops);
+
+ this->fops->lookup = dht_lookup;
+ this->fops->create = dht_create;
+ this->fops->mknod = dht_mknod;
+}
+
+int
+nufa_find_local_subvol (xlator_t *this,
+ void (*fn) (xlator_t *each, void* data), void *data)
+{
+ int ret = -1;
+ dht_conf_t *conf = this->private;
+ xlator_list_t *trav = NULL;
+ xlator_t *parent = NULL;
+ xlator_t *candidate = NULL;
+
+ xlator_foreach_depth_first (this, fn, data);
+ if (!conf->private) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local "
+ "brick");
+ return -1;
+ }
+
+ candidate = conf->private;
+ trav = candidate->parents;
+ while (trav) {
+
+ parent = trav->xlator;
+ if (strcmp (parent->type, "cluster/nufa") == 0) {
+ gf_log (this->name, GF_LOG_INFO, "Found local subvol, "
+ "%s", candidate->name);
+ ret = 0;
+ conf->private = candidate;
+ break;
+ }
+
+ candidate = parent;
+ trav = parent->parents;
+ }
+
+ return ret;
+}
+
int
nufa_init (xlator_t *this)
{
- dht_conf_t *conf = NULL;
- xlator_list_t *trav = NULL;
data_t *data = NULL;
char *local_volname = NULL;
int ret = -1;
char my_hostname[256];
- xlator_t *local_subvol = NULL;
- char *brick_host = NULL;
- xlator_t *kid = NULL;
+ gf_boolean_t addr_match = _gf_false;
+ nufa_args_t args = {0, };
ret = dht_init(this);
if (ret) {
return ret;
}
- conf = this->private;
- local_volname = "localhost";
- ret = gethostname (my_hostname, 256);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "could not find hostname (%s)",
- strerror (errno));
- }
+ if ((data = dict_get (this->options, "local-volume-name"))) {
+ local_volname = data->data;
- if (ret == 0)
- local_volname = my_hostname;
+ } else {
+ addr_match = _gf_true;
+ local_volname = "localhost";
+ ret = gethostname (my_hostname, 256);
+ if (ret == 0)
+ local_volname = my_hostname;
- data = dict_get (this->options, "local-volume-name");
- if (data) {
- local_volname = data->data;
- }
+ else
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hostname (%s)",
+ strerror (errno));
- for (trav = this->children; trav; trav = trav->next) {
- if (strcmp (trav->xlator->name, local_volname) == 0)
- break;
- if (local_subvol) {
- continue;
- }
- kid = trav->xlator;
- for (;;) {
- if (dict_get_str(trav->xlator->options,"remote-host",
- &brick_host) == 0) {
- /* Found it. */
- break;
- }
- if (!kid->children) {
- /* Nowhere further to look. */
- gf_log (this->name, GF_LOG_ERROR,
- "could not get remote-host");
- goto err;
- }
- if (kid->children->next) {
- /* Multiple choices, can't/shouldn't decide. */
- gf_log (this->name, GF_LOG_ERROR,
- "NUFA found fan-out (type %s) volume",
- kid->type);
- goto err;
- }
- /* One-to-one xlators are OK, try the next one. */
- kid = kid->children->xlator;
- }
- if (same_first_part(my_hostname,'.',brick_host,'.')) {
- local_subvol = trav->xlator;
- }
}
- if (trav) {
- gf_log (this->name, GF_LOG_INFO,
- "Using specified subvol %s", local_volname);
- conf->private = trav->xlator;
- }
- else if (local_subvol) {
+ args.this = this;
+ args.volname = local_volname;
+ args.addr_match = addr_match;
+ ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args);
+ if (ret) {
gf_log (this->name, GF_LOG_INFO,
- "Using first local subvol %s", local_subvol->name);
- conf->private = local_subvol;
+ "Unable to find local subvolume, switching "
+ "to dht mode");
+ nufa_to_dht (this);
}
- else {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not find specified or local subvol");
- goto err;
-
- }
-
return 0;
-
-err:
- dht_fini(this);
- return -1;
}
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index 861012247..2717ce975 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -437,7 +437,8 @@ switch_create (call_frame_t *frame, xlator_t *this,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (subvol != avail_subvol) {
@@ -536,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (avail_subvol != subvol) {
@@ -668,6 +670,7 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf,
GF_FREE (dup_str);
continue;
}
+ GF_FREE (dup_str);
memcpy (switch_opt->path_pattern, pattern, strlen (pattern));
if (childs) {
dup_childs = gf_strdup (childs);
@@ -724,7 +727,6 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf,
"option in unify volume. Exiting");
goto err;
}
- GF_FREE (dup_str);
/* Link it to the main structure */
if (switch_buf) {
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
new file mode 100644
index 000000000..aa19ddc57
--- /dev/null
+++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
@@ -0,0 +1,63 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+#include "byte-order.h"
+
+int
+dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p)
+{
+ return 0;
+}
+
+int
+dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout)
+{
+ return 0;
+}
+
+int
+dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this,
+ dht_layout_t *layout_int)
+{
+ return 0;
+}
+
+int
+dict_get_ptr (dict_t *this, char *key, void **ptr)
+{
+ return 0;
+}
+
+int
+dict_get_ptr_and_len (dict_t *this, char *key, void **ptr, int *len)
+{
+ return 0;
+}
+
+int _gf_log (const char *domain, const char *file,
+ const char *function, int32_t line, gf_loglevel_t level,
+ const char *fmt, ...)
+{
+ return 0;
+}
+
+int _gf_log_callingfn (const char *domain, const char *file,
+ const char *function, int32_t line, gf_loglevel_t level,
+ const char *fmt, ...)
+{
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
new file mode 100644
index 000000000..b5233d235
--- /dev/null
+++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
@@ -0,0 +1,124 @@
+/*
+ Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+#include "logging.h"
+#include "xlator.h"
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <inttypes.h>
+#include <cmockery/pbc.h>
+#include <cmockery/cmockery.h>
+
+/*
+ * Helper functions
+ */
+
+static xlator_t *
+helper_xlator_init(uint32_t num_types)
+{
+ xlator_t *xl;
+ int i, ret;
+
+ REQUIRE(num_types > 0);
+
+ xl = test_calloc(1, sizeof(xlator_t));
+ assert_non_null(xl);
+ xl->mem_acct.num_types = num_types;
+ xl->mem_acct.rec = test_calloc(num_types, sizeof(struct mem_acct_rec));
+ assert_non_null(xl->mem_acct.rec);
+
+ xl->ctx = test_calloc(1, sizeof(glusterfs_ctx_t));
+ assert_non_null(xl->ctx);
+
+ for (i = 0; i < num_types; i++) {
+ ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock));
+ assert_false(ret);
+ }
+
+ ENSURE(num_types == xl->mem_acct.num_types);
+ ENSURE(NULL != xl);
+
+ return xl;
+}
+
+static int
+helper_xlator_destroy(xlator_t *xl)
+{
+ int i, ret;
+
+ for (i = 0; i < xl->mem_acct.num_types; i++) {
+ ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock));
+ assert_int_equal(ret, 0);
+ }
+
+ free(xl->mem_acct.rec);
+ free(xl->ctx);
+ free(xl);
+ return 0;
+}
+
+/*
+ * Unit tests
+ */
+static void
+test_dht_layout_new(void **state)
+{
+ xlator_t *xl;
+ dht_layout_t *layout;
+ dht_conf_t *conf;
+ int cnt;
+
+ expect_assert_failure(dht_layout_new(NULL, 0));
+ expect_assert_failure(dht_layout_new((xlator_t *)0x12345, -1));
+ xl = helper_xlator_init(10);
+
+ // xl->private is NULL
+ assert_null(xl->private);
+ cnt = 100;
+ layout = dht_layout_new(xl, cnt);
+ assert_non_null(layout);
+ assert_int_equal(layout->type, DHT_HASH_TYPE_DM);
+ assert_int_equal(layout->cnt, cnt);
+ assert_int_equal(layout->ref, 1);
+ assert_int_equal(layout->gen, 0);
+ assert_int_equal(layout->spread_cnt, 0);
+ free(layout);
+
+ // xl->private is not NULL
+ cnt = 110;
+ conf = (dht_conf_t *)test_calloc(1, sizeof(dht_conf_t));
+ assert_non_null(conf);
+ conf->dir_spread_cnt = 12345;
+ conf->gen = -123;
+ xl->private = conf;
+
+ layout = dht_layout_new(xl, cnt);
+ assert_non_null(layout);
+ assert_int_equal(layout->type, DHT_HASH_TYPE_DM);
+ assert_int_equal(layout->cnt, cnt);
+ assert_int_equal(layout->ref, 1);
+ assert_int_equal(layout->gen, conf->gen);
+ assert_int_equal(layout->spread_cnt, conf->dir_spread_cnt);
+ free(layout);
+
+ free(conf);
+ helper_xlator_destroy(xl);
+}
+
+int main(void) {
+ const UnitTest tests[] = {
+ unit_test(test_dht_layout_new),
+ };
+
+ return run_tests(tests, "xlator_dht_layout");
+}
diff --git a/xlators/cluster/nsr-client/Makefile.am b/xlators/cluster/nsr-client/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-client/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-client/src/Makefile.am b/xlators/cluster/nsr-client/src/Makefile.am
new file mode 100644
index 000000000..4541ea01a
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/Makefile.am
@@ -0,0 +1,33 @@
+noinst_PYTHON = gen-fops.py
+
+xlator_LTLIBRARIES = nsrc.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsrc_la_LDFLAGS = -module -avoid-version
+nsrc_la_SOURCES = nsrc.c
+
+nsrc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = fop-template.c \
+ $(top_srcdir)/xlators/lib/src/libxlator.h \
+ $(top_srcdir)/glusterfsd/src/glusterfsd.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES = nsrc-cg.c
+
+CODEGEN_DIR = ../../nsr-server/src/codegen.py
+
+nsrc-cg.c: gen-fops.py $(CODEGEN) $(XLATOR_HEADER) fop-template.c
+ $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) fop-template.c > $@
+
+nsrc.lo: nsrc-cg.c
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-client/src/fop-template.c b/xlators/cluster/nsr-client/src/fop-template.c
new file mode 100644
index 000000000..699b07d40
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/fop-template.c
@@ -0,0 +1,113 @@
+// template-name fop
+$TYPE$
+nsrc_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = NULL;
+ xlator_t *target_xl = ACTIVE_CHILD(this);
+
+ local = mem_get(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+
+ local->stub = fop_$NAME$_stub (frame, nsrc_$NAME$_continue,
+ $ARGS_SHORT$);
+ if (!local->stub) {
+ goto err;
+ }
+ local->curr_xl = target_xl;
+ local->scars = 0;
+
+ frame->local = local;
+ STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, target_xl,
+ target_xl, target_xl->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+
+err:
+ if (local) {
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, ENOMEM,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name cbk
+$TYPE$
+nsrc_$NAME$_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = frame->local;
+ xlator_t *last_xl = cookie;
+ xlator_t *next_xl;
+ nsrc_private_t *priv = this->private;
+ struct timespec spec;
+
+ if (op_ret != (-1)) {
+ if (local->scars) {
+ gf_log (this->name, GF_LOG_INFO,
+ HILITE("retried %p OK"), frame->local);
+ }
+ priv->active = last_xl;
+ goto unwind;
+ }
+ if ((op_errno != EREMOTE) && (op_errno != ENOTCONN)) {
+ goto unwind;
+ }
+
+ /* TBD: get leader ID from xdata? */
+ next_xl = next_xlator(this,last_xl);
+ /*
+ * We can't just give up after we've tried all bricks, because it's
+ * quite likely that a new leader election just hasn't finished yet.
+ * We also shouldn't retry endlessly, and especially not at a high
+ * rate, but that's good enough while we work on other things.
+ *
+ * TBD: implement slow/finite retry via a worker thread
+ */
+ if (!next_xl || (local->scars >= SCAR_LIMIT)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ HILITE("ran out of retries for %p"), frame->local);
+ goto unwind;
+ }
+
+ local->curr_xl = next_xl;
+ local->scars += 1;
+ spec.tv_sec = 1;
+ spec.tv_nsec = 0;
+ /*
+ * WARNING
+ *
+ * Just calling gf_timer_call_after like this leaves open the
+ * possibility that writes will get reordered, if a first write is
+ * rescheduled and then a second comes along to find an updated
+ * priv->active before the first actually executes. We might need to
+ * implement a stricter (and more complicated) queuing mechanism to
+ * ensure absolute consistency in this case.
+ */
+ if (gf_timer_call_after(this->ctx,spec,nsrc_retry_cb,local)) {
+ return 0;
+ }
+
+unwind:
+ call_stub_destroy(local->stub);
+ STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno,
+ $ARGS_SHORT$);
+ return 0;
+}
+
+// template-name cont-func
+$TYPE$
+nsrc_$NAME$_continue (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = frame->local;
+
+ STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, local->curr_xl,
+ local->curr_xl, local->curr_xl->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+}
diff --git a/xlators/cluster/nsr-client/src/gen-fops.py b/xlators/cluster/nsr-client/src/gen-fops.py
new file mode 100755
index 000000000..b07b3c5b1
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/gen-fops.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+
+# This script generates the boilerplate versions of most fops in the client,
+# mostly so that we can use STACK_WIND instead of STACK_WIND_TAIL (see
+# fop-template.c for the details). The problem we're solving is that we sit
+# under DHT, which makes assumptions about getting callbacks only from its
+# direct children. If we didn't define our own versions of these fops, the
+# default versions would use STACK_WIND_TAIL and the callbacks would come from
+# DHT's grandchildren. The code-generation approach allows us to handle this
+# with a minimum of code, and also keep up with any changes to the fop table.
+
+import sys
+sys.path.append("../../nsr-server/src") # Blech.
+import codegen
+
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_t\)"
+full_re = type_re + " *" + name_re
+fop_cg = codegen.CodeGenerator()
+fop_cg.skip = 2
+fop_cg.parse_decls(sys.argv[1],full_re)
+fop_cg.load_templates(sys.argv[2])
+
+# Use the multi-template feature to generate multiple callbacks from the same
+# parsed declarations.
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)"
+full_re = type_re + " *" + name_re
+cbk_cg = codegen.CodeGenerator()
+cbk_cg.skip = 5
+cbk_cg.parse_decls(sys.argv[1],full_re)
+cbk_cg.load_templates(sys.argv[2])
+
+# This is a nasty little trick to handle the case where a generated fop needs
+# a set of default arguments for the corresponding callback.
+#
+# Yes, it's ironic that I'm copying and pasting the generator code.
+fop_cg.make_defaults = cbk_cg.make_defaults
+
+# Sorry, getspec, you're not a real fop until someone writes a stub function
+# for you.
+del fop_cg.decls["getspec"]
+del cbk_cg.decls["getspec"]
+
+# cbk is used by both fop and continue, so emit first
+for f_name in cbk_cg.decls.keys():
+ cbk_cg.emit(f_name,"cbk")
+ print("")
+
+# continue is used by fop, so emit next
+for f_name in fop_cg.decls.keys():
+ fop_cg.emit(f_name,"cont-func")
+ print("")
+
+for f_name in fop_cg.decls.keys():
+ fop_cg.emit(f_name,"fop")
+ print("")
diff --git a/xlators/cluster/nsr-client/src/nsrc.c b/xlators/cluster/nsr-client/src/nsrc.c
new file mode 100644
index 000000000..4551a1432
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/nsrc.c
@@ -0,0 +1,243 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "timer.h"
+#include "xlator.h"
+
+#define SCAR_LIMIT 20
+#define HILITE(x) (""x"")
+
+/*
+ * The fops are actually generated by gen-fops.py; the rest was mostly copied
+ * from defaults.c (commit cd253754 on 27 August 2013).
+ */
+
+enum gf_dht_mem_types_ {
+ gf_mt_nsrc_private_t = gf_common_mt_end + 1,
+ gf_mt_nsrc_end
+};
+
+typedef struct {
+ xlator_t *active;
+} nsrc_private_t;
+
+typedef struct {
+ call_stub_t *stub;
+ xlator_t *curr_xl;
+ uint16_t scars;
+} nsrc_local_t;
+
+char *NSRC_XATTR = "user.nsr.active";
+
+static inline
+xlator_t *
+ACTIVE_CHILD (xlator_t *parent)
+{
+ nsrc_private_t *priv = parent->private;
+
+ return priv ? priv->active : FIRST_CHILD(parent);
+}
+
+xlator_t *
+next_xlator (xlator_t *this, xlator_t *prev)
+{
+ xlator_list_t *trav;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ if (trav->xlator == prev) {
+ return trav->next ? trav->next->xlator
+ : this->children->xlator;
+ }
+ }
+
+ return NULL;
+}
+
+void
+nsrc_retry_cb (void *cb_arg)
+{
+ nsrc_local_t *local = cb_arg;
+
+ gf_log (__func__, GF_LOG_INFO, HILITE("retrying %p"), local);
+ call_resume_wind(local->stub);
+}
+
+#include "nsrc-cg.c"
+
+int32_t
+nsrc_forget (xlator_t *this, inode_t *inode)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement forget_cbk");
+ return 0;
+}
+
+
+int32_t
+nsrc_releasedir (xlator_t *this, fd_t *fd)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement releasedir_cbk");
+ return 0;
+}
+
+int32_t
+nsrc_release (xlator_t *this, fd_t *fd)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement release_cbk");
+ return 0;
+}
+
+struct xlator_fops fops = {
+ .lookup = nsrc_lookup,
+ .stat = nsrc_stat,
+ .fstat = nsrc_fstat,
+ .truncate = nsrc_truncate,
+ .ftruncate = nsrc_ftruncate,
+ .access = nsrc_access,
+ .readlink = nsrc_readlink,
+ .mknod = nsrc_mknod,
+ .mkdir = nsrc_mkdir,
+ .unlink = nsrc_unlink,
+ .rmdir = nsrc_rmdir,
+ .symlink = nsrc_symlink,
+ .rename = nsrc_rename,
+ .link = nsrc_link,
+ .create = nsrc_create,
+ .open = nsrc_open,
+ .readv = nsrc_readv,
+ .writev = nsrc_writev,
+ .flush = nsrc_flush,
+ .fsync = nsrc_fsync,
+ .opendir = nsrc_opendir,
+ .readdir = nsrc_readdir,
+ .readdirp = nsrc_readdirp,
+ .fsyncdir = nsrc_fsyncdir,
+ .statfs = nsrc_statfs,
+ .setxattr = nsrc_setxattr,
+ .getxattr = nsrc_getxattr,
+ .fsetxattr = nsrc_fsetxattr,
+ .fgetxattr = nsrc_fgetxattr,
+ .removexattr = nsrc_removexattr,
+ .fremovexattr = nsrc_fremovexattr,
+ .lk = nsrc_lk,
+ .inodelk = nsrc_inodelk,
+ .finodelk = nsrc_finodelk,
+ .entrylk = nsrc_entrylk,
+ .fentrylk = nsrc_fentrylk,
+ .rchecksum = nsrc_rchecksum,
+ .xattrop = nsrc_xattrop,
+ .fxattrop = nsrc_fxattrop,
+ .setattr = nsrc_setattr,
+ .fsetattr = nsrc_fsetattr,
+ .fallocate = nsrc_fallocate,
+ .discard = nsrc_discard,
+};
+
+struct xlator_cbks cbks = {
+};
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("nsrc", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_mt_nsrc_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Memory accounting init" "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+int32_t
+nsrc_init (xlator_t *this)
+{
+ nsrc_private_t *priv = NULL;
+
+ this->local_pool = mem_pool_new (nsrc_local_t, 128);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create nsrc_local_t pool");
+ goto err;
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_mt_nsrc_private_t);
+ if (!priv) {
+ goto err;
+ }
+
+ priv->active = FIRST_CHILD(this);
+ this->private = priv;
+ return 0;
+
+err:
+ if (priv) {
+ GF_FREE(priv);
+ }
+ return -1;
+}
+
+void
+nsrc_fini (xlator_t *this)
+{
+ GF_FREE(this->private);
+}
+
+int32_t
+nsrc_notify (xlator_t *this, int32_t event, void *data, ...)
+{
+ int32_t ret = 0;
+
+ switch (event) {
+ case GF_EVENT_CHILD_DOWN:
+ /*
+ * TBD: handle this properly
+ *
+ * What we really should do is propagate this only if it caused
+ * us to lose quorum, and likewise for GF_EVENT_CHILD_UP only
+ * if it caused us to gain quorum. However, that requires
+ * tracking child states and for now it's easier to swallow
+ * these unconditionally. The consequence of failing to do
+ * this is that DHT sees the first GF_EVENT_CHILD_DOWN and gets
+ * confused, so it doesn't call us and doesn't get up-to-date
+ * directory listings etc.
+ */
+ break;
+ default:
+ ret = default_notify (this, event, data);
+ }
+
+ return ret;
+}
+
+class_methods_t class_methods = {
+ .init = nsrc_init,
+ .fini = nsrc_fini,
+ .notify = nsrc_notify,
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-recon/Makefile.am b/xlators/cluster/nsr-recon/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-recon/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-recon/src/Makefile.am b/xlators/cluster/nsr-recon/src/Makefile.am
new file mode 100644
index 000000000..e639e4437
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/Makefile.am
@@ -0,0 +1,23 @@
+xlator_LTLIBRARIES = nsr_recon.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsr_recon_la_LDFLAGS = -module -avoid-version
+nsr_recon_la_SOURCES = recon_driver.c recon_xlator.c
+
+nsr_recon_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(top_builddir)/api/src/libgfapi.la
+
+noinst_HEADERS = recon_driver.h recon_xlator.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES =
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-recon/src/recon_driver.c b/xlators/cluster/nsr-recon/src/recon_driver.c
new file mode 100644
index 000000000..8c7622a02
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_driver.c
@@ -0,0 +1,3130 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <fnmatch.h>
+
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+
+
+#include "recon_driver.h"
+#include "recon_xlator.h"
+#include "api/src/glfs-internal.h"
+#include "api/src/glfs-handles.h"
+
+/* TBD: move declarations here and nsr.c into a common place */
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+
+/*
+ * Execution architecture for the NSR reconciliation driver. The driver runs
+ * as a seperate process in each node where the brick is. The main function of
+ * the driver is nsr_reconciliation_driver() (last function below) The driver
+ * just sits in a tight loop waiting for state changes. When a brick becomes a
+ * replica leader, it fences IO, contacts this process and waits for
+ * reconciliation to finish.
+ *
+ * The replica leader talks to other bricks in replica group which are alive
+ * and gets the last term info using which it decides which has the latest
+ * data. That brick is referred to as the "reconciliator"; leader sends a
+ * message to reconciliator to freeze its data (by reading any incomplete data
+ * from other nodes from that term if required)
+ *
+ * Once that is done leader sends a message to all nodes except the
+ * reconciliator to sync themselves with the reconciliator. This process is
+ * referred to as "resolution".
+ *
+ * Hence the reconciliation processes need to talk to each other to get a given
+ * term info. This is implemented using the recon translator IOs which
+ * implements a bare bone RPC by exposing a file interface to which
+ * reads/writes are done to pass control messages. This is referred to as the
+ * "control plane". This implementation allows the control plane to be
+ * implemented as a bunch of threads for each of the nodes.
+ *
+ * The reconciliation process also needs to talk to the brick process on that
+ * node to actually write the data as part of reconciliation/resolution. This
+ * is referred to as the "data plane". Again there are a bunch of threads that
+ * do this work.
+ *
+ * The way the worker threads are organised is that main driver context has a
+ * pointer to contexts for each of these thread contexts. The thread context at
+ * index 0 always refers to talking with local recon process/brick. So the
+ * control worker at index 0 will get the local changelog info and data worker
+ * at index 0 will talk to local brick.
+ *
+ * All the ops from the control/data planes are implemented using the glfs
+ * APIs.
+ */
+
+#if defined(NSR_DEBUG)
+
+/* This lets us change on the fly even if NSR_DEBUG is defined. */
+int nsr_debug_level = GF_LOG_TRACE;
+
+FILE *
+recon_create_log (char *member, char *module)
+{
+ char *dpath = NULL;
+ char *p;
+ char *fpath = NULL;
+ FILE *fp = NULL;
+ int fd = -1;
+
+ (void)mkdir(NSR_LOG_DIR,0777);
+ (void)asprintf(&dpath,NSR_LOG_DIR"/%s",member);
+ if (dpath) {
+ for (p = dpath + strlen(NSR_LOG_DIR) + 1; *p; ++p) {
+ if (*p == '/') {
+ *p = '-';
+ }
+ }
+ (void)mkdir(dpath,0777);
+ (void)asprintf(&fpath,"%s/%s",dpath,module);
+ if (fpath) {
+ fd = open(fpath,O_WRONLY|O_CREAT|O_APPEND|O_SYNC,0666);
+ if (fd >= 0) {
+ fp = fdopen(fd,"a");
+ if (!fp) {
+ close(fd);
+ }
+ }
+ if (fp) {
+ if (setvbuf (fp, NULL, _IONBF, 0)) {
+ /*
+ * Might as well take advantage of it
+ * to log the error.
+ */
+ fprintf (fp,
+ "setvbuf failed for log\n");
+ fprintf (fp,
+ "log output may be async\n");
+ fflush(fp);
+ }
+ }
+ free(fpath);
+ }
+ free(dpath);
+ }
+
+ return fp;
+}
+
+void
+_nsr_driver_log (const char *func, int line, char *member, FILE *fp,
+ char *fmt, ...)
+{
+ va_list ap;
+ char *buf = NULL;
+ int retval;
+
+ if (!fp) {
+ fp = recon_create_log(member,"nsr-driver-log");
+ if (!fp) {
+ return;
+ }
+ }
+
+ va_start(ap,fmt);
+ retval = vasprintf(&buf,fmt,ap);
+ if (buf) {
+ fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf);
+ free(buf);
+ }
+ va_end(ap);
+}
+
+void
+_nsr_worker_log (const char *func, int line, char *member,
+ char *type, uint32_t index, FILE *fp,
+ char *fmt, ...)
+{
+ va_list ap;
+ char *buf = NULL;
+ int retval;
+
+ if (!fp) {
+ char *name;
+ if (asprintf(&name,"%s-%u",type,index) < 1) {
+ return;
+ }
+ fp = recon_create_log (member, name);
+ if (!fp) {
+ return;
+ }
+ }
+
+ va_start(ap,fmt);
+ retval = vasprintf(&buf,fmt,ap);
+ if (buf) {
+ fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf);
+ free(buf);
+ }
+ va_end(ap);
+}
+
+#endif
+
+/*
+ * Recon Driver Calloc
+ *
+ * We need this because all of this messing about with gfapi from within a
+ * translator keeps scrambling THIS (only one reason it's a terrible idea) and
+ * we need THIS to have a value that represents our initialization with our
+ * memory types.
+ *
+ * Note that the macro requires "this" to be defined in the current scope.
+ */
+
+#define RD_CALLOC(x,y,z) ({THIS = this; GF_CALLOC(x,y,z); })
+
+/*
+ * This function gets the size of all the extended attributes for a file.
+ * This is used so that caller knows how much to allocate for key-value storage.
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can get this from the required brick
+ *
+ * Output Arguments:
+ * b - pointer to the buffer where the attributes are filled up.
+ * key_size - the size of all keys
+ * val_size - the size of all values
+ * num - number of key/values
+ */
+static int32_t
+get_xattr_total_size( struct glfs_fd *fd,
+ char **b,
+ uint32_t *key_size,
+ uint32_t *val_size,
+ uint32_t* num,
+ dict_t *dict)
+{
+ int32_t s = -1, ret = -1;
+ char *c = NULL;
+
+ *key_size = 0;
+ *val_size = 0;
+ *num = 0;
+
+ // First get the size of the keys
+ s = glfs_flistxattr_with_xdata(fd, NULL,0, dict);
+ if (s == -1) {
+ goto out;
+ }
+ *key_size = s;
+
+ // TBD - use the regular calloc
+ (*b) = c = calloc(s+1,1);
+
+ // get the keys themselves
+ if (glfs_flistxattr_with_xdata(fd, c, s+1, dict) == -1) {
+ goto out;
+ }
+ do {
+ int32_t r;
+ uint32_t len = 0;
+ // for each key get the size of the value
+ r = glfs_fgetxattr_with_xdata(fd, c, NULL, 0, dict);
+ if (r == -1)
+ goto out;
+ (*val_size) += r;
+ len = strlen(c) + 1;
+ c += len;
+ s -= len;
+ (*num)++;
+ } while(s);
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * This function gets bunch of xattr values given set of keys.
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * keys - the bunch of keys
+ * size - size of values
+ * num - number of keys
+ * dict - passed so that NSR translator can get this from the required brick
+ *
+ * Output Arguments:
+ * buf - where the values are written one after the other (NULL seperated)
+ */
+static int32_t
+get_xattr(struct glfs_fd *fd,
+ char *keys,
+ char *buf,
+ uint32_t size,
+ uint32_t num,
+ dict_t *dict)
+{
+ while(num--) {
+ int32_t r;
+ uint32_t len = 0;
+
+ // copy the key
+ strcpy(buf, keys);
+ len = strlen(keys);
+ len++;
+ buf += len;
+
+ // get the value and copy the value after incrementing buf after the key
+ r = glfs_fgetxattr_with_xdata(fd, keys, buf, size, dict);
+
+ // TBD - handle error
+ if (r == -1)
+ return -1;
+
+ // increment the key to next value
+ keys += len;
+
+ // increment buf to hold the next key
+ buf += strlen(buf) + 1;
+ }
+ return 0;
+}
+
+/*
+ * Function deletes a bunch of key values in extended attributes of a file.
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can do this from the required brick
+ * keys - bunch of NULL seperated key names
+ * num - number of keys
+ */
+static int32_t delete_xattr(struct glfs_fd *fd,
+ dict_t *dict_t,
+ char *keys,
+ uint32_t num)
+{
+ while(num--) {
+ // get the value and copy the value
+ // TBD - handle failure cases when calling glfs_fremovexattr_with_xdata()
+ if (glfs_fremovexattr_with_xdata(fd, keys, dict_t) == -1)
+ return -1;
+ keys += strlen(keys) +1;
+ }
+ return 0;
+}
+
+/*
+ * Given a bunch of key value pairs, fill them as xattrs for a file
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can do this from the required brick
+ * buf - buffer containing the keys-values pairs. The key value are NULL seperated.
+ * Each of the key-value is seperated by NULL in turn.
+ * num - Number of such key value pairs.
+ */
+static int32_t
+fill_xattr(struct glfs_fd *fd,
+ dict_t *dict,
+ char *buf,
+ uint32_t num)
+{
+ char *k = buf, *val = NULL;
+
+ while(num--) {
+ int32_t r;
+
+ val = k + strlen(k) + 1;
+
+ // TBD - handle failure cases when calling glfs_fsetxattr_with_xdata()
+ r = glfs_fsetxattr_with_xdata(fd, k, val, strlen(val), 0, dict);
+ if (r == -1)
+ return -1;
+ k = val + strlen(val) + 1;
+ }
+ return 0;
+}
+
+/*
+ * This function gets a file that can be used for doing glfs_init later.
+ * The control file is used by control thread(function) to talk to peer reconciliation process.
+ * The data file is used by the data thread(function) to talk to the bricks.
+ * The control file is of name such as con:gfs1:-mnt-a1 where "gfs1" is name of host
+ * and the brick path is "/mnt/a1".
+ * The data file is of name such as data:gfs1:-mnt-a1.
+ *
+ * Input Arguments:
+ * vol - name of the volume. This is used to build the full path of the control and data file
+ * such as /var/lib/glusterd/vols/test/bricks/gfs2:-mnt-test1-nsr-recon.vol.
+ * In above example the volume name is test and brick on gfs2 is on path /mnt/test1
+ *
+ * worker - The worker for a given node. This worker has 2 threads - one on the data plane
+ * and one on the control plane. The worker->name is already filled with hostname:brickname
+ * in the function nsr_reconciliation_driver(). Use that to build the volume file.
+ * So if worker->name has gfs1:/mnt/a1, control file is con:gfs1:-mnt-a1
+ * and data file is data:gfs1:-mnt-a1.
+ * All these files are under the bricks directory. TBD - move this to a NSR recon directory later.
+ */
+static void
+nsr_recon_get_file(char *vol, nsr_replica_worker_t *worker)
+{
+ char *ptr;
+ char tr[256];
+
+ // Replace the "/" to -
+ strcpy(tr, worker->name);
+ ptr = strchr (tr, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (tr, '/');
+ }
+
+ // Build the base directory such as "/var/lib/glusterd/vols/test/bricks/"
+ sprintf(worker->control_worker->vol_file,
+ "/%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ vol,
+ GLUSTERD_BRICK_INFO_DIR);
+
+ strcat(worker->control_worker->vol_file, "con:");
+ strcat(worker->control_worker->vol_file, tr);
+
+ sprintf(worker->data_worker->vol_file,
+ "/%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ vol,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(worker->data_worker->vol_file, "data:");
+ strcat(worker->data_worker->vol_file, tr);
+}
+
+/*
+ * This function does all the glfs initialisation
+ * so that reconciliation process can talk to other recon processes/bricks
+ * for the control/data messages.
+ * This will be done everytime a worker needs to be kicked off to talk
+ * across any plane.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static int32_t
+nsr_recon_start_work(nsr_per_node_worker_t *ctx,
+ gf_boolean_t control)
+{
+ glfs_t *fs = NULL;
+ xlator_t *this = ctx->driver_ctx->this;
+ int32_t ret = 0;
+ glfs_fd_t *aux_fd = NULL; // fd of auxilary log
+ char lf[256];
+ nsr_recon_private_t *priv = NULL;
+ char *my_name = NULL;
+ char *morph_name = NULL, *ptr = NULL;
+
+ priv = this->private;
+ my_name = RD_CALLOC (1,
+ strlen (priv->replica_group_members[0]) + 1,
+ gf_mt_recon_member_name_t);
+ strcpy (my_name, priv->replica_group_members[0]);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting work with volfile %s\n",
+ ctx->vol_file);
+
+ fs = glfs_new(ctx->id);
+ if (!fs) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot create gfls context for thread %s\n",ctx->id);
+ return -1;
+ }
+
+ // For some vague reason, glfs init APIs seem to be clobbering "this". hence resetting it.
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "init done. setting volfile %s\n",
+ ctx->vol_file);
+
+ ret = glfs_set_volfile(fs, ctx->vol_file);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot set volfile %s for thread %s\n",ctx->vol_file, ctx->id);
+ return -1;
+ }
+
+ morph_name = RD_CALLOC (1, strlen (my_name) + 1,
+ gf_mt_recon_member_name_t);
+ strcpy (morph_name, my_name);
+
+ ptr = strchr (morph_name, '/');
+ while (ptr)
+ {
+ *ptr = '-';
+ ptr = strchr (morph_name, '/');
+ }
+ // TBD - convert this to right /usr/local/var/log based log files.
+
+ sprintf(lf, NSR_LOG_DIR"/%s/%s-%"PRIu32, morph_name,
+ (control == _gf_true)?"glfs-con":"glfs-data", ctx->index);
+ ret = glfs_set_logging (fs, lf, 7);
+ if (ret) {
+ glusterfs_this_set(this);
+ gf_log (this->name, GF_LOG_ERROR, "glfs logging set failed (%s)",
+ strerror (errno));
+ return -1;
+ }
+
+ ret = glfs_init (fs);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do init for thread %s with volfile %s\n",ctx->id, ctx->vol_file);
+ return -1;
+ }
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "setting volfile %s done\n",
+ ctx->vol_file);
+
+ // If it is control thread, open the "/" as the aux_fd.
+ // All IOs happening via the fd will do the RPCs across the reconciliation
+ // processes. For some vague reason, the root seems to be open'able like a file.
+ // TBD - try to clean this up. (implement a virtual file???)
+ if (control == _gf_true) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing open for / \n");
+ aux_fd = glfs_open (fs, "/", O_RDWR);
+ // TBD - proper error handling. Stall reconciliation if such a thing happens?
+ if (aux_fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot open aux log file for thread %s\n",ctx->id);
+ return -1;
+ } else {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "---opened aux log file for thread %s\n",ctx->id);
+ }
+ ctx->aux_fd = aux_fd;
+ }
+ glusterfs_this_set(this);
+ ctx->fs = fs;
+ return 0;
+}
+
+/*
+ *
+ * This function does the cleanup after reconciliation is done
+ * or before we start a new reconciliation.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static int32_t
+nsr_recon_end_work(nsr_per_node_worker_t *ctx,
+ gf_boolean_t control)
+{
+ int32_t ret = 0;
+ xlator_t *this = ctx->driver_ctx->this;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing fini for recon worker\n");
+
+ ret = glfs_fini(ctx->fs);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do fini for thread %s with volfile %s\n",ctx->id, ctx->vol_file);
+ return -1;
+ }
+ glusterfs_this_set(this);
+ ctx->fs = NULL;
+ if (control == _gf_true) {
+ glfs_close (ctx->aux_fd);
+ ctx->aux_fd = NULL;
+ }
+ return 0;
+}
+
+//called in case all worker functions run as sepeerate threads
+static void
+init_worker(nsr_per_node_worker_t *ctx, gf_boolean_t control)
+{
+ pthread_mutex_init(&(ctx->mutex), NULL);
+ pthread_cond_init(&(ctx->cv), NULL);
+ INIT_LIST_HEAD(&(ctx->head.list));
+}
+
+
+/*
+ * Control worker funct for getting changelog info on this node.
+ * calls directly functions to parse the changelog.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static void
+control_worker_func_0(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]);
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ xlator_t *this = dr->this;
+ nsr_recon_private_t *priv = this->private;
+
+ ctx->is_control = _gf_true;
+
+ switch (work->req_id) {
+ case NSR_WORK_ID_INI:
+ {
+ break;
+ }
+ case NSR_WORK_ID_FINI:
+ {
+ break;
+ }
+ case NSR_WORK_ID_GET_LAST_TERM_INFO:
+ {
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // term is stuffed inside work->index. overloading.
+ int32_t term = work->index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get last term info for node %d with current term %d\n",index, term);
+
+ // TBD - handle errors
+ // This is called by the leader after it gets the current term.
+ // Makes searching easier.
+ nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, term, &lt);
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get last term info with current term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+ break;
+ }
+ case NSR_WORK_ID_GET_GIVEN_TERM_INFO:
+ {
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // term is stuffed inside work->index. overloading.
+ int32_t term = work->index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get term info for node %d for term %d\n",index, term);
+
+ // TBD - handle errors
+ nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, term, &lt);
+
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get term info for term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+
+ break;
+ }
+ case NSR_WORK_ID_RECONCILIATOR_DO_WORK:
+ {
+ // For local resolution, the main driver thread does it.
+ // SO there is no way we can have this message for this node.
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this message should not be sent \n");
+ break;
+ }
+ case NSR_WORK_ID_RESOLUTION_DO_WORK:
+ {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this message should not be sent \n");
+ ctx->result = -1;
+ break;
+ }
+ case NSR_WORK_ID_GET_RECONCILATION_WINDOW:
+ {
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // first_index and last_index at 0 indicates empty log.
+ // For non empty log, the first_index always starts at 1.
+ uint32_t num = (dr->workers[index].recon_info->last_index -
+ dr->workers[index].recon_info->first_index + 1);
+ nsr_recon_record_details_t *rd;
+ uint32_t i=0;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get reconciliation window records for node %d for term %d with first %d last %d\n",
+ index, recon_info->last_term, recon_info->first_index, recon_info->last_index);
+
+
+ // TBD - handle buffer allocation errors
+ rd = RD_CALLOC(num,
+ sizeof(nsr_recon_record_details_t),
+ gf_mt_recon_record_details_t);
+ if (rd == NULL) {
+ ctx->result = -1;
+ return;
+ }
+
+ recon_info->records = RD_CALLOC(num,
+ sizeof(nsr_reconciliation_record_t),
+ gf_mt_recon_record_t);
+ if (recon_info->records == NULL) {
+ ctx->result = -1;
+ return;
+ }
+
+ // TBD - handle errors
+ if (nsr_recon_libchangelog_get_records(this, priv->changelog_base_path,
+ recon_info->last_term,
+ recon_info->first_index,
+ recon_info->last_index,
+ rd) == _gf_false) {
+ ctx->result = -1;
+ return;
+ }
+
+ // The above function writes into rd from 0 to (num -1)
+ // We need to take care of this whenever we deal with records
+ for (i=0; i < num; i++) {
+ ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl
+ memcpy(&(recon_info->records[i].rec),
+ &(rd[i]),
+ sizeof(nsr_recon_record_details_t));
+ }
+
+ GF_FREE(rd);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got reconciliation window records for node %d for term %d \n",
+ index, recon_info->last_term);
+ break;
+ }
+
+ default:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "bad req id %u", work->req_id);
+ }
+
+ return;
+}
+
+// Control worker thread
+static void*
+control_worker_main_0(nsr_per_node_worker_t *ctx)
+{
+
+ ctx->is_control = _gf_true;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting control worker func 0\n");
+
+ init_worker(ctx, 1);
+
+ while(1)
+ {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&ctx->mutex);
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&ctx->cv, &ctx->mutex);
+ }
+ pthread_mutex_unlock(&ctx->mutex);
+
+
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n", work->req_id);
+ work->in_use = _gf_false;
+
+ // Call the main function.
+ control_worker_func_0(ctx, work);
+
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ GF_FREE(work);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+static void
+control_worker_do_reconciliation (nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ nsr_recon_role_t rr;
+ uint32_t i=0;
+ uint32_t num=0;
+ uint32_t idx = dr->reconciliator_index;
+ uint32_t term = dr->workers[idx].recon_info->last_term;
+
+ GF_ASSERT(idx == index);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to make this index %d as reconciliator for term %d\n", index, term);
+
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd,
+ nsr_recon_xlator_sector_1,
+ SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+
+ // We have all the info for all other nodes.
+ // Fill all that info when sending data to that process.
+ for (i=0; i < dr->replica_group_size; i++) {
+ if ( dr->workers[i].in_use &&
+ (dr->workers[i].recon_info->last_term == term)) {
+ rr.info[num].last_term =
+ dr->workers[i].recon_info->last_term;
+ rr.info[num].commited_ops =
+ dr->workers[i].recon_info->commited_ops;
+ rr.info[num].last_index =
+ dr->workers[i].recon_info->last_index;
+ rr.info[num].first_index =
+ dr->workers[i].recon_info->first_index;
+ strcpy(rr.info[num].name,
+ dr->workers[i].name);
+ }
+ num++;
+ }
+ rr.num = num;
+ rr.role = reconciliator;
+ ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl
+ if (glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0) == -1) {
+ ctx->result = -1;
+ // Put the errno only for this case since we are bothered about
+ // retrying only for this case. For rest of the cases we will
+ // just return EIO in errno.
+ ctx->op_errno = errno;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "sent reconciliator info for term %d with node count as %d\n", term, num);
+}
+
+static void
+control_worker_do_resolution (nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ nsr_recon_role_t rr;
+ unsigned int i=0, j=0;
+ unsigned int rec = dr->reconciliator_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to make this index %d as resolutor with reconciliator as %d\n",index, rec);
+
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd,
+ nsr_recon_xlator_sector_1,
+ SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+
+ rr.num = 2;
+
+ // Fill in info[0] as info for the node for which we are seeking
+ // resolution. Fill in info[1] as info of the reconciliator node. The
+ // function nsr_recon_driver_get_role() that will be called when this
+ // message reaches the node will look at index 1 for term information
+ // related to the reconciliator.
+ for (i=0; i < 2; i++) {
+ (i == 0) ? (j = index) : (j = rec);
+ rr.info[i].last_term =
+ dr->workers[j].recon_info->last_term;
+ rr.info[i].commited_ops =
+ dr->workers[j].recon_info->commited_ops;
+ rr.info[i].last_index =
+ dr->workers[j].recon_info->last_index;
+ rr.info[i].first_index =
+ dr->workers[j].recon_info->first_index;
+ // The name is used as the key to convert indices since the
+ // reconciliator index could be different across the nodes.
+ strcpy(rr.info[i].name,
+ dr->workers[j].name);
+ if (i == 0) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this node info term=%d, ops=%d, first=%d, last=%d\n",
+ rr.info[i].last_term, rr.info[i].commited_ops,
+ rr.info[i].first_index,rr.info[i].last_index);
+ } else {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "reconciliator node info term=%d, ops=%d, first=%d, last=%d\n",
+ rr.info[i].last_term, rr.info[i].commited_ops,
+ rr.info[i].first_index,rr.info[i].last_index);
+ }
+ }
+ rr.role = resolutor;
+ ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl
+ if (glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0) == -1) {
+ ctx->result = -1;
+ // Put the errno only for this case since we are bothered about
+ // retrying only for this case. For rest of the cases we will
+ // just return EIO in errno.
+ ctx->op_errno = errno;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "sent message to this node %d resolutor with reconciliator as %d\n", index, rec);
+}
+
+static void
+control_worker_get_window (nsr_per_node_worker_t *ctx, nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]);
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ xlator_t *this = dr->this;
+ nsr_recon_log_info_t li;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ uint32_t i = 0;
+ uint32_t num = (dr->workers[index].recon_info->last_index -
+ dr->workers[index].recon_info->first_index +1);
+ nsr_recon_record_details_t *rd;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get reconciliation window records for node %d for term %d with first %d last %d\n",
+ index, recon_info->last_term, recon_info->first_index, recon_info->last_index);
+
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_2, SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+
+ // write to node what term & indices we are interested
+ li.term = recon_info->last_term;
+ li.first_index = recon_info->first_index;
+ li.last_index = recon_info->last_index;
+ ENDIAN_CONVERSION_LI(li, _gf_false); //htonl
+ if (glfs_write(ctx->aux_fd, &li, sizeof(li), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+
+ // then read
+ rd = RD_CALLOC(num,
+ sizeof(nsr_recon_record_details_t),
+ gf_mt_recon_private_t);
+ if (rd == NULL) {
+ ctx->result = -1;
+ return;
+ }
+ recon_info->records = RD_CALLOC(num,
+ sizeof(nsr_reconciliation_record_t),
+ gf_mt_recon_private_t);
+ if (recon_info->records == NULL) {
+ ctx->result = -1;
+ goto err;
+ }
+
+ if (glfs_read(ctx->aux_fd, rd, num * sizeof(nsr_recon_record_details_t), 0) == -1) {
+ ctx->result = -1;
+ goto err;
+ }
+
+ for (i=0; i < num; i++) {
+ ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl
+ memcpy (&(recon_info->records[i].rec), &(rd[i]),
+ sizeof(nsr_recon_record_details_t));
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "get_reconcilaition_window:Got %d at index %d\n",
+ recon_info->records[i].rec.type,
+ i + recon_info->first_index);
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got reconciliation window records for node %d for term %d \n",
+ index, recon_info->last_term);
+
+err:
+ GF_FREE(rd);
+}
+
+/*
+ * Control worker funct for getting changelog info on some other node.
+ * calls glfs functions to seek/read/write on aux_fd.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static void
+control_worker_func(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]);
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ int32_t term = htonl(work->index); // overloading it
+
+ ctx->is_control = _gf_true;
+
+ switch (work->req_id){
+
+ case NSR_WORK_ID_INI:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "calling nsr_recon_start_work\n");
+
+ // TBD - handle error in case nsr_recon_start_work gives error
+ if (nsr_recon_start_work(ctx, _gf_true) != 0) {
+ ctx->result = -1;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished nsr_recon_start_work\n");
+ break;
+
+ case NSR_WORK_ID_FINI:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "calling nsr_recon_end_work\n");
+
+ // TBD - handle error in case nsr_recon_end_work gives error
+ if (nsr_recon_end_work(ctx, _gf_true) != 0) {
+ ctx->result = -1;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished nsr_recon_end_work\n");
+ break;
+
+ case NSR_WORK_ID_GET_LAST_TERM_INFO:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get last term info for node %d with current term %d\n",index, work->index);
+
+ // first write the current term term number
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_4, SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ if (glfs_write(ctx->aux_fd, &term, sizeof(term), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ if (glfs_read(ctx->aux_fd, &lt, sizeof(lt), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get last term info with current term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+
+ break;
+
+ case NSR_WORK_ID_GET_GIVEN_TERM_INFO:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get term info for node %d for term %d\n",index, work->index);
+
+ // first write the term number
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_3, SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ if (glfs_write(ctx->aux_fd, &term, sizeof(term), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ if (glfs_read(ctx->aux_fd, &lt, sizeof(lt), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get term info for term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+
+ break;
+
+ case NSR_WORK_ID_RECONCILIATOR_DO_WORK:
+ control_worker_do_reconciliation(ctx,work);
+ break;
+
+ case NSR_WORK_ID_RESOLUTION_DO_WORK:
+ control_worker_do_resolution(ctx,work);
+ break;
+
+ case NSR_WORK_ID_GET_RECONCILATION_WINDOW:
+ control_worker_get_window(ctx,work);
+ break;
+
+ default:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "bad work type %d", work->req_id);
+ }
+
+ return;
+}
+
+// Control worker thread
+static void*
+control_worker_main(nsr_per_node_worker_t *ctx)
+{
+ unsigned int index = ctx->index;
+
+ ctx->is_control = _gf_true;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting control worker func\n");
+
+ // if this is for local processing, call the changelog parsing calls directly
+ if (index == 0) {
+ control_worker_main_0(ctx);
+ return NULL;
+ }
+
+ init_worker(ctx, 1);
+
+
+ while(1)
+ {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&ctx->mutex);
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&ctx->cv, &ctx->mutex);
+ }
+ pthread_mutex_unlock(&ctx->mutex);
+
+
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n", work->req_id);
+ work->in_use = _gf_false;
+ control_worker_func(ctx,work);
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ GF_FREE(work);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+/*
+ * This function gets called if this process is chosen as the reconciliator
+ * for this replica group. It would have already got the records for the last term
+ * for the indices that are required (from the first HOLE to last index) from
+ * all other nodes that also witnessed that term. COmpare all the records and
+ * compute the work required.
+ *
+ * Input arguments
+ * ctx - driver context. All recon work is stored in workers[0].recon_info
+ */
+static void
+compute_reconciliation_work(nsr_recon_driver_ctx_t *ctx)
+{
+ uint32_t i=0, j=0;
+ nsr_reconciliator_info_t *my_recon = ctx->workers[0].recon_info;
+ uint32_t num = (my_recon->last_index - my_recon->first_index + 1);
+
+ for (i=0; i < num; i++) {
+ nsr_log_type_t orig, new;
+ unsigned int src = 0;
+ orig = new = my_recon->records[i].rec.type;
+ nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE;
+ // index 0 means this node. Look at all other nodes.
+ for (j=1; j < ctx->replica_group_size; j++) {
+ if (ctx->workers[j].in_use) {
+ nsr_log_type_t pr = ctx->workers[j].recon_info->records[i].work.type;
+ if ((new != pr) && (pr > new)) {
+ src = j;
+ new = (new | pr);
+ }
+ }
+ }
+ // TBD - compare data if new and orig are all FILLs. (can detect changelog corruption)
+ // Right now we compare if both orig and new are psuedo holes since
+ // only that is of interest to us.
+ if (orig != new) {
+ if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_PSEUDO_HOLE))
+ tw = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE;
+ else if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_PSEUDO_HOLE))
+ tw = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE;
+ else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ }
+ if (tw != NSR_RECON_WORK_NONE) {
+ my_recon->records[i].work.type = tw;
+ my_recon->records[i].work.source = src;
+ // Overwrite the record
+ memcpy(&(my_recon->records[i].rec),
+ &(ctx->workers[src].recon_info->records[i].rec),
+ sizeof(nsr_recon_record_details_t));
+ }
+ }
+ return;
+}
+
+static int32_t
+nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx,
+ uint32_t i,
+ gf_boolean_t in_use);
+
+/*
+ * Write the role and associated information to the node.
+ * This gets called from recon xlator indicating node is either
+ * leader, reconciliator or should do resolution.
+ */
+gf_boolean_t
+nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_role_t *rr,
+ uint32_t term)
+{
+ nsr_role_work_t *rw;
+ xlator_t *this = ctx->this;
+
+ nsr_driver_log(this->name, GF_LOG_INFO, "set role called \n");
+ rw = RD_CALLOC(1, sizeof (nsr_role_work_t), gf_mt_recon_role_work_t);
+ memcpy(&rw->role, rr, sizeof(nsr_recon_role_t));
+ rw->term = term;
+ INIT_LIST_HEAD(&(rw->list));
+ pthread_mutex_lock(&(ctx->mutex));
+ list_add_tail(&rw->list, &ctx->role_head.list);
+ pthread_cond_signal(&(ctx->cv));
+ pthread_mutex_unlock(&(ctx->mutex));
+ nsr_driver_log(this->name, GF_LOG_INFO, "set role returns \n");
+ return _gf_true;
+}
+
+/*
+ * First we undo the last role to make sure we clean up.
+ *
+ * Input arguments
+ * ctx - driver context.
+ * rr - Role information.
+ * If leader, the thread now sends the list of all nodes that are part of
+ * the current replica group. Use that to find out the activate the
+ * required worker threads.
+ * If reconciliator, the leader node would have sent information about
+ * all nodes which saw last term as the reconciliator.
+ * If resolution to be done, then rr.info[0] will have this node's info
+ * which the leader would have got earlier. rr[1].info will have the
+ * info regarding the reconciliator.
+ * term - leader's term that is causing this role
+ */
+nsr_recon_driver_state_t
+nsr_recon_driver_get_role(int32_t *status,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_role_work_t *rw)
+{
+ uint8_t i=0, j=0;
+ nsr_recon_role_t *rr = &(rw->role);
+ nsr_reconciliator_info_t *tmp;
+ xlator_t *this = ctx->this;
+
+ // First make all the threads uninitialise
+ for (i = 0; i < ctx->replica_group_size; i++) {
+ if (nsr_recon_in_use(ctx, i, _gf_false) == -1) {
+ *status = -1;
+ return 0;
+ }
+ }
+
+ switch (rr->role) {
+ case leader:
+ case joiner:
+
+ // First set info this node
+ tmp = RD_CALLOC (1, sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ ctx->workers[0].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, 0, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ ctx->current_term = rr->current_term;
+
+ // Find rest of the nodes
+ for (i=1; i < ctx->replica_group_size; i++) {
+ for (j=0 ; /* nothing */; j++) {
+ if (j >= rr->num) {
+ nsr_driver_log (this->name, GF_LOG_ERROR,
+ "failed to find %s",
+ ctx->workers[i].name);
+ break;
+ }
+ if (strcmp(ctx->workers[i].name,
+ rr->info[j].name)) {
+ continue;
+ }
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "nsr_recon_driver_get_role: this as %s. found other server %s\n",
+ (rr->role == leader) ? "leader"
+ : "joiner",
+ ctx->workers[i].name);
+
+ // Allocate this here. This will get later
+ // filled when the leader tries to get last term
+ // information from all the nodes
+ tmp = RD_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ ctx->workers[i].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, i, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ break;
+ }
+ }
+ // If leader, reconciliator has to be chosen.
+ // If joiner, we are the reconciliator.
+ if (rr->role == leader)
+ ctx->reconciliator_index = -1;
+ else
+ ctx->reconciliator_index = 0;
+ break;
+
+ case reconciliator:
+ ctx->reconciliator_index = 0;
+ // Copy information about all the other members which had the
+ // same term
+ for (i=0; i < rr->num; i++) {
+ for (j=0; /* nothing */; j++) {
+ if (j >= ctx->replica_group_size) {
+ nsr_driver_log (this->name, GF_LOG_ERROR,
+ "failed to find %s",
+ rr->info[i].name);
+ break;
+ }
+ if (strcmp(rr->info[i].name,
+ ctx->workers[j].name)) {
+ continue;
+ }
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "nsr_recon_driver_get_role: this as reconciliator. found other server %s\n",
+ ctx->workers[j].name);
+ tmp = RD_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ tmp->last_term = rr->info[i].last_term;
+ tmp->commited_ops = rr->info[i].commited_ops;
+ tmp->last_index = rr->info[i].last_index;
+ tmp->first_index = rr->info[i].first_index;
+ ctx->workers[j].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, j, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ break;
+ }
+ }
+ break;
+
+ case resolutor:
+ for (j=0; /* nothing */; j++) {
+ // info[1] has the information regarding the
+ // reconciliator
+ if (j >= ctx->replica_group_size) {
+ nsr_driver_log (this->name, GF_LOG_ERROR,
+ "failed to find %s",
+ rr->info[1].name);
+ break;
+ }
+ if (strcmp(rr->info[1].name,
+ ctx->workers[j].name)) {
+ continue;
+ }
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "nsr_recon_driver_get_role: this as resolutor. found other server %s as reconciliator\n",
+ ctx->workers[j].name);
+ tmp = RD_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ tmp->last_term = rr->info[1].last_term;
+ tmp->commited_ops = rr->info[1].commited_ops;
+ tmp->last_index = rr->info[1].last_index;
+ tmp->first_index = rr->info[1].first_index;
+ ctx->reconciliator_index = j;
+ ctx->workers[j].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, j, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ GF_ASSERT(ctx->reconciliator_index != 0);
+ break;
+ }
+ tmp = RD_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ // info[0] has all info for this node
+ tmp->last_term = rr->info[0].last_term;
+ tmp->commited_ops = rr->info[0].commited_ops;
+ tmp->last_index = rr->info[0].last_index;
+ tmp->first_index = rr->info[0].first_index;
+ ctx->workers[0].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, 0, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ }
+
+ ctx->term = rw->term;
+
+ *status = 0;
+ return rr->role;
+}
+
+
+/*
+ * This function gets called if this process is chosen to sync itself with
+ * the reconciliator.
+ *
+ * Input arguments
+ * ctx - driver context.
+ * my_info - local changelog info that has all the local records for indices that require work
+ * his_info - reconciliator's info that has all the golden copies
+ * invalidate - if set to true, then do not consult local records
+ */
+
+static void
+compute_resolution_work(nsr_recon_driver_ctx_t *ctx,
+ nsr_reconciliator_info_t *my_info,
+ nsr_reconciliator_info_t *his_info,
+ gf_boolean_t invalidate)
+{
+ uint32_t i=0;
+ uint32_t num = (my_info->last_index - my_info->first_index + 1);
+ xlator_t *this = ctx->this;
+
+ if (invalidate) {
+ if (my_info->records) {
+ GF_FREE(my_info->records);
+ }
+ my_info->records = RD_CALLOC(num,
+ sizeof(nsr_reconciliation_record_t),
+ gf_mt_recon_record_t);
+ }
+
+ for (i=0; i < num; i++) {
+ nsr_log_type_t orig, new;
+ nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE;
+ orig = my_info->records[i].rec.type;
+ if (invalidate)
+ orig = NSR_LOG_HOLE;
+ new = his_info->records[i].rec.type;
+ // TBD - we can never have PSUEDO_HOLE in reconciliator's info
+ // We should have taken care of that during reconciliation.
+ // Put an assert to validate that.
+ if (new != orig) {
+ if ((orig != NSR_LOG_FILL) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ else if ((orig != NSR_LOG_HOLE) && (new == NSR_LOG_HOLE))
+ tw = NSR_RECON_WORK_UNDO_FILL;
+ }
+ // copy the records anyway
+ my_info->records[i].work.type = tw;
+ my_info->records[i].work.source = ctx->reconciliator_index;
+ memcpy(&(my_info->records[i].rec),
+ &(his_info->records[i].rec),
+ sizeof(nsr_recon_record_details_t));
+ }
+ return;
+}
+
+
+// Create an glfs object
+static struct glfs_object *
+create_obj(nsr_per_node_worker_t *ctx, char *gfid_str)
+{
+ struct glfs_object *obj = NULL;
+ uuid_t gfid;
+
+ uuid_parse(gfid_str, gfid);
+
+ obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL);
+ if (obj == NULL) {
+ GF_ASSERT(obj != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ return NULL;
+ }
+ return obj;
+}
+
+/*
+ * Function to apply the actual record onto the local brick.
+ * prior to this we should have read all the data from the
+ * brick that has the data.
+ *
+ * Input parameters:
+ * ctx - per node worker context that has the fs for communicating to brick
+ * ri - Reconciliation record that needs fixup
+ * dict - So that NSR server translator on brick applis fixup only on this brick
+ * and the changelog translator consumes term and index.
+ */
+
+static gf_boolean_t
+apply_record(nsr_per_node_worker_t *ctx,
+ nsr_reconciliation_record_t *ri,
+ dict_t * dict)
+{
+ struct glfs_fd *fd = NULL;
+ struct glfs_object *obj = NULL;
+ struct glfs_object *to_obj = NULL;
+ gf_boolean_t retval = _gf_false;
+
+ if (ri->rec.op == GF_FOP_WRITE) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "DOing write for file %s @offset %d for len %d\n",
+ ri->rec.gfid, ri->rec.offset, ri->rec.len);
+
+ // The file has got deleted on the source. Hence just ignore
+ // this.
+ // TBD - get a way to just stuff the log entry without writing
+ // the data so that changelogs remain identical.
+ if (ri->work.data == NULL) {
+ return _gf_true;
+ }
+
+ if ((obj = create_obj(ctx,ri->rec.gfid)) == NULL)
+ goto err;
+
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ goto err;
+ }
+ if (glfs_lseek_with_xdata(fd, ri->rec.offset, SEEK_SET, dict) != ri->rec.offset) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "lseek for file %s failed at offset %d\n",
+ ri->rec.gfid, ri->rec.offset);
+ goto err;
+ }
+ if (glfs_write_with_xdata(fd, ri->work.data, ri->rec.len, 0, dict) != ri->rec.len) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "write for file %s failed for bytes %d\n",
+ ri->rec.gfid, ri->rec.len);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished DOing write for gfid %s @offset %d for len %d\n",
+ ri->rec.gfid, ri->rec.offset, ri->rec.len);
+
+ } else if (ri->rec.op == GF_FOP_FTRUNCATE) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "DOing truncate for file %s @offset %d \n",
+ ri->rec.gfid, ri->rec.offset);
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ goto err;
+ }
+ if (glfs_ftruncate_with_xdata(fd, ri->rec.offset, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "trunctae for file %s failed @offset %d\n",
+ ri->rec.gfid,ri->rec.offset );
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished DOing truncate for gfid %s @offset %d \n",
+ ri->rec.gfid, ri->rec.offset);
+
+ } else if ((ri->rec.op == GF_FOP_FREMOVEXATTR) ||
+ (ri->rec.op == GF_FOP_REMOVEXATTR) ||
+ (ri->rec.op == GF_FOP_SETXATTR) ||
+ (ri->rec.op == GF_FOP_FSETXATTR)) {
+
+ uint32_t k_s = 0, v_s = 0;
+ char *t_b= NULL;
+ uint32_t num = 0;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing set extended attr for file %s \n",
+ ri->rec.gfid);
+
+ // The file has got deleted on the source. Hence just ignore
+ // this. TBD - get a way to just stuff the log entry without
+ // writing the data so that changelogs remain identical.
+ if (ri->work.data == NULL) {
+ return _gf_true;
+ }
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict);
+ else
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ goto err;
+ }
+
+ if(get_xattr_total_size(fd, &t_b, &k_s, &v_s, &num, dict) == -1) {
+ if (t_b) free(t_b);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "list of xattr of %s failed\n", ri->rec.gfid);
+ goto err;
+ }
+
+ if (delete_xattr(fd, dict, t_b, num) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "deleting xattrs failed\n");
+ goto err;
+ }
+
+ // Set one special dict flag to indicate the opcode so that
+ // the opcode gets set to this
+ if (dict_set_int32(dict,"recon-xattr-opcode",ri->rec.op)) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "setting opcode to %d failed\n",ri->rec.op);
+ goto err;
+ }
+
+ if (fill_xattr(fd, dict, ri->work.data, ri->work.num) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "filling xattrs failed\n");
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed Doing set extended attr for %s \n",
+ ri->rec.gfid);
+
+ } else if (ri->rec.op == GF_FOP_CREATE) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing create for file %s \n",
+ ri->rec.gfid);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+
+ nsr_worker_log (this->name, GF_LOG_INFO,
+ "creating with mode 0%o", ri->rec.mode);
+ if (glfs_h_creat_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, ri->rec.mode, NULL, gfid, dict) == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing create for file %s\n",
+ ri->rec.entry);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing create for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_MKNOD) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing mknod for file %s \n",
+ ri->rec.entry);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+
+ if (glfs_h_mknod_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, 0777, NULL, gfid, dict) == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing mknod for file %s\n",
+ ri->rec.entry);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing mknod for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_MKDIR) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing mkdir for dir %s \n",
+ ri->rec.gfid);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+
+ if (glfs_h_mkdir_with_xdata(ctx->fs, obj, ri->rec.entry, 0777, NULL, gfid, dict) != 0) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing mkdir for file %s\n",
+ ri->rec.entry);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing mkdir for file %s \n",
+ ri->rec.entry);
+
+ } else if ((ri->rec.op == GF_FOP_RMDIR) || (ri->rec.op == GF_FOP_UNLINK)) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing rmdir/ublink for dir %s \n",
+ ri->rec.entry);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+ if (glfs_h_unlink_with_xdata(ctx->fs, obj, ri->rec.entry, dict) != 0) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing rmdir/unlink for file %s\n",
+ ri->rec.entry);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing rmdir/unlink for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_SYMLINK) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+ uuid_parse(ri->rec.gfid, gfid);
+
+ if (glfs_h_symlink_with_xdata(ctx->fs, obj, ri->rec.entry, ri->rec.link_path, NULL, gfid, dict) == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+
+ } else if (ri->rec.op == GF_FOP_LINK) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+ if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ if (glfs_h_link_with_xdata(ctx->fs, to_obj, obj, ri->rec.entry, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+
+ } else if (ri->rec.op == GF_FOP_RENAME) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing rename for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+ if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ if (glfs_h_rename_with_xdata(ctx->fs, obj, ri->rec.entry, to_obj, ri->rec.newloc, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing rename for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed doing renam for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+
+
+ } else if ((ri->rec.op == GF_FOP_SETATTR) || (ri->rec.op == GF_FOP_FSETATTR)) {
+
+ struct iatt iatt = {0, };
+ int valid = 0;
+ int ret = -1;
+
+ // TBD - do the actual settings once we do that
+ // right now we just set the mode so that changelog gets filled
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing attr for file %s \n",
+ ri->rec.gfid);
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict);
+ else
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ goto err;
+ }
+
+ iatt.ia_prot = ia_prot_from_st_mode(777);
+ valid = GF_SET_ATTR_MODE;
+
+
+ // Set one special dict flag to indicate the opcode so that
+ // the opcode gets set to this
+ if (dict_set_int32(dict,"recon-attr-opcode",ri->rec.op)) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "setting opcode to %d failed\n",ri->rec.op);
+ goto err;
+ }
+
+ ret = glfs_fsetattr_with_xdata(fd, &iatt, valid, dict);
+ if (ret == -1) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "failed Doing attr for file %s \n",
+ ri->rec.gfid);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing attr for file %s \n",
+ ri->rec.gfid);
+
+ }
+
+ retval = _gf_true;
+
+err:
+ if (fd) {
+ /*
+ * It's not clear that we should be passing the same dict to
+ * glfs_close that was passed to us for glfs_open, but that's
+ * the prior behavior so let's preserve it for now.
+ */
+ if (glfs_close_with_xdata(fd, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "close failed\n");
+ }
+ }
+ if (obj) {
+ /*
+ * AFAICT fd operations do not borrow this reference, so we
+ * still need to drop it ourselves.
+ */
+ glfs_h_close(obj);
+ }
+ if (to_obj) {
+ /*
+ * AFAICT fd operations do not borrow this reference, so we
+ * still need to drop it ourselves.
+ */
+ glfs_h_close(to_obj);
+ }
+ return retval;
+}
+
+//return back opcodes that requires reading from source
+static gf_boolean_t
+recon_check_changelog(nsr_recon_record_details_t *rd)
+{
+ return((rd->op == GF_FOP_WRITE) ||
+ (rd->op == GF_FOP_FSETATTR) ||
+ (rd-> op == GF_FOP_SETATTR) ||
+ (rd->op == GF_FOP_FREMOVEXATTR) ||
+ (rd->op == GF_FOP_SETXATTR) ||
+ (rd->op == GF_FOP_FSETXATTR) ||
+ (rd->op == GF_FOP_SYMLINK));
+
+}
+
+// TBD
+static gf_boolean_t
+recon_compute_undo(nsr_recon_record_details_t *rd)
+{
+ return(_gf_false);
+}
+
+
+/*
+ * Function that talks to the brick for data tranfer.
+ *
+ * Input arguments:
+ * ctx - worker context
+ * work - pointer to work object
+ */
+static void
+data_worker_func(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ xlator_t *this = dr->this;
+ nsr_reconciliation_record_t *ri = NULL;
+ nsr_recon_record_details_t *rd = NULL;
+ int wip = 0;
+ dict_t * dict = NULL;
+ struct glfs_fd *fd = NULL;
+ struct glfs_object *obj = NULL;
+ uuid_t gfid;
+ uint32_t k_s = 0, v_s = 0;
+ char *t_b= NULL;
+ uint32_t num=0;
+
+ switch (work->req_id){
+ case NSR_WORK_ID_INI:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started data ini \n");
+
+ if (nsr_recon_start_work(ctx, _gf_false) != 0) {
+ ctx->result = -1;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished data ini \n");
+ break;
+ case NSR_WORK_ID_FINI:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started data fini \n");
+
+ if (nsr_recon_end_work(ctx, _gf_false) != 0) {
+ ctx->result = -1;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished data fini \n");
+ break;
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_READ:
+ // first_index always starts with 1 but records starts at 0.
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+
+ dict = dict_new ();
+ if (!dict) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+
+ switch (rd->op) {
+ case GF_FOP_WRITE:
+
+ // record already copied.
+ // copy data to this node's info.
+
+ uuid_parse(ri->rec.gfid, gfid);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started recon read for file %s at offset %d at len %d\n",
+ ri->rec.gfid, rd->offset, rd->len);
+
+ obj = glfs_h_create_from_handle (ctx->fs, gfid,
+ GFAPI_HANDLE_LENGTH,
+ NULL);
+ if (obj == NULL) {
+ GF_ASSERT(obj != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ break;
+ }
+
+ // The file has probably got deleted.
+ fd = glfs_h_open_with_xdata (ctx->fs, obj, O_RDONLY,
+ dict);
+ if (fd == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "opening of file failed\n");
+ break;
+ }
+
+ if (glfs_lseek_with_xdata (fd, rd->offset, SEEK_SET,
+ dict) != rd->offset) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "lseek of file failed to offset %d\n",
+ rd->offset);
+ break;
+ }
+
+ ri->work.data = RD_CALLOC (rd->len , sizeof(char),
+ gf_mt_recon_work_data_t);
+ if (glfs_read_with_xdata (fd, ri->work.data, rd->len,
+ 0, dict) != rd->len) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "read of file failed to offset %d for bytes %d\n",
+ rd->offset, rd->len);
+ break;
+ }
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ case GF_FOP_SYMLINK:
+ case GF_FOP_RMDIR:
+ case GF_FOP_UNLINK:
+ case GF_FOP_MKNOD:
+ case GF_FOP_CREATE:
+ case GF_FOP_LINK:
+ case GF_FOP_MKDIR:
+ case GF_FOP_RENAME:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "unimplemented fop %u\n", rd->op);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ case GF_FOP_REMOVEXATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+
+ uuid_parse(ri->rec.gfid, gfid);
+
+
+ // This is for all the set attribute/extended
+ // attributes commands. Get all the attributes from
+ // the source and fill it in the buffer as a NULL
+ // seperated key and value which are in turn seperated
+ // by NULL.
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing getattr for gfid %s \n",
+ ri->rec.gfid);
+
+ obj = glfs_h_create_from_handle (ctx->fs, gfid,
+ GFAPI_HANDLE_LENGTH,
+ NULL);
+ if (obj == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ break;
+ }
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata (ctx->fs, obj,
+ dict);
+ else
+ fd = glfs_h_open_with_xdata (ctx->fs, obj,
+ O_RDONLY, dict);
+
+ if (fd == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "opening of file failed\n");
+ break;
+ }
+
+ if (get_xattr_total_size (fd, &t_b, &k_s, &v_s, &num,
+ dict) == -1) {
+ if (t_b) free(t_b);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "list of xattr of gfid %s failed\n",
+ rd->gfid);
+ break;
+ }
+ ri->work.data = RD_CALLOC ((k_s + v_s) , sizeof(char),
+ gf_mt_recon_work_data_t);
+ if (get_xattr(fd, t_b, ri->work.data, v_s, num, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "get xattr of gfid %s failed\n", rd->gfid);
+ break;
+ }
+ ri->work.num = num;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished getattr for gfid %s \n",
+ ri->rec.gfid);
+ free(t_b);
+ break;
+
+ case GF_FOP_FSETATTR:
+ case GF_FOP_SETATTR:
+ //TBD - to get the actual attrbutes and fill
+ // mode, uid, gid, size, atime, mtime
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "unimplemented fop %u\n", rd->op);
+ break;
+ default:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "unrecognized fop %u\n", rd->op);
+
+ }
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished recon read for gfid %s at offset %d for %d bytes \n",
+ rd->gfid, rd->offset, rd->len);
+ break;
+
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT:
+ // first_index always starts with 1 but records starts at 0.
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got recon commit for index %d that has gfid %s \n",
+ wip, rd->gfid);
+ dict = dict_new ();
+ if (!dict) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+ if (apply_record(ctx, ri, dict) == _gf_false) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "apply_record fails\n");
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished recon commit for gfid %s \n",
+ rd->gfid);
+ break;
+
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH:
+ dict = dict_new ();
+ if (!dict) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+
+ // Increment work index with the start index
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+
+ glfs_fsync_with_xdata(fd, dict);
+ break;
+
+ default:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "unrecognized request id %u\n", work->req_id);
+ }
+
+ if (fd) {
+ glfs_close_with_xdata(fd, dict);
+ }
+ if (obj) {
+ glfs_h_close(obj);
+ }
+ if (dict) {
+ dict_unref(dict);
+ }
+}
+
+// thread for doing data work
+static void *
+data_worker_main(nsr_per_node_worker_t *ctx)
+{
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting data worker func\n");
+ init_worker(ctx, 0);
+
+ while(1) {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&(ctx->mutex));
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&(ctx->cv), &(ctx->mutex));
+ }
+ pthread_mutex_unlock(&(ctx->mutex));
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n",work->req_id);
+ work->in_use = _gf_false;
+ data_worker_func(ctx, work);
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ GF_FREE(work);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+
+//make recon work
+static void
+recon_make_work(nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_t **work,
+ nsr_recon_work_req_id_t req_id,
+ int32_t i)
+{
+ xlator_t *this = ctx->this;
+
+ // TBD - change this to get from a static pool
+ // This cannot fail
+ (*work) = RD_CALLOC (1, sizeof (nsr_recon_work_t), gf_mt_recon_work_t);
+ (*work)->req_id = req_id;
+ (*work)->index = i;
+ (*work)->in_use = _gf_true;
+ INIT_LIST_HEAD(&((*work)->list));
+ return;
+}
+
+// Schedule a work object to a worker thread.
+static void
+recon_queue_to_worker(nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_t *work,
+ unsigned int id,
+ nsr_recon_queue_type_t type)
+{
+ nsr_per_node_worker_t *worker;
+ if (type == NSR_RECON_QUEUE_TO_CONTROL) {
+ worker = ctx->workers[id].control_worker;
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "queueing work to control index %d\n",id);
+ } else {
+ worker= ctx->workers[id].data_worker;
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "queueing work to data index %d\n",id);
+ }
+ pthread_mutex_lock(&worker->mutex);
+ list_add_tail(&work->list, &worker->head.list);
+ pthread_cond_signal(&worker->cv);
+ pthread_mutex_unlock(&worker->mutex);
+ return;
+}
+
+typedef void * (*F_t) (void *);
+
+// In case mode is set to NSR_USE_THREADS, create worker threads.
+static gf_boolean_t
+create_worker_threads(nsr_recon_private_t *priv,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_per_node_worker_t *w,
+ gf_boolean_t control_or_data,
+ F_t f,
+ uint32_t num)
+{
+ uint32_t i;
+ nsr_per_node_worker_t *worker = w;
+ xlator_t *this = ctx->this;
+
+ for (i=0; i < num; i++) {
+ worker->id = RD_CALLOC(1, 10, gf_mt_recon_id_t);
+ if (!worker->id) {
+ nsr_driver_log (priv->this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return _gf_false;
+ }
+ sprintf(worker->id,"recon_%d", i);
+ worker->driver_ctx = ctx ;
+
+ if (ctx->mode == NSR_USE_THREADS) {
+ if (pthread_create(&worker->thread_id, NULL, f, worker)) {
+ nsr_driver_log (ctx->this->name, GF_LOG_ERROR, "control work thread creation error \n");
+ return _gf_false;
+ }
+ }
+ worker->index = i;
+ worker++;
+ }
+ return _gf_true;
+}
+
+/*
+ * In case of thread, send the work item; else call the function directly.
+ *
+ * Input arguments:
+ * bm - bitmap containing indices of nodes we want to send work
+ * num - number of such indices
+ * ctx - driver context from where we derive per worker context
+ * id - request ID
+ * q - control or data
+ * misc - used to overload such as index.
+ */
+static void
+send_and_wait(int32_t *result,
+ int32_t *op_errno,
+ int32_t bm,
+ uint32_t num,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_req_id_t id,
+ nsr_recon_queue_type_t q,
+ int32_t misc)
+{
+ uint32_t i = 0;
+ nsr_recon_work_t *work;
+
+#define CONTROL_WORKER(i) ctx->workers[i].control_worker
+#define DATA_WORKER(i) ctx->workers[i].data_worker
+#define WORKER(i) ((q == NSR_RECON_QUEUE_TO_CONTROL) ? (CONTROL_WORKER(i)) : (DATA_WORKER(i)))
+
+ *result = *op_errno = 0;
+
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ WORKER(i)->result = 0;
+ WORKER(i)->op_errno = 0;
+ }
+ }
+ if (ctx->mode == NSR_SEQ) {
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ recon_make_work(ctx,&work, id, misc);
+ if (q == NSR_RECON_QUEUE_TO_CONTROL) {
+ if (i == 0)
+ control_worker_func_0(ctx->workers[0].control_worker, work);
+ else
+ control_worker_func(ctx->workers[i].control_worker, work);
+ } else {
+ data_worker_func(ctx->workers[i].data_worker, work);
+ }
+ GF_FREE(work);
+ }
+ }
+ goto out;
+ }
+
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ recon_make_work(ctx,&work, id, misc);
+ atomic_inc(&(ctx->outstanding));
+ recon_queue_to_worker(ctx, work, i, q);
+ }
+ }
+
+ nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: waiting\n");
+ while (ctx->outstanding) {
+ pthread_yield();
+ }
+out:
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ if (WORKER(i)->result == -1) {
+ *result = -1;
+ }
+ }
+ }
+ if (*result == -1) {
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ if (WORKER(i)->op_errno == EAGAIN) {
+ *op_errno = EAGAIN;
+ break;
+ } else {
+ *op_errno = EIO;
+ }
+ }
+ }
+ }
+
+ nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: all workers have returned with result: %d errno:%d\n", *result, *op_errno);
+ return;
+}
+
+// send INI or FINI
+static int32_t
+nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx,
+ uint32_t i,
+ gf_boolean_t in_use)
+{
+ uint32_t bm = 1 << i;
+ gf_boolean_t send = _gf_false;
+ int32_t status =0, op_errno = 0;
+
+ send = (ctx->workers[i].in_use != in_use);
+ ctx->workers[i].in_use = in_use;
+
+ if (!send) {
+ return 0;
+ }
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending %s to index %d\n",in_use?"INI":"FINI",i);
+
+ send_and_wait(&status, &op_errno, bm, ctx->replica_group_size, ctx,
+ (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ goto err;
+
+ send_and_wait(&status, &op_errno, bm, ctx->replica_group_size, ctx,
+ (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI,
+ NSR_RECON_QUEUE_TO_DATA, -1);
+ if (status == -1)
+ goto err;
+
+ /*
+ * We really need better error recovery. To activate a worker, we
+ * allocate memory and send two messages. If any of those actions
+ * fail, we should undo the others. It would probably be good to
+ * collapse the two messages into one, because it's pretty trivial to
+ * allocate a temporary structure and either link it in or free it
+ * depending on the result here.
+ */
+
+ if (in_use == _gf_false) {
+ GF_FREE(ctx->workers[i].recon_info);
+ }
+
+ return 0;
+
+err:
+ GF_FREE(ctx->workers[i].recon_info);
+ ctx->workers[i].recon_info = NULL;
+ return -1;
+}
+
+gf_boolean_t
+nsr_recon_driver_reconciliator (nsr_recon_private_t *priv)
+{
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context;
+ int32_t bm;
+ int32_t status = 0;
+ int32_t op_errno = 0;
+ gf_boolean_t do_recon = _gf_false;
+ uint32_t start_index = ctx->workers[0].recon_info->first_index;
+ uint32_t end_index = ctx->workers[0].recon_info->last_index;
+ uint32_t num = ((start_index == 0) && (end_index == 0)) ? 0 : (end_index - start_index + 1);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "starting reconciliation work as reconciliator \n");
+
+ // nothing to be done? signal back to the recon translator that this
+ // phase done.
+ bm = 1;
+ for (i=1; i < replica_group_size; i++) {
+ if (ctx->workers[i].in_use &&
+ (ctx->workers[0].recon_info->last_term == ctx->workers[i].recon_info->last_term)) {
+ ctx->workers[i].recon_info->last_index = end_index;
+ ctx->workers[i].recon_info->first_index = start_index;
+ bm |= (1 << i);
+ do_recon = _gf_true;
+ }
+ }
+
+ if (!do_recon || !num) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "nothing needs to be done as resolutor \n");
+ return _gf_true;
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting reconciliation window for term %d from %dto %d \n",
+ ctx->workers[0].recon_info->last_term,
+ start_index, end_index);
+ // We have set the bm in the above for loop where we go thru all nodes
+ // including this node that have seen the last term.
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting reconciliation window for term %d from %dto %d \n",
+ ctx->workers[0].recon_info->last_term,
+ start_index, end_index);
+
+
+ // from the changelogs, calculate the entries that need action and the
+ // source for each of these entries
+ compute_reconciliation_work(ctx);
+
+ // for each of the entries that need fixup, issue IO
+ for (i=start_index; i < (start_index + num); i++) {
+ nsr_reconciliator_info_t *my_recon_info =
+ ctx->workers[0].recon_info;
+ nsr_reconciliation_record_t *record =
+ &(my_recon_info->records[i - start_index]);
+
+ record->work.term = ctx->workers[0].recon_info->last_term;
+ record->work.index = i;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing index %d\n",i);
+ if ((record->work.type == NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE) ||
+ (record->work.type == NSR_RECON_WORK_HOLE_TO_FILL)) {
+ // 1st case (RECON_WORK_HOLE_TO_PSEUDO_HOLE): If there
+ // are only pseudo_holes in others, it is best effort.
+ // Just pick from the first node that has it and
+ // proceed.
+ // 2nd case (RECON_WORK_HOLE_TO_FILL): this node has
+ // either a HOLE or PSUEDO_HOLE and some one else has a
+ // FILL(source). analyse the changelog to check if data
+ // needs to be read or if the log has all the data
+ // required
+
+ if (recon_check_changelog(&record->rec)) {
+ bm = (1 << record->work.source);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reading data from source %d\n",record->work.source);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "got data from source %d\n",record->work.source);
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing local data as part of reconciliation\n");
+
+ bm = 1;
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing local data as part of reconciliation\n");
+
+ } else if (record->work.type == NSR_RECON_WORK_COMPARE_PSEUDO_HOLE) {
+ // this node has a pseudo_hole and some others have just
+ // that too. Just convert this to FILL. let others
+ // blindly pick it from here.
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing this record as a fill\n");
+ bm = 1;
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing this record as a fill\n");
+ }
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reconciliation work as reconciliator \n");
+
+ // tbd - mark this term golden in the reconciliator
+ return _gf_true;
+}
+
+gf_boolean_t
+nsr_recon_driver_resolutor (nsr_recon_private_t *priv)
+{
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context;
+ int32_t bm;
+ int32_t status = 0;
+ int32_t op_errno = 0;
+ // This node's last term is filled when it gets a message from the
+ // leader to act as a reconciliator.
+ uint32_t recon_index = ctx->reconciliator_index;
+ nsr_reconciliator_info_t *my_info =
+ ctx->workers[0].recon_info;
+ nsr_reconciliator_info_t *his_info =
+ ctx->workers[recon_index].recon_info;
+ uint32_t my_last_term = my_info->last_term;
+ uint32_t to_do_term = his_info->last_term;
+ uint32_t my_start_index = 1, my_end_index = 1;
+ uint32_t his_start_index = 1, his_end_index = 1;
+ uint32_t num = 0;
+ gf_boolean_t fl = _gf_true;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "starting resolutor work with reconciliator as %d from term %d to term %d \n",
+ recon_index, my_last_term, to_do_term);
+
+ do {
+
+ if (!fl) {
+ (his_info->last_term)++;
+ (my_info->last_term)++;
+ } else {
+ his_info->last_term = my_last_term;
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "resolving term %d \n", my_info->last_term);
+
+ // Get reconciliator's term information for that term
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting info from reconciliator for term %d \n", my_info->last_term);
+ bm = (1 << recon_index);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_GIVEN_TERM_INFO,
+ NSR_RECON_QUEUE_TO_CONTROL, his_info->last_term);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting info from reconciliator for term %d \n", my_info->last_term);
+
+
+ // empty term
+ if (!his_info->commited_ops) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reconciliator for term %d is empty. moving to next term. \n", my_info->last_term);
+ // TBD - mark the term golden
+ fl = _gf_false;
+ continue;
+ }
+
+ // calculate the resolution window boundary. for the last term
+ // this node saw, we compare the resolution window of this and
+ // reconciliator. for the rest of the nodes, we just accept the
+ // reconciliator info.
+ if (fl) {
+ my_start_index = my_info->first_index;
+ my_end_index = my_info->last_index;
+ his_start_index = his_info->first_index;
+ his_end_index = his_info->last_index;
+ my_info->first_index = (my_start_index < his_start_index) ? my_start_index : his_start_index;
+ my_info->last_index = (my_end_index > his_end_index) ? my_end_index : his_end_index;
+ } else {
+ my_info->first_index = his_info->first_index;
+ my_info->last_index = his_info->last_index;
+ my_info->commited_ops = his_info->commited_ops;
+ }
+ if (my_info->first_index == 0)
+ my_info->first_index = 1;
+ num = (my_info->last_index - my_info->first_index) + 1;
+
+
+ // Get the logs from the reconciliator (and this node for this
+ // term)
+ if (fl)
+ bm = ((1 << recon_index) | 1);
+ else
+ bm = (1 << recon_index);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting reconciliation window for term %d from %d to %d \n",
+ my_info->last_term,
+ my_info->first_index, my_info->last_index);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting reconciliation window for term %d from %d to %d \n",
+ my_info->last_term,
+ my_info->first_index, my_info->last_index);
+
+ // from the changelogs, calculate the entries that need action
+ compute_resolution_work(ctx, my_info, his_info, !fl);
+
+
+ // for each of the entries that need fixup, issue IO
+ for (i=my_info->first_index; i < (my_info->first_index + num); i++) {
+ nsr_reconciliation_record_t *record =
+ &(my_info->records[i - my_info->first_index]);
+
+ record->work.term = my_info->last_term;
+ record->work.index = i;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing index %d\n",i);
+ if ((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) ||
+ (record->work.type == NSR_RECON_WORK_UNDO_FILL)) {
+ if (((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) &&
+ recon_check_changelog(&record->rec)) ||
+ ((record->work.type == NSR_RECON_WORK_UNDO_FILL) &&
+ recon_compute_undo(&record->rec))) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reading data from source %d\n",recon_index);
+ bm = (1 << recon_index);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reading data from source %d\n",recon_index);
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing local data as part of resolutor\n");
+
+ bm = 1;
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing local data as part of resolutor\n");
+ }
+ }
+ fl = _gf_false;
+
+ // tbd - mark this term golden in the reconciliator
+ } while (my_last_term++ != to_do_term);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished resolutor work \n");
+ return _gf_true;
+}
+
+gf_boolean_t
+nsr_recon_driver_leader (nsr_recon_private_t *priv)
+{
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context;
+ int32_t bm;
+ int32_t status = 0;
+ int32_t op_errno = 0;
+ int32_t chosen = -1;
+ int32_t last_term = -1, last_ops = -1;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting last term info from all members of this group\n");
+ // Get last term info from all members for this group
+ send_and_wait(&status, &op_errno, -1,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_LAST_TERM_INFO,
+ NSR_RECON_QUEUE_TO_CONTROL, ctx->current_term);
+ if (status == -1)
+ return _gf_false;
+
+
+ // compare all the info received and choose the reconciliator First
+ // choose all with latest term
+ for (i=0; i < replica_group_size; i++) {
+ if (ctx->workers[i].in_use) {
+ if (ctx->workers[i].recon_info->last_term > last_term) {
+ last_term = ctx->workers[i].recon_info->last_term;
+ }
+ }
+ }
+ // First choose all with latest term and highest ops
+ for (i=0; i < replica_group_size; i++) {
+ if ((ctx->workers[i].in_use) && (last_term == ctx->workers[i].recon_info->last_term)) {
+ if (ctx->workers[i].recon_info->commited_ops > last_ops) {
+ last_ops = ctx->workers[i].recon_info->commited_ops;
+ }
+ }
+ }
+ // choose the first among the lot
+ for (i=0; i < replica_group_size; i++) {
+ if ((ctx->workers[i].in_use) &&
+ (last_term == ctx->workers[i].recon_info->last_term) &&
+ (last_ops == ctx->workers[i].recon_info->commited_ops)) {
+ chosen = i;
+ break;
+ }
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reconciliator chosen is %d\n", chosen);
+ ctx->reconciliator_index = chosen;
+ GF_ASSERT(chosen != -1);
+ if (chosen == -1) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "no reconciliatior chosen\n");
+ return _gf_false;
+ }
+
+ // send the message to reconciliator to do reconciliation with list of
+ // nodes that are part of this quorum
+ if (chosen != 0) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending reconciliation work to %d\n", chosen);
+ bm = 1 << ctx->reconciliator_index;
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_RECONCILIATOR_DO_WORK,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reconciliation work to %d\n", chosen);
+ } else {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "local node is reconciliator. before set jmp\n");
+ nsr_recon_driver_reconciliator(priv);
+ }
+
+ // send message to all other nodes to sync up with the reconciliator
+ // including itself if required
+ // requires optimisation - TBD
+ if (chosen != 0) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "local node resolution needs to be done. before set jmp\n");
+ nsr_recon_driver_resolutor(priv);
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending resolution work to all nodes except this node and reconciliator\n");
+ bm = ~((1 << ctx->reconciliator_index) || 1);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_RESOLUTION_DO_WORK,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ return _gf_false;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reconciliation work as leader \n");
+ return _gf_true;
+}
+
+// main recon driver thread
+void *
+nsr_reconciliation_driver(void *arg)
+{
+ nsr_recon_private_t *priv = (nsr_recon_private_t *) arg;
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_per_node_worker_t *control_s, *data_s;
+ nsr_recon_driver_ctx_t **driver_ctx, *ctx;
+ int32_t bm;
+ xlator_t *this = priv->this;
+ char *con_name, *data_name;
+ int32_t status = 0;
+ int32_t op_errno = 0;
+
+ driver_ctx = &priv->driver_thread_context;
+ (*driver_ctx) = GF_CALLOC (1,
+ sizeof (nsr_recon_driver_ctx_t),
+ gf_mt_recon_driver_ctx_t);
+ if (!driver_ctx) {
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ ctx = *driver_ctx;
+ ctx->this = priv->this;
+ ctx->replica_group_size = replica_group_size;
+
+ ctx->fp = recon_create_log (priv->replica_group_members[0], "nsr-driver-log");
+ if (!ctx->fp)
+ return NULL;
+
+ if ((pthread_mutex_init(&(ctx->mutex), NULL)) ||
+ (pthread_cond_init(&(ctx->cv), NULL))){
+ nsr_driver_log (this->name, GF_LOG_ERROR, "mutex init error \n");
+ return NULL;
+ }
+ INIT_LIST_HEAD(&(ctx->role_head.list));
+
+ ctx->workers = RD_CALLOC (replica_group_size,
+ sizeof(nsr_replica_worker_t),
+ gf_mt_recon_worker_t);
+ if (!ctx->workers) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ for (i=0; i < replica_group_size; i++) {
+ strcpy(ctx->workers[i].name, priv->replica_group_members[i]);
+ }
+
+ control_s = RD_CALLOC (replica_group_size,
+ sizeof(nsr_per_node_worker_t),
+ gf_mt_recon_per_node_worker_t);
+ if (!control_s) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+
+ data_s = RD_CALLOC (replica_group_size,
+ sizeof(nsr_per_node_worker_t),
+ gf_mt_recon_per_node_worker_t);
+ if (!data_s) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ for (i=0; i < replica_group_size; i++) {
+ ctx->workers[i].control_worker = &control_s[i];
+ if (asprintf(&con_name,"recon-con-%u",i) < 1) {
+ return NULL;
+ }
+ ctx->workers[i].control_worker->fp = recon_create_log
+ (priv->replica_group_members[0], con_name);
+ if (!ctx->workers[i].control_worker->fp)
+ return NULL;
+ ctx->workers[i].data_worker = &data_s[i];
+ if (asprintf (&data_name,"recon-data-%u",i) <1) {
+ return NULL;
+ }
+ ctx->workers[i].data_worker->fp = recon_create_log
+ (priv->replica_group_members[0], data_name);
+ if (!ctx->workers[i].data_worker->fp)
+ return NULL;
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "creating threads \n");
+ // Create the worker threads
+ // For every brick including itself there will be 2 worker threads:
+ // one for data and one for control
+ if (!create_worker_threads(priv, ctx, control_s, _gf_true,
+ (F_t) control_worker_main, replica_group_size) ||
+ !create_worker_threads(priv, ctx, data_s, _gf_false,
+ (F_t) data_worker_main, replica_group_size)) {
+ return NULL;
+ }
+
+ for (i=0; i < replica_group_size; i++) {
+ nsr_recon_get_file(priv->volname, &(ctx->workers[i]));
+ }
+
+ while (1) {
+
+ nsr_role_work_t *rr;
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "waiting for role to be queued \n");
+ pthread_mutex_lock(&(ctx->mutex));
+ while (list_empty(&(ctx->role_head.list))) {
+ pthread_cond_wait(&(ctx->cv), &(ctx->mutex));
+ }
+ pthread_mutex_unlock(&(ctx->mutex));
+
+ list_for_each_entry(rr, &(ctx->role_head.list), list) {
+ nsr_recon_driver_state_t state;
+ state = nsr_recon_driver_get_role(&status, ctx, rr);
+
+ if (status == -1) {
+ op_errno = EIO;
+ goto out;
+ }
+
+ switch (state) {
+
+ case leader:
+ if (!nsr_recon_driver_leader(priv)) {
+ goto out;
+ }
+ break;
+ case reconciliator:
+ if (!nsr_recon_driver_reconciliator(priv)) {
+ goto out;
+ }
+ break;
+ case resolutor:
+ if (!nsr_recon_driver_resolutor(priv)) {
+ goto out;
+ }
+ break;
+
+ case joiner:
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "getting last term info from all members of this group\n");
+ // Get last term info from all members for this group
+ // which will be the leader(this node) and the node that wants to join.
+ send_and_wait(&status, &op_errno, -1,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_LAST_TERM_INFO,
+ NSR_RECON_QUEUE_TO_CONTROL, ctx->current_term);
+ if (status == -1)
+ goto out;
+
+
+ // send message to other node that just joined to sync up with this node which is also the leader
+ nsr_driver_log (this->name, GF_LOG_INFO, "sending resolution work to all nodes except this\n");
+ bm = ~(1);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_RESOLUTION_DO_WORK,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ goto out;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished recon work as joiner \n");
+ break;
+
+ default:
+ nsr_driver_log (this->name, GF_LOG_ERROR,
+ "bad state %d", state);
+ }
+
+
+ // free the asasociated recon_info contexts created as part of this role
+
+out:
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending end of reconciliation message \n");
+ nsr_recon_return_back(priv, ctx->term, status, op_errno);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished sending end of reconciliation message \n");
+ }
+ list_del_init (&rr->list);
+ }
+
+ return NULL;
+}
diff --git a/xlators/cluster/nsr-recon/src/recon_driver.h b/xlators/cluster/nsr-recon/src/recon_driver.h
new file mode 100644
index 000000000..3efb26269
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_driver.h
@@ -0,0 +1,325 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __RECON_DRIVER_H__
+#define __RECON_DRIVER_H__
+
+
+#include "api/src/glfs.h"
+
+#define MAX_HOSTNAME_LEN 32
+#define MAXIMUM_REPLICA_STRENGTH 8
+#define MAX_RECONCILIATION_WINDOW_SIZE 10000
+
+#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
+#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
+#define GLUSTERD_BRICK_INFO_DIR "bricks"
+
+/*
+ * Even with the names fixed, the non-NSR_DEBUG definitions of nsr_*_log don't
+ * work because many callers don't have "this" defined.
+ *
+ * TBD: use gf_log, fix "this" problem, eliminate extra fields and newlines.
+ */
+#define NSR_DEBUG
+
+typedef enum nsr_recon_work_req_id_t {
+ NSR_WORK_ID_GET_NONE = 0,
+ NSR_WORK_ID_GET_LAST_TERM_INFO = NSR_WORK_ID_GET_NONE + 1,
+ NSR_WORK_ID_GET_GIVEN_TERM_INFO = NSR_WORK_ID_GET_LAST_TERM_INFO + 1,
+ NSR_WORK_ID_RECONCILIATOR_DO_WORK = NSR_WORK_ID_GET_GIVEN_TERM_INFO + 1,
+ NSR_WORK_ID_RESOLUTION_DO_WORK = NSR_WORK_ID_RECONCILIATOR_DO_WORK + 1,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW = NSR_WORK_ID_RESOLUTION_DO_WORK + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ = NSR_WORK_ID_GET_RECONCILATION_WINDOW + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT = NSR_WORK_ID_SINGLE_RECONCILIATION_READ + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH = NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT + 1,
+ NSR_WORK_ID_GET_RESOLUTION_WINDOW = NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH + 1,
+ NSR_WORK_ID_END_RECONCILIATION = NSR_WORK_ID_GET_RESOLUTION_WINDOW + 1,
+ NSR_WORK_ID_INI = NSR_WORK_ID_END_RECONCILIATION + 1,
+ NSR_WORK_ID_FINI = NSR_WORK_ID_INI + 1
+} nsr_recon_work_req_id_t;
+
+typedef enum nsr_recon_queue_type_t {
+ NSR_RECON_QUEUE_TO_CONTROL = 0,
+ NSR_RECON_QUEUE_TO_DATA =NSR_RECON_QUEUE_TO_CONTROL + 1,
+} nsr_recon_queue_type_t;
+
+typedef enum nsr_log_type_t {
+ NSR_LOG_HOLE = 0b0,
+ NSR_LOG_PSEUDO_HOLE = 0b1,
+ NSR_LOG_FILL = 0b11
+} nsr_log_type_t;
+
+typedef enum nsr_mode_t {
+ NSR_SEQ = 0,
+ NSR_USE_THREADS = 1,
+ NSR_ASYNC = 2
+} nsr_mode_t;
+
+typedef enum nsr_recon_work_type_t {
+ NSR_RECON_WORK_NONE = 0,
+ NSR_RECON_WORK_HOLE_TO_NOOP = NSR_RECON_WORK_NONE + 1,
+ NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_NOOP + 1,
+ NSR_RECON_WORK_COMPARE_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE + 1,
+ NSR_RECON_WORK_HOLE_TO_FILL = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE + 1,
+ NSR_RECON_WORK_UNDO_FILL = NSR_RECON_WORK_HOLE_TO_FILL + 1,
+} nsr_recon_work_type_t;
+
+typedef enum nsr_recon_driver_state_t {
+ none = 0,
+ leader = 1,
+ reconciliator = 2,
+ resolutor = 3,
+ joiner = 4,
+} nsr_recon_driver_state_t;
+
+// role structure
+#pragma pack(push, 1)
+typedef struct _nsr_recon_role_s {
+ uint32_t role; // leader, reconciliator, resolutor
+ uint32_t num; // required in case state is reconciliator
+ uint32_t current_term; // current term used in case of leader
+ // In case this is reconciliator, num is set to nodes that were part
+ // of previous term.
+ // In case this is resolutor, num is set to 2.
+ // info[0] - information for this node.
+ // info[1] - information of the reconciliator.
+ // In case this is leader, num is set to this term's membership list
+ // set info.name to all members including the leader
+ struct {
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+ char name[MAX_HOSTNAME_LEN];
+ } info[MAXIMUM_REPLICA_STRENGTH];
+} nsr_recon_role_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_RR(rr, is_true) \
+{ \
+ uint32_t i=0; \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ if (is_true == _gf_true) rr.num = f(rr.num); \
+ rr.current_term = f(rr.current_term); \
+ for (i=0; i < rr.num; i++) { \
+ rr.info[i].last_term = f(rr.info[i].last_term); \
+ rr.info[i].commited_ops = f(rr.info[i].commited_ops); \
+ rr.info[i].last_index = f(rr.info[i].last_index); \
+ rr.info[i].first_index = f(rr.info[i].first_index); \
+ } \
+ if (is_true == _gf_false) rr.num = f(rr.num); \
+}
+
+// last term info structure
+#pragma pack(push, 1)
+typedef struct _nsr_recon_last_term_info_s {
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+} nsr_recon_last_term_info_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_LT(lt, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ lt.last_term = f(lt.last_term); \
+ lt.commited_ops = f(lt.commited_ops); \
+ lt.last_index = f(lt.last_index); \
+ lt.first_index = f(lt.first_index); \
+}
+
+// log information
+#pragma pack(push, 1)
+typedef struct _nsr_recon_log_info_s {
+ uint32_t term;
+ uint32_t first_index;
+ uint32_t last_index;
+} nsr_recon_log_info_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_LI(li, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ li.term = f(li.term); \
+ li.first_index = f(li.first_index); \
+ li.last_index = f(li.last_index); \
+}
+
+#pragma pack(push, 1)
+typedef struct nsr_recon_record_details_s {
+ uint32_t type;
+ uint32_t op;
+ char gfid[36+1];
+ char pargfid[36+1];
+ char link_path[256]; // should it be PATH_MAX?
+ uint32_t offset;
+ uint32_t len;
+ char entry[128];
+ char newloc[128]; // for rename. can you overload link_path for this? TBD
+ mode_t mode;
+} nsr_recon_record_details_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_RD(rd, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ rd.type = f(rd.type); \
+ rd.op = f(rd.op); \
+ rd.offset = f(rd.offset); \
+ rd.len = f(rd.len); \
+}
+
+typedef struct _nsr_role_work_s {
+ nsr_recon_role_t role;
+ uint32_t term;
+ struct list_head list;
+} nsr_role_work_t;
+
+typedef struct _nsr_recon_work_s {
+ gf_boolean_t in_use;
+ uint32_t index;
+ uint32_t req_id;
+ struct list_head list;
+} nsr_recon_work_t;
+
+typedef struct _nsr_reconciliation_work_s {
+ uint32_t term;
+ uint32_t index;
+ uint32_t type;
+ uint32_t source;
+ void *data;
+
+ uint32_t num; // used for xattr
+
+} nsr_reconciliation_work_t;
+
+typedef struct _nsr_reconciliation_record_s {
+ nsr_reconciliation_work_t work; // will store the computed work
+ nsr_recon_record_details_t rec;
+} nsr_reconciliation_record_t;
+
+typedef struct _nsr_reconciliator_info {
+ uint32_t reconcilator_index;
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+ //nsr_reconciliation_record_t records[MAX_RECONCILIATION_WINDOW_SIZE];
+ nsr_reconciliation_record_t *records;
+} nsr_reconciliator_info_t;
+
+typedef struct _nsr_per_node_worker_s {
+ char *id; // identifier
+ char vol_file[256]; //volfile that will be used by this thread
+ glfs_t *fs;
+ glfs_fd_t *aux_fd;
+ uint32_t index; // index into array of workers
+ pthread_t thread_id; // thread id
+ void * context; // thread context
+ struct _nsr_recon_driver_ctxt *driver_ctx;
+ char local; // local data worker
+ //struct list_head list; //list of work items
+ nsr_recon_work_t head;
+ pthread_mutex_t mutex; //mutex to guard the state
+ pthread_cond_t cv; //condition variable for signaling the worker thread
+ gf_boolean_t is_control;
+#if defined(NSR_DEBUG)
+ FILE *fp;
+#endif
+ int32_t result; // result of latest work
+ int32_t op_errno; // errno
+} nsr_per_node_worker_t;
+
+typedef struct _nsr_replica_worker_s {
+ char name[256];
+ nsr_per_node_worker_t *control_worker;
+ nsr_per_node_worker_t *data_worker;
+ gf_boolean_t in_use;
+ nsr_reconciliator_info_t *recon_info; // Bunch of infos kept for this reconciliation
+} nsr_replica_worker_t;
+
+typedef struct _nsr_recon_driver_ctxt {
+ xlator_t *this;
+ uint32_t replica_group_size; // number of static members of replica group
+ nsr_replica_worker_t *workers; // worker info
+ int32_t reconciliator;
+ pthread_mutex_t mutex;
+ pthread_cond_t cv;
+ nsr_role_work_t role_head;
+ volatile int32_t outstanding;
+ uint32_t reconciliator_index;
+ uint32_t term;
+ uint32_t current_term;
+ nsr_mode_t mode; // default set to seq
+#if defined(NSR_DEBUG)
+ FILE *fp;
+#endif
+} nsr_recon_driver_ctx_t;
+
+void *
+nsr_reconciliation_driver(void *);
+
+gf_boolean_t
+nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx, nsr_recon_role_t *rr, uint32_t term);
+
+#define atomic_inc(ptr) ((void) __sync_fetch_and_add(ptr, 1))
+#define atomic_dec(ptr) ((void) __sync_fetch_and_add(ptr, -1))
+#define atomic_fetch_and __sync_fetch_and_and
+#define atomic_fetch_or __sync_fetch_and_or
+
+#if defined(NSR_DEBUG)
+
+#define NSR_LOG_DIR "/var/log/nsr-logs"
+
+extern int nsr_debug_level;
+extern FILE *recon_create_log (char *member, char *module);
+
+extern void
+_nsr_driver_log (const char *func, int line, char *member, FILE *fp,
+ char *fmt, ...);
+
+#define nsr_driver_log(dom, levl, fmt...) do { \
+ FMT_WARN (fmt); \
+ if (levl <= nsr_debug_level) { \
+ nsr_recon_private_t *priv = ctx->this->private; \
+ _nsr_driver_log (__FUNCTION__, __LINE__, \
+ priv->replica_group_members[0], \
+ ctx->fp, \
+ ##fmt); \
+ } \
+} while (0)
+
+extern void
+_nsr_worker_log (const char *func, int line, char *member,
+ char *type, uint32_t index, FILE *fp,
+ char *fmt, ...);
+
+#define nsr_worker_log(dom, levl, fmt...) do { \
+ FMT_WARN (fmt); \
+ if (levl <= nsr_debug_level) { \
+ nsr_recon_private_t *priv; \
+ priv = ctx->driver_ctx->this->private; \
+ _nsr_worker_log (__FUNCTION__, __LINE__, \
+ priv->replica_group_members[0], \
+ ctx->is_control ? "recon-con" : \
+ "recon-data", \
+ ctx->index, ctx->fp, \
+ ##fmt); \
+ } \
+} while (0)
+
+#else
+#define nsr_driver_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#define nsr_worker_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#endif
+
+#endif /* #ifndef __RECON_DRIVER_H__ */
diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.c b/xlators/cluster/nsr-recon/src/recon_xlator.c
new file mode 100644
index 000000000..272c35dc2
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_xlator.c
@@ -0,0 +1,1010 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+
+#include "recon_driver.h"
+#include "recon_xlator.h"
+
+typedef struct _nsr_recon_fd_s {
+ int32_t term;
+ nsr_recon_driver_state_t state;
+ uint32_t first_index;
+ uint32_t last_index;
+ call_frame_t *frame;
+} nsr_recon_fd_t;
+
+#if defined(NSR_DEBUG)
+
+void
+_recon_main_log (const char *func, int line, char *member, FILE *fp,
+ char *fmt, ...)
+{
+ va_list ap;
+ char *buf = NULL;
+ int retval;
+
+ if (!fp) {
+ fp = recon_create_log(member,"recon-main-log");
+ if (!fp) {
+ return;
+ }
+ }
+
+ va_start(ap,fmt);
+ retval = vasprintf(&buf,fmt,ap);
+ if (buf) {
+ fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf);
+ free(buf);
+ }
+ va_end(ap);
+}
+
+#endif
+
+// Given fd, get back the NSR based fd context.
+static int32_t this_fd_ctx_get(fd_t *fd, xlator_t *this, nsr_recon_fd_t **rfd)
+{
+ uint64_t tmp = 0;
+ int32_t ret = -1;
+
+ if ((ret = fd_ctx_get(fd, this, &tmp)) != 0) {
+ return ret;
+ } else {
+ *rfd = (nsr_recon_fd_t *)tmp;
+ return 0;
+ }
+}
+
+// Add the frame in q after associating with term
+// term usage tbd
+static void put_frame(nsr_recon_private_t *priv,
+ call_frame_t *frame,
+ uint32_t term)
+{
+ xlator_t *this = priv->this;
+ recon_main_log (this->name, GF_LOG_INFO, "adding frame for term %d \n", term);
+ priv->frame = frame;
+ return;
+}
+
+// get the frame from the queue given the term
+// term usage tbd
+static void get_frame(nsr_recon_private_t *priv,
+ call_frame_t **frame,
+ uint32_t term)
+{
+ if (frame != NULL)
+ *frame = priv->frame;
+ priv->frame = NULL;
+ return;
+}
+
+// check if there are outstanding frames
+static gf_boolean_t is_frame(nsr_recon_private_t *priv)
+{
+ return((priv->frame != NULL) ? _gf_true : _gf_false);
+}
+
+#define ENTRY_SIZE 128
+
+long
+get_entry_count (char *path)
+{
+ int fd;
+ struct stat buf;
+ unsigned long entries = -1;
+ long min; /* last entry not known to be empty */
+ long max; /* first entry known to be empty */
+ long curr;
+ char entry[ENTRY_SIZE];
+ void *err_label = &&done;
+
+ fd = open(path,O_RDONLY);
+ if (fd < 0) {
+ goto *err_label;
+ }
+ err_label = &&close_fd;
+
+ if (fstat(fd,&buf) < 0) {
+ goto *err_label;
+ }
+
+ min = 0;
+ max = buf.st_size / ENTRY_SIZE;
+ printf("max = %ld\n",max);
+
+ while ((min+1) < max) {
+ curr = (min + max) / 2;
+ printf("trying entry %ld\n",curr);
+ if (lseek(fd,curr*ENTRY_SIZE,SEEK_SET) < 0) {
+ goto *err_label;
+ }
+ if (read(fd,entry,sizeof(entry)) != sizeof(entry)) {
+ goto *err_label;
+ }
+ if ((entry[0] == '_') && (entry[1] == 'P')) {
+ min = curr;
+ }
+ else {
+ max = curr;
+ }
+ }
+
+ entries = max;
+
+close_fd:
+ close(fd);
+done:
+ return entries;
+}
+
+// Get the term info for the term number specified
+void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt)
+{
+ char path[PATH_MAX];
+ long entries;
+
+ bzero(lt, sizeof(nsr_recon_last_term_info_t));
+ lt->last_term = term;
+ sprintf(path,"%s/%s%d",bp,"TERM.",term);
+ entries = get_entry_count(path);
+ if (entries > 1) {
+ /* The first entry is actually a header. */
+ lt->first_index = 1;
+ /*
+ * This seems wrong, because it means that last_index*128 will
+ * be exactly at EOF and commited_ops will be one greater than
+ * it should be. Maybe some other code makes the exact
+ * opposite mistake to compensate.
+ */
+ lt->last_index = lt->commited_ops = (int)entries;
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "for term=%d got first_index=%d last_index=%d commited_ops=%d\n",
+ term, lt->first_index, lt->last_index, lt->commited_ops);
+ return;
+}
+
+// Given the term number, find the last term in the changelogs
+void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt)
+{
+ uint32_t t = term;
+ struct stat buf;
+ char path[PATH_MAX];
+ bzero(lt, sizeof(nsr_recon_last_term_info_t));
+ while(t) {
+ // journal file is of type TERM-1.jnl
+ sprintf(path,"%s/%s%d",bp,"TERM.",t);
+ if (!stat(path, &buf)) {
+ nsr_recon_libchangelog_get_this_term_info(this, bp, t, lt);
+ recon_main_log (this->name, GF_LOG_INFO, "got last term given current term %d as %d\n", term, t);
+ return;
+ }
+ t--;
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "got no last term given current term %d \n", term);
+
+ return;
+}
+
+// Return back the frame stored against the term
+void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t term, int32_t status, int32_t op_errno)
+{
+ call_frame_t *old_frame = NULL;
+ xlator_t *this = priv->this;
+
+ get_frame(priv, &old_frame, term);
+ if (old_frame) {
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev returns old frame \n");
+ // first return the original write for which this ack was sent
+ STACK_UNWIND_STRICT (writev, old_frame, status, op_errno, NULL, NULL, NULL);
+ } else {
+ recon_main_log (this->name, GF_LOG_ERROR, "EIII---nsr_recon_writev cnnot return old frame \n");
+ }
+}
+
+typedef enum records_type_t {
+ fop_gfid_pgfid_oldloc_newloc = 1,
+ fop_gfid_pgfid_entry = fop_gfid_pgfid_oldloc_newloc + 1,
+ fop_gfid = fop_gfid_pgfid_entry + 1 ,
+ fop_gfid_offset = fop_gfid + 1,
+ fop_gfid_offset_len = fop_gfid_offset + 1,
+} records_type_t;
+
+// Get the backend ./glusterfs/xx/xx/<...> path
+static void
+get_gfid_path(nsr_recon_private_t *priv, char *gfid, char *path)
+{
+ strcpy(path, priv->base_dir);
+ strcat(path, "/.glusterfs/");
+ strncat(path,gfid,2);
+ strcat(path,"/");
+ strncat(path,gfid+2,2);
+ strcat(path,"/");
+ strcat(path,gfid);
+}
+
+
+// Get the link to which backend points to
+static gf_boolean_t
+get_link_using_gfid(nsr_recon_private_t *priv, char *gfid, char *path)
+{
+ char lp[PATH_MAX];
+ xlator_t *this = priv->this;
+ get_gfid_path(priv,gfid, lp);
+ if (readlink(lp, path, 255) == -1) {
+ GF_ASSERT(0);
+ recon_main_log(priv->this, GF_LOG_ERROR,
+ "cannot get readlink for %s\n",lp);
+ return _gf_false;
+ }
+ return _gf_true;
+}
+
+// Get the list of changelog records given a term , first and last index.
+//
+// TBD: rewrite this hideous ball of mud in at least the following ways:
+//
+// (1) Break out the code for handling a single record into a separate
+// function, to make error handling easier and reduce "indentation
+// creep" so the code's readable.
+//
+// (2) Change all of the fop_xxx_yyy nonsense to OR together values
+// like FOP_HAS_FIELD_XXX and FOP_HAS_FIELD_YYY, to reduce code
+// duplication and facilitate the addition of new fields.
+//
+// (3) Stop making so many assumptions about the underlying formats.
+// The code as it is won't even work for the existing binary format,
+// let alone as changelog evolves over time.
+//
+// Really, 90% of this code should just GO AWAY in favor of using
+// libgfchangelog, enhanced as necessary to support our needs.
+
+/*
+ * Use this macro to skip over a field we're not using yet.
+ * NB: the body is a null statement on purpose
+ * TBD: all instances of this should be removed eventually!
+ */
+#define SKIP_FIELD do /* nothing */ ; while (*(start++) != '\0')
+
+#define SKIP_OVER
+gf_boolean_t nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf)
+{
+ // do a mmap; seek into the first and read all records till last.
+ // TBD - right now all records are pseudo holes but mark them as fills.
+ // TBD - pseudo hole to be implemented when actual fsync gets done on data.
+ char *rb = NULL, *orig = NULL;
+ char path[PATH_MAX];
+ int fd;
+ uint32_t index = 0;
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records called for term %d index from %d to %d \n",
+ term, first, last );
+
+ orig = rb = GF_CALLOC(128, ((last - first) + 1),
+ gf_mt_recon_changelog_buf_t);
+
+ sprintf(path,"%s/%s%d",bp,"TERM.",term);
+ fd = open(path, O_RDONLY);
+ if (fd == -1) {
+ return _gf_false;
+ } else {
+ char *start = NULL;
+ nsr_recon_record_details_t * rec = (nsr_recon_record_details_t *)buf;
+
+ if (first == 0)
+ lseek(fd, 128, SEEK_SET);
+ else
+ lseek(fd, first * 128, SEEK_SET);
+ if (read(fd, rb, (last - first + 1) * 128) == -1) {
+ return _gf_false;
+ }
+ start = rb;
+ index = first;
+ do {
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records start inspecting records at index %d \n",
+ index );
+ if (!strncmp(start, "_PRE_", 5)) {
+ uint32_t i;
+ uint32_t opcode = 0;
+ records_type_t type;
+
+ start += 5;
+ // increment by the NULLs after the PRE
+ start += 4;
+ SKIP_FIELD; // real index
+ // now we have the opcode
+ while (*start != '\0') {
+ opcode *= 10;
+ opcode += (*(start++) - '0');
+ }
+ ++start;
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "libchangelog_get_records: got opcode %d @index %d\n", opcode, index);
+ if ((opcode == GF_FOP_RENAME)) {
+ type = fop_gfid_pgfid_oldloc_newloc;
+ } else if ((opcode == GF_FOP_UNLINK) ||
+ (opcode == GF_FOP_RMDIR) ||
+ (opcode == GF_FOP_LINK) ||
+ (opcode == GF_FOP_MKDIR) ||
+ (opcode == GF_FOP_SYMLINK) ||
+ (opcode == GF_FOP_MKNOD) ||
+ (opcode == GF_FOP_CREATE)) {
+ type = fop_gfid_pgfid_entry;
+ } else if ((opcode == GF_FOP_FSETATTR) ||
+ (opcode == GF_FOP_SETATTR) ||
+ (opcode == GF_FOP_FREMOVEXATTR) ||
+ (opcode == GF_FOP_REMOVEXATTR) ||
+ (opcode == GF_FOP_SETXATTR) ||
+ (opcode == GF_FOP_FSETXATTR)) {
+ type = fop_gfid;
+ } else if ((opcode == GF_FOP_TRUNCATE) ||
+ (opcode == GF_FOP_FTRUNCATE)) {
+ type = fop_gfid_offset;
+ } else if (opcode == GF_FOP_WRITE) {
+ type = fop_gfid_offset_len;
+ } else {
+ recon_main_log (this->name,
+ GF_LOG_ERROR,
+ "libchangelog_get_records:got no proper opcode %d @index %d\n",
+ opcode, index);
+ //GF_ASSERT(0);
+ // make this as a hole.
+ // TBD - check this logic later. maybe we should raise alarm here because
+ // this means that changelog is corrupted. We are not handling changelog
+ // corruptions as of now.
+ rec->type = NSR_LOG_HOLE;
+ goto finish;
+ }
+ // TBD - handle psuedo holes once that logic is in.
+ rec->type = NSR_LOG_FILL;
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "libchangelog_get_records:got type %d at index %d \n",
+ rec->type, index);
+ rec->op = opcode;
+
+ // Now get the gfid and parse it
+ // before that increment the pointer
+ for (i=0; i < 36; i++) {
+ rec->gfid[i] = (*start);
+ start++;
+ }
+ rec->gfid[i] = '\0';
+
+ GF_ASSERT(*start == 0);
+ start ++;
+
+ if (opcode == GF_FOP_SYMLINK) {
+ i = 0;
+ do {
+ if (i >= 256) {
+ goto finish;
+ }
+ rec->link_path[i++] = *start;
+ } while (*(start++) != '\0');
+ }
+
+ i = 0;
+ // If type is fop_gfid_offset+_len, get offset
+ if ((type == fop_gfid_offset) || (type == fop_gfid_offset_len)) {
+ char offset_str[128];
+ while(*start != 0) {
+ offset_str[i++] = *start;
+ start ++;
+ }
+ offset_str[i] = '\0';
+ // get over the 0
+ start++;
+ rec->offset = strtoul(offset_str, NULL, 10);
+ recon_main_log (this->name,
+ GF_LOG_ERROR,
+ "libchangelog_get_records:got offset %d @index %d \n", rec->offset, index);
+
+ }
+ i = 0;
+ if (type == fop_gfid_offset_len) {
+ char len_str[128];
+ while(*start != 0) {
+ len_str[i++] = *start;
+ start ++;
+ }
+ len_str[i] = '\0';
+ // get over the 0
+ start++;
+ rec->len = strtoul(len_str, NULL, 10);
+ recon_main_log (this->name,
+ GF_LOG_ERROR,
+ "libchangelog_get_records:got length %d @index %d \n", rec->len, index);
+ }
+ i = 0;
+ if (type == fop_gfid_pgfid_entry) {
+ switch (opcode) {
+ case GF_FOP_CREATE:
+ case GF_FOP_MKDIR:
+ case GF_FOP_MKNOD:
+ SKIP_FIELD; // mode
+ break;
+ /* TBD: handle GF_FOP_SYMLINK target */
+ default:
+ ;
+ }
+ SKIP_FIELD; // uid
+ SKIP_FIELD; // gid
+ if (opcode == GF_FOP_MKNOD) {
+ SKIP_FIELD; // dev
+ }
+ // first get the gfid and then the path
+ for (i=0; i < 36; i++) {
+ rec->pargfid[i] = (*start);
+ start++;
+ }
+ rec->pargfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+
+ i = 0;
+ while(*start != 0) {
+ rec->entry[i++] = *start;
+ start ++;
+ }
+ rec->entry[i] = '\0';
+ // get over the 0
+ start++;
+ /*
+ * Having to add this as a special case
+ * is awful. See the function header
+ * comment for the real solution.
+ */
+ if (opcode == GF_FOP_CREATE) {
+ rec->mode = 0;
+ while (*start != '\0') {
+ rec->mode *= 10;
+ rec->mode += *start
+ - '0';
+ ++start;
+ }
+ ++start;
+ }
+ recon_main_log (this->name,
+ GF_LOG_ERROR,
+ "libchangelog_get_records:got entry %s @index %d \n", rec->entry, index);
+
+ }
+ i = 0;
+ if (type == fop_gfid_pgfid_oldloc_newloc) {
+
+ // first get the source and then the destination
+ // source stuff gets stored in pargfid/entry
+ for (i=0; i < 36; i++) {
+ rec->pargfid[i] = (*start);
+ start++;
+ }
+ rec->pargfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+
+ i=0;
+ while(*start != 0) {
+ rec->entry[i++] = *start;
+ start ++;
+ }
+ rec->entry[i] = '\0';
+ // get over the 0
+ start++;
+
+ // dst stuff gets stored in gfid/newloc
+ for (i=0; i < 36; i++) {
+ rec->gfid[i] = (*start);
+ start++;
+ }
+ rec->gfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+ i = 0;
+ while(*start != 0) {
+ rec->newloc[i++] = *start;
+ start ++;
+ }
+ rec->newloc[i] = '\0';
+ // get over the 0
+ start++;
+
+ }
+ ENDIAN_CONVERSION_RD((*rec), _gf_false); //htonl
+ }
+finish:
+ if (index == last)
+ break;
+ index++;
+ rb += 128;
+ start = rb;
+ rec++;
+ } while(1);
+ }
+ GF_FREE(orig);
+ close(fd);
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records finsihed inspecting records for term %d \n",
+ term);
+ return _gf_true;
+}
+
+int32_t
+nsr_recon_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
+{
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ nsr_recon_fd_t *rfd = NULL;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open called for path %s \n",loc->path );
+ rfd = GF_CALLOC (1, sizeof (*rfd), gf_mt_recon_fd_t);
+ if (!rfd) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ }
+
+ op_ret = fd_ctx_set (fd, this, (uint64_t)(long)rfd);
+ if (op_ret) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open returns with %d for path %s \n",op_ret,loc->path );
+ STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL);
+ return 0;
+}
+
+int32_t
+nsr_recon_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ nsr_recon_fd_t *rfd = NULL;
+ nsr_recon_private_t *priv = NULL;
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ int32_t ret = 0;
+
+ ret = this_fd_ctx_get (fd, this, &rfd);
+ if (ret < 0) {
+ return -1;
+ }
+ priv = (nsr_recon_private_t *)this->private;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called for offset %d \n",(unsigned int)offset );
+ GF_ASSERT(count == 1);
+ switch (offset) {
+ // client(brick, leader) writes the role of the node
+ case nsr_recon_xlator_sector_1 :
+ {
+ nsr_recon_role_t rr;
+ memcpy((void *)&rr, (void *)vector[0].iov_base, sizeof(rr));
+ ENDIAN_CONVERSION_RR(rr, _gf_true); //ntohl
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called to set role %d\n", rr.role);
+ if ((rr.role != leader) &&
+ (rr.role != reconciliator) &&
+ (rr.role != resolutor) &&
+ (rr.role != joiner)) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "EIII---nsr_recon_writev cannot set state \n");
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ }
+
+ GF_ASSERT(rr.num <= MAXIMUM_REPLICA_STRENGTH);
+
+ // Check if already a role play is going on. If yes return with EAGAIN.
+ // Ideally we should check if we have got a higher term number while
+ // servicing a lower term number; if so abort the older one.
+ // However the abort infrastructure needs to be sketched properly; TBD.
+ if (is_frame(priv) == _gf_true) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_writev set_role - already role play \n");
+ STACK_UNWIND_STRICT (writev, frame, -1, EAGAIN,
+ NULL, NULL, NULL);
+ } else {
+
+ // Store the stack frame so that when the actual job gets finished
+ // we send the response back to the brick.
+ put_frame(priv, frame, rr.current_term);
+ if (nsr_recon_driver_set_role(priv->driver_thread_context,
+ &rr,
+ rr.current_term) == _gf_false) {
+ get_frame(priv, NULL, rr.current_term);
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_writev set_role - cannot seem to set role \n");
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ } else {
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev set_role - set role succesfully \n");
+ }
+ }
+ break;
+ }
+ // client(reconciliator) writes how much it needs for the read
+ case nsr_recon_xlator_sector_2 :
+ {
+ nsr_recon_log_info_t li;
+ memcpy((void *)&li, (void *)vector[0].iov_base, sizeof(li));
+ ENDIAN_CONVERSION_LI(li, _gf_true); //ntohl
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for reconcilation info. term=%d, first_index=%d,start_index=%d \n",
+ li.term, li.first_index, li.last_index);
+ rfd->term = li.term;
+ rfd->last_index = li.last_index;
+ rfd->first_index = li.first_index;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ // client(reconciliator) writes term for which it needs info
+ case nsr_recon_xlator_sector_3 :
+ {
+ int32_t term;
+
+ memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term));
+ term = ntohl(term); //ntohl
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for term info. term=%d\n",
+ term);
+ rfd->term = term;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ // client(reconciliator) writes current term so that it gets last term info later
+ case nsr_recon_xlator_sector_4 :
+ {
+ int32_t term;
+
+ memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term));
+ term = ntohl(term); //ntohl
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for last term info given current term=%d\n",
+ term);
+ rfd->term = term;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ default:
+ {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_writev called with wrong offset\n");
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int
+nsr_recon_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ nsr_recon_fd_t *rfd = NULL;
+ int32_t op_errno = 0;
+ // copied stuff from quick-read.c and posix.c
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec iov = {0, };
+ int32_t ret = -1;
+ nsr_recon_private_t *priv = NULL;
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ ret = this_fd_ctx_get (fd, this, &rfd);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+ priv = (nsr_recon_private_t *)this->private;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_readv called for offset %d \n",(unsigned int)offset );
+ switch (offset) {
+ // client(leader) reads from here to get info for this term on this node
+ // invole libchagelog to get the information
+ case nsr_recon_xlator_sector_3 :
+ {
+ nsr_recon_last_term_info_t lt;
+ GF_ASSERT(size == sizeof(lt));
+ nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, rfd->term, &lt);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting term info for term=%d, ops=%d, first=%d, last=%d\n",
+ rfd->term, lt.commited_ops, lt.first_index, lt.last_index);
+ ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl
+ memcpy(iobuf->ptr, &lt, size);
+ goto out;
+ }
+ // client(reconciliator) reads individual record information
+ case nsr_recon_xlator_sector_2 :
+ {
+ uint32_t num = (rfd->last_index - rfd->first_index + 1);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - expected size %lu got size %lu\n",
+ (num * sizeof(nsr_recon_record_details_t)), size);
+
+ GF_ASSERT(size == (num * sizeof(nsr_recon_record_details_t)));
+ bzero(iobuf->ptr, size);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting records for term=%d from %d to %d\n",
+ rfd->term, rfd->first_index, rfd->last_index);
+ nsr_recon_libchangelog_get_records(this, priv->changelog_base_path,
+ rfd->term, rfd->first_index, rfd->last_index, iobuf->ptr);
+ goto out;
+ }
+ // read last term info
+ case nsr_recon_xlator_sector_4 :
+ {
+ nsr_recon_last_term_info_t lt;
+ GF_ASSERT(size == sizeof(lt));
+ nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, rfd->term, &lt);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting last term info given current term=%d. last term = %d ops=%d, first=%d, last=%d\n",
+ rfd->term, lt.last_term, lt.commited_ops, lt.first_index, lt.last_index);
+ ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl
+ memcpy(iobuf->ptr, &lt, size);
+ goto out;
+ }
+ default:
+ {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_readv called with wrong offset\n");
+ op_errno = -1;
+ break;
+ }
+ }
+
+out:
+ if (op_errno == 0) {
+ iov.iov_base = iobuf->ptr;
+ ret = iov.iov_len = size;
+ }
+
+ STACK_UNWIND_STRICT (readv, frame, ret, op_errno, &iov, 1, NULL, iobref , NULL);
+
+ if (iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+ return 0;
+}
+
+int
+nsr_recon_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xdata)
+{
+ struct iatt buf = {0, };
+ // dirty hack to set root as regular but seems to work.
+ buf.ia_type = IA_IFREG;
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_lookup called \n");
+
+ STACK_UNWIND_STRICT (lookup, frame, 0, 0, this->itable->root, &buf, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+nsr_recon_flush (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (flush, frame, 0, 0, NULL);
+ return 0;
+}
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("recon", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_mt_recon_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Memory accounting init" "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+int32_t
+init (xlator_t *this)
+{
+ nsr_recon_private_t *priv = NULL;
+ char *local, *members;
+ unsigned int i=0;
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_mt_recon_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "priv allocation error\n");
+ return -1;
+ }
+ GF_OPTION_INIT ("replica-group-size", priv->replica_group_size, uint32, err);
+ GF_OPTION_INIT ("vol-name", priv->volname, str, err);
+ if (!priv->volname) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing volname option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("changelog-dir", priv->changelog_base_path, str, err);
+ if (!priv->changelog_base_path) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing changelog directory option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("base-dir", priv->base_dir, str, err);
+ if (!priv->base_dir) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing brick base directory option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("replica-group-members", members, str, err);
+ if (!members) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing membership option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("local-member", local, str, err);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing local member option (required)");
+ return -1;
+ }
+
+ priv->replica_group_members = GF_CALLOC (priv->replica_group_size,
+ sizeof(char *),
+ gf_mt_recon_members_list_t);
+ priv->replica_group_members[0] = GF_CALLOC (1,
+ strlen(local),
+ gf_mt_recon_member_name_t);
+ if (!priv->replica_group_members || !(priv->replica_group_members[0])) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "str allocation error\n");
+ return -1;
+ }
+ strcpy(priv->replica_group_members[0], local);
+ for (i=1; i < priv->replica_group_size; i++) {
+ char *member;
+ if (i == 1)
+ member = strtok(members, ",");
+ else
+ member = strtok(NULL, ",");
+ priv->replica_group_members[i] = GF_CALLOC (1,
+ strlen(member) + 1, gf_mt_recon_member_name_t);
+ if (!priv->replica_group_members[i]) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "str allocation error\n");
+ return -1;
+ }
+ strcpy(priv->replica_group_members[i], member);
+ }
+
+
+ priv->this = this;
+ this->private = (void *)priv;
+
+ priv->fp = recon_create_log (priv->replica_group_members[0], "recon-main-log");
+ if (!priv->fp)
+ return -1;
+
+ recon_main_log (this->name, GF_LOG_INFO, "creating reconciliation driver \n");
+
+ if (pthread_create(&priv->thread_id, NULL, nsr_reconciliation_driver, priv)) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "pthread creation error \n");
+ return -1;
+ }
+
+ INIT_LIST_HEAD(&(priv->list));
+
+
+ return 0;
+
+err:
+ return -1;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ nsr_recon_private_t *priv = NULL;
+ void *ret = NULL;
+
+ priv = (nsr_recon_private_t *)this->private;
+
+ pthread_cancel(priv->thread_id);
+ pthread_join(priv->thread_id, &ret);
+}
+
+
+struct xlator_fops fops = {
+ .open = nsr_recon_open,
+ .readv = nsr_recon_readv,
+ .writev = nsr_recon_writev,
+ .lookup = nsr_recon_lookup,
+ .flush = nsr_recon_flush
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"replica-group-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 2,
+ .max = INT_MAX,
+ .default_value = "2",
+ .description = "Number of bricks in replica group. can be derived but putting it here for testing."
+ },
+ {
+ .key = {"vol-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "volume name"
+ },
+ {
+ .key = {"local-member"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "member(brick) for which this translator is responsible."
+ },
+ {
+ .key = {"replica-group-members"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Comma seperated member names other than local."
+ },
+ {
+ .key = {"changelog-dir"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Base directory where per term changelogs are maintained."
+ },
+ {
+ .key = {"base-dir"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Base directory for this brick. This should go away once we fix gfid based lookups"
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.h b/xlators/cluster/nsr-recon/src/recon_xlator.h
new file mode 100644
index 000000000..d9692a632
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_xlator.h
@@ -0,0 +1,92 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __RECON_XLATOR_H__
+#define __RECON_XLATOR_H__
+
+#include <semaphore.h>
+#include <pthread.h>
+
+enum gf_dht_mem_types_ {
+ gf_mt_recon_changelog_buf_t = gf_common_mt_end + 1,
+ gf_mt_recon_driver_ctx_t,
+ gf_mt_recon_fd_t,
+ gf_mt_recon_id_t,
+ gf_mt_recon_member_name_t,
+ gf_mt_recon_members_list_t,
+ gf_mt_recon_per_node_worker_t,
+ gf_mt_recon_private_t,
+ gf_mt_recon_reconciliator_info_t,
+ gf_mt_recon_record_t,
+ gf_mt_recon_record_details_t,
+ gf_mt_recon_role_work_t,
+ gf_mt_recon_work_t,
+ gf_mt_recon_work_data_t,
+ gf_mt_recon_worker_t,
+ gf_mt_recon_end,
+};
+
+enum nsr_recon_xlator_sector_t {
+ nsr_recon_xlator_sector_0 = 0, // to report back the status of given transaction ids
+ nsr_recon_xlator_sector_1 = 512, // to write here information about leadership changes from the brick
+ nsr_recon_xlator_sector_2 = (512 * 2), // to write here individual roles and wait for that role to be done
+ nsr_recon_xlator_sector_3 = (512 *3), // read from here to get term info for given term
+ nsr_recon_xlator_sector_4 = (512 * 4), // read from here to get last term info
+};
+
+
+typedef struct _nsr_recon_private_s {
+ xlator_t *this; //back pointer
+ unsigned int replica_group_size; // number of static members of replica group
+ char **replica_group_members; // replica group members (including itself in first slot)
+ pthread_t thread_id; // driver thread id
+ nsr_recon_driver_ctx_t *driver_thread_context; //driver thread context
+ unsigned int outstanding; // for communicating with driver thread
+ call_frame_t *frame; // old frame that is pending (just one as of now)
+ struct list_head list;
+ char *volname;
+ uint32_t txn_id;
+ char *changelog_base_path;
+ char *base_dir;
+#if defined(NSR_DEBUG)
+ FILE *fp;
+#endif
+} nsr_recon_private_t;
+
+#define atomic_cmpxchg __sync_val_compare_and_swap
+
+#if defined(NSR_DEBUG)
+
+extern void
+_recon_main_log (const char *func, int line, char *member, FILE *fp,
+ char *fmt, ...);
+
+#define recon_main_log(dom, levl, fmt...) do { \
+ FMT_WARN (fmt); \
+ if (levl <= nsr_debug_level) { \
+ nsr_recon_private_t *priv = this->private; \
+ _recon_main_log (__FUNCTION__, __LINE__, \
+ priv->replica_group_members[0], \
+ priv->fp, \
+ ##fmt); \
+ } \
+} while (0)
+
+#else
+#define recon_main_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#endif
+
+void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt);
+void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt);
+void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t term, int32_t status, int32_t op_errno);
+gf_boolean_t nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf);
+
+
+#endif /* #ifndef __RECON_XLATOR_H__ */
diff --git a/xlators/cluster/nsr-server/Makefile.am b/xlators/cluster/nsr-server/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-server/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-server/src/Makefile.am b/xlators/cluster/nsr-server/src/Makefile.am
new file mode 100644
index 000000000..0092aad4f
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/Makefile.am
@@ -0,0 +1,43 @@
+noinst_PYTHON = codegen.py gen-fops.py
+
+xlator_LTLIBRARIES = nsr.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsr_la_LDFLAGS = -module -avoid-version -lcurl
+
+if ENABLE_ETCD_SIM
+nsr_la_SOURCES = nsr.c leader.c recon_notify.c etcd-sim.c
+else
+nsr_la_SOURCES = nsr.c leader.c recon_notify.c etcd-api.c \
+ yajl.c yajl_alloc.c yajl_buf.c yajl_encode.c yajl_gen.c \
+ yajl_lex.c yajl_parser.c yajl_tree.c yajl_version.c
+endif
+
+
+nsr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(top_builddir)/api/src/libgfapi.la
+
+noinst_HEADERS = nsr-internal.h etcd-api.h all-templates.c \
+ yajl_alloc.h yajl_buf.h yajl_bytestack.h yajl_encode.h \
+ yajl_lex.h yajl_parser.h yajl/yajl_common.h yajl/yajl_gen.h \
+ yajl/yajl_parse.h yajl/yajl_tree.h yajl/yajl_version.h \
+ $(top_srcdir)/xlators/lib/src/libxlator.h \
+ $(top_srcdir)/glusterfsd/src/glusterfsd.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src -DSBIN_DIR=\"$(sbindir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES = nsr-cg.c
+
+nsr-cg.c: gen-fops.py codegen.py $(XLATOR_HEADER) all-templates.c
+ $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) all-templates.c > $@
+
+nsr.lo: nsr-cg.c
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-server/src/all-templates.c b/xlators/cluster/nsr-server/src/all-templates.c
new file mode 100644
index 000000000..fa29de7b2
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/all-templates.c
@@ -0,0 +1,345 @@
+/*
+ * You can put anything here - it doesn't even have to be a comment - and it
+ * will be ignored until we reach the first template-name comment.
+ */
+
+
+// template-name read-fop
+$TYPE$
+nsr_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_$NAME$_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, EREMOTE,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name read-dispatch
+/* No "dispatch" function needed for $NAME$ */
+
+// template-name read-fan-in
+/* No "fan-in" function needed for $NAME$ */
+
+// template-name read-continue
+/* No "continue" function needed for $NAME$ */
+
+// template-name read-complete
+/* No "complete" function needed for $NAME$ */
+
+// template-name write-fop
+$TYPE$
+nsr_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+ uint32_t ti = 0;
+ double must_be_up;
+ double are_up;
+
+ /*
+ * Our first goal here is to avoid "split brain surprise" for users who
+ * specify exactly 50% with two- or three-way replication. That means
+ * either a more-than check against half the total replicas or an
+ * at-least check against half of our peers (one less). Of the two,
+ * only an at-least check supports the intuitive use of 100% to mean
+ * all replicas must be present, because "more than 100%" will never
+ * succeed regardless of which count we use. This leaves us with a
+ * slightly non-traditional definition of quorum ("at least X% of peers
+ * not including ourselves") but one that's useful enough to be worth
+ * it.
+ *
+ * Note that n_children and up_children *do* include the local
+ * subvolume, so we need to subtract one in each case.
+ */
+ must_be_up = ((double)(priv->n_children - 1)) * priv->quorum_pct;
+ are_up = ((double)(priv->up_children - 1)) * 100.0;
+ if (are_up < must_be_up) {
+ /* Emulate the AFR client-side-quorum behavior. */
+ op_errno = EROFS;
+ goto err;
+ }
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if defined(NSR_CG_NEED_FD)
+ local->fd = fd_ref(fd);
+#else
+ local->fd = NULL;
+#endif
+ INIT_LIST_HEAD(&local->qlinks);
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ atomic_inc(&priv->ops_in_flight);
+ STACK_WIND (frame, nsr_$NAME$_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+ }
+
+
+ if (!priv->leader/* || priv->fence_io*/) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ LOCK(&priv->index_lock);
+ ti = ++(priv->index);
+ UNLOCK(&priv->index_lock);
+ if (dict_set_int32(xdata,NSR_INDEX_XATTR,ti) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set index");
+ goto err;
+ }
+
+ local->stub = fop_$NAME$_stub (frame,nsr_$NAME$_continue,
+ $ARGS_SHORT$);
+ if (!local->stub) {
+ goto err;
+ }
+
+
+#if defined(NSR_CG_QUEUE)
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd->inode);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ LOCK(&ictx->lock);
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ /*
+ * TBD: enqueue only for real conflict
+ *
+ * Currently we just act like all writes are in
+ * conflict with one another. What we should really do
+ * is check the active/pending queues and defer only if
+ * there's a conflict there.
+ *
+ * It's important to check the pending queue because we
+ * might have an active request X which conflicts with
+ * a pending request Y, and this request Z might
+ * conflict with Y but not X. If we checked only the
+ * active queue then Z could jump ahead of Y, which
+ * would be incorrect.
+ */
+ local->qstub = fop_$NAME$_stub (frame,
+ nsr_$NAME$_dispatch,
+ $ARGS_SHORT$);
+ if (!local->qstub) {
+ UNLOCK(&ictx->lock);
+ goto err;
+ }
+ list_add_tail(&local->qlinks,&ictx->pqueue);
+ ++(ictx->pending);
+ UNLOCK(&ictx->lock);
+ return 0;
+ }
+ else {
+ list_add_tail(&local->qlinks,&ictx->aqueue);
+ ++(ictx->active);
+ }
+ UNLOCK(&ictx->lock);
+#endif
+
+ return nsr_$NAME$_dispatch (frame, this, $ARGS_SHORT$);
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->qstub) {
+ call_stub_destroy(local->qstub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, op_errno,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name write-dispatch
+$TYPE$
+nsr_$NAME$_dispatch (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = frame->local;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+
+ atomic_inc(&priv->ops_in_flight);
+
+ /*
+ * TBD: unblock pending request(s) if we fail after this point but
+ * before we get to nsr_$NAME$_complete (where that code currently
+ * resides).
+ */
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_$NAME$_fan_in,
+ trav->xlator, trav->xlator->fops->$NAME$,
+ $ARGS_SHORT$);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+}
+
+// template-name write-fan-in
+$TYPE$
+nsr_$NAME$_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+// template-name write-continue
+$TYPE$
+nsr_$NAME$_continue (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ STACK_WIND (frame, nsr_$NAME$_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+}
+
+// template-name write-complete
+$TYPE$
+nsr_$NAME$_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+ nsr_private_t *priv = this->private;
+#if defined(NSR_CG_NEED_FD)
+ nsr_local_t *local = frame->local;
+#endif
+
+#if defined(NSR_CG_QUEUE)
+ nsr_inode_ctx_t *ictx;
+ nsr_local_t *next;
+ if (local->qlinks.next != &local->qlinks) {
+ list_del(&local->qlinks);
+ ictx = nsr_get_inode_ctx(this,local->fd->inode);
+ if (ictx) {
+ LOCK(&ictx->lock);
+ if (ictx->pending) {
+ /*
+ * TBD: dequeue *all* non-conflicting reqs
+ *
+ * With the stub implementation there can only
+ * be one request active at a time (zero here)
+ * so it's not an issue. In a real
+ * implementation there might still be other
+ * active requests to check against, and
+ * multiple pending requests that could
+ * continue.
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking next request");
+ --(ictx->pending);
+ next = list_entry (ictx->pqueue.next,
+ nsr_local_t, qlinks);
+ list_del(&next->qlinks);
+ list_add_tail(&next->qlinks,&ictx->aqueue);
+ call_resume(next->qstub);
+ }
+ else {
+ --(ictx->active);
+ }
+ UNLOCK(&ictx->lock);
+ }
+ }
+#endif
+
+#if defined(NSR_CG_FSYNC)
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if defined(NSR_CG_NEED_FD)
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno,
+ $ARGS_SHORT$);
+ atomic_dec(&priv->ops_in_flight);
+ return 0;
+
+}
diff --git a/xlators/cluster/nsr-server/src/codegen.py b/xlators/cluster/nsr-server/src/codegen.py
new file mode 100755
index 000000000..709f5662f
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/codegen.py
@@ -0,0 +1,174 @@
+#!/usr/bin/python
+
+# This module lets us auto-generate boilerplate versions of fops and cbks,
+# both for the client side and (eventually) on the server side as well. This
+# allows us to implement common logic (e.g. leader fan-out and sequencing)
+# once, without all the problems that come with copying and pasting the same
+# code into dozens of functions (or failing to).
+#
+# I've tried to make this code pretty generic, since it's already likely to
+# be used multiple ways within NSR. Really, we should use something like this
+# to generate defaults.[ch] as well, to avoid the same sorts of mismatches
+# that we've already seen and to which this approach makes NSR immune. That
+# would require using something other than defaults.h as the input, but that
+# format could be even simpler so that's a good thing too.
+
+
+import re
+import sys
+
+decl_re = re.compile("([a-z0-9_]+)$")
+tmpl_re = re.compile("// template-name (.*)")
+
+class CodeGenerator:
+
+ def __init__ (self):
+ self.decls = {}
+ self.skip = 0
+ self.templates = {}
+ self.make_defaults = self._make_defaults
+
+ # Redefine this to preprocess the name in a declaration, e.g.
+ # fop_lookup_t => nsrc_lookup
+ def munge_name (self, orig):
+ return orig
+
+ # By default, this will convert the argument string into a sequence of
+ # (type, name) tuples minus the first self.skip (default zero) arguments.
+ # You can redefine it to skip the conversion, do a different conversion,
+ # or rearrange the arguments however you like.
+ def munge_args (self, orig):
+ args = []
+ for decl in orig.strip("(); ").split(","):
+ m = decl_re.search(decl)
+ if m:
+ args.append((m.group(1),decl[:m.start(1)].strip()))
+ else:
+ raise RuntimeError("can't split %s into type+name"%decl)
+ return args[self.skip:]
+
+ def add_decl (self, fname, ftype, fargs):
+ self.decls[self.munge_name(fname)] = (ftype, self.munge_args(fargs))
+
+ def parse_decls (self, path, pattern):
+ regex = re.compile(pattern)
+ f = open(path,"r")
+ have_decl = False
+ while True:
+ line = f.readline()
+ if not line:
+ break
+ m = regex.search(line)
+ if m:
+ if have_decl:
+ self.add_decl(f_name,f_type,f_args)
+ f_name = m.group(2)
+ f_type = m.group(1)
+ f_args = line[m.end(0):-1].strip()
+ if f_args.rfind(")") >= 0:
+ self.add_decl(f_name,f_type,f_args)
+ else:
+ have_decl = True
+ elif have_decl:
+ if line.strip() == "":
+ self.add_decl(f_name,f_type,f_args)
+ have_decl = False
+ else:
+ f_args += " "
+ f_args += line[:-1].strip()
+ if have_decl:
+ self.add_decl(f_name,f_type,f_args)
+
+ # Legacy function (yeah, already) to load a single template. If you're
+ # using multiple templates, you're better off loading them all from one
+ # file using load_templates (note plural) instead.
+ def load_template (self, name, path):
+ self.templates[name] = open(path,"r").readlines()
+
+ # Load multiple templates. Each is introduced by a special comment of
+ # the form
+ #
+ # // template-name xyz
+ #
+ # One side effect is that the block before the first such comment will be
+ # ignored. This seems like it might be useful some day so I'll leave it
+ # in, but if people trip over it maybe it will change.
+ #
+ # It is recommended to define templates in expected execution order, to
+ # make the result more readable than the inverted order (e.g. callback
+ # then fop) common in the rest of our code.
+ def load_templates (self, path):
+ t_name = None
+ for line in open(path,"r").readlines():
+ if not line:
+ break
+ m = tmpl_re.match(line)
+ if m:
+ if t_name:
+ self.templates[t_name] = t_contents
+ t_name = m.group(1).strip()
+ t_contents = []
+ elif t_name:
+ t_contents.append(line)
+ if t_name:
+ self.templates[t_name] = t_contents
+
+ # Emit the template, with the following expansions:
+ #
+ # $NAME$ => function name (as passed in)
+ # $TYPE$ => function return value
+ # $ARGS_SHORT$ => argument list, including types
+ # $ARGS_LONG$ => argument list, *not* including types
+ # $DEFAULTS$ => default callback args (see below)
+ #
+ # The $DEFAULTS$ substitution is for the case where a fop (which has one
+ # set of arguments) needs to signal an error via STACK_UNWIND (which
+ # requires a different set of arguments). In this case we look up the
+ # argument list for the opposite direction, using self.make_defaults which
+ # the user must explicitly set to the method for the opposite direction.
+ # If an argument is a pointer, we replace it with NULL; otherwise we
+ # replace it with zero. It's a hack, but it's the only thing we do that
+ # doesn't require specific knowledge of our environment and the specific
+ # call we're handling. If this doesn't suffice, we'll have to add
+ # something like $ARG0$ which can be passed in for specific cases.
+ def emit (self, f_name, tmpl):
+ args = self.decls[f_name][1]
+ zipper = lambda x: x[0]
+ a_short = ", ".join(map(zipper,args))
+ zipper = lambda x: x[1] + " " + x[0]
+ a_long = ", ".join(map(zipper,args))
+ for line in self.templates[tmpl]:
+ line = line.replace("$NAME$",f_name)
+ line = line.replace("$TYPE$",self.decls[f_name][0])
+ line = line.replace("$ARGS_SHORT$",a_short)
+ line = line.replace("$ARGS_LONG$",a_long)
+ line = line.replace("$DEFAULTS$",self.make_defaults(f_name))
+ print(line.rstrip())
+
+ def _make_defaults (self, f_name):
+ result = []
+ for arg in self.decls[f_name][1]:
+ if arg[1][-1] == "*":
+ result.append("NULL")
+ else:
+ result.append("0")
+ return ", ".join(result)
+
+if __name__ == "__main__":
+ type_re = "([a-z_0-9]+)"
+ name_re = "\(\*fop_([a-z0-9]+)_t\)"
+ full_re = type_re + " *" + name_re
+ cg = CodeGenerator()
+ cg.skip = 2
+ cg.parse_decls(sys.argv[1],full_re)
+ """
+ for k, v in cg.decls.iteritems():
+ print("=== %s" % k)
+ print(" return type %s" % v[0])
+ for arg in v[1]:
+ print(" arg %s (type %s)" % arg)
+ """
+ cg.load_template("fop",sys.argv[2])
+ cg.emit("lookup","fop")
+ cg.emit("rename","fop")
+ cg.emit("setxattr","fop")
diff --git a/xlators/cluster/nsr-server/src/etcd-api.c b/xlators/cluster/nsr-server/src/etcd-api.c
new file mode 100644
index 000000000..a07019244
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/etcd-api.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (c) 2013, Red Hat
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* For asprintf */
+#if !defined(_GNU_SOURCE)
+#define _GNU_SOURCE
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <curl/curl.h>
+#include <yajl/yajl_tree.h>
+#include "etcd-api.h"
+
+
+#define DEFAULT_ETCD_PORT 4001
+#define SL_DELIM "\n\r\t ,;"
+
+typedef struct {
+ etcd_server *servers;
+} _etcd_session;
+
+typedef struct {
+ char *key;
+ char *value;
+ int *index_in; /* pointer so NULL can be special */
+ int index_out; /* NULL would be meaningless */
+} etcd_watch_t;
+
+typedef size_t curl_callback_t (void *, size_t, size_t, void *);
+
+int g_inited = 0;
+const char *value_path[] = { "node", "value", NULL };
+const char *nodes_path[] = { "node", "nodes", NULL };
+const char *entry_path[] = { "key", NULL };
+
+/*
+ * We only call this in case where it should be safe, but gcc doesn't know
+ * that so we use this to shut it up.
+ */
+char *
+MY_YAJL_GET_STRING (yajl_val x)
+{
+ char *y = YAJL_GET_STRING(x);
+
+ return y ? y : "bogus";
+}
+
+#if defined(DEBUG)
+void
+print_curl_error (char *intro, CURLcode res)
+{
+ printf("%s: %s\n",intro,curl_easy_strerror(res));
+}
+#else
+#define print_curl_error(intro,res)
+#endif
+
+
+etcd_session
+etcd_open (etcd_server *server_list)
+{
+ _etcd_session *session;
+
+ if (!g_inited) {
+ curl_global_init(CURL_GLOBAL_ALL);
+ g_inited = 1;
+ }
+
+ session = malloc(sizeof(*session));
+ if (!session) {
+ return NULL;
+ }
+
+ /*
+ * Some day we'll set up more persistent connections, and keep track
+ * (via redirects) of which server is leader so that we can always
+ * try it first. For now we just push that to the individual request
+ * functions, which do the most brain-dead thing that can work.
+ */
+
+ session->servers = server_list;
+ return session;
+}
+
+
+void
+etcd_close (etcd_session session)
+{
+ free(session);
+}
+
+/*
+ * Normal yajl_tree_get is returning NULL for these paths even when I can
+ * verify (in gdb) that they exist. I suppose I could debug this for them, but
+ * this is way easier.
+ *
+ * TBD: see if common distros are packaging a JSON library that isn't total
+ * crap.
+ */
+yajl_val
+my_yajl_tree_get (yajl_val root, char const **path, yajl_type type)
+{
+ yajl_val obj = root;
+ int i;
+
+ for (;;) {
+ if (!*path) {
+ if (obj && (obj->type != type)) {
+ return NULL;
+ }
+ return obj;
+ }
+ if (obj->type != yajl_t_object) {
+ return NULL;
+ }
+ for (i = 0; /* nothing */; ++i) {
+ if (i >= obj->u.object.len) {
+ return NULL;
+ }
+ if (!strcmp(obj->u.object.keys[i],*path)) {
+ obj = obj->u.object.values[i];
+ ++path;
+ break;
+ }
+ }
+ }
+}
+
+
+/*
+ * Looking directly at node->u.array seems terribly un-modular, but the YAJL
+ * tree interface doesn't seem to have any exposed API for iterating over the
+ * elements of an array. I tried using yajl_tree_get with an index in the
+ * path, either as a type-casted integer or as a string, but that didn't work.
+ */
+char *
+parse_array_response (yajl_val parent)
+{
+ size_t i;
+ yajl_val item;
+ yajl_val value;
+ char *retval = NULL;
+ char *saved;
+ yajl_val node;
+
+ node = my_yajl_tree_get(parent,nodes_path,yajl_t_array);
+ if (!node) {
+ return NULL;
+ }
+
+ for (i = 0; i < node->u.array.len; ++i) {
+ item = node->u.array.values[i];
+ if (!item) {
+ break;
+ }
+ value = my_yajl_tree_get(item,entry_path,yajl_t_string);
+ if (!value) {
+ break;
+ }
+ if (retval) {
+ saved = retval;
+ retval = NULL;
+ (void)asprintf (&retval, "%s\n%s",
+ saved, MY_YAJL_GET_STRING(value));
+ free(saved);
+ }
+ else {
+ retval = strdup(MY_YAJL_GET_STRING(value));
+ }
+ if (!retval) {
+ break;
+ }
+ }
+
+ return retval;
+}
+
+size_t
+parse_get_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = my_yajl_tree_get(node,value_path,yajl_t_string);
+ if (value) {
+ /*
+ * YAJL probably copied it once, now we're going to
+ * copy it again. If anybody really cares for such
+ * small and infrequently used values, we'd have to do
+ * do something much more complicated (like using the
+ * stream interface) to avoid the copy. Right now it's
+ * just not worth it.
+ */
+ *((char **)stream) = strdup(MY_YAJL_GET_STRING(value));
+ }
+ else {
+ /* Might as well try this. */
+ *((char **)stream) = parse_array_response(node);
+ }
+ yajl_tree_free(node);
+ }
+
+ return size*nmemb;
+}
+
+
+etcd_result
+etcd_get_one (_etcd_session *session, char *key, etcd_server *srv, char *prefix,
+ char *post, curl_callback_t cb, char **stream)
+{
+ char *url;
+ CURL *curl;
+ CURLcode curl_res;
+ etcd_result res = ETCD_WTF;
+ void *err_label = &&done;
+
+ if (asprintf(&url,"http://%s:%u/v2/%s%s",
+ srv->host,srv->port,prefix,key) < 0) {
+ goto *err_label;
+ }
+ printf("url = %s\n",url);
+ err_label = &&free_url;
+
+ curl = curl_easy_init();
+ if (!curl) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_curl;
+
+ /* TBD: add error checking for these */
+ curl_easy_setopt(curl,CURLOPT_URL,url);
+ curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);
+ curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,cb);
+ curl_easy_setopt(curl,CURLOPT_WRITEDATA,stream);
+ if (post) {
+ curl_easy_setopt(curl,CURLOPT_POST,1L);
+ curl_easy_setopt(curl,CURLOPT_POSTFIELDS,post);
+ }
+#if defined(DEBUG)
+ curl_easy_setopt(curl,CURLOPT_VERBOSE,1L);
+#endif
+
+ curl_res = curl_easy_perform(curl);
+ if (curl_res != CURLE_OK) {
+ print_curl_error("perform",curl_res);
+ goto *err_label;
+ }
+
+ res = ETCD_OK;
+
+cleanup_curl:
+ curl_easy_cleanup(curl);
+free_url:
+ free(url);
+done:
+ return res;
+}
+
+
+char *
+etcd_get (etcd_session session_as_void, char *key)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *value = NULL;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_get_one(session,key,srv,"keys/",NULL,
+ parse_get_response,&value);
+ if ((res == ETCD_OK) && value) {
+ return value;
+ }
+ }
+
+ return NULL;
+}
+
+
+size_t
+parse_watch_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+ etcd_watch_t *watch = stream;
+ static const char *i_path[] = { "node", "modifiedIndex", NULL };
+ static const char *k_path[] = { "node", "key", NULL };
+ static const char *v_path[] = { "node", "value", NULL };
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = my_yajl_tree_get(node,i_path,yajl_t_number);
+ if (value) {
+ watch->index_out = strtoul(YAJL_GET_NUMBER(value),
+ NULL,10);
+ }
+ value = my_yajl_tree_get(node,k_path,yajl_t_string);
+ if (value) {
+ watch->key = strdup(MY_YAJL_GET_STRING(value));
+ }
+ value = my_yajl_tree_get(node,v_path,yajl_t_string);
+ if (value) {
+ watch->value = strdup(MY_YAJL_GET_STRING(value));
+ }
+ }
+
+ return size*nmemb;
+}
+
+
+etcd_result
+etcd_watch (etcd_session session_as_void, char *pfx,
+ char **keyp, char **valuep, int *index_in, int *index_out)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ etcd_watch_t watch;
+ char *path;
+
+ if (index_in) {
+ if (asprintf(&path,"%s?wait=true&recursive=true&waitIndex=%d",
+ pfx,*index_in) < 0) {
+ return ETCD_WTF;
+ }
+ }
+ else {
+ if (asprintf(&path,"%s?wait=true&recursive=true",pfx) < 0) {
+ return ETCD_WTF;
+ }
+ }
+
+ memset(&watch,0,sizeof(watch));
+ watch.index_in = index_in;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_get_one(session,path,srv,"keys/",NULL,
+ parse_watch_response,(char **)&watch);
+ if (res == ETCD_OK) {
+ if (keyp) {
+ *keyp = watch.key;
+ }
+ if (valuep) {
+ *valuep = watch.value;
+ }
+ if (index_out) {
+ *index_out = watch.index_out;
+ }
+ break;
+ }
+ }
+
+ free(path);
+ return res;
+}
+
+
+size_t
+parse_set_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+ etcd_result res = ETCD_PROTOCOL_ERROR;
+ /*
+ * Success responses contain prevValue and index. Failure responses
+ * contain errorCode and cause. Among all these, index seems to be the
+ * one we're most likely to need later, so look for that.
+ */
+ static const char *path[] = { "node", "modifiedIndex", NULL };
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = my_yajl_tree_get(node,path,yajl_t_number);
+ if (value) {
+ res = ETCD_OK;
+ }
+ }
+
+ *((etcd_result *)stream) = res;
+ return size*nmemb;
+}
+
+
+size_t
+parse_lock_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ *((char **)stream) = strdup(ptr);
+ return size*nmemb;
+}
+
+
+/*
+ * There are two use cases, based on is_lock.
+ *
+ * If is_lock is null, we use the "keys" namespace. A null value means an
+ * HTTP DELETE; precond and ttl are both ignored. Otherwise we're setting a
+ * value, with *optional* precond and ttl.
+ *
+ * If is_lock is set, we use the "locks" namespace. A null value means an
+ * HTTP DELETE as before, and we still ignore ttl as before, but now precond
+ * must be set to represent the lock index. Otherwise ttl must be present,
+ * and we decide what to do based on precond. If it's null, this is an
+ * initial lock so we use an HTTP POST. Otherwise it's a renewal so we use
+ * an HTTP PUT instead.
+ */
+etcd_result
+etcd_set_one (_etcd_session *session, char *key, char *value,
+ char *precond, unsigned int ttl, etcd_server *srv,
+ char **is_lock)
+{
+ char *url;
+ char *contents = NULL;
+ CURL *curl;
+ etcd_result res = ETCD_WTF;
+ CURLcode curl_res;
+ void *err_label = &&done;
+ char *namespace;
+ char *http_cmd;
+ char *orig_index;
+
+ if (is_lock) {
+ namespace = "mod/v2/lock";
+ if (value) {
+ if (!ttl) {
+ /* Lock/renew must specify ttl. */
+ return ETCD_WTF;
+ }
+ http_cmd = precond ? "PUT" : "POST";
+ }
+ else {
+ if (!precond) {
+ /* Unlock must specify index. */
+ return ETCD_WTF;
+ }
+ http_cmd = "DELETE";
+ }
+ orig_index = *is_lock;
+ }
+ else {
+ namespace = "v2/keys";
+ http_cmd = value ? "PUT" : "DELETE";
+ }
+
+ if (asprintf(&url,"http://%s:%u/%s/%s",
+ srv->host,srv->port,namespace,key) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_url;
+
+ if (is_lock) {
+ if (precond) {
+ if (asprintf(&contents,"index=%s",precond) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_contents;
+ }
+ if (ttl) {
+ if (contents) {
+ char *c2;
+ if (asprintf(&c2,"ttl=%u;%s",ttl,contents) < 0) {
+ goto *err_label;
+ }
+ free(contents);
+ contents = c2;
+ }
+ else {
+ if (asprintf(&contents,"ttl=%u",ttl) < 0) {
+ goto *err_label;
+ }
+ }
+ err_label = &&free_contents;
+ }
+ }
+ else {
+ if (value) {
+ if (asprintf(&contents,"value=%s",value) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_contents;
+ }
+ if (precond) {
+ char *c2;
+ if (asprintf(&c2,"%s;prevValue=%s",contents,
+ precond) < 0) {
+ goto *err_label;
+ }
+ free(contents);
+ contents = c2;
+ err_label = &&free_contents;
+ }
+ if (ttl) {
+ char *c2;
+ if (asprintf(&c2,"%s;ttl=%u",contents,ttl) < 0) {
+ goto *err_label;
+ }
+ free(contents);
+ contents = c2;
+ err_label = &&free_contents;
+ }
+ }
+
+ curl = curl_easy_init();
+ if (!curl) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_curl;
+
+ /* TBD: add error checking for these */
+ curl_easy_setopt(curl,CURLOPT_CUSTOMREQUEST,http_cmd);
+ curl_easy_setopt(curl,CURLOPT_URL,url);
+ curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);
+ curl_easy_setopt(curl,CURLOPT_POSTREDIR,CURL_REDIR_POST_ALL);
+
+ if (is_lock && value && !precond) {
+ /* Only do this for an initial lock, not a renewal. */
+ curl_easy_setopt (curl, CURLOPT_WRITEFUNCTION,
+ parse_lock_response);
+ curl_easy_setopt(curl,CURLOPT_WRITEDATA,is_lock);
+ }
+ else {
+ curl_easy_setopt (curl, CURLOPT_WRITEFUNCTION,
+ parse_set_response);
+ curl_easy_setopt(curl,CURLOPT_WRITEDATA,&res);
+ }
+
+ /*
+ * CURLOPT_HTTPPOST would be easier, but it looks like etcd will barf on
+ * that. Sigh.
+ */
+ if (contents) {
+ curl_easy_setopt(curl,CURLOPT_POST,1L);
+ curl_easy_setopt(curl,CURLOPT_POSTFIELDS,contents);
+ }
+#if defined(DEBUG)
+ curl_easy_setopt(curl,CURLOPT_VERBOSE,1L);
+#endif
+
+ curl_res = curl_easy_perform(curl);
+ if (curl_res != CURLE_OK) {
+ print_curl_error("perform",curl_res);
+ goto *err_label;
+ }
+
+ if (is_lock && value) {
+ if (!precond) {
+ /*
+ * If this is an initial lock, parse_lock_response would
+ * have been unable to set "res" for us. Instead, we
+ * set it here if the index string got updated.
+ */
+ if (*is_lock != orig_index) {
+ res = ETCD_OK;
+ }
+ }
+ else {
+ /*
+ * If this is a lock renewal, then a successful call
+ * will pass through neither parse_lock_response nor
+ * parse_get_response. The curl response code alone
+ * is sufficient.
+ */
+ res = ETCD_OK;
+ }
+ }
+
+ /*
+ * If the request succeeded, or at least got to the server and failed
+ * there, parse_set_response should have set res appropriately.
+ */
+
+cleanup_curl:
+ curl_easy_cleanup(curl);
+free_contents:
+ free(contents); /* might already be NULL for delete, but that's OK */
+free_url:
+ free(url);
+done:
+ return res;
+}
+
+
+etcd_result
+etcd_set (etcd_session session_as_void, char *key, char *value,
+ char *precond, unsigned int ttl)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_set_one(session,key,value,precond,ttl,srv,NULL);
+ /*
+ * Protocol errors are likely to be things like precondition
+ * failures, which won't be helped by retrying on another
+ * server.
+ */
+ if ((res == ETCD_OK) || (res == ETCD_PROTOCOL_ERROR)) {
+ return res;
+ }
+ }
+
+ return ETCD_WTF;
+}
+
+
+/*
+ * This uses the same path and status checks as SET, but with a different HTTP
+ * command instead of data. Precondition and TTL are obviously not used in
+ * this case, though a conditional delete would be a cool feature for etcd. I
+ * think you can get a timed delete by doing a conditional set to the current
+ * value with a TTL, but I haven't actually tried it.
+ */
+etcd_result
+etcd_delete (etcd_session session_as_void, char *key)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_set_one(session,key,NULL,NULL,0,srv,NULL);
+ if (res == ETCD_OK) {
+ break;
+ }
+ }
+
+ return res;
+}
+
+
+etcd_result
+etcd_lock (etcd_session session_as_void, char *key, unsigned int ttl,
+ char *index_in, char **index_out)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *tmp = NULL;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_set_one(session,key,"hack",index_in,ttl,srv,&tmp);
+ if (res == ETCD_OK) {
+ if (index_out) {
+ *index_out = tmp;
+ }
+ break;
+ }
+ }
+
+ return res;
+}
+
+
+etcd_result
+etcd_unlock (etcd_session session_as_void, char *key, char *index)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *tmp = NULL;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_set_one(session,key,NULL,index,0,srv,&tmp);
+ if (res == ETCD_OK) {
+ break;
+ }
+ }
+
+ return res;
+}
+size_t
+store_leader (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ *((char **)stream) = strdup(ptr);
+ return size * nmemb;
+}
+
+
+char *
+etcd_leader (etcd_session session_as_void)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *value = NULL;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_get_one(session,"leader",srv,"",NULL,
+ store_leader,&value);
+ if ((res == ETCD_OK) && value) {
+ return value;
+ }
+ }
+
+ return NULL;
+}
+
+
+void
+free_sl (etcd_server *server_list)
+{
+ size_t num_servers;
+
+ for (num_servers = 0; server_list[num_servers].host; ++num_servers) {
+ free(server_list[num_servers].host);
+ }
+ free(server_list);
+}
+
+
+int
+_count_matching (char *text, char *cset, int result)
+{
+ char *t;
+ int res = 0;
+
+ for (t = text; *t; ++t) {
+ if ((strchr(cset,*t) != NULL) != result) {
+ break;
+ }
+ ++res;
+ }
+
+ return res;
+}
+
+#define count_matching(t,cs) _count_matching(t,cs,1)
+#define count_nonmatching(t,cs) _count_matching(t,cs,0)
+
+
+etcd_session
+etcd_open_str (char *server_names)
+{
+ char *snp;
+ int run_len;
+ int host_len;
+ size_t num_servers;
+ etcd_server *server_list;
+ etcd_session *session;
+
+ /*
+ * Yeah, we iterate over the string twice so we can allocate an
+ * appropriately sized array instead of turning it into a linked list.
+ * Unfortunately this means we can't use strtok* which is destructive
+ * with no platform-independent way to reverse the destructive effects.
+ */
+
+ num_servers = 0;
+ snp = server_names;
+ while (*snp) {
+ run_len = count_nonmatching(snp,SL_DELIM);
+ if (!run_len) {
+ snp += count_matching(snp,SL_DELIM);
+ continue;
+ }
+ ++num_servers;
+ snp += run_len;
+ }
+
+ if (!num_servers) {
+ return NULL;
+ }
+
+ server_list = calloc(num_servers+1,sizeof(*server_list));
+ if (!server_list) {
+ return NULL;
+ }
+ num_servers = 0;
+
+ snp = server_names;
+ while (*snp) {
+ run_len = count_nonmatching(snp,SL_DELIM);
+ if (!run_len) {
+ snp += count_matching(snp,SL_DELIM);
+ continue;
+ }
+ host_len = count_nonmatching(snp,":");
+ if ((run_len - host_len) > 1) {
+ server_list[num_servers].host = strndup(snp,host_len);
+ server_list[num_servers].port = (unsigned short)
+ strtoul(snp+host_len+1,NULL,10);
+ }
+ else {
+ server_list[num_servers].host = strndup(snp,run_len);
+ server_list[num_servers].port = DEFAULT_ETCD_PORT;
+ }
+ ++num_servers;
+ snp += run_len;
+ }
+
+ session = etcd_open(server_list);
+ if (!session) {
+ free_sl(server_list);
+ }
+ return session;
+}
+
+
+void
+etcd_close_str (etcd_session session)
+{
+ free_sl(((_etcd_session *)session)->servers);
+ etcd_close(session);
+}
diff --git a/xlators/cluster/nsr-server/src/etcd-api.h b/xlators/cluster/nsr-server/src/etcd-api.h
new file mode 100644
index 000000000..66275d40d
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/etcd-api.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2013, Red Hat
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Description of an etcd server. For now it just includes the name and
+ * port, but some day it might include other stuff like SSL certificate
+ * information.
+ */
+
+typedef enum {
+ ETCD_OK = 0,
+ ETCD_PROTOCOL_ERROR,
+ /* TBD: add other error categories here */
+ ETCD_WTF /* anything we can't easily categorize */
+} etcd_result;
+
+typedef struct {
+ char *host;
+ unsigned short port;
+} etcd_server;
+
+typedef void *etcd_session;
+
+/*
+ * etcd_open
+ *
+ * Establish a session to an etcd cluster, with automatic reconnection and
+ * so on.
+ *
+ * server_list
+ * Array of etcd_server structures, with the last having host=NULL. The
+ * caller is responsible for ensuring that this remains valid as long as
+ * the session exists.
+ */
+etcd_session etcd_open (etcd_server *server_list);
+
+
+/*
+ * etcd_open_str
+ *
+ * Same as etcd_open, except that the servers are specified as a list of
+ * host:port strings, separated by comma/semicolon or whitespace.
+ */
+etcd_session etcd_open_str (char *server_names);
+
+
+/*
+ * etcd_close
+ *
+ * Terminate a session, closing connections and freeing memory (or any other
+ * resources) associated with it.
+ */
+void etcd_close (etcd_session session);
+
+
+/*
+ * etcd_close
+ *
+ * Same as etcd_close, but also free the server list as etcd_open_str would
+ * have allocated it.
+ */
+void etcd_close_str (etcd_session session);
+
+
+/*
+ * etcd_get
+ *
+ * Fetch a key from one of the servers in a session. The return value is a
+ * newly allocated string, which must be freed by the caller.
+ *
+ * key
+ * The etcd key (path) to fetch.
+ */
+char * etcd_get (etcd_session session, char *key);
+
+
+/*
+ * etcd_watch
+ * Watch the set of keys matching a prefix.
+ *
+ * pfx
+ * The etcd key prefix (like a path) to watch.
+ *
+ * keyp
+ * Space for a pointer to the key that was added/modified/deleted.
+ *
+ * valuep
+ * Space for a pointer to the value if a key was added/modified. A delete
+ * is signified by this being set to NULL.
+ *
+ * index_in
+ * Pointer to an index to be used for *issuing* the watch request, or
+ * NULL for a watch without an index.
+ *
+ * index_out
+ * Pointer to space for an index *returned* by etcd, or NULL to mean don't
+ * bother.
+ *
+ * In normal usage, index_in will be NULL and index_out will be set to receive
+ * the index for the first watch. Subsequently, index_in will be set to
+ * provide the previous index (plus one) and index_out will be set to receive
+ * the next. It's entirely legitimate to point both at the same variable.
+ */
+
+etcd_result etcd_watch (etcd_session session, char *pfx,
+ char **keyp, char **valuep,
+ int *index_in, int *index_out);
+
+
+/*
+ * etcd_set
+ *
+ * Write a key, with optional TTL and/or previous value (as a precondition).
+ *
+ * key
+ * The etcd key (path) to set.
+ *
+ * value
+ * New value as a null-terminated string. Unlike etcd_get, we can derive
+ * the length ourselves instead of needing it to be passed in separately.
+ *
+ * precond
+ * Required previous value as a null-terminated string, or NULL to mean
+ * an unconditional set.
+ *
+ * ttl
+ * Time in seconds after which the value will automatically expire and be
+ * deleted, or zero to mean no auto-expiration.
+ */
+
+etcd_result etcd_set (etcd_session session, char *key, char *value,
+ char *precond, unsigned int ttl);
+
+
+/*
+ * etcd_delete
+ *
+ * Delete a key from one of the servers in a session.
+ *
+ * key
+ * The etcd key (path) to delete.
+ */
+
+etcd_result etcd_delete (etcd_session session, char *key);
+
+
+/*
+ * etcd_leader
+ *
+ * Get the identify of the current leader.
+ */
+
+char * etcd_leader (etcd_session session);
+
+/*
+ * etcd_lock
+ *
+ * Take or renew a lock - really a lease but the etcd folks call it a lock so
+ * we'll follow suit.
+ *
+ * key
+ * The path (in the "locks" namespace) for the lock.
+ *
+ * ttl
+ * Time in seconds for the lock.
+ *
+ * index_in (optional, indicates renewal)
+ * Lock index from previous lock call.
+ *
+ * index_out (only used for initial lock)
+ * Place for the new lock index. You must free this.
+ */
+
+etcd_result etcd_lock (etcd_session session_as_void, char *key,
+ unsigned int ttl, char *index_in, char **index_out);
+
+/*
+ * etcd_unlock
+ *
+ * Release a lock (see etcd_lock regarding terminology).
+ *
+ * key
+ * The path (in the "locks" namespace) for the lock.
+ *
+ * index
+ * Lock index from previous lock call.
+ */
+
+etcd_result etcd_unlock (etcd_session session_as_void, char *key,
+ char *index);
+
diff --git a/xlators/cluster/nsr-server/src/etcd-sim.c b/xlators/cluster/nsr-server/src/etcd-sim.c
new file mode 100644
index 000000000..d0bea12c7
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/etcd-sim.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2014, Red Hat
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "mem-pool.h"
+
+/*
+ * Mock implementation of etcd
+ * The etcd file is simulated in /tmp/<server-names>
+ * Writes from Multiple writers are protected using file lock.
+*/
+
+#include "etcd-api.h"
+#define MAX_KEY_LEN 64
+#define MAX_VALUE_LEN 64
+#define MAX_EXPIRE_LEN 16
+
+etcd_session
+etcd_open (etcd_server *server_list)
+{
+ return NULL;
+}
+
+typedef struct _etcd_sim_s {
+ char *path;
+} etcd_sim_t;
+
+void
+etcd_close (etcd_session this)
+{
+ etcd_sim_t *sim = (etcd_sim_t *)this;
+ free(sim->path);
+ free(this);
+}
+
+
+char *
+etcd_get_1 (FILE *stream, char *key)
+{
+ char *str = NULL;
+ size_t len;
+ unsigned long expires;
+ char *ret;
+
+ // Read the file
+ while(1) {
+ if(str) {
+ free(str);
+ str = NULL;
+ }
+ if (getline((char **)&str, &len,stream) == -1) {
+ break;
+ }
+ if (!strncmp(str, key, strlen(key))) {
+ char k[256], s[256];
+ sscanf(str,"%s %s %lu",k, s, &expires);
+ // check if key is expired.
+ if (time(NULL) > expires) {
+ /* Keep looking for an unexpired entry. */
+ continue;
+ }
+ ret = calloc(1, strlen(s) + 1);
+ strcpy(ret,s);
+ free(str);
+ return(ret);
+ }
+ }
+ return NULL;
+}
+
+
+char *
+etcd_get (etcd_session this, char *key)
+{
+ etcd_sim_t *sim = (etcd_sim_t *)this;
+ int fd;
+ FILE *stream;
+ char *retval;
+
+ fd = open(sim->path,O_RDONLY);
+ if (!fd) {
+ return NULL;
+ }
+
+ stream = fdopen(fd,"r");
+ (void)flock(fd,LOCK_SH);
+ retval = etcd_get_1(stream,key);
+ (void)flock(fd,LOCK_UN);
+ fclose(stream); /* closes fd as well */
+
+ return retval;
+}
+
+
+etcd_result
+etcd_set_1 (FILE *stream, char *key, char *value,
+ char *precond, unsigned int ttl)
+{
+ char *str = NULL;
+ char tp[255];
+ size_t len;
+ unsigned long expires;
+
+ while(1) {
+ if(str) {
+ free(str);
+ str = NULL;
+ }
+ if (getline((char **)&str, &len,stream) == -1) {
+ break;
+ }
+ if (!strncmp(str, key, strlen(key))) {
+ char k[256], s[256];
+ sscanf(str,"%s %s %lu",k, s, &expires);
+ // check if the present key is expired
+ if (time(NULL) > expires) {
+ /* Keep looking for an unexpired entry. */
+ continue;
+ }
+ /*
+ * The only case in which we should fail here is if a
+ * precondition was specified and does not match the
+ * current (non-expired) value.
+ */
+ if (precond && strcmp(precond, s)) {
+ free(str);
+ return ETCD_WTF;
+ }
+ fseek(stream, -strlen(str), SEEK_CUR);
+ free(str);
+ goto here;
+ }
+ }
+here:
+ memset(tp, 0, 255);
+ sprintf(tp,"%*s %*s %*lu\n",
+ -MAX_KEY_LEN, key, -MAX_VALUE_LEN, value,
+ -MAX_EXPIRE_LEN, ttl ? time(NULL) + ttl : ~0);
+ if (fwrite(tp, 1,strlen(tp), stream) != strlen(tp)) {
+ return ETCD_WTF;
+ }
+ fflush(stream);
+ fsync(fileno(stream));
+ return ETCD_OK;
+}
+
+
+etcd_result
+etcd_set (etcd_session this, char *key, char *value,
+ char *precond, unsigned int ttl)
+{
+ etcd_sim_t *sim = (etcd_sim_t *)this;
+ int fd;
+ FILE *stream;
+ etcd_result retval;
+
+ fd = open(sim->path,O_RDWR);
+ if (fd < 0) {
+ return ETCD_WTF;
+ }
+
+ stream = fdopen(fd,"r+");
+ (void)flock(fd,LOCK_EX);
+ retval = etcd_set_1(stream,key,value,precond,ttl);
+ (void)flock(fd,LOCK_UN);
+ fclose(stream); /* closes fd as well */
+
+ return retval;
+}
+
+
+etcd_session
+etcd_open_str (char *server_names)
+{
+ etcd_sim_t *sim;
+ int fd;
+
+ sim = calloc(1, sizeof(etcd_sim_t));
+ (void)asprintf(&sim->path,"/tmp/%s",server_names);
+
+ fd = open(sim->path, O_RDWR | O_CREAT, 0777);
+ if (fd == -1) {
+ free(sim->path);
+ free(sim);
+ return NULL;
+ }
+
+ close(fd);
+ return ((void *)sim);
+}
+
+
+void
+etcd_close_str (etcd_session this)
+{
+ etcd_close(this);
+}
+
+etcd_result
+etcd_delete (etcd_session this, char *key)
+{
+ return ETCD_WTF;
+}
+
+char *
+etcd_leader (etcd_session this_as_void)
+{
+ return NULL;
+}
+
+etcd_result
+etcd_watch (etcd_session this, char *pfx, char **keyp, char **valuep,
+ int *index_in, int *index_out)
+{
+ return ETCD_WTF;
+}
+
+etcd_result
+etcd_lock (etcd_session session_as_void, char *key, unsigned int ttl,
+ char *index_in, char **index_out)
+{
+ char *path;
+ int fd;
+
+ if (!index_in) {
+ if (gf_asprintf(&path,"/var/tmp/%s",key) < 0) {
+ return ETCD_WTF;
+ }
+ fd = open(path,O_RDWR|O_CREAT,0666);
+ GF_FREE(path);
+ if (fd < 0) {
+ return ETCD_WTF;
+ }
+ if (flock(fd,LOCK_EX) < 0) {
+ close(fd);
+ return ETCD_WTF;
+ }
+ *index_out = strdup("42");
+ }
+
+ /*
+ * Yes, we leak an fd by not closing it here (and nobody else even
+ * knows about it). That would be awful in any other context, but
+ * for test scripts it won't matter.
+ */
+ return ETCD_OK;
+}
+
diff --git a/xlators/cluster/nsr-server/src/gen-fops.py b/xlators/cluster/nsr-server/src/gen-fops.py
new file mode 100755
index 000000000..1639f489c
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/gen-fops.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+
+# This script generates the boilerplate versions of most fops and cbks in the
+# server. This allows the details of leadership-status checking, sequencing
+# between leader and followers (including fan-out), and basic error checking
+# to be centralized one place, with per-operation code kept to a minimum.
+
+import sys
+import codegen
+
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_t\)"
+full_re = type_re + " *" + name_re
+fop_cg = codegen.CodeGenerator()
+fop_cg.skip = 2
+fop_cg.parse_decls(sys.argv[1],full_re)
+fop_cg.load_templates(sys.argv[2])
+
+# Use the multi-template feature to generate multiple callbacks from the same
+# parsed declarations.
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)"
+full_re = type_re + " *" + name_re
+cbk_cg = codegen.CodeGenerator()
+cbk_cg.skip = 5
+cbk_cg.parse_decls(sys.argv[1],full_re)
+cbk_cg.load_templates(sys.argv[2])
+
+# This is a nasty little trick to handle the case where a generated fop needs
+# a set of default arguments for the corresponding callback.
+fop_cg.make_defaults = cbk_cg.make_defaults
+
+# We need two types of templates. The first, for pure read operations, just
+# needs to do a simple am-i-leader check (augmented to allow dirty reads).
+# The second, for pure writes, needs to do fan-out to followers between those
+# initial checks and local execution. There are other operations that don't
+# fit neatly into either category - e.g. lock ops or fsync - so we'll just have
+# to handle those manually. The table thus includes entries only for those we
+# can categorize. The special cases, plus any new operations we've never even
+# heard of, aren't in there.
+#
+# Various keywords can be used to define/undefine preprocessor symbols used
+# in the templates, on a per-function basis. For example, if the keyword here
+# is "fsync" (lowercase word or abbreviation) that will cause NSR_CG_FSYNC
+# (prefix plus uppercase version) to be defined above all of the generated code
+# for that fop.
+
+fop_table = {
+ "access": "read",
+ "create": "write",
+ "discard": "write",
+# "entrylk": "read",
+ "fallocate": "write",
+# "fentrylk": "read",
+ "fgetxattr": "read",
+# "finodelk": "read",
+# "flush": "read",
+ "fremovexattr": "write",
+ "fsetattr": "write",
+ "fsetxattr": "write",
+ "fstat": "read",
+# "fsync": "read",
+# "fsyncdir": "read",
+ "ftruncate": "write",
+ "fxattrop": "write",
+ "getxattr": "read",
+# "inodelk": "read",
+ "link": "write",
+# "lk": "read",
+# "lookup": "read",
+ "mkdir": "write",
+ "mknod": "write",
+ "open": "write",
+ "opendir": "read",
+ "rchecksum": "read",
+ "readdir": "read",
+ "readdirp": "read",
+ "readlink": "read",
+ "readv": "read",
+ "removexattr": "write",
+ "rename": "write",
+ "rmdir": "write",
+ "setattr": "write",
+ "setxattr": "write",
+ "stat": "read",
+ "statfs": "read",
+ "symlink": "write",
+ "truncate": "write",
+ "unlink": "write",
+ "writev": "write,fsync,queue",
+ "xattrop": "write",
+}
+
+fops_done = []
+for x in sorted(fop_cg.decls.keys()):
+ if x in fop_table.keys():
+ info = fop_table[x].split(",")
+ kind = info[0]
+ flags = info[1:]
+ if ("fsync" in flags) or ("queue" in flags):
+ flags.append("need_fd")
+ for fname in flags:
+ print "#define NSR_CG_%s" % fname.upper()
+ cbk_cg.emit(x,kind+"-complete")
+ fop_cg.emit(x,kind+"-continue")
+ cbk_cg.emit(x,kind+"-fan-in")
+ fop_cg.emit(x,kind+"-dispatch")
+ fop_cg.emit(x,kind+"-fop")
+ for fname in flags:
+ print "#undef NSR_CG_%s" % fname.upper()
+ fops_done.append(x)
+ else:
+ print("/* No code emitted for %s */"%x)
+ print("")
+
+# Just for fun, emit the fops table too.
+print("struct xlator_fops fops = {")
+for x in fops_done:
+ print(" .%s = nsr_%s,"%(x,x))
+print("};")
diff --git a/xlators/cluster/nsr-server/src/leader.c b/xlators/cluster/nsr-server/src/leader.c
new file mode 100644
index 000000000..02a2609c8
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/leader.c
@@ -0,0 +1,138 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <regex.h>
+//#include <stdlib.h>
+#include <string.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+
+#ifndef NSR_SIM_ETCD
+#include "etcd-api.h"
+#endif
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+#define NSR_TTL 5
+
+static void
+nsr_set_leader (xlator_t *this, etcd_session etcd)
+{
+ long term = 0;
+ etcd_result res;
+ nsr_private_t *priv = this->private;
+ char n_t[sizeof(long)+1];
+ char *text = NULL;
+
+ gf_log (this->name, GF_LOG_INFO, "Just became leader");
+
+ text = etcd_get(etcd, priv->term_key);
+ if(text == NULL) {
+ term = 0;
+ } else {
+ term = strtol(text, NULL, 10);
+ }
+ sprintf(n_t,"%ld",term+1);
+ res = etcd_set(etcd, priv->term_key,n_t,text,0);
+ if(res != ETCD_OK) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set term");
+ return;
+ }
+ priv->leader = _gf_true;
+
+ priv->current_term = term + 1;
+
+ if (priv->nsr_recon_start == _gf_false) {
+ atomic_fetch_and(&(priv->fence_io), 0);
+ return;
+ }
+
+ // Move this inside recon notify???
+ atomic_fetch_or(&(priv->fence_io), 1);
+
+ nsr_recon_notify_event_set_leader(priv);
+
+ return;
+}
+
+void *
+nsr_leader_thread (void *arg)
+{
+ xlator_t *this = (xlator_t *) arg;
+ nsr_private_t *priv = this->private;
+ etcd_result res;
+ char *index_in = NULL;
+ char *index_out = NULL;
+
+ gf_log (this->name, GF_LOG_INFO,
+ "calling etcd_open_str on servers %s", priv->etcd_servers);
+
+ priv->etcd = etcd_open_str(priv->etcd_servers);
+ if (!(priv->etcd)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open etcd session\n");
+ return NULL;
+ }
+
+ priv->leader_inited = 1;
+
+ for (;;) {
+ /* Not leader yet. Try to become leader. */
+ for (;;) {
+ res = etcd_lock (priv->etcd, priv->leader_key, NSR_TTL,
+ index_in, &index_out);
+ if (res == ETCD_OK) {
+ break;
+ }
+ gf_log (this->name, GF_LOG_WARNING,
+ "etcd_lock failed (%d)", res);
+ sleep(1);
+ }
+ /* We're there. Notify other parts of the code. */
+ nsr_set_leader(this,priv->etcd);
+ /* Try to retain leadership. */
+ index_in = index_out;
+ index_out = NULL;
+ for (;;) {
+ res = etcd_lock (priv->etcd, priv->leader_key, NSR_TTL,
+ index_in, &index_out);
+ if (index_out && (index_in != index_out)) {
+ if (index_in) {
+ free(index_in);
+ }
+ index_in = index_out;
+ index_out = NULL;
+ }
+ if (res != ETCD_OK) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lost leadership (%d)", res);
+ if (index_out) {
+ free(index_out);
+ }
+ break;
+ }
+ sleep(1);
+ }
+ }
+
+ etcd_close_str(priv->etcd);
+ return NULL;
+}
+
diff --git a/xlators/cluster/nsr-server/src/nsr-internal.h b/xlators/cluster/nsr-server/src/nsr-internal.h
new file mode 100644
index 000000000..72b61bfa5
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/nsr-internal.h
@@ -0,0 +1,101 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#define LEADER_XATTR "user.nsr.leader"
+#define SECOND_CHILD(xl) (xl->children->next->xlator)
+
+enum {
+ gf_mt_nsr_private_t = gf_common_mt_end + 1,
+ gf_mt_nsr_fd_ctx_t,
+ gf_mt_nsr_inode_ctx_t,
+ gf_mt_nsr_dirty_t,
+ gf_mt_nsr_end
+};
+
+typedef enum nsr_recon_notify_ev_id_t {
+ NSR_RECON_SET_LEADER = 1,
+ NSR_RECON_ADD_CHILD = 2
+} nsr_recon_notify_ev_id_t;
+
+typedef struct _nsr_recon_notify_ev_s {
+ nsr_recon_notify_ev_id_t id;
+ uint32_t index; // in case of add
+ struct list_head list;
+} nsr_recon_notify_ev_t;
+
+typedef struct {
+ char *etcd_servers;
+ char *subvol_uuid;
+ char *leader_key;
+ char *term_key;
+ char *brick_uuid;
+ gf_boolean_t leader;
+ uint8_t up_children;
+ uint8_t n_children;
+ char *vol_file;
+ etcd_session etcd;
+ volatile unsigned int fence_io;
+ uint32_t current_term;
+#ifdef NSR_DEBUG
+ uint32_t leader_log_fd;
+#endif
+ volatile int recon_notify_inited;
+ volatile int leader_inited;
+ uint32_t kid_state;
+ gf_lock_t dirty_lock;
+ struct list_head dirty_fds;
+ gf_boolean_t nsr_recon_start;
+ void * recon_ctx;
+ volatile uint32_t ops_in_flight;
+ uint32_t index;
+ gf_lock_t index_lock;
+ double quorum_pct;
+} nsr_private_t;
+
+typedef struct {
+ call_stub_t *stub;
+ call_stub_t *qstub;
+ uint8_t call_count;
+ fd_t *fd;
+ struct list_head qlinks;
+} nsr_local_t;
+
+/*
+ * This should match whatever changelog returns on the pre-op for us to pass
+ * when we're ready for our post-op.
+ */
+typedef uint32_t log_id_t;
+
+typedef struct {
+ struct list_head links;
+ log_id_t id;
+} nsr_dirty_list_t;
+
+typedef struct {
+ fd_t *fd;
+ struct list_head dirty_list;
+ struct list_head fd_list;
+} nsr_fd_ctx_t;
+
+typedef struct {
+ gf_lock_t lock;
+ uint32_t active;
+ struct list_head aqueue;
+ uint32_t pending;
+ struct list_head pqueue;
+} nsr_inode_ctx_t;
+
+void nsr_recon_notify_event_set_leader(nsr_private_t *priv);
+void nsr_recon_notify_event_add_child(nsr_private_t *priv, uint32_t index);
+void* nsr_recon_notify_thread (void *this);
+
diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c
new file mode 100644
index 000000000..85eba09b5
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/nsr.c
@@ -0,0 +1,812 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+#include "run.h"
+#include "common-utils.h"
+#include "syncop.h"
+
+#include "etcd-api.h"
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+
+#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
+#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
+#define GLUSTERD_BRICK_INFO_DIR "bricks"
+
+#define NSR_FLUSH_INTERVAL 5
+
+nsr_inode_ctx_t *
+nsr_get_inode_ctx (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_int = 0LL;
+ nsr_inode_ctx_t *ctx_ptr;
+
+ if (__inode_ctx_get(inode,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int;
+ }
+ else {
+ ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr),
+ gf_mt_nsr_inode_ctx_t);
+ if (ctx_ptr) {
+ ctx_int = (uint64_t)(long)ctx_ptr;
+ if (__inode_ctx_set(inode,this,&ctx_int) == 0) {
+ LOCK_INIT(&ctx_ptr->lock);
+ INIT_LIST_HEAD(&ctx_ptr->aqueue);
+ INIT_LIST_HEAD(&ctx_ptr->pqueue);
+ }
+ else {
+ GF_FREE(ctx_ptr);
+ ctx_ptr = NULL;
+ }
+ }
+
+ }
+
+ return ctx_ptr;
+}
+
+nsr_fd_ctx_t *
+nsr_get_fd_ctx (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx_int = 0LL;
+ nsr_fd_ctx_t *ctx_ptr;
+
+ if (__fd_ctx_get(fd,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int;
+ }
+ else {
+ ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t);
+ if (ctx_ptr) {
+ if (__fd_ctx_set(fd,this,(uint64_t)ctx_ptr) == 0) {
+ INIT_LIST_HEAD(&ctx_ptr->dirty_list);
+ INIT_LIST_HEAD(&ctx_ptr->fd_list);
+ }
+ else {
+ GF_FREE(ctx_ptr);
+ ctx_ptr = NULL;
+ }
+ }
+
+ }
+
+ return ctx_ptr;
+}
+
+void
+nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local)
+{
+ fd_t *fd = local->fd;
+ nsr_fd_ctx_t *ctx_ptr;
+ nsr_dirty_list_t *dirty;
+ nsr_private_t *priv = this->private;
+
+ /*
+ * TBD: don't do any of this for O_SYNC/O_DIRECT writes.
+ * Unfortunately, that optimization requires that we distinguish
+ * between writev and other "write" calls, saving the original flags
+ * and checking them in the callback. Too much work for too little
+ * gain right now.
+ */
+
+ LOCK(&fd->lock);
+ ctx_ptr = nsr_get_fd_ctx(this,fd);
+ dirty = GF_CALLOC(1,sizeof(*dirty),gf_mt_nsr_dirty_t);
+ if (ctx_ptr && dirty) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "marking fd %p as dirty (%p)", fd, dirty);
+ /* TBD: fill dirty->id from what changelog gave us */
+ list_add_tail(&dirty->links,&ctx_ptr->dirty_list);
+ if (list_empty(&ctx_ptr->fd_list)) {
+ /* Add a ref so _release doesn't get called. */
+ ctx_ptr->fd = fd_ref(fd);
+ LOCK(&priv->dirty_lock);
+ list_add_tail (&ctx_ptr->fd_list,
+ &priv->dirty_fds);
+ UNLOCK(&priv->dirty_lock);
+ }
+ }
+ else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not mark %p dirty", fd);
+ if (ctx_ptr) {
+ GF_FREE(ctx_ptr);
+ }
+ if (dirty) {
+ GF_FREE(dirty);
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define NSR_INDEX_XATTR "trusted.nsr.index"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+#define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count"
+#include "nsr-cg.c"
+
+uint8_t
+nsr_count_up_kids (nsr_private_t *priv)
+{
+ uint8_t retval = 0;
+ uint8_t i;
+
+ for (i = 0; i < priv->n_children; ++i) {
+ if (priv->kid_state & (1 << i)) {
+ ++retval;
+ }
+ }
+
+ return retval;
+}
+
+/*
+ * The fsync machinery looks a lot like that for any write call, but there are
+ * some important differences that are easy to miss. First, we don't care
+ * about the xdata that shows whether the call came from a leader or
+ * reconciliation process. If we're the leader we fan out; if we're not we
+ * don't. Second, we don't wait for followers before we issue the local call.
+ * The code generation system could be updated to handle this, and still might
+ * if we need to implement other "almost identical" paths (e.g. for open), but
+ * a copy is more readable as long as it's just one.
+ */
+
+int32_t
+nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ nsr_local_t *local = frame->local;
+ gf_boolean_t unwind;
+
+ LOCK(&frame->lock);
+ unwind = !--(local->call_count);
+ UNLOCK(&frame->lock);
+
+ if (unwind) {
+ STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ }
+ return 0;
+}
+
+int32_t
+nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ nsr_dirty_list_t *dirty;
+ nsr_dirty_list_t *dtmp;
+ nsr_local_t *local = frame->local;
+
+ list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "sending post-op on %p (%p)", local->fd, dirty);
+ GF_FREE(dirty);
+ }
+
+ return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
+
+int32_t
+nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ nsr_private_t *priv = this->private;
+ nsr_local_t *local;
+ uint64_t ctx_int = 0LL;
+ nsr_fd_ctx_t *ctx_ptr;
+ xlator_list_t *trav;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ STACK_UNWIND_STRICT(fsync,frame,-1,ENOMEM,NULL,NULL,xdata);
+ return 0;
+ }
+ INIT_LIST_HEAD(&local->qlinks);
+ frame->local = local;
+
+ /* Move the dirty list from the fd to the fsync request. */
+ LOCK(&fd->lock);
+ if (__fd_ctx_get(fd,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int;
+ list_splice_init (&ctx_ptr->dirty_list,
+ &local->qlinks);
+ }
+ UNLOCK(&fd->lock);
+
+ /* Issue the local call. */
+ local->call_count = priv->leader ? priv->n_children : 1;
+ STACK_WIND (frame, nsr_fsync_local_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+
+ /* Issue remote calls if we're the leader. */
+ if (priv->leader) {
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fsync_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ dict_t *result;
+ nsr_private_t *priv = this->private;
+
+ if (!priv->leader) {
+ STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL);
+ return 0;
+ }
+
+ if (!name || (strcmp(name,NSR_REP_COUNT_XATTR) != 0)) {
+ STACK_WIND_TAIL (frame,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+ }
+
+ result = dict_new();
+ if (!result) {
+ goto dn_failed;
+ }
+
+ priv->up_children = nsr_count_up_kids(this->private);
+ if (dict_set_uint32(result,NSR_REP_COUNT_XATTR,priv->up_children) != 0) {
+ goto dsu_failed;
+ }
+
+ STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL);
+ dict_destroy(result);
+ return 0;
+
+dsu_failed:
+ dict_destroy(result);
+dn_failed:
+ STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+void
+nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx)
+{
+ nsr_dirty_list_t *dirty;
+ nsr_dirty_list_t *dtmp;
+
+ list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "sending post-op on %p (%p)", fd_ctx->fd, dirty);
+ GF_FREE(dirty);
+ }
+
+ INIT_LIST_HEAD(&fd_ctx->dirty_list);
+}
+
+void *
+nsr_flush_thread (void *ctx)
+{
+ xlator_t *this = ctx;
+ nsr_private_t *priv = this->private;
+ struct list_head dirty_fds;
+ nsr_fd_ctx_t *fd_ctx;
+ nsr_fd_ctx_t *fd_tmp;
+ int ret;
+
+ for (;;) {
+ /*
+ * We have to be very careful to avoid lock inversions here, so
+ * we can't just hold priv->dirty_lock while we take and
+ * release locks for each fd. Instead, we only hold dirty_lock
+ * at the beginning of each iteration, as we (effectively) make
+ * a copy of the current list head and then clear the original.
+ * This leads to four scenarios for adding the first entry to
+ * an fd and potentially putting it on the global list.
+ *
+ * (1) While we're asleep. No lock contention, it just gets
+ * added and will be processed on the next iteration.
+ *
+ * (2) After we've made a local copy, but before we've started
+ * processing that fd. The new entry will be added to the
+ * fd (under its lock), and we'll process it on the current
+ * iteration.
+ *
+ * (3) While we're processing the fd. They'll block on the fd
+ * lock, then see that the list is empty and put it on the
+ * global list. We'll process it here on the next
+ * iteration.
+ *
+ * (4) While we're working, but after we've processed that fd.
+ * Same as (1) as far as that fd is concerned.
+ */
+ INIT_LIST_HEAD(&dirty_fds);
+ LOCK(&priv->dirty_lock);
+ list_splice_init(&priv->dirty_fds,&dirty_fds);
+ UNLOCK(&priv->dirty_lock);
+
+ list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) {
+ ret = syncop_fsync(FIRST_CHILD(this),fd_ctx->fd,0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to fsync %p (%d)",
+ fd_ctx->fd, -ret);
+ }
+
+ LOCK(&fd_ctx->fd->lock);
+ nsr_flush_fd(this,fd_ctx);
+ list_del_init(&fd_ctx->fd_list);
+ UNLOCK(&fd_ctx->fd->lock);
+ fd_unref(fd_ctx->fd);
+ }
+
+ sleep(NSR_FLUSH_INTERVAL);
+ }
+
+ return NULL;
+}
+
+int32_t
+nsr_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx = 0LL;
+
+ if ((inode_ctx_del(inode,this,&ctx) == 0) && ctx) {
+ GF_FREE((void *)(long)ctx);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_release (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx = 0LL;
+
+ if ((fd_ctx_del(fd,this,&ctx) == 0) && ctx) {
+ GF_FREE((void *)(long)ctx);
+ }
+
+ return 0;
+}
+
+struct xlator_cbks cbks = {
+ .forget = nsr_forget,
+ .release = nsr_release,
+};
+
+int
+nsr_reconfigure (xlator_t *this, dict_t *options)
+{
+ nsr_private_t *priv = this->private;
+
+ GF_OPTION_RECONF ("leader", priv->leader, options, bool, err);
+ gf_log (this->name, GF_LOG_INFO,
+ "reconfigure called. setting priv->leader to %d\n", priv->leader);
+ return 0;
+
+err:
+ return -1;
+}
+
+int
+nsr_get_child_index (xlator_t *this, xlator_t *kid)
+{
+ xlator_list_t *trav;
+ int retval = -1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++retval;
+ if (trav->xlator == kid) {
+ return retval;
+ }
+ }
+
+ return -1;
+}
+
+/*
+ * Child notify handling is unreasonably FUBAR. Sometimes we'll get a
+ * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it.
+ * Other times we won't. Because it's effectively random (probably racy), we
+ * can't just maintain a count. We actually have to keep track of the state
+ * for each child separately, to filter out the bogus CHILD_DOWN events, and
+ * then generate counts on demand.
+ */
+int
+nsr_notify (xlator_t *this, int event, void *data, ...)
+{
+ nsr_private_t *priv = this->private;
+ int index;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ index = nsr_get_child_index(this,data);
+ if (index >= 0) {
+ priv->kid_state |= (1 << index);
+ priv->up_children = nsr_count_up_kids(priv);
+ gf_log (this->name, GF_LOG_INFO,
+ "got CHILD_UP for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ priv->up_children);
+ if (priv->nsr_recon_start == _gf_true) {
+ nsr_recon_notify_event_add_child(priv, index);
+ }
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ index = nsr_get_child_index(this,data);
+ if (index >= 0) {
+ priv->kid_state &= ~(1 << index);
+ priv->up_children = nsr_count_up_kids(priv);
+ gf_log (this->name, GF_LOG_INFO,
+ "got CHILD_DOWN for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ priv->up_children);
+ }
+ break;
+ default:
+ ;
+ }
+
+ return default_notify(this,event,data);
+}
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("nsr", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_mt_nsr_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Memory accounting init" "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+extern void *nsr_leader_thread (void *);
+
+void
+nsr_deallocate_priv (nsr_private_t *priv)
+{
+ if (!priv) {
+ return;
+ }
+
+ if (priv->leader_key) {
+ GF_FREE(priv->leader_key);
+ }
+
+ if (priv->term_key) {
+ GF_FREE(priv->term_key);
+ }
+
+ GF_FREE(priv);
+}
+
+
+int32_t
+nsr_init (xlator_t *this)
+{
+ xlator_list_t *remote;
+ xlator_list_t *local;
+ nsr_private_t *priv = NULL;
+ xlator_list_t *trav;
+ pthread_t kid;
+ uuid_t tmp_uuid;
+ char *my_name = NULL, *morph_name = NULL, *recon_file = NULL, *recon_pid_file = NULL, *ptr = NULL;
+ char *volname;
+ extern xlator_t global_xlator;
+ glusterfs_ctx_t *oldctx = global_xlator.ctx;
+ runner_t runner = {0,};
+ int32_t ret = -1;
+ struct stat buf;
+ char *recon_log = NULL, *recon_log_dir = NULL;
+
+ /*
+ * Any fop that gets special treatment has to be patched in here,
+ * because the compiled-in table is produced by the code generator and
+ * only contains generated functions. Note that we have to go through
+ * this->fops because of some dynamic-linking strangeness; modifying
+ * the static table doesn't work.
+ */
+ this->fops->getxattr = nsr_getxattr_special;
+ this->fops->fsync = nsr_fsync;
+
+ local = this->children;
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "no local subvolume");
+ goto err;
+ }
+
+ remote = local->next;
+ if (!remote) {
+ gf_log (this->name, GF_LOG_ERROR, "no remote subvolumes");
+ goto err;
+ }
+
+ this->local_pool = mem_pool_new (nsr_local_t, 128);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create nsr_local_t pool");
+ goto err;
+ }
+
+ priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate priv");
+ goto err;
+ }
+
+ // set this so that unless leader election is done, IO is fenced
+ priv->fence_io = 1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++(priv->n_children);
+ }
+
+ LOCK_INIT(&priv->dirty_lock);
+ LOCK_INIT(&priv->index_lock);
+ INIT_LIST_HEAD(&priv->dirty_fds);
+
+ this->private = priv;
+
+ GF_OPTION_INIT ("etcd-servers", priv->etcd_servers, str, err);
+ if (!priv->etcd_servers) {
+ gf_log (this->name, GF_LOG_ERROR, "etcd servers not generated. ???");
+ goto err;
+ }
+
+
+ GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err);
+
+ GF_OPTION_INIT ("subvol-uuid", priv->subvol_uuid, str, err);
+ gf_log (this->name, GF_LOG_INFO, "subvol_uuid = %s", priv->subvol_uuid);
+ if (gf_asprintf(&priv->leader_key,"%s:leader",priv->subvol_uuid) <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not generate leader key");
+ goto err;
+ }
+ if (gf_asprintf(&priv->term_key,"%s:term",priv->subvol_uuid) <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not generate term key");
+ goto err;
+ }
+ uuid_generate(tmp_uuid);
+ priv->brick_uuid = strdup(uuid_utoa(tmp_uuid));
+ gf_log (this->name, GF_LOG_INFO, "brick_uuid = %s\n", priv->brick_uuid);
+
+ GF_OPTION_INIT ("my-name", my_name, str, err);
+ if (!my_name) {
+ gf_log (this->name, GF_LOG_ERROR, "brick name not generated. ???");
+ goto err;
+ }
+ GF_OPTION_INIT ("vol-name", volname, str, err);
+ if (!volname) {
+ gf_log (this->name, GF_LOG_ERROR, "vol name not generated. ???");
+ goto err;
+ }
+
+ morph_name = GF_CALLOC (1, strlen(my_name) + 1, gf_mt_nsr_private_t);
+ strcpy(morph_name, my_name);
+ recon_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("con") +1, gf_mt_nsr_private_t);
+ recon_pid_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("recon") +1, gf_mt_nsr_private_t);
+ if ((!recon_file) || (!recon_pid_file)) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name");
+ goto err;
+ }
+ ptr = strchr (morph_name, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (morph_name, '/');
+ }
+
+ sprintf(recon_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(recon_file, morph_name);
+ strcat(recon_file, "-nsr-recon.vol");
+
+ sprintf(recon_pid_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ "run");
+ strcat(recon_pid_file, morph_name);
+ strcat(recon_pid_file, "-recon.pid");
+
+ priv->vol_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("con") +1, gf_mt_nsr_private_t);
+ if (!priv->vol_file) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name");
+ goto err;
+ }
+ sprintf(priv->vol_file,"%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(priv->vol_file, "con:");
+ strcat(priv->vol_file, morph_name);
+
+ if (pthread_create(&kid,NULL,nsr_flush_thread,this) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not start flush thread");
+ /* TBD: treat this as a fatal error? */
+ }
+
+ // Start the recon process. Then start the leader thread.
+ /*
+ * REVIEW
+ * Logs belong in /var/log not /tmp.
+ */
+
+ ret = mkdir (NSR_LOG_DIR, 0777);
+ if (ret != 0) {
+ if (errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't create"
+ " nsr log directory (%s)", strerror (errno));
+ goto err;
+ }
+ }
+
+ recon_log_dir = GF_CALLOC (1, strlen (NSR_LOG_DIR) + strlen(morph_name)
+ + 2, gf_mt_nsr_private_t);
+ if (!recon_log_dir) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't allocate recon log "
+ "dir name");
+ goto err;
+ }
+ sprintf (recon_log_dir, "%s/%s", NSR_LOG_DIR, morph_name);
+ ret = mkdir (recon_log_dir, 0777);
+
+ if (ret != 0){
+ if (errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Couldn't create brick log dir (%s)",
+ strerror (errno));
+ goto err;
+ }
+ }
+
+ recon_log = GF_CALLOC (1, strlen (recon_log_dir)+
+ strlen ("reconciliation.log") + 2,
+ gf_mt_nsr_private_t);
+ if (!recon_log) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't allocate recon log"
+ " file name");
+ goto err;
+ }
+ sprintf (recon_log, "%s/reconciliation.log", recon_log_dir);
+
+ if (!stat(priv->vol_file, &buf)) {
+
+ runinit (&runner);
+ runner_add_args(&runner, SBIN_DIR "/glusterfs",
+ "-f", recon_file,
+ "-p", recon_pid_file,
+ "-l", recon_log,
+ NULL);
+ ret = runner_run (&runner);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not exec reconciliation process %s",
+ SBIN_DIR "/glusterfs");
+ goto err;
+ }
+
+ // TBD - convert this to make sure recon process runs
+ sleep(2);
+ priv->nsr_recon_start = _gf_true;
+ }
+
+
+ (void)pthread_create(&kid,NULL,nsr_recon_notify_thread,this);
+ while (priv->recon_notify_inited == 0) {
+ sleep(1);
+ }
+
+ if (pthread_create(&kid,NULL,nsr_leader_thread,this) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to start leader thread");
+ }
+ while (priv->leader_inited == 0) {
+ sleep(1);
+ }
+
+
+ /*
+ * Calling glfs_new changes old->ctx, even if THIS still points
+ * to global_xlator. That causes problems later in the main
+ * thread, when gf_log_dump_graph tries to use the FILE after
+ * we've mucked with it and gets a segfault in __fprintf_chk.
+ * We can avoid all that by undoing the damage before we
+ * continue.
+ */
+ global_xlator.ctx = oldctx;
+
+ return 0;
+
+err:
+ nsr_deallocate_priv(priv);
+ return -1;
+}
+
+
+void
+nsr_fini (xlator_t *this)
+{
+ nsr_deallocate_priv(this->private);
+}
+
+class_methods_t class_methods = {
+ .init = nsr_init,
+ .fini = nsr_fini,
+ .reconfigure = nsr_reconfigure,
+ .notify = nsr_notify,
+};
+
+struct volume_options options[] = {
+ { .key = {"leader"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "Start in the leader role. This is only for "
+ "bootstrapping the code, and should go away when we "
+ "have real leader election."
+ },
+ { .key ={"vol-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "volume name"
+ },
+ { .key = {"my-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "brick name in form of host:/path"
+ },
+ { .key = {"etcd-servers"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "list of comma seperated etc servers"
+ },
+ { .key = {"subvol-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "UUID for this NSR (sub)volume"
+ },
+ { .key = {"quorum-percent"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "50.0",
+ .description = "percentage of rep_count-1 that must be up"
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-server/src/recon_notify.c b/xlators/cluster/nsr-server/src/recon_notify.c
new file mode 100644
index 000000000..1c50de234
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/recon_notify.c
@@ -0,0 +1,389 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+#include "etcd-api.h"
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+
+
+typedef struct _nsr_recon_notify_ctx_t {
+ nsr_recon_notify_ev_t recon_head;
+ pthread_mutex_t recon_mutex;
+ pthread_cond_t recon_cv;
+ char **hosts; // list of hosts ordered depending on child indices
+ uint32_t current_term;
+ uint32_t last_reconciled_term;
+ glfs_t *fs;
+ glfs_fd_t *fd;
+} nsr_recon_notify_ctx_t;
+
+static int
+xlator_get_option (xlator_t *xl, char *key, char **value)
+{
+ GF_ASSERT (xl);
+ return dict_get_str (xl->options, key, value);
+}
+
+void nsr_recon_notify_event_set_leader(nsr_private_t *priv)
+{
+ nsr_recon_notify_ev_t *ev;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+
+ ev = GF_CALLOC (1, sizeof (nsr_recon_notify_ev_t), 0);
+ ev->id = NSR_RECON_SET_LEADER;
+ INIT_LIST_HEAD(&(ev->list));
+ pthread_mutex_lock(&ctx->recon_mutex);
+ list_add_tail(&ev->list, &ctx->recon_head.list);
+ pthread_cond_signal(&ctx->recon_cv);
+ pthread_mutex_unlock(&ctx->recon_mutex);
+}
+
+void nsr_recon_notify_event_add_child(nsr_private_t *priv, uint32_t index)
+{
+ nsr_recon_notify_ev_t *ev;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+
+ ev = GF_CALLOC (1, sizeof (nsr_recon_notify_ev_t), 0);
+ ev->id = NSR_RECON_ADD_CHILD;
+ ev->index = index;
+ INIT_LIST_HEAD(&(ev->list));
+ pthread_mutex_lock(&ctx->recon_mutex);
+ list_add_tail(&ev->list, &ctx->recon_head.list);
+ pthread_cond_signal(&ctx->recon_cv);
+ pthread_mutex_unlock(&ctx->recon_mutex);
+}
+
+
+static void
+nsr_recon_set_leader (xlator_t *this)
+{
+
+ nsr_private_t *priv = this->private;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+ nsr_recon_role_t role;
+ xlator_t *old = this;
+ uint32_t i=0;
+
+ if (priv->leader != _gf_true)
+ return;
+
+ if (ctx->last_reconciled_term == priv->current_term)
+ return;
+
+ /*
+ * Quorum for reconciliation is not the same as quorum for I/O. Here,
+ * we require a true majority. The +1 is because we don't count
+ * ourselves as part of n_children or up_children.
+ *
+ * TBD: re-evaluate when to reconcile (including partial)
+ */
+ if (priv->up_children <= (priv->n_children / 2))
+ return;
+
+ gf_log (this->name, GF_LOG_INFO,
+ "Sending message to do recon with %d nodes\n",
+ priv->up_children);
+
+ role.num = 0;
+ role.role = leader;
+ for (i = 0; i < priv->n_children; ++i) {
+ if (priv->kid_state & (1 << i)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Recon using host %s",
+ ctx->hosts[i]);
+ strcpy(role.info[role.num].name, ctx->hosts[i]);
+ (role.num)++;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_INFO,
+ "setting current term as %d", priv->current_term);
+ role.current_term = priv->current_term;
+ ENDIAN_CONVERSION_RR(role, _gf_false);
+
+ // inform the reconciliator that this is leader
+ // in the callback (once reconciliation is done),
+ // we will unfence the IOs.
+ // TBD - error handling later.
+ if (glfs_lseek(ctx->fd, nsr_recon_xlator_sector_1, SEEK_SET) == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "doing lseek failed\n");
+ return;
+ }
+
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO,
+ "Writing to local node to set leader");
+ do {
+ if (priv->leader != _gf_true) {
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_ERROR, "no longer leader\n");
+ return;
+ }
+ if (glfs_write(ctx->fd, &role, sizeof(role), 0) == -1) {
+ if (errno == EAGAIN) {
+ // Wait for old reconciliation to bail out.
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_ERROR,
+ "write failed with retry. retrying after some time\n");
+ sleep(5);
+ continue;
+ }
+ else{
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_ERROR,
+ "doing write failed\n");
+ // This is because reconciliation has returned with error
+ // because some node has died in between.
+ // What should be done? Either we retry being leader
+ // or hook to CHILD_DOWN notification.
+ // Put that logic later. As of now we will just retry.
+ // This is easier.
+ sleep(5);
+ continue;
+ }
+ } else {
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO, "doing write with success\n");
+ break;
+ }
+ } while(1);
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO,
+ "glfs_write returned. unfencing IO\n");
+
+ // TBD - error handling
+
+ ctx->last_reconciled_term = priv->current_term;
+ priv->index = 0; // reset changelog index
+ atomic_fetch_and(&(priv->fence_io), 0);
+
+ return;
+}
+
+static void
+nsr_recon_add_child (xlator_t *this, uint32_t index)
+{
+ nsr_private_t *priv = this->private;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+ nsr_recon_role_t role;
+ xlator_t *old = this;
+
+ if (priv->leader != _gf_true)
+ return;
+
+ // reconciliation still pending.
+ // Check if we have majority
+ if (ctx->last_reconciled_term != priv->current_term) {
+ nsr_recon_set_leader(this);
+ } else {
+ // Reconciliation done.
+ // new child joining the majority/
+ // Do reconciliation only fot this child but after fencing new IO and draining old IO
+ role.num = 1;
+ role.role = joiner;
+
+ atomic_fetch_or(&(priv->fence_io), 1);
+ while(priv->ops_in_flight) {
+ sleep(1);
+ }
+
+ strcpy(role.info[0].name, ctx->hosts[index]);
+ role.current_term = priv->current_term;
+ ENDIAN_CONVERSION_RR(role, _gf_false);
+ glfs_lseek(ctx->fd, nsr_recon_xlator_sector_1, SEEK_SET);
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO,
+ "Writing to local node to join %s\n", role.info[0].name);
+ glfs_write(ctx->fd, &role,
+ sizeof(role), 0);
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO,
+ "Write to local node to set joiner returned\n");
+
+ // TBD - error handling
+ atomic_fetch_and(&(priv->fence_io), 0);
+ }
+
+ return;
+}
+
+static uint32_t
+nsr_setup_recon (xlator_t *this)
+{
+ nsr_private_t *priv = this->private;
+ xlator_t *old = this;
+ uint32_t ret = 0;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+
+ if (priv->nsr_recon_start == _gf_false)
+ return 0;
+
+ ctx->fs = glfs_new(priv->subvol_uuid);
+ if (!ctx->fs) {
+ ret = 1;
+ gf_log (this->name, GF_LOG_ERROR, "failed to initialise glfs \n");
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ ret = glfs_set_volfile(ctx->fs, priv->vol_file);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set volfile \n");
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ /*
+ * REVIEW
+ * Logs belong in /var/log not /tmp.
+ */
+ glfs_set_logging (ctx->fs,"/tmp/glfs-log", 7);
+ if (glfs_init(ctx->fs) < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to init volfile \n");
+ ret = 1;
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ ctx->fd = glfs_open (ctx->fs, "/", O_RDWR);
+ if (ctx->fd == NULL) {
+ ret = 1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open fd to communicate with recon process \n");
+ goto done;
+ }
+
+
+done:
+ glusterfs_this_set(old);
+ return ret;
+}
+
+
+static void
+nsr_setup_hosts(xlator_t *this)
+{
+ xlator_list_t *trav;
+ nsr_private_t *priv = this->private;
+ uint32_t i = 0;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+
+ ctx->hosts = GF_CALLOC(sizeof(char *), priv->n_children, gf_mt_nsr_private_t);
+ // Iterate thru all the children
+ for (trav = this->children; trav; trav = trav->next) {
+ char *hostname = NULL, *vol = NULL;
+ int ret1 = 0, ret2 = 0, ret = 0;
+ xlator_t *xl = trav->xlator;
+ // If the child type is that of protocol/client
+ if (!strcmp(trav->xlator->type, "protocol/client")) {
+ ret1 = xlator_get_option (xl, "remote-host", &hostname);
+ ret2 = xlator_get_option (xl, "remote-subvolume", &vol);
+ if (!ret1 && !ret2) {
+ // add the name of that host to the hosts
+ ctx->hosts[i] = GF_CALLOC(sizeof(char), strlen(hostname) + strlen(vol) + 2, 0);
+ strcpy(ctx->hosts[i], hostname);
+ strcat(ctx->hosts[i], ":");
+ strcat(ctx->hosts[i], vol);
+ gf_log (this->name, GF_LOG_INFO,
+ "adding hosts %s to recon notfiy list", ctx->hosts[i]);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "CANNOT FIND HOSTNAME FOR A CHILD");
+ GF_ASSERT(0);
+ }
+ // local brick
+ } else {
+ ret = xlator_get_option (this, "my-name", &hostname);
+ if (!ret) {
+ uint32_t len = strlen(hostname);
+ ctx->hosts[i] = GF_CALLOC(sizeof(char),
+ len+1,
+ gf_mt_nsr_private_t);
+ strcpy(ctx->hosts[i], hostname);
+ gf_log (this->name, GF_LOG_INFO,
+ "adding my host %s to recon notfiy list", ctx->hosts[i]);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "CANNOT FIND MY HOSTNAME");
+ GF_ASSERT(0);
+ }
+ }
+ i++;
+ }
+}
+
+void *
+nsr_recon_notify_thread (void *arg)
+{
+ xlator_t *this = (xlator_t *)arg;
+ nsr_private_t *priv = this->private;
+ nsr_recon_notify_ev_t *ev;
+ nsr_recon_notify_ctx_t *ctx;
+
+ priv->recon_ctx = GF_CALLOC(1, sizeof(nsr_recon_notify_ctx_t), gf_mt_nsr_private_t);
+ if (!priv->recon_ctx) {
+ gf_log (this->name, GF_LOG_ERROR, "calloc error");
+ return NULL;
+ }
+ ctx = priv->recon_ctx;
+
+ pthread_mutex_init(&(ctx->recon_mutex), NULL);
+ pthread_cond_init(&(ctx->recon_cv), NULL);
+ INIT_LIST_HEAD(&(ctx->recon_head.list));
+
+ nsr_setup_hosts(this);
+
+ if (nsr_setup_recon(this)) {
+ gf_log (this->name, GF_LOG_ERROR, "recon notify thread : initing glfs error");
+ return NULL;
+ }
+
+ priv->recon_notify_inited = 1;
+
+ while(1) {
+ pthread_mutex_lock(&ctx->recon_mutex);
+ while (list_empty(&(ctx->recon_head.list))) {
+ pthread_cond_wait(&ctx->recon_cv, &ctx->recon_mutex);
+ }
+ pthread_mutex_unlock(&ctx->recon_mutex);
+
+ list_for_each_entry(ev, &(ctx->recon_head.list), list) {
+
+ if (ev->id == NSR_RECON_SET_LEADER) {
+ gf_log (this->name, GF_LOG_INFO,
+ "got add leader notfiy event");
+ nsr_recon_set_leader(this);
+ } else if (ev->id == NSR_RECON_ADD_CHILD) {
+ gf_log (this->name, GF_LOG_INFO,
+ "got add child notify event");
+ nsr_recon_add_child(this, ev->index);
+ }
+ }
+ list_del_init (&ev->list);
+ }
+
+ return NULL;
+}
+
diff --git a/xlators/cluster/nsr-server/src/yajl.c b/xlators/cluster/nsr-server/src/yajl.c
new file mode 100644
index 000000000..54e6474fc
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_parse.h"
+#include "yajl_lex.h"
+#include "yajl_parser.h"
+#include "yajl_alloc.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+const char *
+yajl_status_to_string(yajl_status stat)
+{
+ const char * statStr = "unknown";
+ switch (stat) {
+ case yajl_status_ok:
+ statStr = "ok, no error";
+ break;
+ case yajl_status_client_canceled:
+ statStr = "client canceled parse";
+ break;
+ case yajl_status_error:
+ statStr = "parse error";
+ break;
+ }
+ return statStr;
+}
+
+yajl_handle
+yajl_alloc(const yajl_callbacks * callbacks,
+ yajl_alloc_funcs * afs,
+ void * ctx)
+{
+ yajl_handle hand = NULL;
+ yajl_alloc_funcs afsBuffer;
+
+ /* first order of business is to set up memory allocation routines */
+ if (afs != NULL) {
+ if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL)
+ {
+ return NULL;
+ }
+ } else {
+ yajl_set_default_alloc_funcs(&afsBuffer);
+ afs = &afsBuffer;
+ }
+
+ hand = (yajl_handle) YA_MALLOC(afs, sizeof(struct yajl_handle_t));
+
+ /* copy in pointers to allocation routines */
+ memcpy((void *) &(hand->alloc), (void *) afs, sizeof(yajl_alloc_funcs));
+
+ hand->callbacks = callbacks;
+ hand->ctx = ctx;
+ hand->lexer = NULL;
+ hand->bytesConsumed = 0;
+ hand->decodeBuf = yajl_buf_alloc(&(hand->alloc));
+ hand->flags = 0;
+ yajl_bs_init(hand->stateStack, &(hand->alloc));
+ yajl_bs_push(hand->stateStack, yajl_state_start);
+
+ return hand;
+}
+
+int
+yajl_config(yajl_handle h, yajl_option opt, ...)
+{
+ int rv = 1;
+ va_list ap;
+ va_start(ap, opt);
+
+ switch(opt) {
+ case yajl_allow_comments:
+ case yajl_dont_validate_strings:
+ case yajl_allow_trailing_garbage:
+ case yajl_allow_multiple_values:
+ case yajl_allow_partial_values:
+ if (va_arg(ap, int)) h->flags |= opt;
+ else h->flags &= ~opt;
+ break;
+ default:
+ rv = 0;
+ }
+ va_end(ap);
+
+ return rv;
+}
+
+void
+yajl_free(yajl_handle handle)
+{
+ yajl_bs_free(handle->stateStack);
+ yajl_buf_free(handle->decodeBuf);
+ if (handle->lexer) {
+ yajl_lex_free(handle->lexer);
+ handle->lexer = NULL;
+ }
+ YA_FREE(&(handle->alloc), handle);
+}
+
+yajl_status
+yajl_parse(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen)
+{
+ yajl_status status;
+
+ /* lazy allocation of the lexer */
+ if (hand->lexer == NULL) {
+ hand->lexer = yajl_lex_alloc(&(hand->alloc),
+ hand->flags & yajl_allow_comments,
+ !(hand->flags & yajl_dont_validate_strings));
+ }
+
+ status = yajl_do_parse(hand, jsonText, jsonTextLen);
+ return status;
+}
+
+
+yajl_status
+yajl_complete_parse(yajl_handle hand)
+{
+ /* The lexer is lazy allocated in the first call to parse. if parse is
+ * never called, then no data was provided to parse at all. This is a
+ * "premature EOF" error unless yajl_allow_partial_values is specified.
+ * allocating the lexer now is the simplest possible way to handle this
+ * case while preserving all the other semantics of the parser
+ * (multiple values, partial values, etc). */
+ if (hand->lexer == NULL) {
+ hand->lexer = yajl_lex_alloc(&(hand->alloc),
+ hand->flags & yajl_allow_comments,
+ !(hand->flags & yajl_dont_validate_strings));
+ }
+
+ return yajl_do_finish(hand);
+}
+
+unsigned char *
+yajl_get_error(yajl_handle hand, int verbose,
+ const unsigned char * jsonText, size_t jsonTextLen)
+{
+ return yajl_render_error_string(hand, jsonText, jsonTextLen, verbose);
+}
+
+size_t
+yajl_get_bytes_consumed(yajl_handle hand)
+{
+ if (!hand) return 0;
+ else return hand->bytesConsumed;
+}
+
+
+void
+yajl_free_error(yajl_handle hand, unsigned char * str)
+{
+ /* use memory allocation functions if set */
+ YA_FREE(&(hand->alloc), str);
+}
+
+/* XXX: add utility routines to parse from file */
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_common.h b/xlators/cluster/nsr-server/src/yajl/yajl_common.h
new file mode 100644
index 000000000..49ca3a5cb
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_common.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_COMMON_H__
+#define __YAJL_COMMON_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define YAJL_MAX_DEPTH 128
+
+/* msft dll export gunk. To build a DLL on windows, you
+ * must define WIN32, YAJL_SHARED, and YAJL_BUILD. To use a shared
+ * DLL, you must define YAJL_SHARED and WIN32 */
+#if defined(WIN32) && defined(YAJL_SHARED)
+# ifdef YAJL_BUILD
+# define YAJL_API __declspec(dllexport)
+# else
+# define YAJL_API __declspec(dllimport)
+# endif
+#else
+# if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
+# define YAJL_API __attribute__ ((visibility("default")))
+# else
+# define YAJL_API
+# endif
+#endif
+
+/** pointer to a malloc function, supporting client overriding memory
+ * allocation routines */
+typedef void * (*yajl_malloc_func)(void *ctx, size_t sz);
+
+/** pointer to a free function, supporting client overriding memory
+ * allocation routines */
+typedef void (*yajl_free_func)(void *ctx, void * ptr);
+
+/** pointer to a realloc function which can resize an allocation. */
+typedef void * (*yajl_realloc_func)(void *ctx, void * ptr, size_t sz);
+
+/** A structure which can be passed to yajl_*_alloc routines to allow the
+ * client to specify memory allocation functions to be used. */
+typedef struct
+{
+ /** pointer to a function that can allocate uninitialized memory */
+ yajl_malloc_func malloc;
+ /** pointer to a function that can resize memory allocations */
+ yajl_realloc_func realloc;
+ /** pointer to a function that can free memory allocated using
+ * reallocFunction or mallocFunction */
+ yajl_free_func free;
+ /** a context pointer that will be passed to above allocation routines */
+ void * ctx;
+} yajl_alloc_funcs;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_gen.h b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h
new file mode 100644
index 000000000..52fa99fc2
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_gen.h
+ * Interface to YAJL's JSON generation facilities.
+ */
+
+#include <yajl/yajl_common.h>
+
+#ifndef __YAJL_GEN_H__
+#define __YAJL_GEN_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ /** generator status codes */
+ typedef enum {
+ /** no error */
+ yajl_gen_status_ok = 0,
+ /** at a point where a map key is generated, a function other than
+ * yajl_gen_string was called */
+ yajl_gen_keys_must_be_strings,
+ /** YAJL's maximum generation depth was exceeded. see
+ * YAJL_MAX_DEPTH */
+ yajl_max_depth_exceeded,
+ /** A generator function (yajl_gen_XXX) was called while in an error
+ * state */
+ yajl_gen_in_error_state,
+ /** A complete JSON document has been generated */
+ yajl_gen_generation_complete,
+ /** yajl_gen_double was passed an invalid floating point value
+ * (infinity or NaN). */
+ yajl_gen_invalid_number,
+ /** A print callback was passed in, so there is no internal
+ * buffer to get from */
+ yajl_gen_no_buf,
+ /** returned from yajl_gen_string() when the yajl_gen_validate_utf8
+ * option is enabled and an invalid was passed by client code.
+ */
+ yajl_gen_invalid_string
+ } yajl_gen_status;
+
+ /** an opaque handle to a generator */
+ typedef struct yajl_gen_t * yajl_gen;
+
+ /** a callback used for "printing" the results. */
+ typedef void (*yajl_print_t)(void * ctx,
+ const char * str,
+ size_t len);
+
+ /** configuration parameters for the parser, these may be passed to
+ * yajl_gen_config() along with option specific argument(s). In general,
+ * all configuration parameters default to *off*. */
+ typedef enum {
+ /** generate indented (beautiful) output */
+ yajl_gen_beautify = 0x01,
+ /**
+ * Set an indent string which is used when yajl_gen_beautify
+ * is enabled. Maybe something like \\t or some number of
+ * spaces. The default is four spaces ' '.
+ */
+ yajl_gen_indent_string = 0x02,
+ /**
+ * Set a function and context argument that should be used to
+ * output generated json. the function should conform to the
+ * yajl_print_t prototype while the context argument is a
+ * void * of your choosing.
+ *
+ * example:
+ * yajl_gen_config(g, yajl_gen_print_callback, myFunc, myVoidPtr);
+ */
+ yajl_gen_print_callback = 0x04,
+ /**
+ * Normally the generator does not validate that strings you
+ * pass to it via yajl_gen_string() are valid UTF8. Enabling
+ * this option will cause it to do so.
+ */
+ yajl_gen_validate_utf8 = 0x08,
+ /**
+ * the forward solidus (slash or '/' in human) is not required to be
+ * escaped in json text. By default, YAJL will not escape it in the
+ * iterest of saving bytes. Setting this flag will cause YAJL to
+ * always escape '/' in generated JSON strings.
+ */
+ yajl_gen_escape_solidus = 0x10
+ } yajl_gen_option;
+
+ /** allow the modification of generator options subsequent to handle
+ * allocation (via yajl_alloc)
+ * \returns zero in case of errors, non-zero otherwise
+ */
+ YAJL_API int yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...);
+
+ /** allocate a generator handle
+ * \param allocFuncs an optional pointer to a structure which allows
+ * the client to overide the memory allocation
+ * used by yajl. May be NULL, in which case
+ * malloc/free/realloc will be used.
+ *
+ * \returns an allocated handle on success, NULL on failure (bad params)
+ */
+ YAJL_API yajl_gen yajl_gen_alloc(const yajl_alloc_funcs * allocFuncs);
+
+ /** free a generator handle */
+ YAJL_API void yajl_gen_free(yajl_gen handle);
+
+ YAJL_API yajl_gen_status yajl_gen_integer(yajl_gen hand, long long int number);
+ /** generate a floating point number. number may not be infinity or
+ * NaN, as these have no representation in JSON. In these cases the
+ * generator will return 'yajl_gen_invalid_number' */
+ YAJL_API yajl_gen_status yajl_gen_double(yajl_gen hand, double number);
+ YAJL_API yajl_gen_status yajl_gen_number(yajl_gen hand,
+ const char * num,
+ size_t len);
+ YAJL_API yajl_gen_status yajl_gen_string(yajl_gen hand,
+ const unsigned char * str,
+ size_t len);
+ YAJL_API yajl_gen_status yajl_gen_null(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_bool(yajl_gen hand, int boolean);
+ YAJL_API yajl_gen_status yajl_gen_map_open(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_map_close(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_array_open(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_array_close(yajl_gen hand);
+
+ /** access the null terminated generator buffer. If incrementally
+ * outputing JSON, one should call yajl_gen_clear to clear the
+ * buffer. This allows stream generation. */
+ YAJL_API yajl_gen_status yajl_gen_get_buf(yajl_gen hand,
+ const unsigned char ** buf,
+ size_t * len);
+
+ /** clear yajl's output buffer, but maintain all internal generation
+ * state. This function will not "reset" the generator state, and is
+ * intended to enable incremental JSON outputing. */
+ YAJL_API void yajl_gen_clear(yajl_gen hand);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_parse.h b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h
new file mode 100644
index 000000000..55c831101
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_parse.h
+ * Interface to YAJL's JSON stream parsing facilities.
+ */
+
+#include <yajl/yajl_common.h>
+
+#ifndef __YAJL_PARSE_H__
+#define __YAJL_PARSE_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ /** error codes returned from this interface */
+ typedef enum {
+ /** no error was encountered */
+ yajl_status_ok,
+ /** a client callback returned zero, stopping the parse */
+ yajl_status_client_canceled,
+ /** An error occured during the parse. Call yajl_get_error for
+ * more information about the encountered error */
+ yajl_status_error
+ } yajl_status;
+
+ /** attain a human readable, english, string for an error */
+ YAJL_API const char * yajl_status_to_string(yajl_status code);
+
+ /** an opaque handle to a parser */
+ typedef struct yajl_handle_t * yajl_handle;
+
+ /** yajl is an event driven parser. this means as json elements are
+ * parsed, you are called back to do something with the data. The
+ * functions in this table indicate the various events for which
+ * you will be called back. Each callback accepts a "context"
+ * pointer, this is a void * that is passed into the yajl_parse
+ * function which the client code may use to pass around context.
+ *
+ * All callbacks return an integer. If non-zero, the parse will
+ * continue. If zero, the parse will be canceled and
+ * yajl_status_client_canceled will be returned from the parse.
+ *
+ * \attention {
+ * A note about the handling of numbers:
+ *
+ * yajl will only convert numbers that can be represented in a
+ * double or a 64 bit (long long) int. All other numbers will
+ * be passed to the client in string form using the yajl_number
+ * callback. Furthermore, if yajl_number is not NULL, it will
+ * always be used to return numbers, that is yajl_integer and
+ * yajl_double will be ignored. If yajl_number is NULL but one
+ * of yajl_integer or yajl_double are defined, parsing of a
+ * number larger than is representable in a double or 64 bit
+ * integer will result in a parse error.
+ * }
+ */
+ typedef struct {
+ int (* yajl_null)(void * ctx);
+ int (* yajl_boolean)(void * ctx, int boolVal);
+ int (* yajl_integer)(void * ctx, long long integerVal);
+ int (* yajl_double)(void * ctx, double doubleVal);
+ /** A callback which passes the string representation of the number
+ * back to the client. Will be used for all numbers when present */
+ int (* yajl_number)(void * ctx, const char * numberVal,
+ size_t numberLen);
+
+ /** strings are returned as pointers into the JSON text when,
+ * possible, as a result, they are _not_ null padded */
+ int (* yajl_string)(void * ctx, const unsigned char * stringVal,
+ size_t stringLen);
+
+ int (* yajl_start_map)(void * ctx);
+ int (* yajl_map_key)(void * ctx, const unsigned char * key,
+ size_t stringLen);
+ int (* yajl_end_map)(void * ctx);
+
+ int (* yajl_start_array)(void * ctx);
+ int (* yajl_end_array)(void * ctx);
+ } yajl_callbacks;
+
+ /** allocate a parser handle
+ * \param callbacks a yajl callbacks structure specifying the
+ * functions to call when different JSON entities
+ * are encountered in the input text. May be NULL,
+ * which is only useful for validation.
+ * \param afs memory allocation functions, may be NULL for to use
+ * C runtime library routines (malloc and friends)
+ * \param ctx a context pointer that will be passed to callbacks.
+ */
+ YAJL_API yajl_handle yajl_alloc(const yajl_callbacks * callbacks,
+ yajl_alloc_funcs * afs,
+ void * ctx);
+
+
+ /** configuration parameters for the parser, these may be passed to
+ * yajl_config() along with option specific argument(s). In general,
+ * all configuration parameters default to *off*. */
+ typedef enum {
+ /** Ignore javascript style comments present in
+ * JSON input. Non-standard, but rather fun
+ * arguments: toggled off with integer zero, on otherwise.
+ *
+ * example:
+ * yajl_config(h, yajl_allow_comments, 1); // turn comment support on
+ */
+ yajl_allow_comments = 0x01,
+ /**
+ * When set the parser will verify that all strings in JSON input are
+ * valid UTF8 and will emit a parse error if this is not so. When set,
+ * this option makes parsing slightly more expensive (~7% depending
+ * on processor and compiler in use)
+ *
+ * example:
+ * yajl_config(h, yajl_dont_validate_strings, 1); // disable utf8 checking
+ */
+ yajl_dont_validate_strings = 0x02,
+ /**
+ * By default, upon calls to yajl_complete_parse(), yajl will
+ * ensure the entire input text was consumed and will raise an error
+ * otherwise. Enabling this flag will cause yajl to disable this
+ * check. This can be useful when parsing json out of a that contains more
+ * than a single JSON document.
+ */
+ yajl_allow_trailing_garbage = 0x04,
+ /**
+ * Allow multiple values to be parsed by a single handle. The
+ * entire text must be valid JSON, and values can be seperated
+ * by any kind of whitespace. This flag will change the
+ * behavior of the parser, and cause it continue parsing after
+ * a value is parsed, rather than transitioning into a
+ * complete state. This option can be useful when parsing multiple
+ * values from an input stream.
+ */
+ yajl_allow_multiple_values = 0x08,
+ /**
+ * When yajl_complete_parse() is called the parser will
+ * check that the top level value was completely consumed. I.E.,
+ * if called whilst in the middle of parsing a value
+ * yajl will enter an error state (premature EOF). Setting this
+ * flag suppresses that check and the corresponding error.
+ */
+ yajl_allow_partial_values = 0x10
+ } yajl_option;
+
+ /** allow the modification of parser options subsequent to handle
+ * allocation (via yajl_alloc)
+ * \returns zero in case of errors, non-zero otherwise
+ */
+ YAJL_API int yajl_config(yajl_handle h, yajl_option opt, ...);
+
+ /** free a parser handle */
+ YAJL_API void yajl_free(yajl_handle handle);
+
+ /** Parse some json!
+ * \param hand - a handle to the json parser allocated with yajl_alloc
+ * \param jsonText - a pointer to the UTF8 json text to be parsed
+ * \param jsonTextLength - the length, in bytes, of input text
+ */
+ YAJL_API yajl_status yajl_parse(yajl_handle hand,
+ const unsigned char * jsonText,
+ size_t jsonTextLength);
+
+ /** Parse any remaining buffered json.
+ * Since yajl is a stream-based parser, without an explicit end of
+ * input, yajl sometimes can't decide if content at the end of the
+ * stream is valid or not. For example, if "1" has been fed in,
+ * yajl can't know whether another digit is next or some character
+ * that would terminate the integer token.
+ *
+ * \param hand - a handle to the json parser allocated with yajl_alloc
+ */
+ YAJL_API yajl_status yajl_complete_parse(yajl_handle hand);
+
+ /** get an error string describing the state of the
+ * parse.
+ *
+ * If verbose is non-zero, the message will include the JSON
+ * text where the error occured, along with an arrow pointing to
+ * the specific char.
+ *
+ * \returns A dynamically allocated string will be returned which should
+ * be freed with yajl_free_error
+ */
+ YAJL_API unsigned char * yajl_get_error(yajl_handle hand, int verbose,
+ const unsigned char * jsonText,
+ size_t jsonTextLength);
+
+ /**
+ * get the amount of data consumed from the last chunk passed to YAJL.
+ *
+ * In the case of a successful parse this can help you understand if
+ * the entire buffer was consumed (which will allow you to handle
+ * "junk at end of input").
+ *
+ * In the event an error is encountered during parsing, this function
+ * affords the client a way to get the offset into the most recent
+ * chunk where the error occured. 0 will be returned if no error
+ * was encountered.
+ */
+ YAJL_API size_t yajl_get_bytes_consumed(yajl_handle hand);
+
+ /** free an error returned from yajl_get_error */
+ YAJL_API void yajl_free_error(yajl_handle hand, unsigned char * str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_tree.h b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h
new file mode 100644
index 000000000..8b377f636
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2010-2011 Florian Forster <ff at octo.it>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_tree.h
+ *
+ * Parses JSON data and returns the data in tree form.
+ *
+ * \author Florian Forster
+ * \date August 2010
+ *
+ * This interface makes quick parsing and extraction of
+ * smallish JSON docs trivial:
+ *
+ * \include example/parse_config.c
+ */
+
+#ifndef YAJL_TREE_H
+#define YAJL_TREE_H 1
+
+#include <yajl/yajl_common.h>
+
+/** possible data types that a yajl_val_s can hold */
+typedef enum {
+ yajl_t_string = 1,
+ yajl_t_number = 2,
+ yajl_t_object = 3,
+ yajl_t_array = 4,
+ yajl_t_true = 5,
+ yajl_t_false = 6,
+ yajl_t_null = 7,
+ /** The any type isn't valid for yajl_val_s.type, but can be
+ * used as an argument to routines like yajl_tree_get().
+ */
+ yajl_t_any = 8
+} yajl_type;
+
+#define YAJL_NUMBER_INT_VALID 0x01
+#define YAJL_NUMBER_DOUBLE_VALID 0x02
+
+/** A pointer to a node in the parse tree */
+typedef struct yajl_val_s * yajl_val;
+
+/**
+ * A JSON value representation capable of holding one of the seven
+ * types above. For "string", "number", "object", and "array"
+ * additional data is available in the union. The "YAJL_IS_*"
+ * and "YAJL_GET_*" macros below allow type checking and convenient
+ * value extraction.
+ */
+struct yajl_val_s
+{
+ /** Type of the value contained. Use the "YAJL_IS_*" macors to check for a
+ * specific type. */
+ yajl_type type;
+ /** Type-specific data. You may use the "YAJL_GET_*" macros to access these
+ * members. */
+ union
+ {
+ char * string;
+ struct {
+ long long i; /*< integer value, if representable. */
+ double d; /*< double value, if representable. */
+ /** Signals whether the \em i and \em d members are
+ * valid. See \c YAJL_NUMBER_INT_VALID and
+ * \c YAJL_NUMBER_DOUBLE_VALID. */
+ char *r; /*< unparsed number in string form. */
+ unsigned int flags;
+ } number;
+ struct {
+ const char **keys; /*< Array of keys */
+ yajl_val *values; /*< Array of values. */
+ size_t len; /*< Number of key-value-pairs. */
+ } object;
+ struct {
+ yajl_val *values; /*< Array of elements. */
+ size_t len; /*< Number of elements. */
+ } array;
+ } u;
+};
+
+/**
+ * Parse a string.
+ *
+ * Parses an null-terminated string containing JSON data and returns a pointer
+ * to the top-level value (root of the parse tree).
+ *
+ * \param input Pointer to a null-terminated utf8 string containing
+ * JSON data.
+ * \param error_buffer Pointer to a buffer in which an error message will
+ * be stored if \em yajl_tree_parse fails, or
+ * \c NULL. The buffer will be initialized before
+ * parsing, so its content will be destroyed even if
+ * \em yajl_tree_parse succeeds.
+ * \param error_buffer_size Size of the memory area pointed to by
+ * \em error_buffer_size. If \em error_buffer_size is
+ * \c NULL, this argument is ignored.
+ *
+ * \returns Pointer to the top-level value or \c NULL on error. The memory
+ * pointed to must be freed using \em yajl_tree_free. In case of an error, a
+ * null terminated message describing the error in more detail is stored in
+ * \em error_buffer if it is not \c NULL.
+ */
+YAJL_API yajl_val yajl_tree_parse (const char *input,
+ char *error_buffer, size_t error_buffer_size);
+
+/**
+ * Free a parse tree returned by "yajl_tree_parse".
+ *
+ * \param v Pointer to a JSON value returned by "yajl_tree_parse". Passing NULL
+ * is valid and results in a no-op.
+ */
+YAJL_API void yajl_tree_free (yajl_val v);
+
+/**
+ * Access a nested value inside a tree.
+ *
+ * \param parent the node under which you'd like to extract values.
+ * \param path A null terminated array of strings, each the name of an object key
+ * \param type the yajl_type of the object you seek, or yajl_t_any if any will do.
+ *
+ * \returns a pointer to the found value, or NULL if we came up empty.
+ *
+ * Future Ideas: it'd be nice to move path to a string and implement support for
+ * a teeny tiny micro language here, so you can extract array elements, do things
+ * like .first and .last, even .length. Inspiration from JSONPath and css selectors?
+ * No it wouldn't be fast, but that's not what this API is about.
+ */
+YAJL_API yajl_val yajl_tree_get(yajl_val parent, const char ** path, yajl_type type);
+
+/* Various convenience macros to check the type of a `yajl_val` */
+#define YAJL_IS_STRING(v) (((v) != NULL) && ((v)->type == yajl_t_string))
+#define YAJL_IS_NUMBER(v) (((v) != NULL) && ((v)->type == yajl_t_number))
+#define YAJL_IS_INTEGER(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_INT_VALID))
+#define YAJL_IS_DOUBLE(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_DOUBLE_VALID))
+#define YAJL_IS_OBJECT(v) (((v) != NULL) && ((v)->type == yajl_t_object))
+#define YAJL_IS_ARRAY(v) (((v) != NULL) && ((v)->type == yajl_t_array ))
+#define YAJL_IS_TRUE(v) (((v) != NULL) && ((v)->type == yajl_t_true ))
+#define YAJL_IS_FALSE(v) (((v) != NULL) && ((v)->type == yajl_t_false ))
+#define YAJL_IS_NULL(v) (((v) != NULL) && ((v)->type == yajl_t_null ))
+
+/** Given a yajl_val_string return a ptr to the bare string it contains,
+ * or NULL if the value is not a string. */
+#define YAJL_GET_STRING(v) (YAJL_IS_STRING(v) ? (v)->u.string : NULL)
+
+/** Get the string representation of a number. You should check type first,
+ * perhaps using YAJL_IS_NUMBER */
+#define YAJL_GET_NUMBER(v) ((v)->u.number.r)
+
+/** Get the double representation of a number. You should check type first,
+ * perhaps using YAJL_IS_DOUBLE */
+#define YAJL_GET_DOUBLE(v) ((v)->u.number.d)
+
+/** Get the 64bit (long long) integer representation of a number. You should
+ * check type first, perhaps using YAJL_IS_INTEGER */
+#define YAJL_GET_INTEGER(v) ((v)->u.number.i)
+
+/** Get a pointer to a yajl_val_object or NULL if the value is not an object. */
+#define YAJL_GET_OBJECT(v) (YAJL_IS_OBJECT(v) ? &(v)->u.object : NULL)
+
+/** Get a pointer to a yajl_val_array or NULL if the value is not an object. */
+#define YAJL_GET_ARRAY(v) (YAJL_IS_ARRAY(v) ? &(v)->u.array : NULL)
+
+#endif /* YAJL_TREE_H */
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_version.h b/xlators/cluster/nsr-server/src/yajl/yajl_version.h
new file mode 100644
index 000000000..0fba9b8fc
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_version.h
@@ -0,0 +1,23 @@
+#ifndef YAJL_VERSION_H_
+#define YAJL_VERSION_H_
+
+#include <yajl/yajl_common.h>
+
+#define YAJL_MAJOR 2
+#define YAJL_MINOR 0
+#define YAJL_MICRO 1
+
+#define YAJL_VERSION ((YAJL_MAJOR * 10000) + (YAJL_MINOR * 100) + YAJL_MICRO)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int YAJL_API yajl_version(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* YAJL_VERSION_H_ */
+
diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.c b/xlators/cluster/nsr-server/src/yajl_alloc.c
new file mode 100644
index 000000000..276315af7
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_alloc.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_alloc.h
+ * default memory allocation routines for yajl which use malloc/realloc and
+ * free
+ */
+
+#include "yajl_alloc.h"
+#include <stdlib.h>
+
+static void * yajl_internal_malloc(void *ctx, size_t sz)
+{
+ return malloc(sz);
+}
+
+static void * yajl_internal_realloc(void *ctx, void * previous,
+ size_t sz)
+{
+ return realloc(previous, sz);
+}
+
+static void yajl_internal_free(void *ctx, void * ptr)
+{
+ free(ptr);
+}
+
+void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf)
+{
+ yaf->malloc = yajl_internal_malloc;
+ yaf->free = yajl_internal_free;
+ yaf->realloc = yajl_internal_realloc;
+ yaf->ctx = NULL;
+}
+
diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.h b/xlators/cluster/nsr-server/src/yajl_alloc.h
new file mode 100644
index 000000000..a8a9e45e6
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_alloc.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_alloc.h
+ * default memory allocation routines for yajl which use malloc/realloc and
+ * free
+ */
+
+#ifndef __YAJL_ALLOC_H__
+#define __YAJL_ALLOC_H__
+
+#include "yajl/yajl_common.h"
+
+#define YA_MALLOC(afs, sz) (afs)->malloc((afs)->ctx, (sz))
+#define YA_FREE(afs, ptr) (afs)->free((afs)->ctx, (ptr))
+#define YA_REALLOC(afs, ptr, sz) (afs)->realloc((afs)->ctx, (ptr), (sz))
+
+void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_buf.c b/xlators/cluster/nsr-server/src/yajl_buf.c
new file mode 100644
index 000000000..0d249d364
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_buf.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_buf.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define YAJL_BUF_INIT_SIZE 2048
+
+struct yajl_buf_t {
+ size_t len;
+ size_t used;
+ unsigned char * data;
+ yajl_alloc_funcs * alloc;
+};
+
+static
+void yajl_buf_ensure_available(yajl_buf buf, size_t want)
+{
+ size_t need;
+
+ assert(buf != NULL);
+
+ /* first call */
+ if (buf->data == NULL) {
+ buf->len = YAJL_BUF_INIT_SIZE;
+ buf->data = (unsigned char *) YA_MALLOC(buf->alloc, buf->len);
+ buf->data[0] = 0;
+ }
+
+ need = buf->len;
+
+ while (want >= (need - buf->used)) need <<= 1;
+
+ if (need != buf->len) {
+ buf->data = (unsigned char *) YA_REALLOC(buf->alloc, buf->data, need);
+ buf->len = need;
+ }
+}
+
+yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc)
+{
+ yajl_buf b = YA_MALLOC(alloc, sizeof(struct yajl_buf_t));
+ memset((void *) b, 0, sizeof(struct yajl_buf_t));
+ b->alloc = alloc;
+ return b;
+}
+
+void yajl_buf_free(yajl_buf buf)
+{
+ assert(buf != NULL);
+ if (buf->data) YA_FREE(buf->alloc, buf->data);
+ YA_FREE(buf->alloc, buf);
+}
+
+void yajl_buf_append(yajl_buf buf, const void * data, size_t len)
+{
+ yajl_buf_ensure_available(buf, len);
+ if (len > 0) {
+ assert(data != NULL);
+ memcpy(buf->data + buf->used, data, len);
+ buf->used += len;
+ buf->data[buf->used] = 0;
+ }
+}
+
+void yajl_buf_clear(yajl_buf buf)
+{
+ buf->used = 0;
+ if (buf->data) buf->data[buf->used] = 0;
+}
+
+const unsigned char * yajl_buf_data(yajl_buf buf)
+{
+ return buf->data;
+}
+
+size_t yajl_buf_len(yajl_buf buf)
+{
+ return buf->used;
+}
+
+void
+yajl_buf_truncate(yajl_buf buf, size_t len)
+{
+ assert(len <= buf->used);
+ buf->used = len;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_buf.h b/xlators/cluster/nsr-server/src/yajl_buf.h
new file mode 100644
index 000000000..94929a519
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_buf.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_BUF_H__
+#define __YAJL_BUF_H__
+
+#include "yajl/yajl_common.h"
+#include "yajl_alloc.h"
+
+/*
+ * Implementation/performance notes. If this were moved to a header
+ * only implementation using #define's where possible we might be
+ * able to sqeeze a little performance out of the guy by killing function
+ * call overhead. YMMV.
+ */
+
+/**
+ * yajl_buf is a buffer with exponential growth. the buffer ensures that
+ * you are always null padded.
+ */
+typedef struct yajl_buf_t * yajl_buf;
+
+/* allocate a new buffer */
+yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc);
+
+/* free the buffer */
+void yajl_buf_free(yajl_buf buf);
+
+/* append a number of bytes to the buffer */
+void yajl_buf_append(yajl_buf buf, const void * data, size_t len);
+
+/* empty the buffer */
+void yajl_buf_clear(yajl_buf buf);
+
+/* get a pointer to the beginning of the buffer */
+const unsigned char * yajl_buf_data(yajl_buf buf);
+
+/* get the length of the buffer */
+size_t yajl_buf_len(yajl_buf buf);
+
+/* truncate the buffer */
+void yajl_buf_truncate(yajl_buf buf, size_t len);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_bytestack.h b/xlators/cluster/nsr-server/src/yajl_bytestack.h
new file mode 100644
index 000000000..1fc50c470
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_bytestack.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * A header only implementation of a simple stack of bytes, used in YAJL
+ * to maintain parse state.
+ */
+
+#ifndef __YAJL_BYTESTACK_H__
+#define __YAJL_BYTESTACK_H__
+
+#include "yajl/yajl_common.h"
+
+#define YAJL_BS_INC 128
+
+typedef struct yajl_bytestack_t
+{
+ unsigned char * stack;
+ size_t size;
+ size_t used;
+ yajl_alloc_funcs * yaf;
+} yajl_bytestack;
+
+/* initialize a bytestack */
+#define yajl_bs_init(obs, _yaf) { \
+ (obs).stack = NULL; \
+ (obs).size = 0; \
+ (obs).used = 0; \
+ (obs).yaf = (_yaf); \
+ } \
+
+
+/* initialize a bytestack */
+#define yajl_bs_free(obs) \
+ if ((obs).stack) (obs).yaf->free((obs).yaf->ctx, (obs).stack);
+
+#define yajl_bs_current(obs) \
+ (assert((obs).used > 0), (obs).stack[(obs).used - 1])
+
+#define yajl_bs_push(obs, byte) { \
+ if (((obs).size - (obs).used) == 0) { \
+ (obs).size += YAJL_BS_INC; \
+ (obs).stack = (obs).yaf->realloc((obs).yaf->ctx,\
+ (void *) (obs).stack, (obs).size);\
+ } \
+ (obs).stack[((obs).used)++] = (byte); \
+}
+
+/* removes the top item of the stack, returns nothing */
+#define yajl_bs_pop(obs) { ((obs).used)--; }
+
+#define yajl_bs_set(obs, byte) \
+ (obs).stack[((obs).used) - 1] = (byte);
+
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_encode.c b/xlators/cluster/nsr-server/src/yajl_encode.c
new file mode 100644
index 000000000..9dc9a3e81
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_encode.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_encode.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+static void CharToHex(unsigned char c, char * hexBuf)
+{
+ const char * hexchar = "0123456789ABCDEF";
+ hexBuf[0] = hexchar[c >> 4];
+ hexBuf[1] = hexchar[c & 0x0F];
+}
+
+void
+yajl_string_encode(const yajl_print_t print,
+ void * ctx,
+ const unsigned char * str,
+ size_t len,
+ int escape_solidus)
+{
+ size_t beg = 0;
+ size_t end = 0;
+ char hexBuf[7];
+ hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
+ hexBuf[6] = 0;
+
+ while (end < len) {
+ const char * escaped = NULL;
+ switch (str[end]) {
+ case '\r': escaped = "\\r"; break;
+ case '\n': escaped = "\\n"; break;
+ case '\\': escaped = "\\\\"; break;
+ /* it is not required to escape a solidus in JSON:
+ * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt
+ * specifically, this production from the grammar:
+ * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
+ */
+ case '/': if (escape_solidus) escaped = "\\/"; break;
+ case '"': escaped = "\\\""; break;
+ case '\f': escaped = "\\f"; break;
+ case '\b': escaped = "\\b"; break;
+ case '\t': escaped = "\\t"; break;
+ default:
+ if ((unsigned char) str[end] < 32) {
+ CharToHex(str[end], hexBuf + 4);
+ escaped = hexBuf;
+ }
+ break;
+ }
+ if (escaped != NULL) {
+ print(ctx, (const char *) (str + beg), end - beg);
+ print(ctx, escaped, (unsigned int)strlen(escaped));
+ beg = ++end;
+ } else {
+ ++end;
+ }
+ }
+ print(ctx, (const char *) (str + beg), end - beg);
+}
+
+static void hexToDigit(unsigned int * val, const unsigned char * hex)
+{
+ unsigned int i;
+ for (i=0;i<4;i++) {
+ unsigned char c = hex[i];
+ if (c >= 'A') c = (c & ~0x20) - 7;
+ c -= '0';
+ assert(!(c & 0xF0));
+ *val = (*val << 4) | c;
+ }
+}
+
+static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf)
+{
+ if (codepoint < 0x80) {
+ utf8Buf[0] = (char) codepoint;
+ utf8Buf[1] = 0;
+ } else if (codepoint < 0x0800) {
+ utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0);
+ utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80);
+ utf8Buf[2] = 0;
+ } else if (codepoint < 0x10000) {
+ utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0);
+ utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
+ utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80);
+ utf8Buf[3] = 0;
+ } else if (codepoint < 0x200000) {
+ utf8Buf[0] =(char)((codepoint >> 18) | 0xF0);
+ utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80);
+ utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80);
+ utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80);
+ utf8Buf[4] = 0;
+ } else {
+ utf8Buf[0] = '?';
+ utf8Buf[1] = 0;
+ }
+}
+
+void yajl_string_decode(yajl_buf buf, const unsigned char * str,
+ size_t len)
+{
+ size_t beg = 0;
+ size_t end = 0;
+
+ while (end < len) {
+ if (str[end] == '\\') {
+ char utf8Buf[5];
+ const char * unescaped = "?";
+ yajl_buf_append(buf, str + beg, end - beg);
+ switch (str[++end]) {
+ case 'r': unescaped = "\r"; break;
+ case 'n': unescaped = "\n"; break;
+ case '\\': unescaped = "\\"; break;
+ case '/': unescaped = "/"; break;
+ case '"': unescaped = "\""; break;
+ case 'f': unescaped = "\f"; break;
+ case 'b': unescaped = "\b"; break;
+ case 't': unescaped = "\t"; break;
+ case 'u': {
+ unsigned int codepoint = 0;
+ hexToDigit(&codepoint, str + ++end);
+ end+=3;
+ /* check if this is a surrogate */
+ if ((codepoint & 0xFC00) == 0xD800) {
+ end++;
+ if (str[end] == '\\' && str[end + 1] == 'u') {
+ unsigned int surrogate = 0;
+ hexToDigit(&surrogate, str + end + 2);
+ codepoint =
+ (((codepoint & 0x3F) << 10) |
+ ((((codepoint >> 6) & 0xF) + 1) << 16) |
+ (surrogate & 0x3FF));
+ end += 5;
+ } else {
+ unescaped = "?";
+ break;
+ }
+ }
+
+ Utf32toUtf8(codepoint, utf8Buf);
+ unescaped = utf8Buf;
+
+ if (codepoint == 0) {
+ yajl_buf_append(buf, unescaped, 1);
+ beg = ++end;
+ continue;
+ }
+
+ break;
+ }
+ default:
+ assert("this should never happen" == NULL);
+ }
+ yajl_buf_append(buf, unescaped, (unsigned int)strlen(unescaped));
+ beg = ++end;
+ } else {
+ end++;
+ }
+ }
+ yajl_buf_append(buf, str + beg, end - beg);
+}
+
+#define ADV_PTR s++; if (!(len--)) return 0;
+
+int yajl_string_validate_utf8(const unsigned char * s, size_t len)
+{
+ if (!len) return 1;
+ if (!s) return 0;
+
+ while (len--) {
+ /* single byte */
+ if (*s <= 0x7f) {
+ /* noop */
+ }
+ /* two byte */
+ else if ((*s >> 5) == 0x6) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ }
+ /* three byte */
+ else if ((*s >> 4) == 0x0e) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ }
+ /* four byte */
+ else if ((*s >> 3) == 0x1e) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ } else {
+ return 0;
+ }
+
+ s++;
+ }
+
+ return 1;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_encode.h b/xlators/cluster/nsr-server/src/yajl_encode.h
new file mode 100644
index 000000000..af1e8bbde
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_encode.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_ENCODE_H__
+#define __YAJL_ENCODE_H__
+
+#include "yajl_buf.h"
+#include "yajl/yajl_gen.h"
+
+void yajl_string_encode(const yajl_print_t printer,
+ void * ctx,
+ const unsigned char * str,
+ size_t length,
+ int escape_solidus);
+
+void yajl_string_decode(yajl_buf buf, const unsigned char * str,
+ size_t length);
+
+int yajl_string_validate_utf8(const unsigned char * s, size_t len);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_gen.c b/xlators/cluster/nsr-server/src/yajl_gen.c
new file mode 100644
index 000000000..73763a9e0
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_gen.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_gen.h"
+#include "yajl_buf.h"
+#include "yajl_encode.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include <stdarg.h>
+
+typedef enum {
+ yajl_gen_start,
+ yajl_gen_map_start,
+ yajl_gen_map_key,
+ yajl_gen_map_val,
+ yajl_gen_array_start,
+ yajl_gen_in_array,
+ yajl_gen_complete,
+ yajl_gen_error
+} yajl_gen_state;
+
+struct yajl_gen_t
+{
+ unsigned int flags;
+ unsigned int depth;
+ const char * indentString;
+ yajl_gen_state state[YAJL_MAX_DEPTH];
+ yajl_print_t print;
+ void * ctx; /* yajl_buf */
+ /* memory allocation routines */
+ yajl_alloc_funcs alloc;
+};
+
+int
+yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...)
+{
+ int rv = 1;
+ va_list ap;
+ va_start(ap, opt);
+
+ switch(opt) {
+ case yajl_gen_beautify:
+ case yajl_gen_validate_utf8:
+ if (va_arg(ap, int)) g->flags |= opt;
+ else g->flags &= ~opt;
+ break;
+ case yajl_gen_indent_string: {
+ const char *indent = va_arg(ap, const char *);
+ g->indentString = indent;
+ for (; *indent; indent++) {
+ if (*indent != '\n'
+ && *indent != '\v'
+ && *indent != '\f'
+ && *indent != '\t'
+ && *indent != '\r'
+ && *indent != ' ')
+ {
+ g->indentString = NULL;
+ rv = 0;
+ }
+ }
+ break;
+ }
+ case yajl_gen_print_callback:
+ yajl_buf_free(g->ctx);
+ g->print = va_arg(ap, const yajl_print_t);
+ g->ctx = va_arg(ap, void *);
+ break;
+ default:
+ rv = 0;
+ }
+
+ va_end(ap);
+
+ return rv;
+}
+
+
+
+yajl_gen
+yajl_gen_alloc(const yajl_alloc_funcs * afs)
+{
+ yajl_gen g = NULL;
+ yajl_alloc_funcs afsBuffer;
+
+ /* first order of business is to set up memory allocation routines */
+ if (afs != NULL) {
+ if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL)
+ {
+ return NULL;
+ }
+ } else {
+ yajl_set_default_alloc_funcs(&afsBuffer);
+ afs = &afsBuffer;
+ }
+
+ g = (yajl_gen) YA_MALLOC(afs, sizeof(struct yajl_gen_t));
+ if (!g) return NULL;
+
+ memset((void *) g, 0, sizeof(struct yajl_gen_t));
+ /* copy in pointers to allocation routines */
+ memcpy((void *) &(g->alloc), (void *) afs, sizeof(yajl_alloc_funcs));
+
+ g->print = (yajl_print_t)&yajl_buf_append;
+ g->ctx = yajl_buf_alloc(&(g->alloc));
+ g->indentString = " ";
+
+ return g;
+}
+
+void
+yajl_gen_free(yajl_gen g)
+{
+ if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_free((yajl_buf)g->ctx);
+ YA_FREE(&(g->alloc), g);
+}
+
+#define INSERT_SEP \
+ if (g->state[g->depth] == yajl_gen_map_key || \
+ g->state[g->depth] == yajl_gen_in_array) { \
+ g->print(g->ctx, ",", 1); \
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); \
+ } else if (g->state[g->depth] == yajl_gen_map_val) { \
+ g->print(g->ctx, ":", 1); \
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, " ", 1); \
+ }
+
+#define INSERT_WHITESPACE \
+ if ((g->flags & yajl_gen_beautify)) { \
+ if (g->state[g->depth] != yajl_gen_map_val) { \
+ unsigned int _i; \
+ for (_i=0;_i<g->depth;_i++) \
+ g->print(g->ctx, \
+ g->indentString, \
+ (unsigned int)strlen(g->indentString)); \
+ } \
+ }
+
+#define ENSURE_NOT_KEY \
+ if (g->state[g->depth] == yajl_gen_map_key || \
+ g->state[g->depth] == yajl_gen_map_start) { \
+ return yajl_gen_keys_must_be_strings; \
+ } \
+
+/* check that we're not complete, or in error state. in a valid state
+ * to be generating */
+#define ENSURE_VALID_STATE \
+ if (g->state[g->depth] == yajl_gen_error) { \
+ return yajl_gen_in_error_state;\
+ } else if (g->state[g->depth] == yajl_gen_complete) { \
+ return yajl_gen_generation_complete; \
+ }
+
+#define INCREMENT_DEPTH \
+ if (++(g->depth) >= YAJL_MAX_DEPTH) return yajl_max_depth_exceeded;
+
+#define DECREMENT_DEPTH \
+ if (--(g->depth) >= YAJL_MAX_DEPTH) return yajl_gen_error;
+
+#define APPENDED_ATOM \
+ switch (g->state[g->depth]) { \
+ case yajl_gen_start: \
+ g->state[g->depth] = yajl_gen_complete; \
+ break; \
+ case yajl_gen_map_start: \
+ case yajl_gen_map_key: \
+ g->state[g->depth] = yajl_gen_map_val; \
+ break; \
+ case yajl_gen_array_start: \
+ g->state[g->depth] = yajl_gen_in_array; \
+ break; \
+ case yajl_gen_map_val: \
+ g->state[g->depth] = yajl_gen_map_key; \
+ break; \
+ default: \
+ break; \
+ } \
+
+#define FINAL_NEWLINE \
+ if ((g->flags & yajl_gen_beautify) && g->state[g->depth] == yajl_gen_complete) \
+ g->print(g->ctx, "\n", 1);
+
+yajl_gen_status
+yajl_gen_integer(yajl_gen g, long long int number)
+{
+ char i[32];
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ sprintf(i, "%lld", number);
+ g->print(g->ctx, i, (unsigned int)strlen(i));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+#ifdef WIN32
+#include <float.h>
+#define isnan _isnan
+#define isinf !_finite
+#endif
+
+yajl_gen_status
+yajl_gen_double(yajl_gen g, double number)
+{
+ char i[32];
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY;
+ if (isnan(number) || isinf(number)) return yajl_gen_invalid_number;
+ INSERT_SEP; INSERT_WHITESPACE;
+ sprintf(i, "%.20g", number);
+ g->print(g->ctx, i, (unsigned int)strlen(i));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_number(yajl_gen g, const char * s, size_t l)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, s, l);
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_string(yajl_gen g, const unsigned char * str,
+ size_t len)
+{
+ // if validation is enabled, check that the string is valid utf8
+ // XXX: This checking could be done a little faster, in the same pass as
+ // the string encoding
+ if (g->flags & yajl_gen_validate_utf8) {
+ if (!yajl_string_validate_utf8(str, len)) {
+ return yajl_gen_invalid_string;
+ }
+ }
+ ENSURE_VALID_STATE; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, "\"", 1);
+ yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus);
+ g->print(g->ctx, "\"", 1);
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_null(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, "null", strlen("null"));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_bool(yajl_gen g, int boolean)
+{
+ const char * val = boolean ? "true" : "false";
+
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, val, (unsigned int)strlen(val));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_map_open(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ INCREMENT_DEPTH;
+
+ g->state[g->depth] = yajl_gen_map_start;
+ g->print(g->ctx, "{", 1);
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_map_close(yajl_gen g)
+{
+ ENSURE_VALID_STATE;
+ DECREMENT_DEPTH;
+
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ APPENDED_ATOM;
+ INSERT_WHITESPACE;
+ g->print(g->ctx, "}", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_array_open(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ INCREMENT_DEPTH;
+ g->state[g->depth] = yajl_gen_array_start;
+ g->print(g->ctx, "[", 1);
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_array_close(yajl_gen g)
+{
+ ENSURE_VALID_STATE;
+ DECREMENT_DEPTH;
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ APPENDED_ATOM;
+ INSERT_WHITESPACE;
+ g->print(g->ctx, "]", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_get_buf(yajl_gen g, const unsigned char ** buf,
+ size_t * len)
+{
+ if (g->print != (yajl_print_t)&yajl_buf_append) return yajl_gen_no_buf;
+ *buf = yajl_buf_data((yajl_buf)g->ctx);
+ *len = yajl_buf_len((yajl_buf)g->ctx);
+ return yajl_gen_status_ok;
+}
+
+void
+yajl_gen_clear(yajl_gen g)
+{
+ if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_clear((yajl_buf)g->ctx);
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_lex.c b/xlators/cluster/nsr-server/src/yajl_lex.c
new file mode 100644
index 000000000..b098e6a99
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_lex.c
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_lex.h"
+#include "yajl_buf.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef YAJL_LEXER_DEBUG
+static const char *
+tokToStr(yajl_tok tok)
+{
+ switch (tok) {
+ case yajl_tok_bool: return "bool";
+ case yajl_tok_colon: return "colon";
+ case yajl_tok_comma: return "comma";
+ case yajl_tok_eof: return "eof";
+ case yajl_tok_error: return "error";
+ case yajl_tok_left_brace: return "brace";
+ case yajl_tok_left_bracket: return "bracket";
+ case yajl_tok_null: return "null";
+ case yajl_tok_integer: return "integer";
+ case yajl_tok_double: return "double";
+ case yajl_tok_right_brace: return "brace";
+ case yajl_tok_right_bracket: return "bracket";
+ case yajl_tok_string: return "string";
+ case yajl_tok_string_with_escapes: return "string_with_escapes";
+ }
+ return "unknown";
+}
+#endif
+
+/* Impact of the stream parsing feature on the lexer:
+ *
+ * YAJL support stream parsing. That is, the ability to parse the first
+ * bits of a chunk of JSON before the last bits are available (still on
+ * the network or disk). This makes the lexer more complex. The
+ * responsibility of the lexer is to handle transparently the case where
+ * a chunk boundary falls in the middle of a token. This is
+ * accomplished is via a buffer and a character reading abstraction.
+ *
+ * Overview of implementation
+ *
+ * When we lex to end of input string before end of token is hit, we
+ * copy all of the input text composing the token into our lexBuf.
+ *
+ * Every time we read a character, we do so through the readChar function.
+ * readChar's responsibility is to handle pulling all chars from the buffer
+ * before pulling chars from input text
+ */
+
+struct yajl_lexer_t {
+ /* the overal line and char offset into the data */
+ size_t lineOff;
+ size_t charOff;
+
+ /* error */
+ yajl_lex_error error;
+
+ /* a input buffer to handle the case where a token is spread over
+ * multiple chunks */
+ yajl_buf buf;
+
+ /* in the case where we have data in the lexBuf, bufOff holds
+ * the current offset into the lexBuf. */
+ size_t bufOff;
+
+ /* are we using the lex buf? */
+ unsigned int bufInUse;
+
+ /* shall we allow comments? */
+ unsigned int allowComments;
+
+ /* shall we validate utf8 inside strings? */
+ unsigned int validateUTF8;
+
+ yajl_alloc_funcs * alloc;
+};
+
+#define readChar(lxr, txt, off) \
+ (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
+ (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
+ ((txt)[(*(off))++]))
+
+#define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
+
+yajl_lexer
+yajl_lex_alloc(yajl_alloc_funcs * alloc,
+ unsigned int allowComments, unsigned int validateUTF8)
+{
+ yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
+ memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
+ lxr->buf = yajl_buf_alloc(alloc);
+ lxr->allowComments = allowComments;
+ lxr->validateUTF8 = validateUTF8;
+ lxr->alloc = alloc;
+ return lxr;
+}
+
+void
+yajl_lex_free(yajl_lexer lxr)
+{
+ yajl_buf_free(lxr->buf);
+ YA_FREE(lxr->alloc, lxr);
+ return;
+}
+
+/* a lookup table which lets us quickly determine three things:
+ * VEC - valid escaped control char
+ * note. the solidus '/' may be escaped or not.
+ * IJC - invalid json char
+ * VHC - valid hex char
+ * NFP - needs further processing (from a string scanning perspective)
+ * NUC - needs utf8 checking when enabled (from a string scanning perspective)
+ */
+#define VEC 0x01
+#define IJC 0x02
+#define VHC 0x04
+#define NFP 0x08
+#define NUC 0x10
+
+static const char charLookupTable[256] =
+{
+/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+
+/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
+/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
+/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
+/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
+/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,
+
+/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
+/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
+/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
+/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
+};
+
+/** process a variable length utf8 encoded codepoint.
+ *
+ * returns:
+ * yajl_tok_string - if valid utf8 char was parsed and offset was
+ * advanced
+ * yajl_tok_eof - if end of input was hit before validation could
+ * complete
+ * yajl_tok_error - if invalid utf8 was encountered
+ *
+ * NOTE: on error the offset will point to the first char of the
+ * invalid utf8 */
+#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
+
+static yajl_tok
+yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ unsigned char curChar)
+{
+ if (curChar <= 0x7f) {
+ /* single byte */
+ return yajl_tok_string;
+ } else if ((curChar >> 5) == 0x6) {
+ /* two byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ } else if ((curChar >> 4) == 0x0e) {
+ /* three byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ }
+ } else if ((curChar >> 3) == 0x1e) {
+ /* four byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ }
+ }
+ }
+
+ return yajl_tok_error;
+}
+
+/* lex a string. input is the lexer, pointer to beginning of
+ * json text, and start of string (offset).
+ * a token is returned which has the following meanings:
+ * yajl_tok_string: lex of string was successful. offset points to
+ * terminating '"'.
+ * yajl_tok_eof: end of text was encountered before we could complete
+ * the lex.
+ * yajl_tok_error: embedded in the string were unallowable chars. offset
+ * points to the offending char
+ */
+#define STR_CHECK_EOF \
+if (*offset >= jsonTextLen) { \
+ tok = yajl_tok_eof; \
+ goto finish_string_lex; \
+}
+
+/** scan a string for interesting characters that might need further
+ * review. return the number of chars that are uninteresting and can
+ * be skipped.
+ * (lth) hi world, any thoughts on how to make this routine faster? */
+static size_t
+yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
+{
+ unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
+ size_t skip = 0;
+ while (skip < len && !(charLookupTable[*buf] & mask))
+ {
+ skip++;
+ buf++;
+ }
+ return skip;
+}
+
+static yajl_tok
+yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ yajl_tok tok = yajl_tok_error;
+ int hasEscapes = 0;
+
+ for (;;) {
+ unsigned char curChar;
+
+ /* now jump into a faster scanning routine to skip as much
+ * of the buffers as possible */
+ {
+ const unsigned char * p;
+ size_t len;
+
+ if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
+ lexer->bufOff < yajl_buf_len(lexer->buf)))
+ {
+ p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
+ (lexer->bufOff));
+ len = yajl_buf_len(lexer->buf) - lexer->bufOff;
+ lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
+ }
+ else if (*offset < jsonTextLen)
+ {
+ p = jsonText + *offset;
+ len = jsonTextLen - *offset;
+ *offset += yajl_string_scan(p, len, lexer->validateUTF8);
+ }
+ }
+
+ STR_CHECK_EOF;
+
+ curChar = readChar(lexer, jsonText, offset);
+
+ /* quote terminates */
+ if (curChar == '"') {
+ tok = yajl_tok_string;
+ break;
+ }
+ /* backslash escapes a set of control chars, */
+ else if (curChar == '\\') {
+ hasEscapes = 1;
+ STR_CHECK_EOF;
+
+ /* special case \u */
+ curChar = readChar(lexer, jsonText, offset);
+ if (curChar == 'u') {
+ unsigned int i = 0;
+
+ for (i=0;i<4;i++) {
+ STR_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if (!(charLookupTable[curChar] & VHC)) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_hex_char;
+ goto finish_string_lex;
+ }
+ }
+ } else if (!(charLookupTable[curChar] & VEC)) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_escaped_char;
+ goto finish_string_lex;
+ }
+ }
+ /* when not validating UTF8 it's a simple table lookup to determine
+ * if the present character is invalid */
+ else if(charLookupTable[curChar] & IJC) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_json_char;
+ goto finish_string_lex;
+ }
+ /* when in validate UTF8 mode we need to do some extra work */
+ else if (lexer->validateUTF8) {
+ yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
+ offset, curChar);
+
+ if (t == yajl_tok_eof) {
+ tok = yajl_tok_eof;
+ goto finish_string_lex;
+ } else if (t == yajl_tok_error) {
+ lexer->error = yajl_lex_string_invalid_utf8;
+ goto finish_string_lex;
+ }
+ }
+ /* accept it, and move on */
+ }
+ finish_string_lex:
+ /* tell our buddy, the parser, wether he needs to process this string
+ * again */
+ if (hasEscapes && tok == yajl_tok_string) {
+ tok = yajl_tok_string_with_escapes;
+ }
+
+ return tok;
+}
+
+#define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
+
+static yajl_tok
+yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ /** XXX: numbers are the only entities in json that we must lex
+ * _beyond_ in order to know that they are complete. There
+ * is an ambiguous case for integers at EOF. */
+
+ unsigned char c;
+
+ yajl_tok tok = yajl_tok_integer;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* optional leading minus */
+ if (c == '-') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ /* a single zero, or a series of integers */
+ if (c == '0') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } else if (c >= '1' && c <= '9') {
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c >= '0' && c <= '9');
+ } else {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_minus;
+ return yajl_tok_error;
+ }
+
+ /* optional fraction (indicates this is floating point) */
+ if (c == '.') {
+ int numRd = 0;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ while (c >= '0' && c <= '9') {
+ numRd++;
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ if (!numRd) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_decimal;
+ return yajl_tok_error;
+ }
+ tok = yajl_tok_double;
+ }
+
+ /* optional exponent (indicates this is floating point) */
+ if (c == 'e' || c == 'E') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* optional sign */
+ if (c == '+' || c == '-') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ if (c >= '0' && c <= '9') {
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c >= '0' && c <= '9');
+ } else {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_exponent;
+ return yajl_tok_error;
+ }
+ tok = yajl_tok_double;
+ }
+
+ /* we always go "one too far" */
+ unreadChar(lexer, offset);
+
+ return tok;
+}
+
+static yajl_tok
+yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ unsigned char c;
+
+ yajl_tok tok = yajl_tok_comment;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* either slash or star expected */
+ if (c == '/') {
+ /* now we throw away until end of line */
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c != '\n');
+ } else if (c == '*') {
+ /* now we throw away until end of comment */
+ for (;;) {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ if (c == '*') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ if (c == '/') {
+ break;
+ } else {
+ unreadChar(lexer, offset);
+ }
+ }
+ }
+ } else {
+ lexer->error = yajl_lex_invalid_char;
+ tok = yajl_tok_error;
+ }
+
+ return tok;
+}
+
+yajl_tok
+yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ const unsigned char ** outBuf, size_t * outLen)
+{
+ yajl_tok tok = yajl_tok_error;
+ unsigned char c;
+ size_t startOffset = *offset;
+
+ *outBuf = NULL;
+ *outLen = 0;
+
+ for (;;) {
+ assert(*offset <= jsonTextLen);
+
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+
+ c = readChar(lexer, jsonText, offset);
+
+ switch (c) {
+ case '{':
+ tok = yajl_tok_left_bracket;
+ goto lexed;
+ case '}':
+ tok = yajl_tok_right_bracket;
+ goto lexed;
+ case '[':
+ tok = yajl_tok_left_brace;
+ goto lexed;
+ case ']':
+ tok = yajl_tok_right_brace;
+ goto lexed;
+ case ',':
+ tok = yajl_tok_comma;
+ goto lexed;
+ case ':':
+ tok = yajl_tok_colon;
+ goto lexed;
+ case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
+ startOffset++;
+ break;
+ case 't': {
+ const char * want = "rue";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_bool;
+ goto lexed;
+ }
+ case 'f': {
+ const char * want = "alse";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_bool;
+ goto lexed;
+ }
+ case 'n': {
+ const char * want = "ull";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_null;
+ goto lexed;
+ }
+ case '"': {
+ tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ goto lexed;
+ }
+ case '-':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ /* integer parsing wants to start from the beginning */
+ unreadChar(lexer, offset);
+ tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ goto lexed;
+ }
+ case '/':
+ /* hey, look, a probable comment! If comments are disabled
+ * it's an error. */
+ if (!lexer->allowComments) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_unallowed_comment;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ /* if comments are enabled, then we should try to lex
+ * the thing. possible outcomes are
+ * - successful lex (tok_comment, which means continue),
+ * - malformed comment opening (slash not followed by
+ * '*' or '/') (tok_error)
+ * - eof hit. (tok_eof) */
+ tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ if (tok == yajl_tok_comment) {
+ /* "error" is silly, but that's the initial
+ * state of tok. guilty until proven innocent. */
+ tok = yajl_tok_error;
+ yajl_buf_clear(lexer->buf);
+ lexer->bufInUse = 0;
+ startOffset = *offset;
+ break;
+ }
+ /* hit error or eof, bail */
+ goto lexed;
+ default:
+ lexer->error = yajl_lex_invalid_char;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ }
+
+
+ lexed:
+ /* need to append to buffer if the buffer is in use or
+ * if it's an EOF token */
+ if (tok == yajl_tok_eof || lexer->bufInUse) {
+ if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
+ lexer->bufInUse = 1;
+ yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
+ lexer->bufOff = 0;
+
+ if (tok != yajl_tok_eof) {
+ *outBuf = yajl_buf_data(lexer->buf);
+ *outLen = yajl_buf_len(lexer->buf);
+ lexer->bufInUse = 0;
+ }
+ } else if (tok != yajl_tok_error) {
+ *outBuf = jsonText + startOffset;
+ *outLen = *offset - startOffset;
+ }
+
+ /* special case for strings. skip the quotes. */
+ if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
+ {
+ assert(*outLen >= 2);
+ (*outBuf)++;
+ *outLen -= 2;
+ }
+
+
+#ifdef YAJL_LEXER_DEBUG
+ if (tok == yajl_tok_error) {
+ printf("lexical error: %s\n",
+ yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
+ } else if (tok == yajl_tok_eof) {
+ printf("EOF hit\n");
+ } else {
+ printf("lexed %s: '", tokToStr(tok));
+ fwrite(*outBuf, 1, *outLen, stdout);
+ printf("'\n");
+ }
+#endif
+
+ return tok;
+}
+
+const char *
+yajl_lex_error_to_string(yajl_lex_error error)
+{
+ switch (error) {
+ case yajl_lex_e_ok:
+ return "ok, no error";
+ case yajl_lex_string_invalid_utf8:
+ return "invalid bytes in UTF8 string.";
+ case yajl_lex_string_invalid_escaped_char:
+ return "inside a string, '\\' occurs before a character "
+ "which it may not.";
+ case yajl_lex_string_invalid_json_char:
+ return "invalid character inside string.";
+ case yajl_lex_string_invalid_hex_char:
+ return "invalid (non-hex) character occurs after '\\u' inside "
+ "string.";
+ case yajl_lex_invalid_char:
+ return "invalid char in json text.";
+ case yajl_lex_invalid_string:
+ return "invalid string in json text.";
+ case yajl_lex_missing_integer_after_exponent:
+ return "malformed number, a digit is required after the exponent.";
+ case yajl_lex_missing_integer_after_decimal:
+ return "malformed number, a digit is required after the "
+ "decimal point.";
+ case yajl_lex_missing_integer_after_minus:
+ return "malformed number, a digit is required after the "
+ "minus sign.";
+ case yajl_lex_unallowed_comment:
+ return "probable comment found in input text, comments are "
+ "not enabled.";
+ }
+ return "unknown error code";
+}
+
+
+/** allows access to more specific information about the lexical
+ * error when yajl_lex_lex returns yajl_tok_error. */
+yajl_lex_error
+yajl_lex_get_error(yajl_lexer lexer)
+{
+ if (lexer == NULL) return (yajl_lex_error) -1;
+ return lexer->error;
+}
+
+size_t yajl_lex_current_line(yajl_lexer lexer)
+{
+ return lexer->lineOff;
+}
+
+size_t yajl_lex_current_char(yajl_lexer lexer)
+{
+ return lexer->charOff;
+}
+
+yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t offset)
+{
+ const unsigned char * outBuf;
+ size_t outLen;
+ size_t bufLen = yajl_buf_len(lexer->buf);
+ size_t bufOff = lexer->bufOff;
+ unsigned int bufInUse = lexer->bufInUse;
+ yajl_tok tok;
+
+ tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
+ &outBuf, &outLen);
+
+ lexer->bufOff = bufOff;
+ lexer->bufInUse = bufInUse;
+ yajl_buf_truncate(lexer->buf, bufLen);
+
+ return tok;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_lex.h b/xlators/cluster/nsr-server/src/yajl_lex.h
new file mode 100644
index 000000000..cbaae0c13
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_lex.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_LEX_H__
+#define __YAJL_LEX_H__
+
+#include "yajl/yajl_common.h"
+
+typedef enum {
+ yajl_tok_bool,
+ yajl_tok_colon,
+ yajl_tok_comma,
+ yajl_tok_eof,
+ yajl_tok_error,
+ yajl_tok_left_brace,
+ yajl_tok_left_bracket,
+ yajl_tok_null,
+ yajl_tok_right_brace,
+ yajl_tok_right_bracket,
+
+ /* we differentiate between integers and doubles to allow the
+ * parser to interpret the number without re-scanning */
+ yajl_tok_integer,
+ yajl_tok_double,
+
+ /* we differentiate between strings which require further processing,
+ * and strings that do not */
+ yajl_tok_string,
+ yajl_tok_string_with_escapes,
+
+ /* comment tokens are not currently returned to the parser, ever */
+ yajl_tok_comment
+} yajl_tok;
+
+typedef struct yajl_lexer_t * yajl_lexer;
+
+yajl_lexer yajl_lex_alloc(yajl_alloc_funcs * alloc,
+ unsigned int allowComments,
+ unsigned int validateUTF8);
+
+void yajl_lex_free(yajl_lexer lexer);
+
+/**
+ * run/continue a lex. "offset" is an input/output parameter.
+ * It should be initialized to zero for a
+ * new chunk of target text, and upon subsetquent calls with the same
+ * target text should passed with the value of the previous invocation.
+ *
+ * the client may be interested in the value of offset when an error is
+ * returned from the lexer. This allows the client to render useful
+n * error messages.
+ *
+ * When you pass the next chunk of data, context should be reinitialized
+ * to zero.
+ *
+ * Finally, the output buffer is usually just a pointer into the jsonText,
+ * however in cases where the entity being lexed spans multiple chunks,
+ * the lexer will buffer the entity and the data returned will be
+ * a pointer into that buffer.
+ *
+ * This behavior is abstracted from client code except for the performance
+ * implications which require that the client choose a reasonable chunk
+ * size to get adequate performance.
+ */
+yajl_tok yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ const unsigned char ** outBuf, size_t * outLen);
+
+/** have a peek at the next token, but don't move the lexer forward */
+yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t offset);
+
+
+typedef enum {
+ yajl_lex_e_ok = 0,
+ yajl_lex_string_invalid_utf8,
+ yajl_lex_string_invalid_escaped_char,
+ yajl_lex_string_invalid_json_char,
+ yajl_lex_string_invalid_hex_char,
+ yajl_lex_invalid_char,
+ yajl_lex_invalid_string,
+ yajl_lex_missing_integer_after_decimal,
+ yajl_lex_missing_integer_after_exponent,
+ yajl_lex_missing_integer_after_minus,
+ yajl_lex_unallowed_comment
+} yajl_lex_error;
+
+const char * yajl_lex_error_to_string(yajl_lex_error error);
+
+/** allows access to more specific information about the lexical
+ * error when yajl_lex_lex returns yajl_tok_error. */
+yajl_lex_error yajl_lex_get_error(yajl_lexer lexer);
+
+/** get the current offset into the most recently lexed json string. */
+size_t yajl_lex_current_offset(yajl_lexer lexer);
+
+/** get the number of lines lexed by this lexer instance */
+size_t yajl_lex_current_line(yajl_lexer lexer);
+
+/** get the number of chars lexed by this lexer instance since the last
+ * \n or \r */
+size_t yajl_lex_current_char(yajl_lexer lexer);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_parser.c b/xlators/cluster/nsr-server/src/yajl_parser.c
new file mode 100644
index 000000000..bf9ef24ef
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_parser.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_parse.h"
+#include "yajl_lex.h"
+#include "yajl_parser.h"
+#include "yajl_encode.h"
+#include "yajl_bytestack.h"
+
+#include <stdlib.h>
+#include <limits.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+
+#define MAX_VALUE_TO_MULTIPLY ((LLONG_MAX / 10) + (LLONG_MAX % 10))
+
+ /* same semantics as strtol */
+long long
+yajl_parse_integer(const unsigned char *number, unsigned int length)
+{
+ long long ret = 0;
+ long sign = 1;
+ const unsigned char *pos = number;
+ if (*pos == '-') { pos++; sign = -1; }
+ if (*pos == '+') { pos++; }
+
+ while (pos < number + length) {
+ if ( ret > MAX_VALUE_TO_MULTIPLY ) {
+ errno = ERANGE;
+ return sign == 1 ? LLONG_MAX : LLONG_MIN;
+ }
+ ret *= 10;
+ if (LLONG_MAX - ret < (*pos - '0')) {
+ errno = ERANGE;
+ return sign == 1 ? LLONG_MAX : LLONG_MIN;
+ }
+ ret += (*pos++ - '0');
+ }
+
+ return sign * ret;
+}
+
+unsigned char *
+yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen, int verbose)
+{
+ size_t offset = hand->bytesConsumed;
+ unsigned char * str;
+ const char * errorType = NULL;
+ const char * errorText = NULL;
+ char text[72];
+ const char * arrow = " (right here) ------^\n";
+
+ if (yajl_bs_current(hand->stateStack) == yajl_state_parse_error) {
+ errorType = "parse";
+ errorText = hand->parseError;
+ } else if (yajl_bs_current(hand->stateStack) == yajl_state_lexical_error) {
+ errorType = "lexical";
+ errorText = yajl_lex_error_to_string(yajl_lex_get_error(hand->lexer));
+ } else {
+ errorType = "unknown";
+ }
+
+ {
+ size_t memneeded = 0;
+ memneeded += strlen(errorType);
+ memneeded += strlen(" error");
+ if (errorText != NULL) {
+ memneeded += strlen(": ");
+ memneeded += strlen(errorText);
+ }
+ str = (unsigned char *) YA_MALLOC(&(hand->alloc), memneeded + 2);
+ if (!str) return NULL;
+ str[0] = 0;
+ strcat((char *) str, errorType);
+ strcat((char *) str, " error");
+ if (errorText != NULL) {
+ strcat((char *) str, ": ");
+ strcat((char *) str, errorText);
+ }
+ strcat((char *) str, "\n");
+ }
+
+ /* now we append as many spaces as needed to make sure the error
+ * falls at char 41, if verbose was specified */
+ if (verbose) {
+ size_t start, end, i;
+ size_t spacesNeeded;
+
+ spacesNeeded = (offset < 30 ? 40 - offset : 10);
+ start = (offset >= 30 ? offset - 30 : 0);
+ end = (offset + 30 > jsonTextLen ? jsonTextLen : offset + 30);
+
+ for (i=0;i<spacesNeeded;i++) text[i] = ' ';
+
+ for (;start < end;start++, i++) {
+ if (jsonText[start] != '\n' && jsonText[start] != '\r')
+ {
+ text[i] = jsonText[start];
+ }
+ else
+ {
+ text[i] = ' ';
+ }
+ }
+ assert(i <= 71);
+ text[i++] = '\n';
+ text[i] = 0;
+ {
+ char * newStr = (char *)
+ YA_MALLOC(&(hand->alloc), (unsigned int)(strlen((char *) str) +
+ strlen((char *) text) +
+ strlen(arrow) + 1));
+ if (newStr) {
+ newStr[0] = 0;
+ strcat((char *) newStr, (char *) str);
+ strcat((char *) newStr, text);
+ strcat((char *) newStr, arrow);
+ }
+ YA_FREE(&(hand->alloc), str);
+ str = (unsigned char *) newStr;
+ }
+ }
+ return str;
+}
+
+/* check for client cancelation */
+#define _CC_CHK(x) \
+ if (!(x)) { \
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error); \
+ hand->parseError = \
+ "client cancelled parse via callback return value"; \
+ return yajl_status_client_canceled; \
+ }
+
+
+yajl_status
+yajl_do_finish(yajl_handle hand)
+{
+ yajl_status stat;
+ stat = yajl_do_parse(hand,(const unsigned char *) " ",1);
+
+ if (stat != yajl_status_ok) return stat;
+
+ switch(yajl_bs_current(hand->stateStack))
+ {
+ case yajl_state_parse_error:
+ case yajl_state_lexical_error:
+ return yajl_status_error;
+ case yajl_state_got_value:
+ case yajl_state_parse_complete:
+ return yajl_status_ok;
+ default:
+ if (!(hand->flags & yajl_allow_partial_values))
+ {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "premature EOF";
+ return yajl_status_error;
+ }
+ return yajl_status_ok;
+ }
+}
+
+yajl_status
+yajl_do_parse(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen)
+{
+ yajl_tok tok;
+ const unsigned char * buf;
+ size_t bufLen;
+ size_t * offset = &(hand->bytesConsumed);
+
+ *offset = 0;
+
+ around_again:
+ switch (yajl_bs_current(hand->stateStack)) {
+ case yajl_state_parse_complete:
+ if (hand->flags & yajl_allow_multiple_values) {
+ yajl_bs_set(hand->stateStack, yajl_state_got_value);
+ goto around_again;
+ }
+ if (!(hand->flags & yajl_allow_trailing_garbage)) {
+ if (*offset != jsonTextLen) {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ if (tok != yajl_tok_eof) {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "trailing garbage";
+ }
+ goto around_again;
+ }
+ }
+ return yajl_status_ok;
+ case yajl_state_lexical_error:
+ case yajl_state_parse_error:
+ return yajl_status_error;
+ case yajl_state_start:
+ case yajl_state_got_value:
+ case yajl_state_map_need_val:
+ case yajl_state_array_need_val:
+ case yajl_state_array_start: {
+ /* for arrays and maps, we advance the state for this
+ * depth, then push the state of the next depth.
+ * If an error occurs during the parsing of the nesting
+ * enitity, the state at this level will not matter.
+ * a state that needs pushing will be anything other
+ * than state_start */
+
+ yajl_state stateToPush = yajl_state_start;
+
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+
+ switch (tok) {
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ case yajl_tok_string:
+ if (hand->callbacks && hand->callbacks->yajl_string) {
+ _CC_CHK(hand->callbacks->yajl_string(hand->ctx,
+ buf, bufLen));
+ }
+ break;
+ case yajl_tok_string_with_escapes:
+ if (hand->callbacks && hand->callbacks->yajl_string) {
+ yajl_buf_clear(hand->decodeBuf);
+ yajl_string_decode(hand->decodeBuf, buf, bufLen);
+ _CC_CHK(hand->callbacks->yajl_string(
+ hand->ctx, yajl_buf_data(hand->decodeBuf),
+ yajl_buf_len(hand->decodeBuf)));
+ }
+ break;
+ case yajl_tok_bool:
+ if (hand->callbacks && hand->callbacks->yajl_boolean) {
+ _CC_CHK(hand->callbacks->yajl_boolean(hand->ctx,
+ *buf == 't'));
+ }
+ break;
+ case yajl_tok_null:
+ if (hand->callbacks && hand->callbacks->yajl_null) {
+ _CC_CHK(hand->callbacks->yajl_null(hand->ctx));
+ }
+ break;
+ case yajl_tok_left_bracket:
+ if (hand->callbacks && hand->callbacks->yajl_start_map) {
+ _CC_CHK(hand->callbacks->yajl_start_map(hand->ctx));
+ }
+ stateToPush = yajl_state_map_start;
+ break;
+ case yajl_tok_left_brace:
+ if (hand->callbacks && hand->callbacks->yajl_start_array) {
+ _CC_CHK(hand->callbacks->yajl_start_array(hand->ctx));
+ }
+ stateToPush = yajl_state_array_start;
+ break;
+ case yajl_tok_integer:
+ if (hand->callbacks) {
+ if (hand->callbacks->yajl_number) {
+ _CC_CHK(hand->callbacks->yajl_number(
+ hand->ctx,(const char *) buf, bufLen));
+ } else if (hand->callbacks->yajl_integer) {
+ long long int i = 0;
+ i = yajl_parse_integer(buf, bufLen);
+ if ((i == LLONG_MIN || i == LLONG_MAX) &&
+ errno == ERANGE)
+ {
+ yajl_bs_set(hand->stateStack,
+ yajl_state_parse_error);
+ hand->parseError = "integer overflow" ;
+ /* try to restore error offset */
+ if (*offset >= bufLen) *offset -= bufLen;
+ else *offset = 0;
+ goto around_again;
+ }
+ _CC_CHK(hand->callbacks->yajl_integer(hand->ctx,
+ i));
+ }
+ }
+ break;
+ case yajl_tok_double:
+ if (hand->callbacks) {
+ if (hand->callbacks->yajl_number) {
+ _CC_CHK(hand->callbacks->yajl_number(
+ hand->ctx, (const char *) buf, bufLen));
+ } else if (hand->callbacks->yajl_double) {
+ double d = 0.0;
+ yajl_buf_clear(hand->decodeBuf);
+ yajl_buf_append(hand->decodeBuf, buf, bufLen);
+ buf = yajl_buf_data(hand->decodeBuf);
+ d = strtod((char *) buf, NULL);
+ if ((d == HUGE_VAL || d == -HUGE_VAL) &&
+ errno == ERANGE)
+ {
+ yajl_bs_set(hand->stateStack,
+ yajl_state_parse_error);
+ hand->parseError = "numeric (floating point) "
+ "overflow";
+ /* try to restore error offset */
+ if (*offset >= bufLen) *offset -= bufLen;
+ else *offset = 0;
+ goto around_again;
+ }
+ _CC_CHK(hand->callbacks->yajl_double(hand->ctx,
+ d));
+ }
+ }
+ break;
+ case yajl_tok_right_brace: {
+ if (yajl_bs_current(hand->stateStack) ==
+ yajl_state_array_start)
+ {
+ if (hand->callbacks &&
+ hand->callbacks->yajl_end_array)
+ {
+ _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ }
+ /* intentional fall-through */
+ }
+ case yajl_tok_colon:
+ case yajl_tok_comma:
+ case yajl_tok_right_bracket:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError =
+ "unallowed token at this point in JSON text";
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "invalid token, internal error";
+ goto around_again;
+ }
+ /* got a value. transition depends on the state we're in. */
+ {
+ yajl_state s = yajl_bs_current(hand->stateStack);
+ if (s == yajl_state_start || s == yajl_state_got_value) {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_complete);
+ } else if (s == yajl_state_map_need_val) {
+ yajl_bs_set(hand->stateStack, yajl_state_map_got_val);
+ } else {
+ yajl_bs_set(hand->stateStack, yajl_state_array_got_val);
+ }
+ }
+ if (stateToPush != yajl_state_start) {
+ yajl_bs_push(hand->stateStack, stateToPush);
+ }
+
+ goto around_again;
+ }
+ case yajl_state_map_start:
+ case yajl_state_map_need_key: {
+ /* only difference between these two states is that in
+ * start '}' is valid, whereas in need_key, we've parsed
+ * a comma, and a string key _must_ follow */
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ switch (tok) {
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ case yajl_tok_string_with_escapes:
+ if (hand->callbacks && hand->callbacks->yajl_map_key) {
+ yajl_buf_clear(hand->decodeBuf);
+ yajl_string_decode(hand->decodeBuf, buf, bufLen);
+ buf = yajl_buf_data(hand->decodeBuf);
+ bufLen = yajl_buf_len(hand->decodeBuf);
+ }
+ /* intentional fall-through */
+ case yajl_tok_string:
+ if (hand->callbacks && hand->callbacks->yajl_map_key) {
+ _CC_CHK(hand->callbacks->yajl_map_key(hand->ctx, buf,
+ bufLen));
+ }
+ yajl_bs_set(hand->stateStack, yajl_state_map_sep);
+ goto around_again;
+ case yajl_tok_right_bracket:
+ if (yajl_bs_current(hand->stateStack) ==
+ yajl_state_map_start)
+ {
+ if (hand->callbacks && hand->callbacks->yajl_end_map) {
+ _CC_CHK(hand->callbacks->yajl_end_map(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ }
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError =
+ "invalid object key (must be a string)";
+ goto around_again;
+ }
+ }
+ case yajl_state_map_sep: {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ switch (tok) {
+ case yajl_tok_colon:
+ yajl_bs_set(hand->stateStack, yajl_state_map_need_val);
+ goto around_again;
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "object key and value must "
+ "be separated by a colon (':')";
+ goto around_again;
+ }
+ }
+ case yajl_state_map_got_val: {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ switch (tok) {
+ case yajl_tok_right_bracket:
+ if (hand->callbacks && hand->callbacks->yajl_end_map) {
+ _CC_CHK(hand->callbacks->yajl_end_map(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ case yajl_tok_comma:
+ yajl_bs_set(hand->stateStack, yajl_state_map_need_key);
+ goto around_again;
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "after key and value, inside map, "
+ "I expect ',' or '}'";
+ /* try to restore error offset */
+ if (*offset >= bufLen) *offset -= bufLen;
+ else *offset = 0;
+ goto around_again;
+ }
+ }
+ case yajl_state_array_got_val: {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ switch (tok) {
+ case yajl_tok_right_brace:
+ if (hand->callbacks && hand->callbacks->yajl_end_array) {
+ _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ case yajl_tok_comma:
+ yajl_bs_set(hand->stateStack, yajl_state_array_need_val);
+ goto around_again;
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError =
+ "after array element, I expect ',' or ']'";
+ goto around_again;
+ }
+ }
+ }
+
+ abort();
+ return yajl_status_error;
+}
+
diff --git a/xlators/cluster/nsr-server/src/yajl_parser.h b/xlators/cluster/nsr-server/src/yajl_parser.h
new file mode 100644
index 000000000..53409731a
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_parser.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_PARSER_H__
+#define __YAJL_PARSER_H__
+
+#include "yajl/yajl_parse.h"
+#include "yajl_bytestack.h"
+#include "yajl_buf.h"
+#include "yajl_lex.h"
+
+
+typedef enum {
+ yajl_state_start = 0,
+ yajl_state_parse_complete,
+ yajl_state_parse_error,
+ yajl_state_lexical_error,
+ yajl_state_map_start,
+ yajl_state_map_sep,
+ yajl_state_map_need_val,
+ yajl_state_map_got_val,
+ yajl_state_map_need_key,
+ yajl_state_array_start,
+ yajl_state_array_got_val,
+ yajl_state_array_need_val,
+ yajl_state_got_value,
+} yajl_state;
+
+struct yajl_handle_t {
+ const yajl_callbacks * callbacks;
+ void * ctx;
+ yajl_lexer lexer;
+ const char * parseError;
+ /* the number of bytes consumed from the last client buffer,
+ * in the case of an error this will be an error offset, in the
+ * case of an error this can be used as the error offset */
+ size_t bytesConsumed;
+ /* temporary storage for decoded strings */
+ yajl_buf decodeBuf;
+ /* a stack of states. access with yajl_state_XXX routines */
+ yajl_bytestack stateStack;
+ /* memory allocation routines */
+ yajl_alloc_funcs alloc;
+ /* bitfield */
+ unsigned int flags;
+};
+
+yajl_status
+yajl_do_parse(yajl_handle handle, const unsigned char * jsonText,
+ size_t jsonTextLen);
+
+yajl_status
+yajl_do_finish(yajl_handle handle);
+
+unsigned char *
+yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen, int verbose);
+
+/* A little built in integer parsing routine with the same semantics as strtol
+ * that's unaffected by LOCALE. */
+long long
+yajl_parse_integer(const unsigned char *number, unsigned int length);
+
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_tree.c b/xlators/cluster/nsr-server/src/yajl_tree.c
new file mode 100644
index 000000000..1a69134e7
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_tree.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2010-2011 Florian Forster <ff at octo.it>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "yajl/yajl_tree.h"
+#include "yajl/yajl_parse.h"
+
+#include "yajl_parser.h"
+
+#ifdef WIN32
+#define snprintf sprintf_s
+#endif
+
+#define STATUS_CONTINUE 1
+#define STATUS_ABORT 0
+
+struct stack_elem_s;
+typedef struct stack_elem_s stack_elem_t;
+struct stack_elem_s
+{
+ char * key;
+ yajl_val value;
+ stack_elem_t *next;
+};
+
+struct context_s
+{
+ stack_elem_t *stack;
+ yajl_val root;
+ char *errbuf;
+ size_t errbuf_size;
+};
+typedef struct context_s context_t;
+
+#define RETURN_ERROR(ctx,retval,...) { \
+ if ((ctx)->errbuf != NULL) \
+ snprintf ((ctx)->errbuf, (ctx)->errbuf_size, __VA_ARGS__); \
+ return (retval); \
+ }
+
+static yajl_val value_alloc (yajl_type type)
+{
+ yajl_val v;
+
+ v = malloc (sizeof (*v));
+ if (v == NULL) return (NULL);
+ memset (v, 0, sizeof (*v));
+ v->type = type;
+
+ return (v);
+}
+
+static void yajl_object_free (yajl_val v)
+{
+ size_t i;
+
+ if (!YAJL_IS_OBJECT(v)) return;
+
+ for (i = 0; i < v->u.object.len; i++)
+ {
+ free((char *) v->u.object.keys[i]);
+ v->u.object.keys[i] = NULL;
+ yajl_tree_free (v->u.object.values[i]);
+ v->u.object.values[i] = NULL;
+ }
+
+ free((void*) v->u.object.keys);
+ free(v->u.object.values);
+ free(v);
+}
+
+static void yajl_array_free (yajl_val v)
+{
+ size_t i;
+
+ if (!YAJL_IS_ARRAY(v)) return;
+
+ for (i = 0; i < v->u.array.len; i++)
+ {
+ yajl_tree_free (v->u.array.values[i]);
+ v->u.array.values[i] = NULL;
+ }
+
+ free(v->u.array.values);
+ free(v);
+}
+
+/*
+ * Parsing nested objects and arrays is implemented using a stack. When a new
+ * object or array starts (a curly or a square opening bracket is read), an
+ * appropriate value is pushed on the stack. When the end of the object is
+ * reached (an appropriate closing bracket has been read), the value is popped
+ * off the stack and added to the enclosing object using "context_add_value".
+ */
+static int context_push(context_t *ctx, yajl_val v)
+{
+ stack_elem_t *stack;
+
+ stack = malloc (sizeof (*stack));
+ if (stack == NULL)
+ RETURN_ERROR (ctx, ENOMEM, "Out of memory");
+ memset (stack, 0, sizeof (*stack));
+
+ assert ((ctx->stack == NULL)
+ || YAJL_IS_OBJECT (v)
+ || YAJL_IS_ARRAY (v));
+
+ stack->value = v;
+ stack->next = ctx->stack;
+ ctx->stack = stack;
+
+ return (0);
+}
+
+static yajl_val context_pop(context_t *ctx)
+{
+ stack_elem_t *stack;
+ yajl_val v;
+
+ if (ctx->stack == NULL)
+ RETURN_ERROR (ctx, NULL, "context_pop: "
+ "Bottom of stack reached prematurely");
+
+ stack = ctx->stack;
+ ctx->stack = stack->next;
+
+ v = stack->value;
+
+ free (stack);
+
+ return (v);
+}
+
+static int object_add_keyval(context_t *ctx,
+ yajl_val obj, char *key, yajl_val value)
+{
+ const char **tmpk;
+ yajl_val *tmpv;
+
+ /* We're checking for NULL in "context_add_value" or its callers. */
+ assert (ctx != NULL);
+ assert (obj != NULL);
+ assert (key != NULL);
+ assert (value != NULL);
+
+ /* We're assuring that "obj" is an object in "context_add_value". */
+ assert(YAJL_IS_OBJECT(obj));
+
+ tmpk = realloc((void *) obj->u.object.keys, sizeof(*(obj->u.object.keys)) * (obj->u.object.len + 1));
+ if (tmpk == NULL)
+ RETURN_ERROR(ctx, ENOMEM, "Out of memory");
+ obj->u.object.keys = tmpk;
+
+ tmpv = realloc(obj->u.object.values, sizeof (*obj->u.object.values) * (obj->u.object.len + 1));
+ if (tmpv == NULL)
+ RETURN_ERROR(ctx, ENOMEM, "Out of memory");
+ obj->u.object.values = tmpv;
+
+ obj->u.object.keys[obj->u.object.len] = key;
+ obj->u.object.values[obj->u.object.len] = value;
+ obj->u.object.len++;
+
+ return (0);
+}
+
+static int array_add_value (context_t *ctx,
+ yajl_val array, yajl_val value)
+{
+ yajl_val *tmp;
+
+ /* We're checking for NULL pointers in "context_add_value" or its
+ * callers. */
+ assert (ctx != NULL);
+ assert (array != NULL);
+ assert (value != NULL);
+
+ /* "context_add_value" will only call us with array values. */
+ assert(YAJL_IS_ARRAY(array));
+
+ tmp = realloc(array->u.array.values,
+ sizeof(*(array->u.array.values)) * (array->u.array.len + 1));
+ if (tmp == NULL)
+ RETURN_ERROR(ctx, ENOMEM, "Out of memory");
+ array->u.array.values = tmp;
+ array->u.array.values[array->u.array.len] = value;
+ array->u.array.len++;
+
+ return 0;
+}
+
+/*
+ * Add a value to the value on top of the stack or the "root" member in the
+ * context if the end of the parsing process is reached.
+ */
+static int context_add_value (context_t *ctx, yajl_val v)
+{
+ /* We're checking for NULL values in all the calling functions. */
+ assert (ctx != NULL);
+ assert (v != NULL);
+
+ /*
+ * There are three valid states in which this function may be called:
+ * - There is no value on the stack => This is the only value. This is the
+ * last step done when parsing a document. We assign the value to the
+ * "root" member and return.
+ * - The value on the stack is an object. In this case store the key on the
+ * stack or, if the key has already been read, add key and value to the
+ * object.
+ * - The value on the stack is an array. In this case simply add the value
+ * and return.
+ */
+ if (ctx->stack == NULL)
+ {
+ assert (ctx->root == NULL);
+ ctx->root = v;
+ return (0);
+ }
+ else if (YAJL_IS_OBJECT (ctx->stack->value))
+ {
+ if (ctx->stack->key == NULL)
+ {
+ if (!YAJL_IS_STRING (v))
+ RETURN_ERROR (ctx, EINVAL, "context_add_value: "
+ "Object key is not a string (%#04x)",
+ v->type);
+
+ ctx->stack->key = v->u.string;
+ v->u.string = NULL;
+ free(v);
+ return (0);
+ }
+ else /* if (ctx->key != NULL) */
+ {
+ char * key;
+
+ key = ctx->stack->key;
+ ctx->stack->key = NULL;
+ return (object_add_keyval (ctx, ctx->stack->value, key, v));
+ }
+ }
+ else if (YAJL_IS_ARRAY (ctx->stack->value))
+ {
+ return (array_add_value (ctx, ctx->stack->value, v));
+ }
+ else
+ {
+ RETURN_ERROR (ctx, EINVAL, "context_add_value: Cannot add value to "
+ "a value of type %#04x (not a composite type)",
+ ctx->stack->value->type);
+ }
+}
+
+static int handle_string (void *ctx,
+ const unsigned char *string, size_t string_length)
+{
+ yajl_val v;
+
+ v = value_alloc (yajl_t_string);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ v->u.string = malloc (string_length + 1);
+ if (v->u.string == NULL)
+ {
+ free (v);
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+ }
+ memcpy(v->u.string, string, string_length);
+ v->u.string[string_length] = 0;
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_number (void *ctx, const char *string, size_t string_length)
+{
+ yajl_val v;
+ char *endptr;
+
+ v = value_alloc(yajl_t_number);
+ if (v == NULL)
+ RETURN_ERROR((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ v->u.number.r = malloc(string_length + 1);
+ if (v->u.number.r == NULL)
+ {
+ free(v);
+ RETURN_ERROR((context_t *) ctx, STATUS_ABORT, "Out of memory");
+ }
+ memcpy(v->u.number.r, string, string_length);
+ v->u.number.r[string_length] = 0;
+
+ v->u.number.flags = 0;
+
+ endptr = NULL;
+ errno = 0;
+ v->u.number.i = yajl_parse_integer((const unsigned char *) v->u.number.r,
+ strlen(v->u.number.r));
+ if ((errno == 0) && (endptr != NULL) && (*endptr == 0))
+ v->u.number.flags |= YAJL_NUMBER_INT_VALID;
+
+ endptr = NULL;
+ errno = 0;
+ v->u.number.d = strtod(v->u.number.r, &endptr);
+ if ((errno == 0) && (endptr != NULL) && (*endptr == 0))
+ v->u.number.flags |= YAJL_NUMBER_DOUBLE_VALID;
+
+ return ((context_add_value(ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_start_map (void *ctx)
+{
+ yajl_val v;
+
+ v = value_alloc(yajl_t_object);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ v->u.object.keys = NULL;
+ v->u.object.values = NULL;
+ v->u.object.len = 0;
+
+ return ((context_push (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_end_map (void *ctx)
+{
+ yajl_val v;
+
+ v = context_pop (ctx);
+ if (v == NULL)
+ return (STATUS_ABORT);
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_start_array (void *ctx)
+{
+ yajl_val v;
+
+ v = value_alloc(yajl_t_array);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ v->u.array.values = NULL;
+ v->u.array.len = 0;
+
+ return ((context_push (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_end_array (void *ctx)
+{
+ yajl_val v;
+
+ v = context_pop (ctx);
+ if (v == NULL)
+ return (STATUS_ABORT);
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_boolean (void *ctx, int boolean_value)
+{
+ yajl_val v;
+
+ v = value_alloc (boolean_value ? yajl_t_true : yajl_t_false);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_null (void *ctx)
+{
+ yajl_val v;
+
+ v = value_alloc (yajl_t_null);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+/*
+ * Public functions
+ */
+yajl_val yajl_tree_parse (const char *input,
+ char *error_buffer, size_t error_buffer_size)
+{
+ static const yajl_callbacks callbacks =
+ {
+ /* null = */ handle_null,
+ /* boolean = */ handle_boolean,
+ /* integer = */ NULL,
+ /* double = */ NULL,
+ /* number = */ handle_number,
+ /* string = */ handle_string,
+ /* start map = */ handle_start_map,
+ /* map key = */ handle_string,
+ /* end map = */ handle_end_map,
+ /* start array = */ handle_start_array,
+ /* end array = */ handle_end_array
+ };
+
+ yajl_handle handle;
+ yajl_status status;
+ context_t ctx = { NULL, NULL, NULL, 0 };
+
+ ctx.errbuf = error_buffer;
+ ctx.errbuf_size = error_buffer_size;
+
+ if (error_buffer != NULL)
+ memset (error_buffer, 0, error_buffer_size);
+
+ handle = yajl_alloc (&callbacks, NULL, &ctx);
+ yajl_config(handle, yajl_allow_comments, 1);
+
+ status = yajl_parse(handle,
+ (unsigned char *) input,
+ strlen (input));
+ status = yajl_complete_parse (handle);
+ if (status != yajl_status_ok) {
+ if (error_buffer != NULL && error_buffer_size > 0) {
+ snprintf(
+ error_buffer, error_buffer_size, "%s",
+ (char *) yajl_get_error(handle, 1,
+ (const unsigned char *) input,
+ strlen(input)));
+ }
+ yajl_free (handle);
+ return NULL;
+ }
+
+ yajl_free (handle);
+ return (ctx.root);
+}
+
+yajl_val yajl_tree_get(yajl_val n, const char ** path, yajl_type type)
+{
+ if (!path) return NULL;
+ while (n && *path) {
+ unsigned int i;
+
+ if (n->type != yajl_t_object) return NULL;
+ for (i = 0; i < n->u.object.len; i++) {
+ if (!strcmp(*path, n->u.object.keys[i])) {
+ n = n->u.object.values[i];
+ break;
+ }
+ }
+ if (i == n->u.object.len) return NULL;
+ path++;
+ }
+ if (n && type != yajl_t_any && type != n->type) n = NULL;
+ return n;
+}
+
+void yajl_tree_free (yajl_val v)
+{
+ if (v == NULL) return;
+
+ if (YAJL_IS_STRING(v))
+ {
+ free(v->u.string);
+ free(v);
+ }
+ else if (YAJL_IS_NUMBER(v))
+ {
+ free(v->u.number.r);
+ free(v);
+ }
+ else if (YAJL_GET_OBJECT(v))
+ {
+ yajl_object_free(v);
+ }
+ else if (YAJL_GET_ARRAY(v))
+ {
+ yajl_array_free(v);
+ }
+ else /* if (yajl_t_true or yajl_t_false or yajl_t_null) */
+ {
+ free(v);
+ }
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_version.c b/xlators/cluster/nsr-server/src/yajl_version.c
new file mode 100644
index 000000000..0671da722
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_version.c
@@ -0,0 +1,7 @@
+#include <yajl/yajl_version.h>
+
+int yajl_version(void)
+{
+ return YAJL_VERSION;
+}
+
diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am
index 2d151422a..4268d6f03 100644
--- a/xlators/cluster/stripe/src/Makefile.am
+++ b/xlators/cluster/stripe/src/Makefile.am
@@ -1,4 +1,3 @@
-
xlator_LTLIBRARIES = stripe.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
diff --git a/xlators/cluster/stripe/src/stripe-helpers.c b/xlators/cluster/stripe/src/stripe-helpers.c
index a83abdc72..3c12809d6 100644
--- a/xlators/cluster/stripe/src/stripe-helpers.c
+++ b/xlators/cluster/stripe/src/stripe-helpers.c
@@ -13,6 +13,7 @@
#include "stripe.h"
#include "byte-order.h"
#include "mem-types.h"
+#include "logging.h"
void
stripe_local_wipe (stripe_local_t *local)
@@ -223,6 +224,7 @@ stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local,
if (ret < 0) {
gf_log (this->name, GF_LOG_WARNING,
"dict_serialize failed (%s)", strerror (-ret));
+ GF_FREE(buf);
ret = -1;
goto out;
}
@@ -259,8 +261,8 @@ stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local,
goto out;
}
- (void) snprintf (stripe_size_str, 20, "%ld",
- (local->fctx) ? local->fctx->stripe_size : 0);
+ (void) snprintf (stripe_size_str, 20, "%"PRId64,
+ (long long) (local->fctx) ? local->fctx->stripe_size : 0);
/* extra bytes for decorations (brackets and <>'s) */
padding = strlen (this->name) + strlen (STRIPE_PATHINFO_HEADER)
@@ -280,6 +282,7 @@ stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Cannot aggregate pathinfo list");
+ GF_FREE(pathinfo_serz);
goto out;
}
@@ -502,7 +505,7 @@ set_default_block_size (stripe_private_t *priv, char *num)
GF_VALIDATE_OR_GOTO (THIS->name, num, out);
- if (gf_string2bytesize (num, &priv->block_size) != 0) {
+ if (gf_string2bytesize_uint64 (num, &priv->block_size) != 0) {
gf_log (THIS->name, GF_LOG_ERROR,
"invalid number format \"%s\"", num);
goto out;
@@ -552,7 +555,7 @@ set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data)
if (ret)
goto out;
}
- if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) {
+ if (gf_string2bytesize_uint64 (num, &stripe_opt->block_size) != 0) {
gf_log (this->name, GF_LOG_ERROR,
"invalid number format \"%s\"", num);
goto out;
@@ -672,4 +675,3 @@ uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count,
return size;
}
-
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
index dadd3fec5..0ebea8168 100644
--- a/xlators/cluster/stripe/src/stripe.c
+++ b/xlators/cluster/stripe/src/stripe.c
@@ -1053,6 +1053,9 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
op_errno = ENOMEM;
goto err;
}
+
+ frame->local = local;
+
local->op_ret = -1;
loc_copy (&local->loc, oldloc);
loc_copy (&local->loc2, newloc);
@@ -1066,8 +1069,6 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
local->fctx = fctx;
}
- frame->local = local;
-
STACK_WIND (frame, stripe_first_rename_cbk, trav->xlator,
trav->xlator->fops->rename, oldloc, newloc, NULL);
@@ -2879,15 +2880,15 @@ stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict
goto err;
}
+ frame->local = local;
+
inode_ctx_get(fd->inode, this, (uint64_t *) &fctx);
if (!fctx) {
op_errno = EINVAL;
goto err;
}
local->fctx = fctx;
-
local->op_ret = -1;
- frame->local = local;
local->call_count = priv->child_count;
while (trav) {
@@ -3774,6 +3775,502 @@ err:
int32_t
+stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ /* send fallocate request to the associated child node */
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
+
+ /*
+ * TODO: Create a separate handler for coalesce mode that sends a
+ * single fallocate per-child (since the ranges are linear).
+ */
+ STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->fallocate, fd, mode,
+ dest_offset, fill_size, xdata);
+
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ /* send discard request to the associated child node */
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
+
+ /*
+ * TODO: Create a separate handler for coalesce mode that sends a
+ * single discard per-child (since the ranges are linear).
+ */
+ STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->discard, fd, dest_offset,
+ fill_size, xdata);
+
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ GF_ASSERT (frame);
+
+ if (!this || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size,
+ fctx->stripe_count);
+
+ STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->zerofill, fd,
+ dest_offset, fill_size, xdata);
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
stripe_release (xlator_t *this, fd_t *fd)
{
return 0;
@@ -3947,6 +4444,37 @@ stripe_setxattr_cbk (call_frame_t *frame, void *cookie,
return 0;
}
+#ifdef HAVE_BD_XLATOR
+int
+stripe_is_bd (dict_t *this, char *key, data_t *value, void *data)
+{
+ gf_boolean_t *is_bd = data;
+
+ if (data == NULL)
+ return 0;
+
+ if (XATTR_IS_BD (key))
+ *is_bd = _gf_true;
+
+ return 0;
+}
+
+static inline gf_boolean_t
+stripe_setxattr_is_bd (dict_t *dict)
+{
+ gf_boolean_t is_bd = _gf_false;
+
+ if (dict == NULL)
+ goto out;
+
+ dict_foreach (dict, stripe_is_bd, &is_bd);
+out:
+ return is_bd;
+}
+#else
+#define stripe_setxattr_is_bd(dict) _gf_false
+#endif
+
int
stripe_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
@@ -3956,6 +4484,7 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this,
stripe_private_t *priv = NULL;
stripe_local_t *local = NULL;
int i = 0;
+ gf_boolean_t is_bd = _gf_false;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -3978,11 +4507,15 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this,
local->wind_count = priv->child_count;
local->op_ret = local->op_errno = 0;
+ is_bd = stripe_setxattr_is_bd (dict);
+
/**
* Set xattrs for directories on all subvolumes. Additionally
- * this power is only given to a special client.
+ * this power is only given to a special client. Bd xlator
+ * also needs xattrs for regular files (ie LVs)
*/
- if ((frame->root->pid == GF_CLIENT_PID_GSYNCD) && IA_ISDIR (loc->inode->ia_type)) {
+ if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) &&
+ IA_ISDIR (loc->inode->ia_type)) || is_bd) {
for (i = 0; i < priv->child_count; i++, trav = trav->next) {
STACK_WIND (frame, stripe_setxattr_cbk,
trav->xlator, trav->xlator->fops->setxattr,
@@ -4013,21 +4546,21 @@ stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie,
int
-stripe_is_lockinfo (dict_t *this,
- char *key,
- data_t *value,
- void *data)
+stripe_is_special_key (dict_t *this,
+ char *key,
+ data_t *value,
+ void *data)
{
- gf_boolean_t *is_lockinfo = NULL;
+ gf_boolean_t *is_special = NULL;
if (data == NULL) {
goto out;
}
- is_lockinfo = data;
+ is_special = data;
- if (XATTR_IS_LOCKINFO (key))
- *is_lockinfo = _gf_true;
+ if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key))
+ *is_special = _gf_true;
out:
return 0;
@@ -4095,7 +4628,7 @@ out:
return ret;
}
-inline gf_boolean_t
+static inline gf_boolean_t
stripe_fsetxattr_is_special (dict_t *dict)
{
gf_boolean_t is_spl = _gf_false;
@@ -4104,7 +4637,7 @@ stripe_fsetxattr_is_special (dict_t *dict)
goto out;
}
- dict_foreach (dict, stripe_is_lockinfo, &is_spl);
+ dict_foreach (dict, stripe_is_special_key, &is_spl);
out:
return is_spl;
@@ -4353,7 +4886,7 @@ unlock:
if (!local_entry)
break;
- if (!IA_ISREG (local_entry->d_stat.ia_type)) {
+ if (!IA_ISREG (local_entry->d_stat.ia_type) || !local_entry->inode) {
LOCK (&frame->lock);
{
local->wind_count--;
@@ -4547,7 +5080,7 @@ reconfigure (xlator_t *this, dict_t *options)
goto unlock;
}
- if (gf_string2bytesize (opt->default_value, &priv->block_size)){
+ if (gf_string2bytesize_uint64 (opt->default_value, &priv->block_size)){
gf_log (this->name, GF_LOG_ERROR,
"Unable to set default block-size ");
ret = -1;
@@ -4654,7 +5187,7 @@ init (xlator_t *this)
ret = -1;
goto unlock;
}
- if (gf_string2bytesize (opt->default_value, &priv->block_size)){
+ if (gf_string2bytesize_uint64 (opt->default_value, &priv->block_size)){
gf_log (this->name, GF_LOG_ERROR,
"Unable to set default block-size ");
ret = -1;
@@ -4866,10 +5399,10 @@ stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie,
xattr->pos = cky;
xattr->xattr_value = gf_memdup (xattr_val,
- xattr->xattr_value,
xattr->xattr_len);
- local->xattr_total_len += xattr->xattr_len + 1;
+ if (xattr->xattr_value != NULL)
+ local->xattr_total_len += xattr->xattr_len + 1;
}
}
out:
@@ -4969,7 +5502,8 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, name,
local, stripe_getxattr_unwind,
sub_volumes, priv->child_count,
- MARKER_UUID_TYPE, priv->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ priv->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -4991,9 +5525,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
- if (name &&
- ((strncmp (name, GF_XATTR_PATHINFO_KEY,
- strlen (GF_XATTR_PATHINFO_KEY)) == 0))) {
+ if (name && (XATTR_IS_PATHINFO (name))) {
if (IA_ISREG (loc->inode->ia_type)) {
ret = inode_ctx_get (loc->inode, this,
(uint64_t *) &local->fctx);
@@ -5048,6 +5580,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
sub_volumes,
local->marker.call_count,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
priv->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -5068,7 +5601,7 @@ err:
return 0;
}
-inline gf_boolean_t
+static inline gf_boolean_t
stripe_is_special_xattr (const char *name)
{
gf_boolean_t is_spl = _gf_false;
@@ -5079,8 +5612,7 @@ stripe_is_special_xattr (const char *name)
if (!strncmp (name, GF_XATTR_LOCKINFO_KEY,
strlen (GF_XATTR_LOCKINFO_KEY))
- || !strncmp (name, GF_XATTR_PATHINFO_KEY,
- strlen (GF_XATTR_PATHINFO_KEY)))
+ || XATTR_IS_PATHINFO (name))
is_spl = _gf_true;
out:
return is_spl;
@@ -5221,6 +5753,9 @@ struct xlator_fops fops = {
.removexattr = stripe_removexattr,
.fremovexattr = stripe_fremovexattr,
.readdirp = stripe_readdirp,
+ .fallocate = stripe_fallocate,
+ .discard = stripe_discard,
+ .zerofill = stripe_zerofill,
};
struct xlator_cbks cbks = {
@@ -5246,10 +5781,10 @@ struct volume_options options[] = {
},
{ .key = {"coalesce"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "false",
- .description = "Enable coalesce mode to flatten striped files as "
- "stored on the server (i.e., eliminate holes caused "
- "by the traditional format)."
+ .default_value = "true",
+ .description = "Enable/Disable coalesce mode to flatten striped "
+ "files as stored on the server (i.e., eliminate holes "
+ "caused by the traditional format)."
},
{ .key = {NULL} },
};
diff --git a/xlators/debug/io-stats/src/Makefile.am b/xlators/debug/io-stats/src/Makefile.am
index 332d79015..dff294cd8 100644
--- a/xlators/debug/io-stats/src/Makefile.am
+++ b/xlators/debug/io-stats/src/Makefile.am
@@ -9,7 +9,9 @@ io_stats_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = io-stats-mem-types.h
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
index 63bb8fa90..5b4c833fb 100644
--- a/xlators/debug/io-stats/src/io-stats.c
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -36,6 +36,7 @@
#include <stdarg.h>
#include "defaults.h"
#include "logging.h"
+#include "cli1-xdr.h"
#define MAX_LIST_MEMBERS 100
@@ -508,7 +509,7 @@ out:
return 0;
}
-inline int
+static inline int
ios_stats_cleanup (xlator_t *this, inode_t *inode)
{
@@ -916,8 +917,19 @@ ios_dump_args_init (struct ios_dump_args *args, ios_dump_type_t type,
return ret;
}
+static void
+ios_global_stats_clear (struct ios_global_stats *stats, struct timeval *now)
+{
+ GF_ASSERT (stats);
+ GF_ASSERT (now);
+
+ memset (stats, 0, sizeof (*stats));
+ stats->started_at = *now;
+}
+
int
-io_stats_dump (xlator_t *this, struct ios_dump_args *args)
+io_stats_dump (xlator_t *this, struct ios_dump_args *args,
+ gf1_cli_info_op op, gf_boolean_t is_peek)
{
struct ios_conf *conf = NULL;
struct ios_global_stats cumulative = {0, };
@@ -935,18 +947,32 @@ io_stats_dump (xlator_t *this, struct ios_dump_args *args)
gettimeofday (&now, NULL);
LOCK (&conf->lock);
{
- cumulative = conf->cumulative;
- incremental = conf->incremental;
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_CUMULATIVE)
+ cumulative = conf->cumulative;
+
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_INCREMENTAL) {
+ incremental = conf->incremental;
+ increment = conf->increment;
- increment = conf->increment++;
+ if (!is_peek) {
+ increment = conf->increment++;
- memset (&conf->incremental, 0, sizeof (conf->incremental));
- conf->incremental.started_at = now;
+ ios_global_stats_clear (&conf->incremental,
+ &now);
+ }
+ }
}
UNLOCK (&conf->lock);
- io_stats_dump_global (this, &cumulative, &now, -1, args);
- io_stats_dump_global (this, &incremental, &now, increment, args);
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_CUMULATIVE)
+ io_stats_dump_global (this, &cumulative, &now, -1, args);
+
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_INCREMENTAL)
+ io_stats_dump_global (this, &incremental, &now, increment, args);
return 0;
}
@@ -1724,6 +1750,40 @@ io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
+io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, FALLOCATE);
+ STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+
+int
+io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, DISCARD);
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
+io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, ZEROFILL);
+ STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
@@ -2170,10 +2230,10 @@ conditional_dump (dict_t *dict, char *key, data_t *value, void *data)
gf_log (this->name, GF_LOG_ERROR, "failed to open %s "
"for writing", filename);
return -1;
- }
+ }
(void) ios_dump_args_init (&args, IOS_DUMP_TYPE_FILE,
logfp);
- io_stats_dump (this, &args);
+ io_stats_dump (this, &args, GF_CLI_INFO_ALL, _gf_false);
fclose (logfp);
}
return 0;
@@ -2392,6 +2452,45 @@ io_stats_fstat (call_frame_t *frame, xlator_t *this,
int
+io_stats_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
+
+ STACK_WIND(frame, io_stats_fallocate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+
+ return 0;
+}
+
+
+int
+io_stats_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
+
+ STACK_WIND(frame, io_stats_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+
+ return 0;
+}
+
+int
+io_stats_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
+
+ STACK_WIND(frame, io_stats_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+
+ return 0;
+}
+
+
+int
io_stats_lk (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
{
@@ -2538,15 +2637,42 @@ ios_destroy_top_stats (struct ios_conf *conf)
return;
}
+static int
+io_stats_clear (struct ios_conf *conf)
+{
+ struct timeval now;
+ int ret = -1;
+
+ GF_ASSERT (conf);
+
+ if (!gettimeofday (&now, NULL))
+ {
+ LOCK (&conf->lock);
+ {
+ ios_global_stats_clear (&conf->cumulative, &now);
+ ios_global_stats_clear (&conf->incremental, &now);
+ conf->increment = 0;
+ }
+ UNLOCK (&conf->lock);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int
reconfigure (xlator_t *this, dict_t *options)
{
struct ios_conf *conf = NULL;
int ret = -1;
char *sys_log_str = NULL;
+ char *log_format_str = NULL;
+ char *logger_str = NULL;
int sys_log_level = -1;
char *log_str = NULL;
int log_level = -1;
+ int log_format = -1;
+ int logger = -1;
if (!this || !this->private)
goto out;
@@ -2574,6 +2700,18 @@ reconfigure (xlator_t *this, dict_t *options)
gf_log_set_loglevel (log_level);
}
+ GF_OPTION_RECONF ("logger", logger_str, options, str, out);
+ if (logger_str) {
+ logger = gf_check_logger (logger_str);
+ gf_log_set_logger (logger);
+ }
+
+ GF_OPTION_RECONF ("log-format", log_format_str, options, str, out);
+ if (log_format_str) {
+ log_format = gf_check_log_format (log_format_str);
+ gf_log_set_logformat (log_format);
+ }
+
ret = 0;
out:
gf_log (this->name, GF_LOG_DEBUG, "reconfigure returning %d", ret);
@@ -2605,6 +2743,10 @@ init (xlator_t *this)
{
struct ios_conf *conf = NULL;
char *sys_log_str = NULL;
+ char *logger_str = NULL;
+ char *log_format_str = NULL;
+ int logger = -1;
+ int log_format = -1;
int sys_log_level = -1;
char *log_str = NULL;
int log_level = -1;
@@ -2663,6 +2805,19 @@ init (xlator_t *this)
gf_log_set_loglevel (log_level);
}
+ GF_OPTION_INIT ("logger", logger_str, str, out);
+ if (logger_str) {
+ logger = gf_check_logger (logger_str);
+ gf_log_set_logger (logger);
+ }
+
+ GF_OPTION_INIT ("log-format", log_format_str, str, out);
+ if (log_format_str) {
+ log_format = gf_check_log_format (log_format_str);
+ gf_log_set_logformat (log_format);
+ }
+
+
this->private = conf;
ret = 0;
out:
@@ -2700,10 +2855,11 @@ notify (xlator_t *this, int32_t event, void *data, ...)
struct ios_dump_args args = {0};
dict_t *output = NULL;
dict_t *dict = NULL;
- int32_t top_op = 0;
+ int32_t op = 0;
int32_t list_cnt = 0;
double throughput = 0;
double time = 0;
+ gf_boolean_t is_peek = _gf_false;
va_list ap;
dict = data;
@@ -2714,7 +2870,7 @@ notify (xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_TRANSLATOR_INFO:
ret = dict_get_str_boolean (dict, "clear-stats", _gf_false);
if (ret) {
- ret = dict_set_int32 (output, "top-op", top_op);
+ ret = dict_set_int32 (output, "top-op", op);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set top-op in dict");
@@ -2734,15 +2890,15 @@ notify (xlator_t *this, int32_t event, void *data, ...)
goto out;
}
- ret = dict_get_int32 (dict, "top-op", &top_op);
+ ret = dict_get_int32 (dict, "top-op", &op);
if (!ret) {
ret = dict_get_int32 (dict, "list-cnt", &list_cnt);
- if (top_op > IOS_STATS_TYPE_NONE &&
- top_op < IOS_STATS_TYPE_MAX)
+ if (op > IOS_STATS_TYPE_NONE &&
+ op < IOS_STATS_TYPE_MAX)
ret = io_stats_dump_stats_to_dict (this, output,
- top_op, list_cnt);
- if (top_op == IOS_STATS_TYPE_READ_THROUGHPUT ||
- top_op == IOS_STATS_TYPE_WRITE_THROUGHPUT) {
+ op, list_cnt);
+ if (op == IOS_STATS_TYPE_READ_THROUGHPUT ||
+ op == IOS_STATS_TYPE_WRITE_THROUGHPUT) {
ret = dict_get_double (dict, "throughput",
&throughput);
if (!ret) {
@@ -2763,9 +2919,41 @@ notify (xlator_t *this, int32_t event, void *data, ...)
}
} else {
- (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_DICT,
- output);
- ret = io_stats_dump (this, &args);
+ ret = dict_get_int32 (dict, "info-op", &op);
+ if (ret || op < GF_CLI_INFO_ALL ||
+ GF_CLI_INFO_CLEAR < op)
+ op = GF_CLI_INFO_ALL;
+
+ ret = dict_set_int32 (output, "info-op", op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set info-op in dict");
+ goto out;
+ }
+
+ if (GF_CLI_INFO_CLEAR == op) {
+ ret = io_stats_clear (this->private);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to clear info stats");
+
+ ret = dict_set_int32 (output, "stats-cleared",
+ ret ? 0 : 1);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set stats-cleared"
+ " in dict");
+ }
+ else {
+ ret = dict_get_str_boolean (dict, "peek",
+ _gf_false);
+ if (-1 != ret)
+ is_peek = ret;
+
+ (void) ios_dump_args_init (&args,
+ IOS_DUMP_TYPE_DICT, output);
+ ret = io_stats_dump (this, &args, op, is_peek);
+ }
}
break;
default:
@@ -2817,6 +3005,9 @@ struct xlator_fops fops = {
.fxattrop = io_stats_fxattrop,
.setattr = io_stats_setattr,
.fsetattr = io_stats_fsetattr,
+ .fallocate = io_stats_fallocate,
+ .discard = io_stats_discard,
+ .zerofill = io_stats_zerofill,
};
struct xlator_cbks cbks = {
@@ -2869,6 +3060,40 @@ struct volume_options options[] = {
.value = { "DEBUG", "WARNING", "ERROR", "INFO",
"CRITICAL", "NONE", "TRACE"}
},
+ { .key = {"logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = { GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}
+ },
+ { .key = {"client-logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = GF_LOGGER_GLUSTER_LOG,
+ .description = "Changes the logging sub-system to log to, for the "
+ "clients",
+ .value = { GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}
+ },
+ { .key = {"brick-logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = GF_LOGGER_GLUSTER_LOG,
+ .description = "Changes the logging sub-system to log to, for the "
+ "bricks",
+ .value = { GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}
+ },
+ { .key = {"log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = { GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}
+ },
+ { .key = {"client-log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = GF_LOG_FORMAT_WITH_MSG_ID,
+ .description = "Changes log format for the clients",
+ .value = { GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}
+ },
+ { .key = {"brick-log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = GF_LOG_FORMAT_WITH_MSG_ID,
+ .description = "Changes the log format for the bricks",
+ .value = { GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}
+ },
{ .key = {NULL} },
};
diff --git a/xlators/debug/trace/src/trace.c b/xlators/debug/trace/src/trace.c
index a9c11babe..1efd50e65 100644
--- a/xlators/debug/trace/src/trace.c
+++ b/xlators/debug/trace/src/trace.c
@@ -18,6 +18,44 @@
* Very helpful translator for debugging.
*/
+static inline void
+trace_stat_to_str(struct iatt *buf, char *str)
+{
+ char atime_buf[256] = {0,};
+ char mtime_buf[256] = {0,};
+ char ctime_buf[256] = {0,};
+ uint64_t ia_time = 0;
+
+ if (!buf)
+ return;
+
+ ia_time = buf->ia_atime;
+ strftime (atime_buf, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+ ia_time = buf->ia_mtime;
+ strftime (mtime_buf, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+
+ ia_time = buf->ia_ctime;
+ strftime (ctime_buf, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+
+ snprintf (str, sizeof (str),
+ "gfid=%s ino=%"PRIu64", mode=%o, "
+ "nlink=%"GF_PRI_NLINK", uid=%u, "
+ "gid=%u, size=%"PRIu64", "
+ "blocks=%"PRIu64", atime=%s, "
+ "mtime=%s, ctime=%s",
+ uuid_utoa (buf->ia_gfid),
+ buf->ia_ino,
+ st_mode_from_ia (buf->ia_prot, buf->ia_type),
+ buf->ia_nlink, buf->ia_uid,
+ buf->ia_gid, buf->ia_size,
+ buf->ia_blocks, atime_buf,
+ mtime_buf, ctime_buf);
+}
+
+
int
dump_history_trace (circular_buffer_t *cb, void *data)
{
@@ -3134,7 +3172,7 @@ init (xlator_t *this)
gf_log (this->name, GF_LOG_DEBUG, "logging to history %s",
(conf->log_history == _gf_true)?"enabled":"disabled");
- history = eh_new (history_size, _gf_false);
+ history = eh_new (history_size, _gf_false, NULL);
if (!history) {
gf_log (this->name, GF_LOG_ERROR, "event history cannot be "
"initialized");
diff --git a/xlators/debug/trace/src/trace.h b/xlators/debug/trace/src/trace.h
index 045eefb36..62d1bc9c9 100644
--- a/xlators/debug/trace/src/trace.h
+++ b/xlators/debug/trace/src/trace.h
@@ -59,40 +59,3 @@ typedef struct {
"%s", _string); \
} \
} while (0);
-
-#define trace_stat_to_str(buf, statstr) \
- do { \
- char atime_buf[256] = {0,}; \
- char mtime_buf[256] = {0,}; \
- char ctime_buf[256] = {0,}; \
- uint64_t ia_time = 0; \
- \
- if (!buf) \
- break; \
- \
- ia_time = buf->ia_atime; \
- strftime (atime_buf, 256, "[%b %d %H:%M:%S]", \
- localtime ((time_t *)&ia_time)); \
- \
- ia_time = buf->ia_mtime; \
- strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", \
- localtime ((time_t *)&ia_time)); \
- \
- ia_time = buf->ia_ctime; \
- strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", \
- localtime ((time_t *)&ia_time)); \
- \
- snprintf (statstr, sizeof (statstr), \
- "gfid=%s ino=%"PRIu64", mode=%o, " \
- "nlink=%"GF_PRI_NLINK", uid=%u, " \
- "gid=%u, size=%"PRIu64", " \
- "blocks=%"PRIu64", atime=%s, " \
- "mtime=%s, ctime=%s", \
- uuid_utoa (buf->ia_gfid), buf->ia_ino, \
- st_mode_from_ia (buf->ia_prot, \
- buf->ia_type), \
- buf->ia_nlink, buf->ia_uid, \
- buf->ia_gid, buf->ia_size, \
- buf->ia_blocks, atime_buf, \
- mtime_buf, ctime_buf); \
- } while (0);
diff --git a/xlators/encryption/Makefile.am b/xlators/encryption/Makefile.am
index 2cbde680f..36efc6698 100644
--- a/xlators/encryption/Makefile.am
+++ b/xlators/encryption/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = rot-13
+SUBDIRS = rot-13 crypt
CLEANFILES =
diff --git a/xlators/encryption/crypt/Makefile.am b/xlators/encryption/crypt/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/encryption/crypt/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/encryption/crypt/src/Makefile.am b/xlators/encryption/crypt/src/Makefile.am
new file mode 100644
index 000000000..b13f65043
--- /dev/null
+++ b/xlators/encryption/crypt/src/Makefile.am
@@ -0,0 +1,24 @@
+if ENABLE_CRYPT_XLATOR
+
+xlator_LTLIBRARIES = crypt.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/encryption
+
+crypt_la_LDFLAGS = -module -avoid-version -lssl -lcrypto
+
+crypt_la_SOURCES = keys.c data.c metadata.c atom.c crypt.c
+crypt_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = crypt-common.h crypt-mem-types.h crypt.h metadata.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
+else
+
+noinst_DIST = keys.c data.c metadata.c atom.c crypt.c
+noinst_HEADERS = crypt-common.h crypt-mem-types.h crypt.h metadata.h
+
+endif \ No newline at end of file
diff --git a/xlators/encryption/crypt/src/atom.c b/xlators/encryption/crypt/src/atom.c
new file mode 100644
index 000000000..1ec41495c
--- /dev/null
+++ b/xlators/encryption/crypt/src/atom.c
@@ -0,0 +1,962 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "crypt-common.h"
+#include "crypt.h"
+
+/*
+ * Glossary
+ *
+ *
+ * cblock (or cipher block). A logical unit in a file.
+ * cblock size is defined as the number of bits
+ * in an input (or output) block of the block
+ * cipher (*). Cipher block size is a property of
+ * cipher algorithm. E.g. cblock size is 64 bits
+ * for DES, 128 bits for AES, etc.
+ *
+ * atomic cipher A cipher algorithm, which requires some chunks of
+ * algorithm text to be padded at left and(or) right sides before
+ * cipher transaform.
+ *
+ *
+ * block (atom) Minimal chunk of file's data, which doesn't require
+ * padding. We'll consider logical units in a file of
+ * block size (atom size).
+ *
+ * cipher algorithm Atomic cipher algorithm, which requires the last
+ * with EOF issue incomplete cblock in a file to be padded with some
+ * data (usually zeros).
+ *
+ *
+ * operation, which reading/writing from offset, which is not aligned to
+ * forms a gap at to atom size
+ * the beginning
+ *
+ *
+ * operation, which reading/writing count bytes starting from offset off,
+ * forms a gap at so that off+count is not aligned to atom_size
+ * the end
+ *
+ * head block the first atom affected by an operation, which forms
+ * a gap at the beginning, or(and) at the end.
+ * Сomment. Head block has at least one gap (either at
+ * the beginning, or at the end)
+ *
+ *
+ * tail block the last atom different from head, affected by an
+ * operation, which forms a gap at the end.
+ * Сomment: Tail block has exactly one gap (at the end).
+ *
+ *
+ * partial block head or tail block
+ *
+ *
+ * full block block without gaps.
+ *
+ *
+ * (*) Recommendation for Block Cipher Modes of Operation
+ * Methods and Techniques
+ * NIST Special Publication 800-38A Edition 2001
+ */
+
+/*
+ * atom->offset_at()
+ */
+static off_t offset_at_head(struct avec_config *conf)
+{
+ return conf->aligned_offset;
+}
+
+static off_t offset_at_hole_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_head(get_hole_conf(frame));
+}
+
+static off_t offset_at_data_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_head(get_data_conf(frame));
+}
+
+
+static off_t offset_at_tail(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ return conf->aligned_offset +
+ (conf->off_in_head ? get_atom_size(object) : 0) +
+ (conf->nr_full_blocks << get_atom_bits(object));
+}
+
+static off_t offset_at_hole_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_tail(get_hole_conf(frame), object);
+}
+
+
+static off_t offset_at_data_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_tail(get_data_conf(frame), object);
+}
+
+static off_t offset_at_full(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ return conf->aligned_offset +
+ (conf->off_in_head ? get_atom_size(object) : 0);
+}
+
+static off_t offset_at_data_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_full(get_data_conf(frame), object);
+}
+
+static off_t offset_at_hole_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_full(get_hole_conf(frame), object);
+}
+
+/*
+ * atom->io_size_nopad()
+ */
+
+static uint32_t io_size_nopad_head(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ uint32_t gap_at_beg;
+ uint32_t gap_at_end;
+
+ check_head_block(conf);
+
+ gap_at_beg = conf->off_in_head;
+
+ if (has_tail_block(conf) || has_full_blocks(conf) || conf->off_in_tail == 0 )
+ gap_at_end = 0;
+ else
+ gap_at_end = get_atom_size(object) - conf->off_in_tail;
+
+ return get_atom_size(object) - (gap_at_beg + gap_at_end);
+}
+
+static uint32_t io_size_nopad_tail(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ check_tail_block(conf);
+ return conf->off_in_tail;
+}
+
+static uint32_t io_size_nopad_full(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ check_full_block(conf);
+ return get_atom_size(object);
+}
+
+static uint32_t io_size_nopad_data_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_head(get_data_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_hole_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_head(get_hole_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_data_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_tail(get_data_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_hole_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_tail(get_hole_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_data_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_full(get_data_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_hole_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_full(get_hole_conf(frame), object);
+}
+
+static uint32_t offset_in_head(struct avec_config *conf)
+{
+ check_cursor_head(conf);
+
+ return conf->off_in_head;
+}
+
+static uint32_t offset_in_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return 0;
+}
+
+static uint32_t offset_in_full(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ check_cursor_full(conf);
+
+ if (has_head_block(conf))
+ return (conf->cursor - 1) << get_atom_bits(object);
+ else
+ return conf->cursor << get_atom_bits(object);
+}
+
+static uint32_t offset_in_data_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_in_head(get_data_conf(frame));
+}
+
+static uint32_t offset_in_hole_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_in_head(get_hole_conf(frame));
+}
+
+static uint32_t offset_in_data_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_in_full(get_data_conf(frame), object);
+}
+
+static uint32_t offset_in_hole_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_in_full(get_hole_conf(frame), object);
+}
+
+/*
+ * atom->rmw()
+ */
+/*
+ * Pre-conditions:
+ * @vec contains plain text of the latest
+ * version.
+ *
+ * Uptodate gaps of the @partial block with
+ * this plain text, encrypt the whole block
+ * and write the result to disk.
+ */
+static int32_t rmw_partial_block(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ struct rmw_atom *atom)
+{
+ size_t was_read = 0;
+ uint64_t file_size;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ struct iovec *partial = atom->get_iovec(frame, 0);
+ struct avec_config *conf = atom->get_config(frame);
+ end_writeback_handler_t end_writeback_partial_block;
+#if DEBUG_CRYPT
+ gf_boolean_t check_last_cblock = _gf_false;
+#endif
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto exit;
+
+ file_size = local->cur_file_size;
+ was_read = op_ret;
+
+ if (atom->locality == HEAD_ATOM && conf->off_in_head) {
+ /*
+ * head atom with a non-uptodate gap
+ * at the beginning
+ *
+ * fill the gap with plain text of the
+ * latest version. Convert a part of hole
+ * (if any) to zeros.
+ */
+ int32_t i;
+ int32_t copied = 0;
+ int32_t to_gap; /* amount of data needed to uptodate
+ the gap at the beginning */
+#if 0
+ int32_t hole = 0; /* The part of the hole which
+ * got in the head block */
+#endif /* 0 */
+ to_gap = conf->off_in_head;
+
+ if (was_read < to_gap) {
+ if (file_size >
+ offset_at_head(conf) + was_read) {
+ /*
+ * It is impossible to uptodate
+ * head block: too few bytes have
+ * been read from disk, so that
+ * partial write is impossible.
+ *
+ * It could happen because of many
+ * reasons: IO errors, (meta)data
+ * corruption in the local file system,
+ * etc.
+ */
+ gf_log(this->name, GF_LOG_WARNING,
+ "Can not uptodate a gap at the beginning");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto exit;
+ }
+#if 0
+ hole = to_gap - was_read;
+#endif /* 0 */
+ to_gap = was_read;
+ }
+ /*
+ * uptodate the gap at the beginning
+ */
+ for (i = 0; i < count && copied < to_gap; i++) {
+ int32_t to_copy;
+
+ to_copy = vec[i].iov_len;
+ if (to_copy > to_gap - copied)
+ to_copy = to_gap - copied;
+
+ memcpy(partial->iov_base, vec[i].iov_base, to_copy);
+ copied += to_copy;
+ }
+#if 0
+ /*
+ * If possible, convert part of the
+ * hole, which got in the head block
+ */
+ ret = TRY_LOCK(&local->hole_lock);
+ if (!ret) {
+ if (local->hole_handled)
+ /*
+ * already converted by
+ * crypt_writev_cbk()
+ */
+ UNLOCK(&local->hole_lock);
+ else {
+ /*
+ * convert the part of the hole
+ * which got in the head block
+ * to zeros.
+ *
+ * Update the orig_offset to make
+ * sure writev_cbk() won't care
+ * about this part of the hole.
+ *
+ */
+ memset(partial->iov_base + to_gap, 0, hole);
+
+ conf->orig_offset -= hole;
+ conf->orig_size += hole;
+ UNLOCK(&local->hole_lock);
+ }
+ }
+ else /*
+ * conversion is being performed
+ * by crypt_writev_cbk()
+ */
+ ;
+#endif /* 0 */
+ }
+ if (atom->locality == TAIL_ATOM ||
+ (!has_tail_block(conf) && conf->off_in_tail)) {
+ /*
+ * tail atom, or head atom with a non-uptodate
+ * gap at the end.
+ *
+ * fill the gap at the end of the block
+ * with plain text of the latest version.
+ * Pad the result, (if needed)
+ */
+ int32_t i;
+ int32_t to_gap;
+ int copied;
+ off_t off_in_tail;
+ int32_t to_copy;
+
+ off_in_tail = conf->off_in_tail;
+ to_gap = conf->gap_in_tail;
+
+ if (to_gap && was_read < off_in_tail + to_gap) {
+ /*
+ * It is impossible to uptodate
+ * the gap at the end: too few bytes
+ * have been read from disk, so that
+ * partial write is impossible.
+ *
+ * It could happen because of many
+ * reasons: IO errors, (meta)data
+ * corruption in the local file system,
+ * etc.
+ */
+ gf_log(this->name, GF_LOG_WARNING,
+ "Can not uptodate a gap at the end");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto exit;
+ }
+ /*
+ * uptodate the gap at the end
+ */
+ copied = 0;
+ to_copy = to_gap;
+ for(i = count - 1; i >= 0 && to_copy > 0; i--) {
+ uint32_t from_vec, off_in_vec;
+
+ off_in_vec = 0;
+ from_vec = vec[i].iov_len;
+ if (from_vec > to_copy) {
+ off_in_vec = from_vec - to_copy;
+ from_vec = to_copy;
+ }
+ memcpy(partial->iov_base +
+ off_in_tail + to_gap - copied - from_vec,
+ vec[i].iov_base + off_in_vec,
+ from_vec);
+
+ gf_log(this->name, GF_LOG_DEBUG,
+ "uptodate %d bytes at tail. Offset at target(source): %d(%d)",
+ (int)from_vec,
+ (int)off_in_tail + to_gap - copied - from_vec,
+ (int)off_in_vec);
+
+ copied += from_vec;
+ to_copy -= from_vec;
+ }
+ partial->iov_len = off_in_tail + to_gap;
+
+ if (object_alg_should_pad(object)) {
+ int32_t resid = 0;
+ resid = partial->iov_len & (object_alg_blksize(object) - 1);
+ if (resid) {
+ /*
+ * append a new EOF padding
+ */
+ local->eof_padding_size =
+ object_alg_blksize(object) - resid;
+
+ gf_log(this->name, GF_LOG_DEBUG,
+ "set padding size %d",
+ local->eof_padding_size);
+
+ memset(partial->iov_base + partial->iov_len,
+ 1,
+ local->eof_padding_size);
+ partial->iov_len += local->eof_padding_size;
+#if DEBUG_CRYPT
+ gf_log(this->name, GF_LOG_DEBUG,
+ "pad cblock with %d zeros:",
+ local->eof_padding_size);
+ dump_cblock(this,
+ (unsigned char *)partial->iov_base +
+ partial->iov_len - object_alg_blksize(object));
+ check_last_cblock = _gf_true;
+#endif
+ }
+ }
+ }
+ /*
+ * encrypt the whole block
+ */
+ encrypt_aligned_iov(object,
+ partial,
+ 1,
+ atom->offset_at(frame, object));
+#if DEBUG_CRYPT
+ if (check_last_cblock == _gf_true) {
+ gf_log(this->name, GF_LOG_DEBUG,
+ "encrypt last cblock with offset %llu",
+ (unsigned long long)atom->offset_at(frame, object));
+ dump_cblock(this, (unsigned char *)partial->iov_base +
+ partial->iov_len - object_alg_blksize(object));
+ }
+#endif
+ set_local_io_params_writev(frame, object, atom,
+ atom->offset_at(frame, object),
+ iovec_get_size(partial, 1));
+ /*
+ * write the whole block to disk
+ */
+ end_writeback_partial_block = dispatch_end_writeback(local->fop);
+ conf->cursor ++;
+ STACK_WIND(frame,
+ end_writeback_partial_block,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ local->fd,
+ partial,
+ 1,
+ atom->offset_at(frame, object),
+ local->flags,
+ local->iobref_data,
+ local->xdata);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "submit partial block: %d bytes from %d offset",
+ (int)iovec_get_size(partial, 1),
+ (int)atom->offset_at(frame, object));
+ exit:
+ return 0;
+}
+
+/*
+ * Perform a (read-)modify-write sequence.
+ * This should be performed only after approval
+ * of upper server-side manager, i.e. the caller
+ * needs to make sure this is his turn to rmw.
+ */
+void submit_partial(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ atom_locality_type ltype)
+{
+ int32_t ret;
+ dict_t *dict;
+ struct rmw_atom *atom;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ atom = atom_by_types(local->active_setup, ltype);
+ /*
+ * To perform the "read" component of the read-modify-write
+ * sequence the crypt translator does stack_wind to itself.
+ *
+ * Pass current file size to crypt_readv()
+ */
+ dict = dict_new();
+ if (!dict) {
+ /*
+ * FIXME: Handle the error
+ */
+ gf_log("crypt", GF_LOG_WARNING, "Can not alloc dict");
+ return;
+ }
+ ret = dict_set(dict,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (ret) {
+ /*
+ * FIXME: Handle the error
+ */
+ dict_unref(dict);
+ gf_log("crypt", GF_LOG_WARNING, "Can not set dict");
+ goto exit;
+ }
+ STACK_WIND(frame,
+ atom->rmw,
+ this,
+ this->fops->readv, /* crypt_readv */
+ fd,
+ atom->count_to_uptodate(frame, object), /* count */
+ atom->offset_at(frame, object), /* offset to read from */
+ 0,
+ dict);
+ exit:
+ dict_unref(dict);
+}
+
+/*
+ * submit blocks of FULL_ATOM type
+ */
+void submit_full(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+ struct rmw_atom *atom = atom_by_types(local->active_setup, FULL_ATOM);
+ uint32_t count; /* total number of full blocks to submit */
+ uint32_t granularity; /* number of blocks to submit in one iteration */
+
+ uint64_t off_in_file; /* start offset in the file, bytes */
+ uint32_t off_in_atom; /* start offset in the atom, blocks */
+ uint32_t blocks_written = 0; /* blocks written for this submit */
+
+ struct avec_config *conf = atom->get_config(frame);
+ end_writeback_handler_t end_writeback_full_block;
+ /*
+ * Write full blocks by groups of granularity size.
+ */
+ end_writeback_full_block = dispatch_end_writeback(local->fop);
+
+ if (is_ordered_mode(frame)) {
+ uint32_t skip = has_head_block(conf) ? 1 : 0;
+ count = 1;
+ granularity = 1;
+ /*
+ * calculate start offset using cursor value;
+ * here we should take into accout head block,
+ * which corresponds to cursor value 0.
+ */
+ off_in_file = atom->offset_at(frame, object) +
+ ((conf->cursor - skip) << get_atom_bits(object));
+ off_in_atom = conf->cursor - skip;
+ }
+ else {
+ /*
+ * in parallel mode
+ */
+ count = conf->nr_full_blocks;
+ granularity = MAX_IOVEC;
+ off_in_file = atom->offset_at(frame, object);
+ off_in_atom = 0;
+ }
+ while (count) {
+ uint32_t blocks_to_write = count;
+
+ if (blocks_to_write > granularity)
+ blocks_to_write = granularity;
+ if (conf->type == HOLE_ATOM)
+ /*
+ * reset iovec before encryption
+ */
+ memset(atom->get_iovec(frame, 0)->iov_base,
+ 0,
+ get_atom_size(object));
+ /*
+ * encrypt the group
+ */
+ encrypt_aligned_iov(object,
+ atom->get_iovec(frame,
+ off_in_atom +
+ blocks_written),
+ blocks_to_write,
+ off_in_file + (blocks_written <<
+ get_atom_bits(object)));
+
+ set_local_io_params_writev(frame, object, atom,
+ off_in_file + (blocks_written << get_atom_bits(object)),
+ blocks_to_write << get_atom_bits(object));
+
+ conf->cursor += blocks_to_write;
+
+ STACK_WIND(frame,
+ end_writeback_full_block,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ local->fd,
+ atom->get_iovec(frame, off_in_atom + blocks_written),
+ blocks_to_write,
+ off_in_file + (blocks_written << get_atom_bits(object)),
+ local->flags,
+ local->iobref_data ? local->iobref_data : local->iobref,
+ local->xdata);
+
+ gf_log("crypt", GF_LOG_DEBUG, "submit %d full blocks from %d offset",
+ blocks_to_write,
+ (int)(off_in_file + (blocks_written << get_atom_bits(object))));
+
+ count -= blocks_to_write;
+ blocks_written += blocks_to_write;
+ }
+ return;
+}
+
+static int32_t rmw_data_head(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ return rmw_partial_block(frame,
+ cookie,
+ this,
+ op_ret,
+ op_errno,
+ vec,
+ count,
+ stbuf,
+ iobref,
+ atom_by_types(DATA_ATOM, HEAD_ATOM));
+}
+
+static int32_t rmw_data_tail(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ return rmw_partial_block(frame,
+ cookie,
+ this,
+ op_ret,
+ op_errno,
+ vec,
+ count,
+ stbuf,
+ iobref,
+ atom_by_types(DATA_ATOM, TAIL_ATOM));
+}
+
+static int32_t rmw_hole_head(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ return rmw_partial_block(frame,
+ cookie,
+ this,
+ op_ret,
+ op_errno,
+ vec,
+ count,
+ stbuf,
+ iobref,
+ atom_by_types(HOLE_ATOM, HEAD_ATOM));
+}
+
+static int32_t rmw_hole_tail(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ return rmw_partial_block(frame,
+ cookie,
+ this,
+ op_ret,
+ op_errno,
+ vec,
+ count,
+ stbuf,
+ iobref,
+ atom_by_types(HOLE_ATOM, TAIL_ATOM));
+}
+
+/*
+ * atom->count_to_uptodate()
+ */
+static uint32_t count_to_uptodate_head(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ if (conf->acount == 1 && conf->off_in_tail)
+ return get_atom_size(object);
+ else
+ /* there is no need to read the whole head block */
+ return conf->off_in_head;
+}
+
+static uint32_t count_to_uptodate_tail(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ /* we need to read the whole tail block */
+ return get_atom_size(object);
+}
+
+static uint32_t count_to_uptodate_data_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return count_to_uptodate_head(get_data_conf(frame), object);
+}
+
+static uint32_t count_to_uptodate_data_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return count_to_uptodate_tail(get_data_conf(frame), object);
+}
+
+static uint32_t count_to_uptodate_hole_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return count_to_uptodate_head(get_hole_conf(frame), object);
+}
+
+static uint32_t count_to_uptodate_hole_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return count_to_uptodate_tail(get_hole_conf(frame), object);
+}
+
+/* atom->get_config() */
+
+static struct avec_config *get_config_data(call_frame_t *frame)
+{
+ return &((crypt_local_t *)frame->local)->data_conf;
+}
+
+static struct avec_config *get_config_hole(call_frame_t *frame)
+{
+ return &((crypt_local_t *)frame->local)->hole_conf;
+}
+
+/*
+ * atom->get_iovec()
+ */
+static struct iovec *get_iovec_hole_head(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_hole_conf(frame);
+
+ return conf->avec;
+}
+
+static struct iovec *get_iovec_hole_full(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_hole_conf(frame);
+
+ return conf->avec + (conf->off_in_head ? 1 : 0);
+}
+
+static inline struct iovec *get_iovec_hole_tail(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_hole_conf(frame);
+
+ return conf->avec + (conf->blocks_in_pool - 1);
+}
+
+static inline struct iovec *get_iovec_data_head(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_data_conf(frame);
+
+ return conf->avec;
+}
+
+static inline struct iovec *get_iovec_data_full(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_data_conf(frame);
+
+ return conf->avec + (conf->off_in_head ? 1 : 0) + count;
+}
+
+static inline struct iovec *get_iovec_data_tail(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_data_conf(frame);
+
+ return conf->avec +
+ (conf->off_in_head ? 1 : 0) +
+ conf->nr_full_blocks;
+}
+
+static struct rmw_atom atoms[LAST_DATA_TYPE][LAST_LOCALITY_TYPE] = {
+ [DATA_ATOM][HEAD_ATOM] =
+ { .locality = HEAD_ATOM,
+ .rmw = rmw_data_head,
+ .offset_at = offset_at_data_head,
+ .offset_in = offset_in_data_head,
+ .get_iovec = get_iovec_data_head,
+ .io_size_nopad = io_size_nopad_data_head,
+ .count_to_uptodate = count_to_uptodate_data_head,
+ .get_config = get_config_data
+ },
+ [DATA_ATOM][TAIL_ATOM] =
+ { .locality = TAIL_ATOM,
+ .rmw = rmw_data_tail,
+ .offset_at = offset_at_data_tail,
+ .offset_in = offset_in_tail,
+ .get_iovec = get_iovec_data_tail,
+ .io_size_nopad = io_size_nopad_data_tail,
+ .count_to_uptodate = count_to_uptodate_data_tail,
+ .get_config = get_config_data
+ },
+ [DATA_ATOM][FULL_ATOM] =
+ { .locality = FULL_ATOM,
+ .offset_at = offset_at_data_full,
+ .offset_in = offset_in_data_full,
+ .get_iovec = get_iovec_data_full,
+ .io_size_nopad = io_size_nopad_data_full,
+ .get_config = get_config_data
+ },
+ [HOLE_ATOM][HEAD_ATOM] =
+ { .locality = HEAD_ATOM,
+ .rmw = rmw_hole_head,
+ .offset_at = offset_at_hole_head,
+ .offset_in = offset_in_hole_head,
+ .get_iovec = get_iovec_hole_head,
+ .io_size_nopad = io_size_nopad_hole_head,
+ .count_to_uptodate = count_to_uptodate_hole_head,
+ .get_config = get_config_hole
+ },
+ [HOLE_ATOM][TAIL_ATOM] =
+ { .locality = TAIL_ATOM,
+ .rmw = rmw_hole_tail,
+ .offset_at = offset_at_hole_tail,
+ .offset_in = offset_in_tail,
+ .get_iovec = get_iovec_hole_tail,
+ .io_size_nopad = io_size_nopad_hole_tail,
+ .count_to_uptodate = count_to_uptodate_hole_tail,
+ .get_config = get_config_hole
+ },
+ [HOLE_ATOM][FULL_ATOM] =
+ { .locality = FULL_ATOM,
+ .offset_at = offset_at_hole_full,
+ .offset_in = offset_in_hole_full,
+ .get_iovec = get_iovec_hole_full,
+ .io_size_nopad = io_size_nopad_hole_full,
+ .get_config = get_config_hole
+ }
+};
+
+struct rmw_atom *atom_by_types(atom_data_type data,
+ atom_locality_type locality)
+{
+ return &atoms[data][locality];
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/crypt-common.h b/xlators/encryption/crypt/src/crypt-common.h
new file mode 100644
index 000000000..7c212ad5d
--- /dev/null
+++ b/xlators/encryption/crypt/src/crypt-common.h
@@ -0,0 +1,141 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CRYPT_COMMON_H__
+#define __CRYPT_COMMON_H__
+
+#define INVAL_SUBVERSION_NUMBER (0xff)
+#define CRYPT_INVAL_OP (GF_FOP_NULL)
+
+#define CRYPTO_FORMAT_PREFIX "trusted.glusterfs.crypt.att.cfmt"
+#define FSIZE_XATTR_PREFIX "trusted.glusterfs.crypt.att.size"
+#define SUBREQ_PREFIX "trusted.glusterfs.crypt.msg.sreq"
+#define FSIZE_MSG_PREFIX "trusted.glusterfs.crypt.msg.size"
+#define DE_MSG_PREFIX "trusted.glusterfs.crypt.msg.dent"
+#define REQUEST_ID_PREFIX "trusted.glusterfs.crypt.msg.rqid"
+#define MSGFLAGS_PREFIX "trusted.glusterfs.crypt.msg.xfgs"
+
+
+/* messages for crypt_open() */
+#define MSGFLAGS_REQUEST_MTD_RLOCK 1 /* take read lock and don't unlock */
+#define MSGFLAGS_REQUEST_MTD_WLOCK 2 /* take write lock and don't unlock */
+
+#define AES_BLOCK_BITS (4) /* AES_BLOCK_SIZE == 1 << AES_BLOCK_BITS */
+
+#define noop do {; } while (0)
+#define cassert(cond) ({ switch (-1) { case (cond): case 0: break; } })
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+
+/*
+ * Format of file's metadata
+ */
+struct crypt_format {
+ uint8_t loader_id; /* version of metadata loader */
+ uint8_t versioned[0]; /* file's metadata of specific version */
+} __attribute__((packed));
+
+typedef enum {
+ AES_CIPHER_ALG,
+ LAST_CIPHER_ALG
+} cipher_alg_t;
+
+typedef enum {
+ XTS_CIPHER_MODE,
+ LAST_CIPHER_MODE
+} cipher_mode_t;
+
+typedef enum {
+ MTD_LOADER_V1,
+ LAST_MTD_LOADER
+} mtd_loader_id;
+
+static inline void msgflags_set_mtd_rlock(uint32_t *flags)
+{
+ *flags |= MSGFLAGS_REQUEST_MTD_RLOCK;
+}
+
+static inline void msgflags_set_mtd_wlock(uint32_t *flags)
+{
+ *flags |= MSGFLAGS_REQUEST_MTD_WLOCK;
+}
+
+static inline gf_boolean_t msgflags_check_mtd_rlock(uint32_t *flags)
+{
+ return *flags & MSGFLAGS_REQUEST_MTD_RLOCK;
+}
+
+static inline gf_boolean_t msgflags_check_mtd_wlock(uint32_t *flags)
+{
+ return *flags & MSGFLAGS_REQUEST_MTD_WLOCK;
+}
+
+static inline gf_boolean_t msgflags_check_mtd_lock(uint32_t *flags)
+{
+ return msgflags_check_mtd_rlock(flags) ||
+ msgflags_check_mtd_wlock(flags);
+}
+
+/*
+ * returns number of logical blocks occupied
+ * (maybe partially) by @count bytes
+ * at offset @start.
+ */
+static inline off_t logical_blocks_occupied(uint64_t start, off_t count,
+ int blkbits)
+{
+ return ((start + count - 1) >> blkbits) - (start >> blkbits) + 1;
+}
+
+/*
+ * are two bytes (represented by offsets @off1
+ * and @off2 respectively) in the same logical
+ * block.
+ */
+static inline int in_same_lblock(uint64_t off1, uint64_t off2,
+ int blkbits)
+{
+ return off1 >> blkbits == off2 >> blkbits;
+}
+
+static inline void dump_cblock(xlator_t *this, unsigned char *buf)
+{
+ gf_log(this->name, GF_LOG_DEBUG,
+ "dump cblock: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x",
+ (buf)[0],
+ (buf)[1],
+ (buf)[2],
+ (buf)[3],
+ (buf)[4],
+ (buf)[5],
+ (buf)[6],
+ (buf)[7],
+ (buf)[8],
+ (buf)[9],
+ (buf)[10],
+ (buf)[11],
+ (buf)[12],
+ (buf)[13],
+ (buf)[14],
+ (buf)[15]);
+}
+
+#endif /* __CRYPT_COMMON_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/crypt-mem-types.h b/xlators/encryption/crypt/src/crypt-mem-types.h
new file mode 100644
index 000000000..2eab921fc
--- /dev/null
+++ b/xlators/encryption/crypt/src/crypt-mem-types.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __CRYPT_MEM_TYPES_H__
+#define __CRYPT_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_crypt_mem_types_ {
+ gf_crypt_mt_priv = gf_common_mt_end + 1,
+ gf_crypt_mt_inode,
+ gf_crypt_mt_data,
+ gf_crypt_mt_mtd,
+ gf_crypt_mt_loc,
+ gf_crypt_mt_iatt,
+ gf_crypt_mt_key,
+ gf_crypt_mt_iovec,
+ gf_crypt_mt_char,
+ gf_crypt_mt_end,
+};
+
+#endif /* __CRYPT_MEM_TYPES_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
+
+
+
diff --git a/xlators/encryption/crypt/src/crypt.c b/xlators/encryption/crypt/src/crypt.c
new file mode 100644
index 000000000..13b1bd962
--- /dev/null
+++ b/xlators/encryption/crypt/src/crypt.c
@@ -0,0 +1,4522 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <ctype.h>
+#include <sys/uio.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "logging.h"
+#include "defaults.h"
+
+#include "crypt-common.h"
+#include "crypt.h"
+
+static void init_inode_info_head(struct crypt_inode_info *info, fd_t *fd);
+static int32_t init_inode_info_tail(struct crypt_inode_info *info,
+ struct master_cipher_info *master);
+static int32_t prepare_for_submit_hole(call_frame_t *frame, xlator_t *this,
+ uint64_t from, off_t size);
+static int32_t load_file_size(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata);
+static void do_ordered_submit(call_frame_t *frame, xlator_t *this,
+ atom_data_type dtype);
+static void do_parallel_submit(call_frame_t *frame, xlator_t *this,
+ atom_data_type dtype);
+static void put_one_call_open(call_frame_t *frame);
+static void put_one_call_readv(call_frame_t *frame, xlator_t *this);
+static void put_one_call_writev(call_frame_t *frame, xlator_t *this);
+static void put_one_call_ftruncate(call_frame_t *frame, xlator_t *this);
+static void free_avec(struct iovec *avec, char **pool, int blocks_in_pool);
+static void free_avec_data(crypt_local_t *local);
+static void free_avec_hole(crypt_local_t *local);
+
+static crypt_local_t *crypt_alloc_local(call_frame_t *frame, xlator_t *this,
+ glusterfs_fop_t fop)
+{
+ crypt_local_t *local = NULL;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ gf_log(this->name, GF_LOG_ERROR, "out of memory");
+ return NULL;
+ }
+ local->fop = fop;
+ LOCK_INIT(&local->hole_lock);
+ LOCK_INIT(&local->call_lock);
+ LOCK_INIT(&local->rw_count_lock);
+
+ frame->local = local;
+ return local;
+}
+
+struct crypt_inode_info *get_crypt_inode_info(inode_t *inode, xlator_t *this)
+{
+ int ret;
+ uint64_t value = 0;
+ struct crypt_inode_info *info;
+
+ ret = inode_ctx_get(inode, this, &value);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Can not get inode info");
+ return NULL;
+ }
+ info = (struct crypt_inode_info *)(long)value;
+ if (info == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Can not obtain inode info");
+ return NULL;
+ }
+ return info;
+}
+
+static struct crypt_inode_info *local_get_inode_info(crypt_local_t *local,
+ xlator_t *this)
+{
+ if (local->info)
+ return local->info;
+ local->info = get_crypt_inode_info(local->fd->inode, this);
+ return local->info;
+}
+
+static struct crypt_inode_info *alloc_inode_info(crypt_local_t *local,
+ loc_t *loc)
+{
+ struct crypt_inode_info *info;
+
+ info = GF_CALLOC(1, sizeof(*info), gf_crypt_mt_inode);
+ if (!info) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ gf_log ("crypt", GF_LOG_WARNING,
+ "Can not allocate inode info");
+ return NULL;
+ }
+ memset(info, 0, sizeof(*info));
+#if DEBUG_CRYPT
+ info->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc);
+ if (!info->loc) {
+ gf_log("crypt", GF_LOG_WARNING, "Can not allocate loc");
+ GF_FREE(info);
+ return NULL;
+ }
+ if (loc_copy(info->loc, loc)){
+ GF_FREE(info->loc);
+ GF_FREE(info);
+ return NULL;
+ }
+#endif /* DEBUG_CRYPT */
+
+ local->info = info;
+ return info;
+}
+
+static void free_inode_info(struct crypt_inode_info *info)
+{
+#if DEBUG_CRYPT
+ loc_wipe(info->loc);
+ GF_FREE(info->loc);
+#endif
+ memset(info, 0, sizeof(*info));
+ GF_FREE(info);
+}
+
+int crypt_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_addr = 0;
+ if (!inode_ctx_del (inode, this, &ctx_addr))
+ free_inode_info((struct crypt_inode_info *)(long)ctx_addr);
+ return 0;
+}
+
+#if DEBUG_CRYPT
+static void check_read(call_frame_t *frame, xlator_t *this, int32_t read,
+ struct iovec *vec, int32_t count, struct iatt *stbuf)
+{
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = get_object_cinfo(local->info);
+ struct avec_config *conf = &local->data_conf;
+ uint32_t resid = stbuf->ia_size & (object_alg_blksize(object) - 1);
+
+ if (read <= 0)
+ return;
+ if (read != iovec_get_size(vec, count))
+ gf_log ("crypt", GF_LOG_DEBUG,
+ "op_ret differs from amount of read bytes");
+
+ if (object_alg_should_pad(object) && (read & (object_alg_blksize(object) - 1)))
+ gf_log ("crypt", GF_LOG_DEBUG,
+ "bad amount of read bytes (!= 0 mod(cblock size))");
+
+ if (conf->aligned_offset + read >
+ stbuf->ia_size + (resid ? object_alg_blksize(object) - resid : 0))
+ gf_log ("crypt", GF_LOG_DEBUG,
+ "bad amount of read bytes (too large))");
+
+}
+
+#define PT_BYTES_TO_DUMP (32)
+static void dump_plain_text(crypt_local_t *local, struct iovec *avec)
+{
+ int32_t to_dump;
+ char str[PT_BYTES_TO_DUMP + 1];
+
+ if (!avec)
+ return;
+ to_dump = avec->iov_len;
+ if (to_dump > PT_BYTES_TO_DUMP)
+ to_dump = PT_BYTES_TO_DUMP;
+ memcpy(str, avec->iov_base, to_dump);
+ memset(str + to_dump, '0', 1);
+ gf_log("crypt", GF_LOG_DEBUG, "Read file: %s", str);
+}
+
+static int32_t data_conf_invariant(struct avec_config *conf)
+{
+ return conf->acount ==
+ !!has_head_block(conf) +
+ !!has_tail_block(conf)+
+ conf->nr_full_blocks;
+}
+
+static int32_t hole_conf_invariant(struct avec_config *conf)
+{
+ return conf->blocks_in_pool ==
+ !!has_head_block(conf) +
+ !!has_tail_block(conf)+
+ !!has_full_blocks(conf);
+}
+
+static void crypt_check_conf(struct avec_config *conf)
+{
+ int32_t ret = 0;
+ const char *msg;
+
+ switch (conf->type) {
+ case DATA_ATOM:
+ msg = "data";
+ ret = data_conf_invariant(conf);
+ break;
+ case HOLE_ATOM:
+ msg = "hole";
+ ret = hole_conf_invariant(conf);
+ break;
+ default:
+ msg = "unknown";
+ }
+ if (!ret)
+ gf_log("crypt", GF_LOG_DEBUG, "bad %s conf", msg);
+}
+
+static void check_buf(call_frame_t *frame, xlator_t *this, struct iatt *buf)
+{
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+ uint64_t local_file_size;
+
+ switch(local->fop) {
+ case GF_FOP_FTRUNCATE:
+ return;
+ case GF_FOP_WRITE:
+ local_file_size = local->new_file_size;
+ break;
+ case GF_FOP_READ:
+ if (parent_is_crypt_xlator(frame, this))
+ return;
+ local_file_size = local->cur_file_size;
+ break;
+ default:
+ gf_log("crypt", GF_LOG_DEBUG, "bad file operation");
+ return;
+ }
+ if (buf->ia_size != round_up(local_file_size,
+ object_alg_blksize(object)))
+ gf_log("crypt", GF_LOG_DEBUG,
+ "bad ia_size in buf (%llu), should be %llu",
+ (unsigned long long)buf->ia_size,
+ (unsigned long long)round_up(local_file_size,
+ object_alg_blksize(object)));
+}
+
+#else
+#define check_read(frame, this, op_ret, vec, count, stbuf) noop
+#define dump_plain_text(local, avec) noop
+#define crypt_check_conf(conf) noop
+#define check_buf(frame, this, buf) noop
+#endif /* DEBUG_CRYPT */
+
+/*
+ * Pre-conditions:
+ * @vec represents a ciphertext of expanded size and
+ * aligned offset.
+ *
+ * Compound a temporal vector @avec with block-aligned
+ * components, decrypt and fix it up to represent a chunk
+ * of data corresponding to the original size and offset.
+ * Pass the result to the next translator.
+ */
+int32_t crypt_readv_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ struct iovec *avec;
+ uint32_t i;
+ uint32_t to_vec;
+ uint32_t to_user;
+
+ check_buf(frame, this, stbuf);
+ check_read(frame, this, op_ret, vec, count, stbuf);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->iobref = iobref_ref(iobref);
+
+ local->buf = *stbuf;
+ local->buf.ia_size = local->cur_file_size;
+
+ if (op_ret <= 0 || count == 0 || vec[0].iov_len == 0)
+ goto put_one_call;
+
+ if (conf->orig_offset >= local->cur_file_size) {
+ local->op_ret = 0;
+ goto put_one_call;
+ }
+ /*
+ * correct config params with real file size
+ * and actual amount of bytes read
+ */
+ set_config_offsets(frame, this,
+ conf->orig_offset, op_ret, DATA_ATOM, 0);
+
+ if (conf->orig_offset + conf->orig_size > local->cur_file_size)
+ conf->orig_size = local->cur_file_size - conf->orig_offset;
+ /*
+ * calculate amount of data to be returned
+ * to user.
+ */
+ to_user = op_ret;
+ if (conf->aligned_offset + to_user <= conf->orig_offset) {
+ gf_log(this->name, GF_LOG_WARNING, "Incomplete read");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto put_one_call;
+ }
+ to_user -= (conf->aligned_offset - conf->orig_offset);
+
+ if (to_user > conf->orig_size)
+ to_user = conf->orig_size;
+ local->rw_count = to_user;
+
+ op_errno = set_config_avec_data(this, local,
+ conf, object, vec, count);
+ if (op_errno) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto put_one_call;
+ }
+ avec = conf->avec;
+#if DEBUG_CRYPT
+ if (conf->off_in_tail != 0 &&
+ conf->off_in_tail < object_alg_blksize(object) &&
+ object_alg_should_pad(object))
+ gf_log(this->name, GF_LOG_DEBUG, "Bad offset in tail %d",
+ conf->off_in_tail);
+ if (iovec_get_size(vec, count) != 0 &&
+ in_same_lblock(conf->orig_offset + iovec_get_size(vec, count) - 1,
+ local->cur_file_size - 1,
+ object_alg_blkbits(object))) {
+ gf_log(this->name, GF_LOG_DEBUG, "Compound last cblock");
+ dump_cblock(this,
+ (unsigned char *)(avec[conf->acount - 1].iov_base) +
+ avec[conf->acount - 1].iov_len - object_alg_blksize(object));
+ dump_cblock(this,
+ (unsigned char *)(vec[count - 1].iov_base) +
+ vec[count - 1].iov_len - object_alg_blksize(object));
+ }
+#endif
+ decrypt_aligned_iov(object, avec,
+ conf->acount, conf->aligned_offset);
+ /*
+ * pass proper plain data to user
+ */
+ avec[0].iov_base += (conf->aligned_offset - conf->orig_offset);
+ avec[0].iov_len -= (conf->aligned_offset - conf->orig_offset);
+
+ to_vec = to_user;
+ for (i = 0; i < conf->acount; i++) {
+ if (avec[i].iov_len > to_vec)
+ avec[i].iov_len = to_vec;
+ to_vec -= avec[i].iov_len;
+ }
+ put_one_call:
+ put_one_call_readv(frame, this);
+ return 0;
+}
+
+static int32_t do_readv(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ data_t *data;
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ /*
+ * extract regular file size
+ */
+ data = dict_get(dict, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
+ op_errno = EIO;
+ goto error;
+ }
+ local->cur_file_size = data_to_uint64(data);
+
+ get_one_call(frame);
+ STACK_WIND(frame,
+ crypt_readv_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readv,
+ local->fd,
+ /*
+ * FIXME: read amount can be reduced
+ */
+ local->data_conf.expanded_size,
+ local->data_conf.aligned_offset,
+ local->flags,
+ local->xdata);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ get_one_call(frame);
+ put_one_call_readv(frame, this);
+ return 0;
+}
+
+static int32_t crypt_readv_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ /*
+ * An access has been granted,
+ * retrieve file size
+ */
+ STACK_WIND(frame,
+ do_readv,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ error:
+ fd_unref(local->fd);
+ if (local->xdata)
+ dict_unref(local->xdata);
+ STACK_UNWIND_STRICT(readv,
+ frame,
+ -1,
+ op_errno,
+ NULL,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t readv_trivial_completion(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "stat failed (%d)", op_errno);
+ goto error;
+ }
+ local->buf = *buf;
+ STACK_WIND(frame,
+ load_file_size,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ local->loc,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno,
+ NULL, 0, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t crypt_readv(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset,
+ uint32_t flags, dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+ struct crypt_inode_info *info;
+ struct gf_flock lock = {0, };
+
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG, "reading %d bytes from offset %llu",
+ (int)size, (long long)offset);
+ if (parent_is_crypt_xlator(frame, this))
+ gf_log("crypt", GF_LOG_DEBUG, "parent is crypt");
+#endif
+ local = crypt_alloc_local(frame, this, GF_FOP_READ);
+ if (!local) {
+ ret = ENOMEM;
+ goto error;
+ }
+ if (size == 0)
+ goto trivial;
+
+ local->fd = fd_ref(fd);
+ local->flags = flags;
+
+ info = local_get_inode_info(local, this);
+ if (info == NULL) {
+ ret = EINVAL;
+ fd_unref(fd);
+ goto error;
+ }
+ if (!object_alg_atomic(&info->cinfo)) {
+ ret = EINVAL;
+ fd_unref(fd);
+ goto error;
+ }
+ set_config_offsets(frame, this, offset, size,
+ DATA_ATOM, 0);
+ if (parent_is_crypt_xlator(frame, this)) {
+ data_t *data;
+ /*
+ * We are called by crypt_writev (or cypt_ftruncate)
+ * to perform the "read" component of the read-modify-write
+ * (or read-prune-write) sequence for some atom;
+ *
+ * don't ask for access:
+ * it has already been acquired
+ *
+ * Retrieve current file size
+ */
+ if (!xdata) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size hasn't been passed");
+ ret = EIO;
+ goto error;
+ }
+ data = dict_get(xdata, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size not found");
+ ret = EIO;
+ goto error;
+ }
+ local->old_file_size =
+ local->cur_file_size = data_to_uint64(data);
+
+ get_one_call(frame);
+ STACK_WIND(frame,
+ crypt_readv_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv,
+ local->fd,
+ /*
+ * FIXME: read amount can be reduced
+ */
+ local->data_conf.expanded_size,
+ local->data_conf.aligned_offset,
+ flags,
+ NULL);
+ return 0;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = F_RDLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_readv_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ trivial:
+ STACK_WIND(frame,
+ readv_trivial_completion,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat,
+ fd,
+ NULL);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(readv,
+ frame,
+ -1,
+ ret,
+ NULL,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+ return 0;
+}
+
+void set_local_io_params_writev(call_frame_t *frame,
+ struct object_cipher_info *object,
+ struct rmw_atom *atom,
+ off_t io_offset,
+ uint32_t io_size)
+{
+ crypt_local_t *local = frame->local;
+
+ local->io_offset = io_offset;
+ local->io_size = io_size;
+
+ local->io_offset_nopad =
+ atom->offset_at(frame, object) + atom->offset_in(frame, object);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "set nopad offset to %llu",
+ (unsigned long long)local->io_offset_nopad);
+
+ local->io_size_nopad = atom->io_size_nopad(frame, object);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "set nopad size to %llu",
+ (unsigned long long)local->io_size_nopad);
+
+ local->update_disk_file_size = 0;
+ /*
+ * NOTE: eof_padding_size is 0 for all full atoms;
+ * For head and tail atoms it will be set up at rmw_partial block()
+ */
+ local->new_file_size = local->cur_file_size;
+
+ if (local->io_offset_nopad + local->io_size_nopad > local->cur_file_size) {
+
+ local->new_file_size = local->io_offset_nopad + local->io_size_nopad;
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "set new file size to %llu",
+ (unsigned long long)local->new_file_size);
+
+ local->update_disk_file_size = 1;
+ }
+}
+
+void set_local_io_params_ftruncate(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ uint32_t resid;
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+
+ resid = conf->orig_offset & (object_alg_blksize(object) - 1);
+ if (resid) {
+ local->eof_padding_size =
+ object_alg_blksize(object) - resid;
+ local->new_file_size = conf->aligned_offset;
+ local->update_disk_file_size = 0;
+ /*
+ * file size will be updated
+ * in the ->writev() stack,
+ * when submitting file tail
+ */
+ }
+ else {
+ local->eof_padding_size = 0;
+ local->new_file_size = conf->orig_offset;
+ local->update_disk_file_size = 1;
+ /*
+ * file size will be updated
+ * in this ->ftruncate stack
+ */
+ }
+}
+
+static inline void submit_head(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ submit_partial(frame, this, local->fd, HEAD_ATOM);
+}
+
+static inline void submit_tail(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ submit_partial(frame, this, local->fd, TAIL_ATOM);
+}
+
+static void submit_hole(call_frame_t *frame, xlator_t *this)
+{
+ /*
+ * hole conversion always means
+ * appended write and goes in ordered fashion
+ */
+ do_ordered_submit(frame, this, HOLE_ATOM);
+}
+
+static void submit_data(call_frame_t *frame, xlator_t *this)
+{
+ if (is_ordered_mode(frame)) {
+ do_ordered_submit(frame, this, DATA_ATOM);
+ return;
+ }
+ gf_log("crypt", GF_LOG_WARNING, "Bad submit mode");
+ get_nr_calls(frame, nr_calls_data(frame));
+ do_parallel_submit(frame, this, DATA_ATOM);
+ return;
+}
+
+/*
+ * heplers called by writev_cbk, fruncate_cbk in ordered mode
+ */
+
+static inline int32_t should_submit_hole(crypt_local_t *local)
+{
+ struct avec_config *conf = &local->hole_conf;
+
+ return conf->avec != NULL;
+}
+
+static inline int32_t should_resume_submit_hole(crypt_local_t *local)
+{
+ struct avec_config *conf = &local->hole_conf;
+
+ if (local->fop == GF_FOP_WRITE && has_tail_block(conf))
+ /*
+ * Don't submit a part of hole, which
+ * fits into a data block:
+ * this part of hole will be converted
+ * as a gap filled by zeros in data head
+ * block.
+ */
+ return conf->cursor < conf->acount - 1;
+ else
+ return conf->cursor < conf->acount;
+}
+
+static inline int32_t should_resume_submit_data(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+
+ if (is_ordered_mode(frame))
+ return conf->cursor < conf->acount;
+ /*
+ * parallel writes
+ */
+ return 0;
+}
+
+static inline int32_t should_submit_data_after_hole(crypt_local_t *local)
+{
+ return local->data_conf.avec != NULL;
+}
+
+static void update_local_file_params(call_frame_t *frame,
+ xlator_t *this,
+ struct iatt *prebuf,
+ struct iatt *postbuf)
+{
+ crypt_local_t *local = frame->local;
+
+ check_buf(frame, this, postbuf);
+
+ local->prebuf = *prebuf;
+ local->postbuf = *postbuf;
+
+ local->prebuf.ia_size = local->cur_file_size;
+ local->postbuf.ia_size = local->new_file_size;
+
+ local->cur_file_size = local->new_file_size;
+}
+
+static int32_t end_writeback_writev(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret <= 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "writev iteration failed");
+ goto put_one_call;
+ }
+ /*
+ * op_ret includes paddings (atom's head, atom's tail and EOF)
+ */
+ if (op_ret < local->io_size) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Incomplete writev iteration");
+ goto put_one_call;
+ }
+ op_ret -= local->eof_padding_size;
+ local->op_ret = op_ret;
+
+ update_local_file_params(frame, this, prebuf, postbuf);
+
+ if (data_write_in_progress(local)) {
+
+ LOCK(&local->rw_count_lock);
+ local->rw_count += op_ret;
+ UNLOCK(&local->rw_count_lock);
+
+ if (should_resume_submit_data(frame))
+ submit_data(frame, this);
+ }
+ else {
+ /*
+ * hole conversion is going on;
+ * don't take into account written zeros
+ */
+ if (should_resume_submit_hole(local))
+ submit_hole(frame, this);
+
+ else if (should_submit_data_after_hole(local))
+ submit_data(frame, this);
+ }
+ put_one_call:
+ put_one_call_writev(frame, this);
+ return 0;
+}
+
+#define crypt_writev_cbk end_writeback_writev
+
+#define HOLE_WRITE_CHUNK_BITS 12
+#define HOLE_WRITE_CHUNK_SIZE (1 << HOLE_WRITE_CHUNK_BITS)
+
+/*
+ * Convert hole of size @size at offset @off to
+ * zeros and prepare respective iovecs for submit.
+ * The hole lock should be held.
+ *
+ * Pre-conditions:
+ * @local->file_size is set and valid.
+ */
+int32_t prepare_for_submit_hole(call_frame_t *frame, xlator_t *this,
+ uint64_t off, off_t size)
+{
+ int32_t ret;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ set_config_offsets(frame, this, off, size, HOLE_ATOM, 1);
+
+ ret = set_config_avec_hole(this, local,
+ &local->hole_conf, object, local->fop);
+ crypt_check_conf(&local->hole_conf);
+
+ return ret;
+}
+
+/*
+ * prepare for submit @count bytes at offset @from
+ */
+int32_t prepare_for_submit_data(call_frame_t *frame, xlator_t *this,
+ off_t from, int32_t size, struct iovec *vec,
+ int32_t vec_count, int32_t setup_gap)
+{
+ uint32_t ret;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ set_config_offsets(frame, this, from, size,
+ DATA_ATOM, setup_gap);
+
+ ret = set_config_avec_data(this, local,
+ &local->data_conf, object, vec, vec_count);
+ crypt_check_conf(&local->data_conf);
+
+ return ret;
+}
+
+static void free_avec(struct iovec *avec,
+ char **pool, int blocks_in_pool)
+{
+ if (!avec)
+ return;
+ GF_FREE(pool);
+ GF_FREE(avec);
+}
+
+static void free_avec_data(crypt_local_t *local)
+{
+ return free_avec(local->data_conf.avec,
+ local->data_conf.pool,
+ local->data_conf.blocks_in_pool);
+}
+
+static void free_avec_hole(crypt_local_t *local)
+{
+ return free_avec(local->hole_conf.avec,
+ local->hole_conf.pool,
+ local->hole_conf.blocks_in_pool);
+}
+
+
+static void do_parallel_submit(call_frame_t *frame, xlator_t *this,
+ atom_data_type dtype)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf;
+
+ local->active_setup = dtype;
+ conf = conf_by_type(frame, dtype);
+
+ if (has_head_block(conf))
+ submit_head(frame, this);
+
+ if (has_full_blocks(conf))
+ submit_full(frame, this);
+
+ if (has_tail_block(conf))
+ submit_tail(frame, this);
+ return;
+}
+
+static void do_ordered_submit(call_frame_t *frame, xlator_t *this,
+ atom_data_type dtype)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf;
+
+ local->active_setup = dtype;
+ conf = conf_by_type(frame, dtype);
+
+ if (should_submit_head_block(conf)) {
+ get_one_call_nolock(frame);
+ submit_head(frame, this);
+ }
+ else if (should_submit_full_block(conf)) {
+ get_one_call_nolock(frame);
+ submit_full(frame, this);
+ }
+ else if (should_submit_tail_block(conf)) {
+ get_one_call_nolock(frame);
+ submit_tail(frame, this);
+ }
+ else
+ gf_log("crypt", GF_LOG_DEBUG,
+ "nothing has been submitted in ordered mode");
+ return;
+}
+
+static int32_t do_writev(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ data_t *data;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+ /*
+ * extract regular file size
+ */
+ data = dict_get(dict, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
+ op_ret = -1;
+ op_errno = EIO;
+ goto error;
+ }
+ local->old_file_size = local->cur_file_size = data_to_uint64(data);
+
+ set_gap_at_end(frame, object, &local->data_conf, DATA_ATOM);
+
+ if (local->cur_file_size < local->data_conf.orig_offset) {
+ /*
+ * Set up hole config
+ */
+ op_errno = prepare_for_submit_hole(frame,
+ this,
+ local->cur_file_size,
+ local->data_conf.orig_offset - local->cur_file_size);
+ if (op_errno) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto error;
+ }
+ }
+ if (should_submit_hole(local))
+ submit_hole(frame, this);
+ else
+ submit_data(frame, this);
+ return 0;
+ error:
+ get_one_call_nolock(frame);
+ put_one_call_writev(frame, this);
+ return 0;
+}
+
+static int32_t crypt_writev_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto error;
+ /*
+ * An access has been granted,
+ * retrieve file size first
+ */
+ STACK_WIND(frame,
+ do_writev,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ error:
+ get_one_call_nolock(frame);
+ put_one_call_writev(frame, this);
+ return 0;
+}
+
+static int32_t writev_trivial_completion(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf,
+ dict_t *dict)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->prebuf = *buf;
+ local->postbuf = *buf;
+
+ local->prebuf.ia_size = local->cur_file_size;
+ local->postbuf.ia_size = local->cur_file_size;
+
+ get_one_call(frame);
+ put_one_call_writev(frame, this);
+ return 0;
+}
+
+int crypt_writev(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vec,
+ int32_t count,
+ off_t offset,
+ uint32_t flags,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+ struct crypt_inode_info *info;
+ struct gf_flock lock = {0, };
+#if DEBUG_CRYPT
+ gf_log ("crypt", GF_LOG_DEBUG, "writing %d bytes from offset %llu",
+ (int)iovec_get_size(vec, count), (long long)offset);
+#endif
+ local = crypt_alloc_local(frame, this, GF_FOP_WRITE);
+ if (!local) {
+ ret = ENOMEM;
+ goto error;
+ }
+ local->fd = fd_ref(fd);
+
+ if (iobref)
+ local->iobref = iobref_ref(iobref);
+ /*
+ * to update real file size on the server
+ */
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ ret = ENOMEM;
+ goto error;
+ }
+ local->flags = flags;
+
+ info = local_get_inode_info(local, this);
+ if (info == NULL) {
+ ret = EINVAL;
+ goto error;
+ }
+ if (!object_alg_atomic(&info->cinfo)) {
+ ret = EINVAL;
+ goto error;
+ }
+ if (iovec_get_size(vec, count) == 0)
+ goto trivial;
+
+ ret = prepare_for_submit_data(frame, this, offset,
+ iovec_get_size(vec, count),
+ vec, count, 0 /* don't setup gup
+ in tail: we don't
+ know file size yet */);
+ if (ret)
+ goto error;
+
+ if (parent_is_crypt_xlator(frame, this)) {
+ data_t *data;
+ /*
+ * we are called by shinking crypt_ftruncate(),
+ * which doesn't perform hole conversion;
+ *
+ * don't ask for access:
+ * it has already been acquired
+ */
+
+ /*
+ * extract file size
+ */
+ if (!xdata) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size hasn't been passed");
+ ret = EIO;
+ goto error;
+ }
+ data = dict_get(xdata, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size not found");
+ ret = EIO;
+ goto error;
+ }
+ local->old_file_size =
+ local->cur_file_size = data_to_uint64(data);
+
+ submit_data(frame, this);
+ return 0;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ /*
+ * lock the file and retrieve its size
+ */
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_writev_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ trivial:
+ STACK_WIND(frame,
+ writev_trivial_completion,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat,
+ fd,
+ NULL);
+ return 0;
+ error:
+ if (local && local->fd)
+ fd_unref(fd);
+ if (local && local->iobref)
+ iobref_unref(iobref);
+ if (local && local->xdata)
+ dict_unref(xdata);
+ if (local && local->xattr)
+ dict_unref(local->xattr);
+ if (local && local->info)
+ free_inode_info(local->info);
+
+ STACK_UNWIND_STRICT(writev, frame, -1, ret, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t prepare_for_prune(call_frame_t *frame, xlator_t *this, uint64_t offset)
+{
+ set_config_offsets(frame, this,
+ offset,
+ 0, /* count */
+ DATA_ATOM,
+ 0 /* since we prune, there is no
+ gap in tail to uptodate */);
+ return 0;
+}
+
+/*
+ * Finish the read-prune-modify sequence
+ *
+ * Can be invoked as
+ * 1) ->ftruncate_cbk() for cblock-aligned, or trivial prune
+ * 2) ->writev_cbk() for non-cblock-aligned prune
+ */
+
+static int32_t prune_complete(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ update_local_file_params(frame, this, prebuf, postbuf);
+
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * This is called as ->ftruncate_cbk()
+ *
+ * Perform the "write" component of the
+ * read-prune-write sequence.
+ *
+ * submuit the rest of the file
+ */
+static int32_t prune_submit_file_tail(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+ dict_t *dict;
+
+ if (op_ret < 0)
+ goto put_one_call;
+
+ if (local->xdata) {
+ dict_unref(local->xdata);
+ local->xdata = NULL;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ dict = dict_new();
+ if (!dict) {
+ op_errno = ENOMEM;
+ goto error;
+ }
+
+ update_local_file_params(frame, this, prebuf, postbuf);
+ local->new_file_size = conf->orig_offset;
+
+ /*
+ * The rest of the file is a partial block and, hence,
+ * should be written via RMW sequence, so the crypt xlator
+ * does STACK_WIND to itself.
+ *
+ * Pass current file size to crypt_writev()
+ */
+ op_errno = dict_set(dict,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (op_errno) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "can not set key to update file size");
+ dict_unref(dict);
+ goto error;
+ }
+ gf_log("crypt", GF_LOG_DEBUG,
+ "passing current file size (%llu) to crypt_writev",
+ (unsigned long long)local->cur_file_size);
+ /*
+ * Padding will be filled with
+ * zeros by rmw_partial_block()
+ */
+ STACK_WIND(frame,
+ prune_complete,
+ this,
+ this->fops->writev, /* crypt_writev */
+ local->fd,
+ &local->vec,
+ 1,
+ conf->aligned_offset, /* offset to write from */
+ 0,
+ local->iobref,
+ dict);
+
+ dict_unref(dict);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ put_one_call:
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * This is called as a callback of ->writev() invoked in behalf
+ * of ftruncate(): it can be
+ * 1) ordered writes issued by hole conversion in the case of
+ * expanded truncate, or
+ * 2) an rmw partial data block issued by non-cblock-aligned
+ * prune.
+ */
+int32_t end_writeback_ftruncate(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ /*
+ * if nothing has been written,
+ * then it must be an error
+ */
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto put_one_call;
+
+ update_local_file_params(frame, this, prebuf, postbuf);
+
+ if (data_write_in_progress(local))
+ /* case (2) */
+ goto put_one_call;
+ /* case (1) */
+ if (should_resume_submit_hole(local))
+ submit_hole(frame, this);
+ /*
+ * case of hole, when we should't resume
+ */
+ put_one_call:
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * Perform prune and write components of the
+ * read-prune-write sequence.
+ *
+ * Called as ->readv_cbk()
+ *
+ * Pre-conditions:
+ * @vec contains the latest atom of the file
+ * (plain text)
+ */
+static int32_t prune_write(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ int32_t i;
+ size_t to_copy;
+ size_t copied = 0;
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ if (op_ret == -1)
+ goto put_one_call;
+
+ /*
+ * At first, uptodate head block
+ */
+ if (iovec_get_size(vec, count) < conf->off_in_head) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Failed to uptodate head block for prune");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto put_one_call;
+ }
+ local->vec.iov_len = conf->off_in_head;
+ local->vec.iov_base = GF_CALLOC(1, local->vec.iov_len,
+ gf_crypt_mt_data);
+
+ if (local->vec.iov_base == NULL) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Failed to calloc head block for prune");
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto put_one_call;
+ }
+ for (i = 0; i < count; i++) {
+ to_copy = vec[i].iov_len;
+ if (to_copy > local->vec.iov_len - copied)
+ to_copy = local->vec.iov_len - copied;
+
+ memcpy((char *)local->vec.iov_base + copied,
+ vec[i].iov_base,
+ to_copy);
+ copied += to_copy;
+ if (copied == local->vec.iov_len)
+ break;
+ }
+ /*
+ * perform prune with aligned offset
+ * (i.e. at this step we prune a bit
+ * more then it is needed
+ */
+ STACK_WIND(frame,
+ prune_submit_file_tail,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate,
+ local->fd,
+ conf->aligned_offset,
+ local->xdata);
+ return 0;
+ put_one_call:
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * Perform a read-prune-write sequence
+ */
+int32_t read_prune_write(call_frame_t *frame, xlator_t *this)
+{
+ int32_t ret = 0;
+ dict_t *dict = NULL;
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ set_local_io_params_ftruncate(frame, object);
+ get_one_call_nolock(frame);
+
+ if ((conf->orig_offset & (object_alg_blksize(object) - 1)) == 0) {
+ /*
+ * cblock-aligned prune:
+ * we don't need read and write components,
+ * just cut file body
+ */
+ gf_log("crypt", GF_LOG_DEBUG,
+ "prune without RMW (at offset %llu",
+ (unsigned long long)conf->orig_offset);
+
+ STACK_WIND(frame,
+ prune_complete,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate,
+ local->fd,
+ conf->orig_offset,
+ local->xdata);
+ return 0;
+ }
+ gf_log("crypt", GF_LOG_DEBUG,
+ "prune with RMW (at offset %llu",
+ (unsigned long long)conf->orig_offset);
+ /*
+ * We are about to perform the "read" component of the
+ * read-prune-write sequence. It means that we need to
+ * read encrypted data from disk and decrypt it.
+ * So, the crypt translator does STACK_WIND to itself.
+ *
+ * Pass current file size to crypt_readv()
+
+ */
+ dict = dict_new();
+ if (!dict) {
+ gf_log("crypt", GF_LOG_WARNING, "Can not alloc dict");
+ ret = ENOMEM;
+ goto exit;
+ }
+ ret = dict_set(dict,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (ret) {
+ gf_log("crypt", GF_LOG_WARNING, "Can not set dict");
+ goto exit;
+ }
+ STACK_WIND(frame,
+ prune_write,
+ this,
+ this->fops->readv, /* crypt_readv */
+ local->fd,
+ get_atom_size(object), /* bytes to read */
+ conf->aligned_offset, /* offset to read from */
+ 0,
+ dict);
+ exit:
+ if (dict)
+ dict_unref(dict);
+ return ret;
+}
+
+/*
+ * File prune is more complicated than expand.
+ * First we need to read the latest atom to not lose info
+ * needed for proper update. Also we need to make sure that
+ * every component of read-prune-write sequence leaves data
+ * consistent
+ *
+ * Non-cblock aligned prune is performed as read-prune-write
+ * sequence:
+ *
+ * 1) read the latest atom;
+ * 2) perform cblock-aligned prune
+ * 3) issue a write request for the end-of-file
+ */
+int32_t prune_file(call_frame_t *frame, xlator_t *this, uint64_t offset)
+{
+ int32_t ret;
+
+ ret = prepare_for_prune(frame, this, offset);
+ if (ret)
+ return ret;
+ return read_prune_write(frame, this);
+}
+
+int32_t expand_file(call_frame_t *frame, xlator_t *this,
+ uint64_t offset)
+{
+ int32_t ret;
+ crypt_local_t *local = frame->local;
+
+ ret = prepare_for_submit_hole(frame, this,
+ local->old_file_size,
+ offset - local->old_file_size);
+ if (ret)
+ return ret;
+ submit_hole(frame, this);
+ return 0;
+}
+
+static int32_t ftruncate_trivial_completion(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf,
+ dict_t *dict)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->prebuf = *buf;
+ local->postbuf = *buf;
+
+ local->prebuf.ia_size = local->cur_file_size;
+ local->postbuf.ia_size = local->cur_file_size;
+
+ get_one_call(frame);
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+static int32_t do_ftruncate(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ data_t *data;
+ crypt_local_t *local = frame->local;
+
+ if (op_ret)
+ goto error;
+ /*
+ * extract regular file size
+ */
+ data = dict_get(dict, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
+ op_errno = EIO;
+ goto error;
+ }
+ local->old_file_size = local->cur_file_size = data_to_uint64(data);
+
+ if (local->data_conf.orig_offset == local->cur_file_size) {
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG,
+ "trivial ftruncate (current file size %llu)",
+ (unsigned long long)local->cur_file_size);
+#endif
+ goto trivial;
+ }
+ else if (local->data_conf.orig_offset < local->cur_file_size) {
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG, "prune from %llu to %llu",
+ (unsigned long long)local->cur_file_size,
+ (unsigned long long)local->data_conf.orig_offset);
+#endif
+ op_errno = prune_file(frame,
+ this,
+ local->data_conf.orig_offset);
+ }
+ else {
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG, "expand from %llu to %llu",
+ (unsigned long long)local->cur_file_size,
+ (unsigned long long)local->data_conf.orig_offset);
+#endif
+ op_errno = expand_file(frame,
+ this,
+ local->data_conf.orig_offset);
+ }
+ if (op_errno)
+ goto error;
+ return 0;
+ trivial:
+ STACK_WIND(frame,
+ ftruncate_trivial_completion,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat,
+ local->fd,
+ NULL);
+ return 0;
+ error:
+ /*
+ * finish with ftruncate
+ */
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ get_one_call_nolock(frame);
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+static int32_t crypt_ftruncate_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto error;
+ /*
+ * An access has been granted,
+ * retrieve file size first
+ */
+ STACK_WIND(frame,
+ do_ftruncate,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ error:
+ get_one_call_nolock(frame);
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * ftruncate is performed in 2 steps:
+ * . recieve file size;
+ * . expand or prune file.
+ */
+static int32_t crypt_ftruncate(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset,
+ dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+ struct crypt_inode_info *info;
+ struct gf_flock lock = {0, };
+
+ local = crypt_alloc_local(frame, this, GF_FOP_FTRUNCATE);
+ if (!local) {
+ ret = ENOMEM;
+ goto error;
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ ret = ENOMEM;
+ goto error;
+ }
+ local->fd = fd_ref(fd);
+ info = local_get_inode_info(local, this);
+ if (info == NULL) {
+ ret = EINVAL;
+ goto error;
+ }
+ if (!object_alg_atomic(&info->cinfo)) {
+ ret = EINVAL;
+ goto error;
+ }
+ local->data_conf.orig_offset = offset;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_ftruncate_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ error:
+ if (local && local->fd)
+ fd_unref(fd);
+ if (local && local->xdata)
+ dict_unref(xdata);
+ if (local && local->xattr)
+ dict_unref(local->xattr);
+ if (local && local->info)
+ free_inode_info(local->info);
+
+ STACK_UNWIND_STRICT(ftruncate, frame, -1, ret, NULL, NULL, NULL);
+ return 0;
+}
+
+/* ->flush_cbk() */
+int32_t truncate_end(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ STACK_UNWIND_STRICT(truncate,
+ frame,
+ op_ret,
+ op_errno,
+ &local->prebuf,
+ &local->postbuf,
+ local->xdata);
+ return 0;
+}
+
+/* ftruncate_cbk() */
+int32_t truncate_flush(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ fd_t *fd = local->fd;
+ local->prebuf = *prebuf;
+ local->postbuf = *postbuf;
+
+ STACK_WIND(frame,
+ truncate_end,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ fd,
+ NULL);
+ fd_unref(fd);
+ return 0;
+}
+
+/*
+ * is called as ->open_cbk()
+ */
+static int32_t truncate_begin(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0) {
+ fd_unref(fd);
+ STACK_UNWIND_STRICT(truncate,
+ frame,
+ op_ret,
+ op_errno, NULL, NULL, NULL);
+ return 0;
+ }
+ /*
+ * crypt_truncate() is implemented via crypt_ftruncate(),
+ * so the crypt xlator does STACK_WIND to itself here
+ */
+ STACK_WIND(frame,
+ truncate_flush,
+ this,
+ this->fops->ftruncate, /* crypt_ftruncate */
+ fd,
+ local->offset,
+ NULL);
+ return 0;
+}
+
+/*
+ * crypt_truncate() is implemented via crypt_ftruncate() as a
+ * sequence crypt_open() - crypt_ftruncate() - truncate_flush()
+ */
+int32_t crypt_truncate(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset,
+ dict_t *xdata)
+{
+ fd_t *fd;
+ crypt_local_t *local;
+
+#if DEBUG_CRYPT
+ gf_log(this->name, GF_LOG_DEBUG,
+ "truncate file %s at offset %llu",
+ loc->path, (unsigned long long)offset);
+#endif
+ local = crypt_alloc_local(frame, this, GF_FOP_TRUNCATE);
+ if (!local)
+ goto error;
+
+ fd = fd_create(loc->inode, frame->root->pid);
+ if (!fd) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not create fd");
+ goto error;
+ }
+ local->fd = fd;
+ local->offset = offset;
+ local->xdata = xdata;
+ STACK_WIND(frame,
+ truncate_begin,
+ this,
+ this->fops->open, /* crypt_open() */
+ loc,
+ O_RDWR,
+ fd,
+ NULL);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(truncate, frame, -1, EINVAL, NULL, NULL, NULL);
+ return 0;
+}
+
+end_writeback_handler_t dispatch_end_writeback(glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_WRITE:
+ return end_writeback_writev;
+ case GF_FOP_FTRUNCATE:
+ return end_writeback_ftruncate;
+ default:
+ gf_log("crypt", GF_LOG_WARNING, "Bad wb operation %d", fop);
+ return NULL;
+ }
+}
+
+/*
+ * true, if the caller needs metadata string
+ */
+static int32_t is_custom_mtd(dict_t *xdata)
+{
+ data_t *data;
+ uint32_t flags;
+
+ if (!xdata)
+ return 0;
+
+ data = dict_get(xdata, MSGFLAGS_PREFIX);
+ if (!data)
+ return 0;
+ if (data->len != sizeof(uint32_t)) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Bad msgflags size (%d)", data->len);
+ return -1;
+ }
+ flags = *((uint32_t *)data->data);
+ return msgflags_check_mtd_lock(&flags);
+}
+
+static int32_t crypt_open_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ if (op_ret < 0)
+ gf_log(this->name, GF_LOG_WARNING, "mtd unlock failed (%d)",
+ op_errno);
+ put_one_call_open(frame);
+ return 0;
+}
+
+static void crypt_open_tail(call_frame_t *frame, xlator_t *this)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ crypt_open_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+}
+
+/*
+ * load private inode info at open time
+ * called as ->fgetxattr_cbk()
+ */
+static int load_mtd_open(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ int32_t ret;
+ gf_boolean_t upload_info;
+ data_t *mtd;
+ uint64_t value = 0;
+ struct crypt_inode_info *info;
+ crypt_local_t *local = frame->local;
+ crypt_private_t *priv = this->private;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (local->fd->inode->ia_type == IA_IFLNK)
+ goto exit;
+ if (op_ret < 0)
+ goto exit;
+ /*
+ * first, check for cached info
+ */
+ ret = inode_ctx_get(local->fd->inode, this, &value);
+ if (ret != -1) {
+ info = (struct crypt_inode_info *)(long)value;
+ if (info == NULL) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Inode info expected, but not found");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto exit;
+ }
+ /*
+ * info has been found in the cache
+ */
+ upload_info = _gf_false;
+ }
+ else {
+ /*
+ * info hasn't been found in the cache.
+ */
+ info = alloc_inode_info(local, local->loc);
+ if (!info) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto exit;
+ }
+ init_inode_info_head(info, local->fd);
+ upload_info = _gf_true;
+ }
+ /*
+ * extract metadata
+ */
+ mtd = dict_get(dict, CRYPTO_FORMAT_PREFIX);
+ if (!mtd) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ gf_log (this->name, GF_LOG_WARNING,
+ "Format string wasn't found");
+ goto exit;
+ }
+ /*
+ * authenticate metadata against the path
+ */
+ ret = open_format((unsigned char *)mtd->data,
+ mtd->len,
+ local->loc,
+ info,
+ get_master_cinfo(priv),
+ local,
+ upload_info);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ret;
+ goto exit;
+ }
+ if (upload_info) {
+ ret = init_inode_info_tail(info, get_master_cinfo(priv));
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ret;
+ goto exit;
+ }
+ ret = inode_ctx_put(local->fd->inode,
+ this, (uint64_t)(long)info);
+ if (ret == -1) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto exit;
+ }
+ }
+ if (local->custom_mtd) {
+ /*
+ * pass the metadata string to the customer
+ */
+ ret = dict_set_static_bin(local->xdata,
+ CRYPTO_FORMAT_PREFIX,
+ mtd->data,
+ mtd->len);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ret;
+ goto exit;
+ }
+ }
+ exit:
+ if (!local->custom_mtd)
+ crypt_open_tail(frame, this);
+ else
+ put_one_call_open(frame);
+ return 0;
+}
+
+static int32_t crypt_open_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING, "finodelk (LOCK) failed");
+ goto exit;
+ }
+ STACK_WIND(frame,
+ load_mtd_open,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ CRYPTO_FORMAT_PREFIX,
+ NULL);
+ return 0;
+ exit:
+ put_one_call_open(frame);
+ return 0;
+}
+
+/*
+ * verify metadata against the specified pathname
+ */
+static int32_t crypt_open_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (local->fd->inode->ia_type == IA_IFLNK)
+ goto exit;
+ if (op_ret < 0)
+ goto exit;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ else if (local->custom_mtd){
+ local->xdata = dict_new();
+ if (!local->xdata) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ gf_log ("crypt", GF_LOG_ERROR,
+ "Can not get new dict for mtd string");
+ goto exit;
+ }
+ }
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = local->custom_mtd ? F_WRLCK : F_RDLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_open_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ exit:
+ put_one_call_open(frame);
+ return 0;
+}
+
+static int32_t crypt_open(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ int32_t ret = ENOMEM;
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, GF_FOP_OPEN);
+ if (!local)
+ goto error;
+ local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc);
+ if (!local->loc) {
+ ret = ENOMEM;
+ goto error;
+ }
+ memset(local->loc, 0, sizeof(*local->loc));
+ ret = loc_copy(local->loc, loc);
+ if (ret) {
+ GF_FREE(local->loc);
+ goto error;
+ }
+ local->fd = fd_ref(fd);
+
+ ret = is_custom_mtd(xdata);
+ if (ret < 0) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ ret = EINVAL;
+ goto error;
+ }
+ local->custom_mtd = ret;
+
+ if ((flags & O_ACCMODE) == O_WRONLY)
+ /*
+ * we can't open O_WRONLY, because
+ * we need to do read-modify-write
+ */
+ flags = (flags & ~O_ACCMODE) | O_RDWR;
+ /*
+ * Make sure that out translated offsets
+ * and counts won't be ignored
+ */
+ flags &= ~O_APPEND;
+ get_one_call_nolock(frame);
+ STACK_WIND(frame,
+ crypt_open_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open,
+ loc,
+ flags,
+ fd,
+ xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(open,
+ frame,
+ -1,
+ ret,
+ NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t init_inode_info_tail(struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ struct object_cipher_info *object = &info->cinfo;
+
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG, "Init inode info for object %s",
+ uuid_utoa(info->oid));
+#endif
+ ret = data_cipher_algs[object->o_alg][object->o_mode].set_private(info,
+ master);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Set private info failed");
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Init inode info at ->create() time
+ */
+static void init_inode_info_create(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ data_t *data)
+{
+ struct object_cipher_info *object;
+
+ info->nr_minor = CRYPT_XLATOR_ID;
+ memcpy(info->oid, data->data, data->len);
+
+ object = &info->cinfo;
+
+ object->o_alg = master->m_alg;
+ object->o_mode = master->m_mode;
+ object->o_block_bits = master->m_block_bits;
+ object->o_dkey_size = master->m_dkey_size;
+}
+
+static void init_inode_info_head(struct crypt_inode_info *info, fd_t *fd)
+{
+ memcpy(info->oid, fd->inode->gfid, sizeof(uuid_t));
+}
+
+static int32_t crypt_create_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_private_t *priv = this->private;
+ crypt_local_t *local = frame->local;
+ struct crypt_inode_info *info = local->info;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ inode_t *local_inode = local->inode;
+
+ if (op_ret < 0) {
+ free_inode_info(info);
+ goto unwind;
+ }
+ op_errno = init_inode_info_tail(info, get_master_cinfo(priv));
+ if (op_errno) {
+ op_ret = -1;
+ free_inode_info(info);
+ goto unwind;
+ }
+ /*
+ * FIXME: drop major subversion number
+ */
+ op_ret = inode_ctx_put(local->fd->inode, this, (uint64_t)(long)info);
+ if (op_ret == -1) {
+ op_errno = EIO;
+ free_inode_info(info);
+ goto unwind;
+ }
+ unwind:
+ free_format(local);
+ STACK_UNWIND_STRICT(create,
+ frame,
+ op_ret,
+ op_errno,
+ local_fd,
+ local_inode,
+ &local->buf,
+ &local->prebuf,
+ &local->postbuf,
+ local_xdata);
+ fd_unref(local_fd);
+ inode_unref(local_inode);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ return 0;
+}
+
+static int crypt_create_tail(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ inode_t *local_inode = local->inode;
+
+ dict_unref(local->xattr);
+
+ if (op_ret < 0)
+ goto error;
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ crypt_create_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ error:
+ free_inode_info(local->info);
+ free_format(local);
+
+ STACK_UNWIND_STRICT(create,
+ frame,
+ op_ret,
+ op_errno,
+ local_fd,
+ local_inode,
+ &local->buf,
+ &local->prebuf,
+ &local->postbuf,
+ local_xdata);
+
+ fd_unref(local_fd);
+ inode_unref(local_inode);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ return 0;
+}
+
+static int32_t crypt_create_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ struct crypt_inode_info *info = local->info;
+
+ if (op_ret < 0)
+ goto error;
+
+ STACK_WIND(frame,
+ crypt_create_tail,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd,
+ local->xattr, /* CRYPTO_FORMAT_PREFIX */
+ 0,
+ NULL);
+ return 0;
+ error:
+ free_inode_info(info);
+ free_format(local);
+ fd_unref(local->fd);
+ dict_unref(local->xattr);
+ if (local->xdata)
+ dict_unref(local->xdata);
+
+ STACK_UNWIND_STRICT(create,
+ frame,
+ op_ret,
+ op_errno,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+ return 0;
+}
+
+/*
+ * Create and store crypt-specific format on disk;
+ * Populate cache with private inode info
+ */
+static int32_t crypt_create_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ inode_t *inode,
+ struct iatt *buf,
+ struct iatt *preparent,
+ struct iatt *postparent,
+ dict_t *xdata)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+ struct crypt_inode_info *info = local->info;
+
+ if (op_ret < 0)
+ goto error;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ local->inode = inode_ref(inode);
+ local->buf = *buf;
+ local->prebuf = *preparent;
+ local->postbuf = *postparent;
+
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_create_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ error:
+ free_inode_info(info);
+ free_format(local);
+ fd_unref(local->fd);
+ dict_unref(local->xattr);
+
+ STACK_UNWIND_STRICT(create,
+ frame,
+ op_ret,
+ op_errno,
+ NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+static int32_t crypt_create(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ mode_t mode,
+ mode_t umask,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ int ret;
+ data_t *data;
+ crypt_local_t *local;
+ crypt_private_t *priv;
+ struct master_cipher_info *master;
+ struct crypt_inode_info *info;
+
+ priv = this->private;
+ master = get_master_cinfo(priv);
+
+ if (master_alg_atomic(master)) {
+ /*
+ * We can't open O_WRONLY, because we
+ * need to do read-modify-write.
+ */
+ if ((flags & O_ACCMODE) == O_WRONLY)
+ flags = (flags & ~O_ACCMODE) | O_RDWR;
+ /*
+ * Make sure that out translated offsets
+ * and counts won't be ignored
+ */
+ flags &= ~O_APPEND;
+ }
+ local = crypt_alloc_local(frame, this, GF_FOP_CREATE);
+ if (!local) {
+ ret = ENOMEM;
+ goto error;
+ }
+ data = dict_get(xdata, "gfid-req");
+ if (!data) {
+ ret = EINVAL;
+ gf_log("crypt", GF_LOG_WARNING, "gfid not found");
+ goto error;
+ }
+ if (data->len != sizeof(uuid_t)) {
+ ret = EINVAL;
+ gf_log("crypt", GF_LOG_WARNING,
+ "bad gfid size (%d), should be %d",
+ (int)data->len, (int)sizeof(uuid_t));
+ goto error;
+ }
+ info = alloc_inode_info(local, loc);
+ if (!info){
+ ret = ENOMEM;
+ goto error;
+ }
+ /*
+ * NOTE:
+ * format has to be created BEFORE
+ * proceeding to the untrusted server
+ */
+ ret = alloc_format_create(local);
+ if (ret) {
+ free_inode_info(info);
+ goto error;
+ }
+ init_inode_info_create(info, master, data);
+
+ ret = create_format(local->format,
+ loc,
+ info,
+ master);
+ if (ret) {
+ free_inode_info(info);
+ goto error;
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ free_inode_info(info);
+ free_format(local);
+ goto error;
+ }
+ ret = dict_set_static_bin(local->xattr,
+ CRYPTO_FORMAT_PREFIX,
+ local->format,
+ new_format_size());
+ if (ret) {
+ dict_unref(local->xattr);
+ free_inode_info(info);
+ free_format(local);
+ goto error;
+ }
+ ret = dict_set(local->xattr, FSIZE_XATTR_PREFIX, data_from_uint64(0));
+ if (ret) {
+ dict_unref(local->xattr);
+ free_inode_info(info);
+ free_format(local);
+ goto error;
+ }
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame,
+ crypt_create_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create,
+ loc,
+ flags,
+ mode,
+ umask,
+ fd,
+ xdata);
+ return 0;
+ error:
+ gf_log("crypt", GF_LOG_WARNING, "can not create file");
+ STACK_UNWIND_STRICT(create,
+ frame,
+ -1,
+ ret,
+ NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+/*
+ * FIXME: this should depends on the version of format string
+ */
+static int32_t filter_crypt_xattr(dict_t *dict,
+ char *key, data_t *value, void *data)
+{
+ dict_del(dict, key);
+ return 0;
+}
+
+static int32_t crypt_fsetxattr(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ dict_foreach_fnmatch(dict, "trusted.glusterfs.crypt*",
+ filter_crypt_xattr, NULL);
+ STACK_WIND(frame,
+ default_fsetxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ fd,
+ dict,
+ flags,
+ xdata);
+ return 0;
+}
+
+/*
+ * TBD: verify file metadata before wind
+ */
+static int32_t crypt_setxattr(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ dict_foreach_fnmatch(dict, "trusted.glusterfs.crypt*",
+ filter_crypt_xattr, NULL);
+ STACK_WIND(frame,
+ default_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ loc,
+ dict,
+ flags,
+ xdata);
+ return 0;
+}
+
+/*
+ * called as flush_cbk()
+ */
+static int32_t linkop_end(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ linkop_unwind_handler_t unwind_fn;
+ unwind_fn = linkop_unwind_dispatch(local->fop);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0 &&
+ op_errno == ENOENT &&
+ local->loc->inode->ia_type == IA_IFLNK) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ }
+ unwind_fn(frame);
+ return 0;
+}
+
+/*
+ * unpin inode on the server
+ */
+static int32_t link_flush(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct iatt *buf,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ if (local->xdata) {
+ dict_unref(local->xdata);
+ local->xdata = NULL;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ local->inode = inode_ref(inode);
+ local->buf = *buf;
+ local->prebuf = *preparent;
+ local->postbuf = *postparent;
+
+ STACK_WIND(frame,
+ linkop_end,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ local->fd,
+ NULL);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ link_unwind(frame);
+ return 0;
+}
+
+void link_unwind(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ dict_t *xdata;
+ dict_t *xattr;
+ inode_t *inode;
+
+ if (!local) {
+ STACK_UNWIND_STRICT(link,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+ return;
+ }
+ xdata = local->xdata;
+ xattr = local->xattr;
+ inode = local->inode;
+
+ if (local->loc){
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ if (local->newloc) {
+ loc_wipe(local->newloc);
+ GF_FREE(local->newloc);
+ }
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->format)
+ GF_FREE(local->format);
+
+ STACK_UNWIND_STRICT(link,
+ frame,
+ local->op_ret,
+ local->op_errno,
+ inode,
+ &local->buf,
+ &local->prebuf,
+ &local->postbuf,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+ if (xattr)
+ dict_unref(xattr);
+ if (inode)
+ inode_unref(inode);
+}
+
+void link_wind(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+
+ STACK_WIND(frame,
+ link_flush,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link,
+ local->loc,
+ local->newloc,
+ local->xdata);
+}
+
+/*
+ * unlink()
+ */
+static int32_t unlink_flush(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ local->prebuf = *preparent;
+ local->postbuf = *postparent;
+ if (local->xdata) {
+ dict_unref(local->xdata);
+ local->xdata = NULL;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ STACK_WIND(frame,
+ linkop_end,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ local->fd,
+ NULL);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ unlink_unwind(frame);
+ return 0;
+}
+
+void unlink_unwind(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ dict_t *xdata;
+ dict_t *xattr;
+
+ if (!local) {
+ STACK_UNWIND_STRICT(unlink,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL,
+ NULL);
+ return;
+ }
+ xdata = local->xdata;
+ xattr = local->xattr;
+ if (local->loc){
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->format)
+ GF_FREE(local->format);
+
+ STACK_UNWIND_STRICT(unlink,
+ frame,
+ local->op_ret,
+ local->op_errno,
+ &local->prebuf,
+ &local->postbuf,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+ if (xattr)
+ dict_unref(xattr);
+}
+
+void unlink_wind(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+
+ STACK_WIND(frame,
+ unlink_flush,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ local->loc,
+ local->flags,
+ local->xdata);
+}
+
+void rename_unwind(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ dict_t *xdata;
+ dict_t *xattr;
+ struct iatt *prenewparent;
+ struct iatt *postnewparent;
+
+ if (!local) {
+ STACK_UNWIND_STRICT(rename,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+ return;
+ }
+ xdata = local->xdata;
+ xattr = local->xattr;
+ prenewparent = local->prenewparent;
+ postnewparent = local->postnewparent;
+
+ if (local->loc){
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ if (local->newloc){
+ loc_wipe(local->newloc);
+ GF_FREE(local->newloc);
+ }
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->format)
+ GF_FREE(local->format);
+
+ STACK_UNWIND_STRICT(rename,
+ frame,
+ local->op_ret,
+ local->op_errno,
+ &local->buf,
+ &local->prebuf,
+ &local->postbuf,
+ prenewparent,
+ postnewparent,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+ if (xattr)
+ dict_unref(xattr);
+ if (prenewparent)
+ GF_FREE(prenewparent);
+ if (postnewparent)
+ GF_FREE(postnewparent);
+}
+
+/*
+ * called as flush_cbk()
+ */
+static int32_t rename_end(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ rename_unwind(frame);
+ return 0;
+}
+
+static int32_t rename_flush(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf,
+ struct iatt *preoldparent,
+ struct iatt *postoldparent,
+ struct iatt *prenewparent,
+ struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ dict_unref(local->xdata);
+ local->xdata = NULL;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ local->buf = *buf;
+ local->prebuf = *preoldparent;
+ local->postbuf = *postoldparent;
+ if (prenewparent) {
+ local->prenewparent = GF_CALLOC(1, sizeof(*prenewparent),
+ gf_crypt_mt_iatt);
+ if (!local->prenewparent) {
+ op_errno = ENOMEM;
+ goto error;
+ }
+ *local->prenewparent = *prenewparent;
+ }
+ if (postnewparent) {
+ local->postnewparent = GF_CALLOC(1, sizeof(*postnewparent),
+ gf_crypt_mt_iatt);
+ if (!local->postnewparent) {
+ op_errno = ENOMEM;
+ goto error;
+ }
+ *local->postnewparent = *postnewparent;
+ }
+ STACK_WIND(frame,
+ rename_end,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ local->fd,
+ NULL);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ rename_unwind(frame);
+ return 0;
+}
+
+void rename_wind(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+
+ STACK_WIND(frame,
+ rename_flush,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename,
+ local->loc,
+ local->newloc,
+ local->xdata);
+}
+
+static int32_t __do_linkop(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ linkop_wind_handler_t wind_fn;
+ linkop_unwind_handler_t unwind_fn;
+
+ wind_fn = linkop_wind_dispatch(local->fop);
+ unwind_fn = linkop_unwind_dispatch(local->fop);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret >= 0)
+ wind_fn(frame, this);
+ else {
+ gf_log(this->name, GF_LOG_WARNING, "mtd unlock failed (%d)",
+ op_errno);
+ unwind_fn(frame);
+ }
+ return 0;
+}
+
+static int32_t do_linkop(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+ linkop_unwind_handler_t unwind_fn;
+
+ unwind_fn = linkop_unwind_dispatch(local->fop);
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if(op_ret < 0)
+ goto error;
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ __do_linkop,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ error:
+ unwind_fn(frame);
+ return 0;
+}
+
+/*
+ * Update the metadata string (against the new pathname);
+ * submit the result
+ */
+static int32_t linkop_begin(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ gf_boolean_t upload_info;
+ crypt_local_t *local = frame->local;
+ crypt_private_t *priv = this->private;
+ struct crypt_inode_info *info;
+ data_t *old_mtd;
+ uint32_t new_mtd_size;
+ uint64_t value = 0;
+ void (*unwind_fn)(call_frame_t *frame);
+ void (*wind_fn)(call_frame_t *frame, xlator_t *this);
+ mtd_op_t mop;
+
+ wind_fn = linkop_wind_dispatch(local->fop);
+ unwind_fn = linkop_unwind_dispatch(local->fop);
+ mop = linkop_mtdop_dispatch(local->fop);
+
+ if (local->fd->inode->ia_type == IA_IFLNK)
+ goto wind;
+ if (op_ret < 0)
+ /*
+ * verification failed
+ */
+ goto error;
+
+ old_mtd = dict_get(xdata, CRYPTO_FORMAT_PREFIX);
+ if (!old_mtd) {
+ op_errno = EIO;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Metadata string wasn't found");
+ goto error;
+ }
+ new_mtd_size = format_size(mop, old_mtd->len);
+ op_errno = alloc_format(local, new_mtd_size);
+ if (op_errno)
+ goto error;
+ /*
+ * check for cached info
+ */
+ op_ret = inode_ctx_get(fd->inode, this, &value);
+ if (op_ret != -1) {
+ info = (struct crypt_inode_info *)(long)value;
+ if (info == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Inode info was not found");
+ op_errno = EINVAL;
+ goto error;
+ }
+ /*
+ * info was found in the cache
+ */
+ local->info = info;
+ upload_info = _gf_false;
+ }
+ else {
+ /*
+ * info wasn't found in the cache;
+ */
+ info = alloc_inode_info(local, local->loc);
+ if (!info)
+ goto error;
+ init_inode_info_head(info, fd);
+ local->info = info;
+ upload_info = _gf_true;
+ }
+ op_errno = open_format((unsigned char *)old_mtd->data,
+ old_mtd->len,
+ local->loc,
+ info,
+ get_master_cinfo(priv),
+ local,
+ upload_info);
+ if (op_errno)
+ goto error;
+ if (upload_info == _gf_true) {
+ op_errno = init_inode_info_tail(info,
+ get_master_cinfo(priv));
+ if (op_errno)
+ goto error;
+ op_errno = inode_ctx_put(fd->inode, this,
+ (uint64_t)(long)(info));
+ if (op_errno == -1) {
+ op_errno = EIO;
+ goto error;
+ }
+ }
+ /*
+ * update the format string (append/update/cup a MAC)
+ */
+ op_errno = update_format(local->format,
+ (unsigned char *)old_mtd->data,
+ old_mtd->len,
+ local->mac_idx,
+ mop,
+ local->newloc,
+ info,
+ get_master_cinfo(priv),
+ local);
+ if (op_errno)
+ goto error;
+ /*
+ * store the new format string on the server
+ */
+ if (new_mtd_size) {
+ op_errno = dict_set_static_bin(local->xattr,
+ CRYPTO_FORMAT_PREFIX,
+ local->format,
+ new_mtd_size);
+ if (op_errno)
+ goto error;
+ }
+ STACK_WIND(frame,
+ do_linkop,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ local->loc,
+ local->xattr,
+ 0,
+ NULL);
+ return 0;
+ wind:
+ wind_fn(frame, this);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ unwind_fn(frame);
+ return 0;
+}
+
+static int32_t linkop_grab_local(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc,
+ int flags, dict_t *xdata,
+ glusterfs_fop_t op)
+{
+ int32_t ret = ENOMEM;
+ fd_t *fd;
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, op);
+ if (!local)
+ goto error;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ fd = fd_create(oldloc->inode, frame->root->pid);
+ if (!fd) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not create fd");
+ goto error;
+ }
+ local->fd = fd;
+ local->flags = flags;
+ local->loc = GF_CALLOC(1, sizeof(*oldloc), gf_crypt_mt_loc);
+ if (!local->loc)
+ goto error;
+ memset(local->loc, 0, sizeof(*local->loc));
+ ret = loc_copy(local->loc, oldloc);
+ if (ret) {
+ GF_FREE(local->loc);
+ local->loc = NULL;
+ goto error;
+ }
+ if (newloc) {
+ local->newloc = GF_CALLOC(1, sizeof(*newloc), gf_crypt_mt_loc);
+ if (!local->newloc) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ goto error;
+ }
+ memset(local->newloc, 0, sizeof(*local->newloc));
+ ret = loc_copy(local->newloc, newloc);
+ if (ret) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ GF_FREE(local->newloc);
+ goto error;
+ }
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not create dict");
+ ret = ENOMEM;
+ goto error;
+ }
+ return 0;
+
+error:
+ if (local) {
+ if (local->xdata)
+ dict_unref(local->xdata);
+ if (local->fd)
+ fd_unref(local->fd);
+ local->fd = 0;
+ local->loc = NULL;
+ local->newloc = NULL;
+ local->op_ret = -1;
+ local->op_errno = ret;
+ }
+
+ return ret;
+}
+
+/*
+ * read and verify locked metadata against the old pathname (via open);
+ * update the metadata string in accordance with the new pathname;
+ * submit modified metadata;
+ * wind;
+ */
+static int32_t linkop(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc,
+ int flags,
+ dict_t *xdata,
+ glusterfs_fop_t op)
+{
+ int32_t ret;
+ dict_t *dict;
+ crypt_local_t *local;
+ void (*unwind_fn)(call_frame_t *frame);
+
+ unwind_fn = linkop_unwind_dispatch(op);
+
+ ret = linkop_grab_local(frame, this, oldloc, newloc, flags, xdata, op);
+ local = frame->local;
+ if (ret)
+ goto error;
+ dict = dict_new();
+ if (!dict) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not create dict");
+ ret = ENOMEM;
+ goto error;
+ }
+ /*
+ * Set a message to crypt_open() that we need
+ * locked metadata string.
+ * All link operations (link, unlink, rename)
+ * need write lock
+ */
+ msgflags_set_mtd_wlock(&local->msgflags);
+ ret = dict_set_static_bin(dict,
+ MSGFLAGS_PREFIX,
+ &local->msgflags,
+ sizeof(local->msgflags));
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not set dict");
+ dict_unref(dict);
+ goto error;
+ }
+ /*
+ * verify metadata against the old pathname
+ * and retrieve locked metadata string
+ */
+ STACK_WIND(frame,
+ linkop_begin,
+ this,
+ this->fops->open, /* crypt_open() */
+ oldloc,
+ O_RDWR,
+ local->fd,
+ dict);
+ dict_unref(dict);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = ret;
+ unwind_fn(frame);
+ return 0;
+}
+
+static int32_t crypt_link(call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ return linkop(frame, this, oldloc, newloc, 0, xdata, GF_FOP_LINK);
+}
+
+static int32_t crypt_unlink(call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int flags, dict_t *xdata)
+{
+ return linkop(frame, this, loc, NULL, flags, xdata, GF_FOP_UNLINK);
+}
+
+static int32_t crypt_rename(call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ return linkop(frame, this, oldloc, newloc, 0, xdata, GF_FOP_RENAME);
+}
+
+static void put_one_call_open(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ if (put_one_call(local)) {
+ fd_t *fd = local->fd;
+ loc_t *loc = local->loc;
+ dict_t *xdata = local->xdata;
+
+ STACK_UNWIND_STRICT(open,
+ frame,
+ local->op_ret,
+ local->op_errno,
+ fd,
+ xdata);
+ fd_unref(fd);
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(loc);
+ GF_FREE(loc);
+ }
+}
+
+static int32_t __crypt_readv_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ /* read deals with data configs only */
+ struct iovec *avec = local->data_conf.avec;
+ char **pool = local->data_conf.pool;
+ int blocks_in_pool = local->data_conf.blocks_in_pool;
+ struct iobref *iobref = local->iobref;
+ struct iobref *iobref_data = local->iobref_data;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "readv unlock failed (%d)", op_errno);
+ if (local->op_ret >= 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+ dump_plain_text(local, avec);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "readv: ret_to_user: %d, iovec len: %d, ia_size: %llu",
+ (int)(local->rw_count > 0 ? local->rw_count : local->op_ret),
+ (int)(local->rw_count > 0 ? iovec_get_size(avec, local->data_conf.acount) : 0),
+ (unsigned long long)local->buf.ia_size);
+
+ STACK_UNWIND_STRICT(readv,
+ frame,
+ local->rw_count > 0 ? local->rw_count : local->op_ret,
+ local->op_errno,
+ avec,
+ avec ? local->data_conf.acount : 0,
+ &local->buf,
+ local->iobref,
+ local_xdata);
+
+ free_avec(avec, pool, blocks_in_pool);
+ fd_unref(local_fd);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ if (iobref)
+ iobref_unref(iobref);
+ if (iobref_data)
+ iobref_unref(iobref_data);
+ return 0;
+}
+
+static void crypt_readv_done(call_frame_t *frame, xlator_t *this)
+{
+ if (parent_is_crypt_xlator(frame, this))
+ /*
+ * don't unlock (it will be done by the parent)
+ */
+ __crypt_readv_done(frame, NULL, this, 0, 0, NULL);
+ else {
+ crypt_local_t *local = frame->local;
+ struct gf_flock lock = {0, };
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ __crypt_readv_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ }
+}
+
+static void put_one_call_readv(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ if (put_one_call(local))
+ crypt_readv_done(frame, this);
+}
+
+static int32_t __crypt_writev_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ int32_t ret_to_user;
+
+ if (local->xattr)
+ dict_unref(local->xattr);
+ /*
+ * Calculate amout of butes to be returned
+ * to user. We need to subtract paddings that
+ * have been written as a part of atom.
+ */
+ /*
+ * subtract head padding
+ */
+ if (local->rw_count == 0)
+ /*
+ * Nothing has been written, it must be an error
+ */
+ ret_to_user = local->op_ret;
+ else if (local->rw_count <= local->data_conf.off_in_head) {
+ gf_log("crypt", GF_LOG_WARNING, "Incomplete write");
+ ret_to_user = 0;
+ }
+ else
+ ret_to_user = local->rw_count -
+ local->data_conf.off_in_head;
+ /*
+ * subtract tail padding
+ */
+ if (ret_to_user > local->data_conf.orig_size)
+ ret_to_user = local->data_conf.orig_size;
+
+ if (local->iobref)
+ iobref_unref(local->iobref);
+ if (local->iobref_data)
+ iobref_unref(local->iobref_data);
+ free_avec_data(local);
+ free_avec_hole(local);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "writev: ret_to_user: %d", ret_to_user);
+
+ STACK_UNWIND_STRICT(writev,
+ frame,
+ ret_to_user,
+ local->op_errno,
+ &local->prebuf,
+ &local->postbuf,
+ local_xdata);
+ fd_unref(local_fd);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ return 0;
+}
+
+static int32_t crypt_writev_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ gf_log("crypt", GF_LOG_WARNING, "can not update file size");
+
+ if (parent_is_crypt_xlator(frame, this))
+ /*
+ * don't unlock (it will be done by the parent)
+ */
+ __crypt_writev_done(frame, NULL, this, 0, 0, NULL);
+ else {
+ struct gf_flock lock = {0, };
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ __crypt_writev_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ }
+ return 0;
+}
+
+static void put_one_call_writev(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ if (put_one_call(local)) {
+ if (local->update_disk_file_size) {
+ int32_t ret;
+ /*
+ * update file size, unlock the file and unwind
+ */
+ ret = dict_set(local->xattr,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (ret) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "can not set key to update file size");
+ crypt_writev_done(frame, NULL,
+ this, 0, 0, NULL);
+ return;
+ }
+ gf_log("crypt", GF_LOG_DEBUG,
+ "Updating disk file size to %llu",
+ (unsigned long long)local->cur_file_size);
+ STACK_WIND(frame,
+ crypt_writev_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd,
+ local->xattr, /* CRYPTO_FORMAT_PREFIX */
+ 0,
+ NULL);
+ }
+ else
+ crypt_writev_done(frame, NULL, this, 0, 0, NULL);
+ }
+}
+
+static int32_t __crypt_ftruncate_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ char *iobase = local->vec.iov_base;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "ftruncate unlock failed (%d)", op_errno);
+ if (local->op_ret >= 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+ if (local->iobref_data)
+ iobref_unref(local->iobref_data);
+ free_avec_data(local);
+ free_avec_hole(local);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "ftruncate, return to user: presize=%llu, postsize=%llu",
+ (unsigned long long)local->prebuf.ia_size,
+ (unsigned long long)local->postbuf.ia_size);
+
+ STACK_UNWIND_STRICT(ftruncate,
+ frame,
+ local->op_ret < 0 ? -1 : 0,
+ local->op_errno,
+ &local->prebuf,
+ &local->postbuf,
+ local_xdata);
+ fd_unref(local_fd);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ if (iobase)
+ GF_FREE(iobase);
+ return 0;
+}
+
+static int32_t crypt_ftruncate_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ struct gf_flock lock = {0, };
+
+ dict_unref(local->xattr);
+ if (op_ret < 0)
+ gf_log("crypt", GF_LOG_WARNING, "can not update file size");
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ __crypt_ftruncate_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+}
+
+static void put_one_call_ftruncate(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ if (put_one_call(local)) {
+ if (local->update_disk_file_size) {
+ int32_t ret;
+ /*
+ * update file size, unlock the file and unwind
+ */
+ ret = dict_set(local->xattr,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (ret) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "can not set key to update file size");
+ crypt_ftruncate_done(frame, NULL,
+ this, 0, 0, NULL);
+ return;
+ }
+ gf_log("crypt", GF_LOG_DEBUG,
+ "Updating disk file size to %llu",
+ (unsigned long long)local->cur_file_size);
+ STACK_WIND(frame,
+ crypt_ftruncate_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd,
+ local->xattr, /* CRYPTO_FORMAT_PREFIX */
+ 0,
+ NULL);
+ }
+ else
+ crypt_ftruncate_done(frame, NULL, this, 0, 0, NULL);
+ }
+}
+
+/*
+ * load regular file size for some FOPs
+ */
+static int32_t load_file_size(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ data_t *data;
+ crypt_local_t *local = frame->local;
+
+ dict_t *local_xdata = local->xdata;
+ inode_t *local_inode = local->inode;
+
+ if (op_ret < 0)
+ goto unwind;
+ /*
+ * load regular file size
+ */
+ data = dict_get(dict, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ if (local->xdata)
+ dict_unref(local->xdata);
+ gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
+ op_ret = -1;
+ op_errno = EIO;
+ goto unwind;
+ }
+ local->buf.ia_size = data_to_uint64(data);
+
+ gf_log(this->name, GF_LOG_DEBUG,
+ "FOP %d: Translate regular file to %llu",
+ local->fop,
+ (unsigned long long)local->buf.ia_size);
+ unwind:
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->loc) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ switch (local->fop) {
+ case GF_FOP_FSTAT:
+ STACK_UNWIND_STRICT(fstat,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? &local->buf : NULL,
+ local->xdata);
+ break;
+ case GF_FOP_STAT:
+ STACK_UNWIND_STRICT(stat,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? &local->buf : NULL,
+ local->xdata);
+ break;
+ case GF_FOP_LOOKUP:
+ STACK_UNWIND_STRICT(lookup,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? local->inode : NULL,
+ op_ret >= 0 ? &local->buf : NULL,
+ local->xdata,
+ op_ret >= 0 ? &local->postbuf : NULL);
+ break;
+ case GF_FOP_READ:
+ STACK_UNWIND_STRICT(readv,
+ frame,
+ op_ret,
+ op_errno,
+ NULL,
+ 0,
+ op_ret >= 0 ? &local->buf : NULL,
+ NULL,
+ NULL);
+ break;
+ default:
+ gf_log(this->name, GF_LOG_WARNING,
+ "Improper file operation %d", local->fop);
+ }
+ if (local_xdata)
+ dict_unref(local_xdata);
+ if (local_inode)
+ inode_unref(local_inode);
+ return 0;
+}
+
+static int32_t crypt_stat_common_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto unwind;
+ if (!IA_ISREG(buf->ia_type))
+ goto unwind;
+
+ local->buf = *buf;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ switch (local->fop) {
+ case GF_FOP_FSTAT:
+ STACK_WIND(frame,
+ load_file_size,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ break;
+ case GF_FOP_STAT:
+ STACK_WIND(frame,
+ load_file_size,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ local->loc,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "Improper file operation %d", local->fop);
+ }
+ return 0;
+ unwind:
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->loc) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ switch (local->fop) {
+ case GF_FOP_FSTAT:
+ STACK_UNWIND_STRICT(fstat,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? buf : NULL,
+ op_ret >= 0 ? xdata : NULL);
+ break;
+ case GF_FOP_STAT:
+ STACK_UNWIND_STRICT(stat,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? buf : NULL,
+ op_ret >= 0 ? xdata : NULL);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "Improper file operation %d", local->fop);
+ }
+ return 0;
+}
+
+static int32_t crypt_fstat(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd, dict_t *xdata)
+{
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, GF_FOP_FSTAT);
+ if (!local)
+ goto error;
+ local->fd = fd_ref(fd);
+ STACK_WIND(frame,
+ crypt_stat_common_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat,
+ fd,
+ xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(fstat,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t crypt_stat(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc, dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, GF_FOP_STAT);
+ if (!local)
+ goto error;
+ local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc);
+ if (!local->loc)
+ goto error;
+ memset(local->loc, 0, sizeof(*local->loc));
+ ret = loc_copy(local->loc, loc);
+ if (ret) {
+ GF_FREE(local->loc);
+ goto error;
+ }
+ STACK_WIND(frame,
+ crypt_stat_common_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat,
+ loc,
+ xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(stat,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t crypt_lookup_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto unwind;
+ if (!IA_ISREG(buf->ia_type))
+ goto unwind;
+
+ local->inode = inode_ref(inode);
+ local->buf = *buf;
+ local->postbuf = *postparent;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ uuid_copy(local->loc->gfid, buf->ia_gfid);
+
+ STACK_WIND(frame,
+ load_file_size,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ local->loc,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ unwind:
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ STACK_UNWIND_STRICT(lookup,
+ frame,
+ op_ret,
+ op_errno,
+ inode,
+ buf,
+ xdata,
+ postparent);
+ return 0;
+}
+
+static int32_t crypt_lookup(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc, dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, GF_FOP_LOOKUP);
+ if (!local)
+ goto error;
+ local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc);
+ if (!local->loc)
+ goto error;
+ memset(local->loc, 0, sizeof(*local->loc));
+ ret = loc_copy(local->loc, loc);
+ if (ret) {
+ GF_FREE(local->loc);
+ goto error;
+ }
+ gf_log(this->name, GF_LOG_DEBUG, "Lookup %s", loc->path);
+ STACK_WIND(frame,
+ crypt_lookup_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup,
+ loc,
+ xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(lookup,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+ return 0;
+}
+
+/*
+ * for every regular directory entry find its real file size
+ * and update stat's buf properly
+ */
+static int32_t crypt_readdirp_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ gf_dirent_t *entries, dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+
+ if (op_ret < 0)
+ goto unwind;
+
+ list_for_each_entry (entry, (&entries->list), list) {
+ data_t *data;
+
+ if (!IA_ISREG(entry->d_stat.ia_type))
+ continue;
+ data = dict_get(entry->dict, FSIZE_XATTR_PREFIX);
+ if (!data){
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size of direntry not found");
+ op_errno = EIO;
+ op_ret = -1;
+ break;
+ }
+ entry->d_stat.ia_size = data_to_uint64(data);
+ }
+ unwind:
+ STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+/*
+ * ->readdirp() fills in-core inodes, so we need to set proper
+ * file sizes for all directory entries of the parent @fd.
+ * Actual updates take place in ->crypt_readdirp_cbk()
+ */
+static int32_t crypt_readdirp(call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset,
+ dict_t *xdata)
+{
+ int32_t ret = ENOMEM;
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata)
+ goto error;
+ }
+ else
+ dict_ref(xdata);
+ /*
+ * make sure that we'll have real file sizes at ->readdirp_cbk()
+ */
+ ret = dict_set(xdata, FSIZE_XATTR_PREFIX, data_from_uint64(0));
+ if (ret) {
+ dict_unref(xdata);
+ goto error;
+ }
+ STACK_WIND(frame,
+ crypt_readdirp_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp,
+ fd,
+ size,
+ offset,
+ xdata);
+ dict_unref(xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(readdirp, frame, -1, ret, NULL, NULL);
+ return 0;
+}
+
+static int32_t crypt_access(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t mask, dict_t *xdata)
+{
+ gf_log(this->name, GF_LOG_WARNING,
+ "NFS mounts of encrypted volumes are unsupported");
+ STACK_UNWIND_STRICT(access, frame, -1, EPERM, NULL);
+ return 0;
+}
+
+int32_t master_set_block_size (xlator_t *this, crypt_private_t *priv,
+ dict_t *options)
+{
+ uint64_t block_size = 0;
+ struct master_cipher_info *master = get_master_cinfo(priv);
+
+ if (options != NULL)
+ GF_OPTION_RECONF("block-size", block_size, options,
+ size_uint64, error);
+ else
+ GF_OPTION_INIT("block-size", block_size, size_uint64, error);
+
+ switch (block_size) {
+ case 512:
+ master->m_block_bits = 9;
+ break;
+ case 1024:
+ master->m_block_bits = 10;
+ break;
+ case 2048:
+ master->m_block_bits = 11;
+ break;
+ case 4096:
+ master->m_block_bits = 12;
+ break;
+ default:
+ gf_log("crypt", GF_LOG_ERROR,
+ "FATAL: unsupported block size %llu",
+ (unsigned long long)block_size);
+ goto error;
+ }
+ return 0;
+ error:
+ return -1;
+}
+
+int32_t master_set_alg(xlator_t *this, crypt_private_t *priv)
+{
+ struct master_cipher_info *master = get_master_cinfo(priv);
+ master->m_alg = AES_CIPHER_ALG;
+ return 0;
+}
+
+int32_t master_set_mode(xlator_t *this, crypt_private_t *priv)
+{
+ struct master_cipher_info *master = get_master_cinfo(priv);
+ master->m_mode = XTS_CIPHER_MODE;
+ return 0;
+}
+
+/*
+ * set key size in bits to the master info
+ * Pre-conditions: cipher mode in the master info is uptodate.
+ */
+static int master_set_data_key_size (xlator_t *this, crypt_private_t *priv,
+ dict_t *options)
+{
+ int32_t ret;
+ uint64_t key_size = 0;
+ struct master_cipher_info *master = get_master_cinfo(priv);
+
+ if (options != NULL)
+ GF_OPTION_RECONF("data-key-size", key_size, options,
+ uint64, error);
+ else
+ GF_OPTION_INIT("data-key-size", key_size, uint64, error);
+
+ ret = data_cipher_algs[master->m_alg][master->m_mode].check_key(key_size);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "FATAL: wrong bin key size %llu for alg %d mode %d",
+ (unsigned long long)key_size,
+ (int)master->m_alg,
+ (int)master->m_mode);
+ goto error;
+ }
+ master->m_dkey_size = key_size;
+ return 0;
+ error:
+ return -1;
+}
+
+static int is_hex(char *s) {
+ return ('0' <= *s && *s <= '9') || ('a' <= *s && *s <= 'f');
+}
+
+static int parse_hex_buf(xlator_t *this, char *src, unsigned char *dst,
+ int hex_size)
+{
+ int i;
+ int hex_byte = 0;
+
+ for (i = 0; i < (hex_size / 2); i++) {
+ if (!is_hex(src + i*2) || !is_hex(src + i*2 + 1)) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "FATAL: not hex symbol in key");
+ return -1;
+ }
+ if (sscanf(src + i*2, "%2x", &hex_byte) != 1) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "FATAL: can not parse hex key");
+ return -1;
+ }
+ dst[i] = hex_byte & 0xff;
+ }
+ return 0;
+}
+
+/*
+ * Parse options;
+ * install master volume key
+ */
+int32_t master_set_master_vol_key(xlator_t *this, crypt_private_t *priv)
+{
+ int32_t ret;
+ FILE *file = NULL;
+
+ int32_t key_size;
+ char *opt_key_file_pathname = NULL;
+
+ unsigned char bin_buf[MASTER_VOL_KEY_SIZE];
+ char hex_buf[2 * MASTER_VOL_KEY_SIZE];
+
+ struct master_cipher_info *master = get_master_cinfo(priv);
+ /*
+ * extract master key passed via option
+ */
+ GF_OPTION_INIT("master-key", opt_key_file_pathname, path, bad_key);
+
+ if (!opt_key_file_pathname) {
+ gf_log(this->name, GF_LOG_ERROR, "FATAL: missing master key");
+ return -1;
+ }
+ gf_log(this->name, GF_LOG_DEBUG, "handling file key %s",
+ opt_key_file_pathname);
+
+ file = fopen(opt_key_file_pathname, "r");
+ if (file == NULL) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "FATAL: can not open file with master key");
+ return -1;
+ }
+ /*
+ * extract hex key
+ */
+ key_size = fread(hex_buf, 1, sizeof(hex_buf), file);
+ if (key_size < sizeof(hex_buf)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "FATAL: master key is too short");
+ goto bad_key;
+ }
+ ret = parse_hex_buf(this, hex_buf, bin_buf, key_size);
+ if (ret)
+ goto bad_key;
+ memcpy(master->m_key, bin_buf, MASTER_VOL_KEY_SIZE);
+ memset(hex_buf, 0, sizeof(hex_buf));
+ fclose(file);
+
+ memset(bin_buf, 0, sizeof(bin_buf));
+ return 0;
+ bad_key:
+ gf_log(this->name, GF_LOG_ERROR, "FATAL: bad master key");
+ if (file)
+ fclose(file);
+ memset(bin_buf, 0, sizeof(bin_buf));
+ return -1;
+}
+
+/*
+ * Derive volume key for object-id authentication
+ */
+int32_t master_set_nmtd_vol_key(xlator_t *this, crypt_private_t *priv)
+{
+ return get_nmtd_vol_key(get_master_cinfo(priv));
+}
+
+int32_t crypt_init_xlator(xlator_t *this)
+{
+ int32_t ret;
+ crypt_private_t *priv = this->private;
+
+ ret = master_set_alg(this, priv);
+ if (ret)
+ return ret;
+ ret = master_set_mode(this, priv);
+ if (ret)
+ return ret;
+ ret = master_set_block_size(this, priv, NULL);
+ if (ret)
+ return ret;
+ ret = master_set_data_key_size(this, priv, NULL);
+ if (ret)
+ return ret;
+ ret = master_set_master_vol_key(this, priv);
+ if (ret)
+ return ret;
+ return master_set_nmtd_vol_key(this, priv);
+}
+
+static int32_t crypt_alloc_private(xlator_t *this)
+{
+ this->private = GF_CALLOC(1, sizeof(crypt_private_t), gf_crypt_mt_priv);
+ if (!this->private) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "Can not allocate memory for private data");
+ return ENOMEM;
+ }
+ return 0;
+}
+
+static void crypt_free_private(xlator_t *this)
+{
+ crypt_private_t *priv = this->private;
+ if (priv) {
+ memset(priv, 0, sizeof(*priv));
+ GF_FREE(priv);
+ }
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_crypt_mt_end);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t reconfigure (xlator_t *this, dict_t *options)
+{
+ int32_t ret = -1;
+ crypt_private_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("crypt", this, error);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, error);
+ GF_VALIDATE_OR_GOTO (this->name, options, error);
+
+ priv = this->private;
+
+ ret = master_set_block_size(this, priv, options);
+ if (ret) {
+ gf_log("this->name", GF_LOG_ERROR,
+ "Failed to reconfure block size");
+ goto error;
+ }
+ ret = master_set_data_key_size(this, priv, options);
+ if (ret) {
+ gf_log("this->name", GF_LOG_ERROR,
+ "Failed to reconfure data key size");
+ goto error;
+ }
+ return 0;
+ error:
+ return ret;
+}
+
+int32_t init(xlator_t *this)
+{
+ int32_t ret;
+
+ if (!this->children || this->children->next) {
+ gf_log ("crypt", GF_LOG_ERROR,
+ "FATAL: crypt should have exactly one child");
+ return EINVAL;
+ }
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+ ret = crypt_alloc_private(this);
+ if (ret)
+ return ret;
+ ret = crypt_init_xlator(this);
+ if (ret)
+ goto error;
+ this->local_pool = mem_pool_new(crypt_local_t, 64);
+ if (!this->local_pool) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to create local_t's memory pool");
+ ret = ENOMEM;
+ goto error;
+ }
+ gf_log ("crypt", GF_LOG_INFO, "crypt xlator loaded");
+ return 0;
+ error:
+ crypt_free_private(this);
+ return ret;
+}
+
+void fini (xlator_t *this)
+{
+ crypt_free_private(this);
+}
+
+struct xlator_fops fops = {
+ .readv = crypt_readv,
+ .writev = crypt_writev,
+ .truncate = crypt_truncate,
+ .ftruncate = crypt_ftruncate,
+ .setxattr = crypt_setxattr,
+ .fsetxattr = crypt_fsetxattr,
+ .link = crypt_link,
+ .unlink = crypt_unlink,
+ .rename = crypt_rename,
+ .open = crypt_open,
+ .create = crypt_create,
+ .stat = crypt_stat,
+ .fstat = crypt_fstat,
+ .lookup = crypt_lookup,
+ .readdirp = crypt_readdirp,
+ .access = crypt_access
+};
+
+struct xlator_cbks cbks = {
+ .forget = crypt_forget
+};
+
+struct volume_options options[] = {
+ { .key = {"master-key"},
+ .type = GF_OPTION_TYPE_PATH,
+ .description = "Pathname of regular file which contains master volume key"
+ },
+ { .key = {"data-key-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .description = "Data key size (bits)",
+ .min = 256,
+ .max = 512,
+ .default_value = "256",
+ },
+ { .key = {"block-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .description = "Atom size (bits)",
+ .min = 512,
+ .max = 4096,
+ .default_value = "4096"
+ },
+ { .key = {NULL} },
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/crypt.h b/xlators/encryption/crypt/src/crypt.h
new file mode 100644
index 000000000..eb7291f13
--- /dev/null
+++ b/xlators/encryption/crypt/src/crypt.h
@@ -0,0 +1,908 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CRYPT_H__
+#define __CRYPT_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <openssl/aes.h>
+#include <openssl/evp.h>
+#include <openssl/sha.h>
+#include <openssl/hmac.h>
+#include <openssl/cmac.h>
+#include <openssl/modes.h>
+#include "crypt-mem-types.h"
+#include "compat.h"
+
+#define CRYPT_XLATOR_ID (0)
+
+#define MAX_IOVEC_BITS (3)
+#define MAX_IOVEC (1 << MAX_IOVEC_BITS)
+#define KEY_FACTOR_BITS (6)
+
+#define DEBUG_CRYPT (0)
+#define TRIVIAL_TFM (0)
+
+#define CRYPT_MIN_BLOCK_BITS (9)
+#define CRYPT_MAX_BLOCK_BITS (12)
+
+#define MASTER_VOL_KEY_SIZE (32)
+#define NMTD_VOL_KEY_SIZE (16)
+
+#if defined(__NetBSD__)
+typedef off_t loff_t;
+#endif
+
+#if defined(GF_DARWIN_HOST_OS)
+typedef uint64_t loff_t;
+#endif
+
+struct crypt_key {
+ uint32_t len;
+ const char *label;
+};
+
+/*
+ * Add new key types to the end of this
+ * enumeration but before LAST_KEY_TYPE
+ */
+typedef enum {
+ MASTER_VOL_KEY,
+ NMTD_VOL_KEY,
+ NMTD_LINK_KEY,
+ EMTD_FILE_KEY,
+ DATA_FILE_KEY_256,
+ DATA_FILE_KEY_512,
+ LAST_KEY_TYPE
+}crypt_key_type;
+
+struct kderive_context {
+ const unsigned char *pkey;/* parent key */
+ uint32_t pkey_len; /* parent key size, bits */
+ uint32_t ckey_len; /* child key size, bits */
+ unsigned char *fid; /* fixed input data, NIST 800-108, 5.1 */
+ uint32_t fid_len; /* fid len, bytes */
+ unsigned char *out; /* contains child keying material */
+ uint32_t out_len; /* out len, bytes */
+};
+
+typedef enum {
+ DATA_ATOM,
+ HOLE_ATOM,
+ LAST_DATA_TYPE
+}atom_data_type;
+
+typedef enum {
+ HEAD_ATOM,
+ TAIL_ATOM,
+ FULL_ATOM,
+ LAST_LOCALITY_TYPE
+}atom_locality_type;
+
+typedef enum {
+ MTD_CREATE,
+ MTD_APPEND,
+ MTD_OVERWRITE,
+ MTD_CUT,
+ MTD_LAST_OP
+} mtd_op_t;
+
+struct xts128_context {
+ void *key1, *key2;
+ block128_f block1,block2;
+};
+
+struct object_cipher_info {
+ cipher_alg_t o_alg;
+ cipher_mode_t o_mode;
+ uint32_t o_block_bits;
+ uint32_t o_dkey_size; /* raw data key size in bits */
+ union {
+ struct {
+ unsigned char ivec[16];
+ AES_KEY dkey[2];
+ AES_KEY tkey; /* key used for tweaking */
+ XTS128_CONTEXT xts;
+ } aes_xts;
+ } u;
+};
+
+struct master_cipher_info {
+ /*
+ * attributes inherited by newly created regular files
+ */
+ cipher_alg_t m_alg;
+ cipher_mode_t m_mode;
+ uint32_t m_block_bits;
+ uint32_t m_dkey_size; /* raw key size in bits */
+ /*
+ * master key
+ */
+ unsigned char m_key[MASTER_VOL_KEY_SIZE];
+ /*
+ * volume key for oid authentication
+ */
+ unsigned char m_nmtd_key[NMTD_VOL_KEY_SIZE];
+};
+
+/*
+* This info is not changed during file's life
+ */
+struct crypt_inode_info {
+#if DEBUG_CRYPT
+ loc_t *loc; /* pathname that the file has been
+ opened, or created with */
+#endif
+ uint16_t nr_minor;
+ uuid_t oid;
+ struct object_cipher_info cinfo;
+};
+
+/*
+ * this should locate in secure memory
+ */
+typedef struct {
+ struct master_cipher_info master;
+} crypt_private_t;
+
+static inline struct master_cipher_info *get_master_cinfo(crypt_private_t *priv)
+{
+ return &priv->master;
+}
+
+static inline struct object_cipher_info *get_object_cinfo(struct crypt_inode_info
+ *info)
+{
+ return &info->cinfo;
+}
+
+/*
+ * this describes layouts and properties
+ * of atoms in an aligned vector
+ */
+struct avec_config {
+ uint32_t atom_size;
+ atom_data_type type;
+ size_t orig_size;
+ off_t orig_offset;
+ size_t expanded_size;
+ off_t aligned_offset;
+
+ uint32_t off_in_head;
+ uint32_t off_in_tail;
+ uint32_t gap_in_tail;
+ uint32_t nr_full_blocks;
+
+ struct iovec *avec; /* aligned vector */
+ uint32_t acount; /* number of avec components. The same
+ * as number of occupied logical blocks */
+ char **pool;
+ uint32_t blocks_in_pool;
+ uint32_t cursor; /* makes sense only for ordered writes,
+ * so there is no races on this counter.
+ *
+ * Cursor is per-config object, we don't
+ * reset cursor for atoms of different
+ * localities (head, tail, full)
+ */
+};
+
+
+typedef struct {
+ glusterfs_fop_t fop; /* code of FOP this local info built for */
+ fd_t *fd;
+ inode_t *inode;
+ loc_t *loc;
+ int32_t mac_idx;
+ loc_t *newloc;
+ int32_t flags;
+ int32_t wbflags;
+ struct crypt_inode_info *info;
+ struct iobref *iobref;
+ struct iobref *iobref_data;
+ off_t offset;
+
+ uint64_t old_file_size; /* per FOP, retrieved under lock held */
+ uint64_t cur_file_size; /* per iteration, before issuing IOs */
+ uint64_t new_file_size; /* per iteration, after issuing IOs */
+
+ uint64_t io_offset; /* offset of IOs issued per iteration */
+ uint64_t io_offset_nopad; /* offset of user's data in the atom */
+ uint32_t io_size; /* size of IOs issued per iteration */
+ uint32_t io_size_nopad; /* size of user's data in the IOs */
+ uint32_t eof_padding_size; /* size od EOF padding in the IOs */
+
+ gf_lock_t call_lock; /* protect nr_calls from many cbks */
+ int32_t nr_calls;
+
+ atom_data_type active_setup; /* which setup (hole or date)
+ is currently active */
+ /* data setup */
+ struct avec_config data_conf;
+
+ /* hole setup */
+ int hole_conv_in_proggress;
+ gf_lock_t hole_lock; /* protect hole config from many cbks */
+ int hole_handled;
+ struct avec_config hole_conf;
+ struct iatt buf;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ struct iatt *prenewparent;
+ struct iatt *postnewparent;
+ int32_t op_ret;
+ int32_t op_errno;
+ int32_t rw_count; /* total read or written */
+ gf_lock_t rw_count_lock; /* protect the counter above */
+ unsigned char *format; /* for create, update format string */
+ uint32_t format_size;
+ uint32_t msgflags; /* messages for crypt_open() */
+ dict_t *xdata;
+ dict_t *xattr;
+ struct iovec vec; /* contains last file's atom for
+ read-prune-write sequence */
+ gf_boolean_t custom_mtd;
+ /*
+ * the next 3 fields are used by readdir and friends
+ */
+ gf_dirent_t *de; /* directory entry */
+ char *de_path; /* pathname of directory entry */
+ uint32_t de_prefix_len; /* lenght of the parent's pathname */
+ gf_dirent_t *entries;
+
+ uint32_t update_disk_file_size:1;
+} crypt_local_t;
+
+/* This represents a (read)modify-write atom */
+struct rmw_atom {
+ atom_locality_type locality;
+ /*
+ * read-modify-write sequence of the atom
+ */
+ int32_t (*rmw)(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata);
+ /*
+ * offset of the logical block in a file
+ */
+ loff_t (*offset_at)(call_frame_t *frame,
+ struct object_cipher_info *object);
+ /*
+ * IO offset in an atom
+ */
+ uint32_t (*offset_in)(call_frame_t *frame,
+ struct object_cipher_info *object);
+ /*
+ * number of bytes of plain text of this atom that user
+ * wants to read/write.
+ * It can be smaller than atom_size in the case of head
+ * or tail atoms.
+ */
+ uint32_t (*io_size_nopad)(call_frame_t *frame,
+ struct object_cipher_info *object);
+ /*
+ * which iovec represents the atom
+ */
+ struct iovec *(*get_iovec)(call_frame_t *frame, uint32_t count);
+ /*
+ * how many bytes of partial block should be uptodated by
+ * reading from disk.
+ * This is used to perform a read component of RMW (read-modify-write).
+ */
+ uint32_t (*count_to_uptodate)(call_frame_t *frame, struct object_cipher_info *object);
+ struct avec_config *(*get_config)(call_frame_t *frame);
+};
+
+struct data_cipher_alg {
+ gf_boolean_t atomic; /* true means that algorithm requires
+ to pad data before cipher transform */
+ gf_boolean_t should_pad; /* true means that algorithm requires
+ to pad the end of file with extra-data */
+ uint32_t blkbits; /* blksize = 1 << blkbits */
+ /*
+ * any preliminary sanity checks goes here
+ */
+ int32_t (*init)(void);
+ /*
+ * set alg-mode specific inode info
+ */
+ int32_t (*set_private)(struct crypt_inode_info *info,
+ struct master_cipher_info *master);
+ /*
+ * check alg-mode specific data key
+ */
+ int32_t (*check_key)(uint32_t key_size);
+ void (*set_iv)(off_t offset, struct object_cipher_info *object);
+ int32_t (*encrypt)(const unsigned char *from, unsigned char *to,
+ size_t length, off_t offset, const int enc,
+ struct object_cipher_info *object);
+};
+
+/*
+ * version-dependent metadata loader
+ */
+struct crypt_mtd_loader {
+ /*
+ * return core format size
+ */
+ size_t (*format_size)(mtd_op_t op, size_t old_size);
+ /*
+ * pack version-specific metadata of an object
+ * at ->create()
+ */
+ int32_t (*create_format)(unsigned char *wire,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master);
+ /*
+ * extract version-specific metadata of an object
+ * at ->open() time
+ */
+ int32_t (*open_format)(unsigned char *wire,
+ int32_t len,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local,
+ gf_boolean_t load_info);
+ int32_t (*update_format)(unsigned char *new,
+ unsigned char *old,
+ size_t old_len,
+ int32_t mac_idx,
+ mtd_op_t op,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local);
+};
+
+typedef int32_t (*end_writeback_handler_t)(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata);
+typedef void (*linkop_wind_handler_t)(call_frame_t *frame, xlator_t *this);
+typedef void (*linkop_unwind_handler_t)(call_frame_t *frame);
+
+
+/* Declarations */
+
+/* keys.c */
+extern struct crypt_key crypt_keys[LAST_KEY_TYPE];
+int32_t get_nmtd_vol_key(struct master_cipher_info *master);
+int32_t get_nmtd_link_key(loc_t *loc,
+ struct master_cipher_info *master,
+ unsigned char *result);
+int32_t get_emtd_file_key(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ unsigned char *result);
+int32_t get_data_file_key(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ uint32_t keysize,
+ unsigned char *key);
+/* data.c */
+extern struct data_cipher_alg data_cipher_algs[LAST_CIPHER_ALG][LAST_CIPHER_MODE];
+void encrypt_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off);
+void decrypt_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off);
+int32_t align_iov_by_atoms(xlator_t *this,
+ crypt_local_t *local,
+ struct object_cipher_info *object,
+ struct iovec *vec /* input vector */,
+ int32_t count /* number of vec components */,
+ struct iovec *avec /* aligned vector */,
+ char **blocks /* pool of blocks */,
+ uint32_t *blocks_allocated,
+ struct avec_config *conf);
+int32_t set_config_avec_data(xlator_t *this,
+ crypt_local_t *local,
+ struct avec_config *conf,
+ struct object_cipher_info *object,
+ struct iovec *vec,
+ int32_t vec_count);
+int32_t set_config_avec_hole(xlator_t *this,
+ crypt_local_t *local,
+ struct avec_config *conf,
+ struct object_cipher_info *object,
+ glusterfs_fop_t fop);
+void set_gap_at_end(call_frame_t *frame, struct object_cipher_info *object,
+ struct avec_config *conf, atom_data_type dtype);
+void set_config_offsets(call_frame_t *frame,
+ xlator_t *this,
+ uint64_t offset,
+ uint64_t count,
+ atom_data_type dtype,
+ int32_t setup_gap_in_tail);
+
+/* metadata.c */
+extern struct crypt_mtd_loader mtd_loaders [LAST_MTD_LOADER];
+
+int32_t alloc_format(crypt_local_t *local, size_t size);
+int32_t alloc_format_create(crypt_local_t *local);
+void free_format(crypt_local_t *local);
+size_t format_size(mtd_op_t op, size_t old_size);
+size_t new_format_size(void);
+int32_t open_format(unsigned char *str, int32_t len, loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master, crypt_local_t *local,
+ gf_boolean_t load_info);
+int32_t update_format(unsigned char *new, unsigned char *old,
+ size_t old_len, int32_t mac_idx, mtd_op_t op, loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local);
+int32_t create_format(unsigned char *wire,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master);
+
+/* atom.c */
+struct rmw_atom *atom_by_types(atom_data_type data,
+ atom_locality_type locality);
+void submit_partial(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ atom_locality_type ltype);
+void submit_full(call_frame_t *frame, xlator_t *this);
+
+/* crypt.c */
+
+end_writeback_handler_t dispatch_end_writeback(glusterfs_fop_t fop);
+static size_t iovec_get_size(struct iovec *vec, uint32_t count);
+void set_local_io_params_writev(call_frame_t *frame,
+ struct object_cipher_info *object,
+ struct rmw_atom *atom, off_t io_offset,
+ uint32_t io_size);
+void link_wind(call_frame_t *frame, xlator_t *this);
+void unlink_wind(call_frame_t *frame, xlator_t *this);
+void link_unwind(call_frame_t *frame);
+void unlink_unwind(call_frame_t *frame);
+void rename_wind(call_frame_t *frame, xlator_t *this);
+void rename_unwind(call_frame_t *frame);
+
+/* Inline functions */
+
+static inline size_t iovec_get_size(struct iovec *vec, uint32_t count)
+{
+ int i;
+ size_t size = 0;
+ for (i = 0; i < count; i++)
+ size += vec[i].iov_len;
+ return size;
+}
+
+static inline int32_t crypt_xlator_id(void)
+{
+ return CRYPT_XLATOR_ID;
+}
+
+static inline mtd_loader_id current_mtd_loader(void)
+{
+ return MTD_LOADER_V1;
+}
+
+static inline uint32_t master_key_size (void)
+{
+ return crypt_keys[MASTER_VOL_KEY].len >> 3;
+}
+
+static inline uint32_t nmtd_vol_key_size (void)
+{
+ return crypt_keys[NMTD_VOL_KEY].len >> 3;
+}
+
+static inline uint32_t alg_mode_blkbits(cipher_alg_t alg,
+ cipher_mode_t mode)
+{
+ return data_cipher_algs[alg][mode].blkbits;
+}
+
+static inline uint32_t alg_mode_blksize(cipher_alg_t alg,
+ cipher_mode_t mode)
+{
+ return 1 << alg_mode_blkbits(alg, mode);
+}
+
+static inline gf_boolean_t alg_mode_atomic(cipher_alg_t alg,
+ cipher_mode_t mode)
+{
+ return data_cipher_algs[alg][mode].atomic;
+}
+
+static inline gf_boolean_t alg_mode_should_pad(cipher_alg_t alg,
+ cipher_mode_t mode)
+{
+ return data_cipher_algs[alg][mode].should_pad;
+}
+
+static inline uint32_t master_alg_blksize(struct master_cipher_info *mr)
+{
+ return alg_mode_blksize(mr->m_alg, mr->m_mode);
+}
+
+static inline uint32_t master_alg_blkbits(struct master_cipher_info *mr)
+{
+ return alg_mode_blkbits(mr->m_alg, mr->m_mode);
+}
+
+static inline gf_boolean_t master_alg_atomic(struct master_cipher_info *mr)
+{
+ return alg_mode_atomic(mr->m_alg, mr->m_mode);
+}
+
+static inline gf_boolean_t master_alg_should_pad(struct master_cipher_info *mr)
+{
+ return alg_mode_should_pad(mr->m_alg, mr->m_mode);
+}
+
+static inline uint32_t object_alg_blksize(struct object_cipher_info *ob)
+{
+ return alg_mode_blksize(ob->o_alg, ob->o_mode);
+}
+
+static inline uint32_t object_alg_blkbits(struct object_cipher_info *ob)
+{
+ return alg_mode_blkbits(ob->o_alg, ob->o_mode);
+}
+
+static inline gf_boolean_t object_alg_atomic(struct object_cipher_info *ob)
+{
+ return alg_mode_atomic(ob->o_alg, ob->o_mode);
+}
+
+static inline gf_boolean_t object_alg_should_pad(struct object_cipher_info *ob)
+{
+ return alg_mode_should_pad(ob->o_alg, ob->o_mode);
+}
+
+static inline uint32_t aes_raw_key_size(struct master_cipher_info *master)
+{
+ return master->m_dkey_size >> 3;
+}
+
+static inline struct avec_config *get_hole_conf(call_frame_t *frame)
+{
+ return &(((crypt_local_t *)frame->local)->hole_conf);
+}
+
+static inline struct avec_config *get_data_conf(call_frame_t *frame)
+{
+ return &(((crypt_local_t *)frame->local)->data_conf);
+}
+
+static inline int32_t get_atom_bits (struct object_cipher_info *object)
+{
+ return object->o_block_bits;
+}
+
+static inline int32_t get_atom_size (struct object_cipher_info *object)
+{
+ return 1 << get_atom_bits(object);
+}
+
+static inline int32_t has_head_block(struct avec_config *conf)
+{
+ return conf->off_in_head ||
+ (conf->acount == 1 && conf->off_in_tail);
+}
+
+static inline int32_t has_tail_block(struct avec_config *conf)
+{
+ return conf->off_in_tail && conf->acount > 1;
+}
+
+static inline int32_t has_full_blocks(struct avec_config *conf)
+{
+ return conf->nr_full_blocks;
+}
+
+static inline int32_t should_submit_head_block(struct avec_config *conf)
+{
+ return has_head_block(conf) && (conf->cursor == 0);
+}
+
+static inline int32_t should_submit_tail_block(struct avec_config *conf)
+{
+ return has_tail_block(conf) && (conf->cursor == conf->acount - 1);
+}
+
+static inline int32_t should_submit_full_block(struct avec_config *conf)
+{
+ uint32_t start = has_head_block(conf) ? 1 : 0;
+
+ return has_full_blocks(conf) &&
+ conf->cursor >= start &&
+ conf->cursor < start + conf->nr_full_blocks;
+}
+
+#if DEBUG_CRYPT
+static inline void crypt_check_input_len(size_t len,
+ struct object_cipher_info *object)
+{
+ if (object_alg_should_pad(object) && (len & (object_alg_blksize(object) - 1)))
+ gf_log ("crypt", GF_LOG_DEBUG, "bad input len: %d", (int)len);
+}
+
+static inline void check_head_block(struct avec_config *conf)
+{
+ if (!has_head_block(conf))
+ gf_log("crypt", GF_LOG_DEBUG, "not a head atom");
+}
+
+static inline void check_tail_block(struct avec_config *conf)
+{
+ if (!has_tail_block(conf))
+ gf_log("crypt", GF_LOG_DEBUG, "not a tail atom");
+}
+
+static inline void check_full_block(struct avec_config *conf)
+{
+ if (!has_full_blocks(conf))
+ gf_log("crypt", GF_LOG_DEBUG, "not a full atom");
+}
+
+static inline void check_cursor_head(struct avec_config *conf)
+{
+ if (!has_head_block(conf))
+ gf_log("crypt",
+ GF_LOG_DEBUG, "Illegal call of head atom method");
+ else if (conf->cursor != 0)
+ gf_log("crypt",
+ GF_LOG_DEBUG, "Cursor (%d) is not at head atom",
+ conf->cursor);
+}
+
+static inline void check_cursor_full(struct avec_config *conf)
+{
+ if (!has_full_blocks(conf))
+ gf_log("crypt",
+ GF_LOG_DEBUG, "Illegal call of full atom method");
+ if (has_head_block(conf) && (conf->cursor == 0))
+ gf_log("crypt",
+ GF_LOG_DEBUG, "Cursor is not at full atom");
+}
+
+/*
+ * FIXME: use avec->iov_len to check setup
+ */
+static inline int data_local_invariant(crypt_local_t *local)
+{
+ return 0;
+}
+
+#else
+#define crypt_check_input_len(len, object) noop
+#define check_head_block(conf) noop
+#define check_tail_block(conf) noop
+#define check_full_block(conf) noop
+#define check_cursor_head(conf) noop
+#define check_cursor_full(conf) noop
+
+#endif /* DEBUG_CRYPT */
+
+static inline struct avec_config *conf_by_type(call_frame_t *frame,
+ atom_data_type dtype)
+{
+ struct avec_config *conf = NULL;
+
+ switch (dtype) {
+ case HOLE_ATOM:
+ conf = get_hole_conf(frame);
+ break;
+ case DATA_ATOM:
+ conf = get_data_conf(frame);
+ break;
+ default:
+ gf_log("crypt", GF_LOG_DEBUG, "bad atom type");
+ }
+ return conf;
+}
+
+static inline uint32_t nr_calls_head(struct avec_config *conf)
+{
+ return has_head_block(conf) ? 1 : 0;
+}
+
+static inline uint32_t nr_calls_tail(struct avec_config *conf)
+{
+ return has_tail_block(conf) ? 1 : 0;
+}
+
+static inline uint32_t nr_calls_full(struct avec_config *conf)
+{
+ switch(conf->type) {
+ case HOLE_ATOM:
+ return has_full_blocks(conf);
+ case DATA_ATOM:
+ return has_full_blocks(conf) ?
+ logical_blocks_occupied(0,
+ conf->nr_full_blocks,
+ MAX_IOVEC_BITS) : 0;
+ default:
+ gf_log("crypt", GF_LOG_DEBUG, "bad atom data type");
+ return 0;
+ }
+}
+
+static inline uint32_t nr_calls(struct avec_config *conf)
+{
+ return nr_calls_head(conf) + nr_calls_tail(conf) + nr_calls_full(conf);
+}
+
+static inline uint32_t nr_calls_data(call_frame_t *frame)
+{
+ return nr_calls(get_data_conf(frame));
+}
+
+static inline uint32_t nr_calls_hole(call_frame_t *frame)
+{
+ return nr_calls(get_hole_conf(frame));
+}
+
+static inline void get_one_call_nolock(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+
+ ++local->nr_calls;
+
+ //gf_log("crypt", GF_LOG_DEBUG, "get %d calls", 1);
+}
+
+static inline void get_one_call(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+
+ LOCK(&local->call_lock);
+ get_one_call_nolock(frame);
+ UNLOCK(&local->call_lock);
+}
+
+static inline void get_nr_calls_nolock(call_frame_t *frame, int32_t nr)
+{
+ crypt_local_t *local = frame->local;
+
+ local->nr_calls += nr;
+
+ //gf_log("crypt", GF_LOG_DEBUG, "get %d calls", nr);
+}
+
+static inline void get_nr_calls(call_frame_t *frame, int32_t nr)
+{
+ crypt_local_t *local = frame->local;
+
+ LOCK(&local->call_lock);
+ get_nr_calls_nolock(frame, nr);
+ UNLOCK(&local->call_lock);
+}
+
+static inline int put_one_call(crypt_local_t *local)
+{
+ uint32_t last = 0;
+
+ LOCK(&local->call_lock);
+ if (--local->nr_calls == 0)
+ last = 1;
+
+ //gf_log("crypt", GF_LOG_DEBUG, "put %d calls", 1);
+
+ UNLOCK(&local->call_lock);
+ return last;
+}
+
+static inline int is_appended_write(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = get_data_conf(frame);
+
+ return conf->orig_offset + conf->orig_size > local->old_file_size;
+}
+
+static inline int is_ordered_mode(call_frame_t *frame)
+{
+#if 0
+ crypt_local_t *local = frame->local;
+ return local->fop == GF_FOP_FTRUNCATE ||
+ (local->fop == GF_FOP_WRITE && is_appended_write(frame));
+#endif
+ return 1;
+}
+
+static inline int32_t hole_conv_completed(crypt_local_t *local)
+{
+ struct avec_config *conf = &local->hole_conf;
+ return conf->cursor == conf->acount;
+}
+
+static inline int32_t data_write_in_progress(crypt_local_t *local)
+{
+ return local->active_setup == DATA_ATOM;
+}
+
+static inline int32_t parent_is_crypt_xlator(call_frame_t *frame,
+ xlator_t *this)
+{
+ return frame->parent->this == this;
+}
+
+static inline linkop_wind_handler_t linkop_wind_dispatch(glusterfs_fop_t fop)
+{
+ switch(fop){
+ case GF_FOP_LINK:
+ return link_wind;
+ case GF_FOP_UNLINK:
+ return unlink_wind;
+ case GF_FOP_RENAME:
+ return rename_wind;
+ default:
+ gf_log("crypt", GF_LOG_ERROR, "Bad link operation %d", fop);
+ return NULL;
+ }
+}
+
+static inline linkop_unwind_handler_t linkop_unwind_dispatch(glusterfs_fop_t fop)
+{
+ switch(fop){
+ case GF_FOP_LINK:
+ return link_unwind;
+ case GF_FOP_UNLINK:
+ return unlink_unwind;
+ case GF_FOP_RENAME:
+ return rename_unwind;
+ default:
+ gf_log("crypt", GF_LOG_ERROR, "Bad link operation %d", fop);
+ return NULL;
+ }
+}
+
+static inline mtd_op_t linkop_mtdop_dispatch(glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_LINK:
+ return MTD_APPEND;
+ case GF_FOP_UNLINK:
+ return MTD_CUT;
+ case GF_FOP_RENAME:
+ return MTD_OVERWRITE;
+ default:
+ gf_log("crypt", GF_LOG_WARNING, "Bad link operation %d", fop);
+ return MTD_LAST_OP;
+ }
+}
+
+#endif /* __CRYPT_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/data.c b/xlators/encryption/crypt/src/data.c
new file mode 100644
index 000000000..762fa554a
--- /dev/null
+++ b/xlators/encryption/crypt/src/data.c
@@ -0,0 +1,769 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "crypt-common.h"
+#include "crypt.h"
+
+static void set_iv_aes_xts(off_t offset, struct object_cipher_info *object)
+{
+ unsigned char *ivec;
+
+ ivec = object->u.aes_xts.ivec;
+
+ /* convert the tweak into a little-endian byte
+ * array (IEEE P1619/D16, May 2007, section 5.1)
+ */
+
+ *((uint64_t *)ivec) = htole64(offset);
+
+ /* ivec is padded with zeroes */
+}
+
+static int32_t aes_set_keys_common(unsigned char *raw_key, uint32_t key_size,
+ AES_KEY *keys)
+{
+ int32_t ret;
+
+ ret = AES_set_encrypt_key(raw_key,
+ key_size,
+ &keys[AES_ENCRYPT]);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Set encrypt key failed");
+ return ret;
+ }
+ ret = AES_set_decrypt_key(raw_key,
+ key_size,
+ &keys[AES_DECRYPT]);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Set decrypt key failed");
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * set private cipher info for xts mode
+ */
+static int32_t set_private_aes_xts(struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int ret;
+ struct object_cipher_info *object = get_object_cinfo(info);
+ unsigned char *data_key;
+ uint32_t subkey_size;
+
+ /* init tweak value */
+ memset(object->u.aes_xts.ivec, 0, 16);
+
+ data_key = GF_CALLOC(1, object->o_dkey_size, gf_crypt_mt_key);
+ if (!data_key)
+ return ENOMEM;
+
+ /*
+ * retrieve data keying meterial
+ */
+ ret = get_data_file_key(info, master, object->o_dkey_size, data_key);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Failed to retrieve data key");
+ GF_FREE(data_key);
+ return ret;
+ }
+ /*
+ * parse compound xts key
+ */
+ subkey_size = object->o_dkey_size >> 4; /* (xts-key-size-in-bytes / 2) */
+ /*
+ * install key for data encryption
+ */
+ ret = aes_set_keys_common(data_key,
+ subkey_size << 3, object->u.aes_xts.dkey);
+ if (ret) {
+ GF_FREE(data_key);
+ return ret;
+ }
+ /*
+ * set up key used to encrypt tweaks
+ */
+ ret = AES_set_encrypt_key(data_key + subkey_size,
+ object->o_dkey_size / 2,
+ &object->u.aes_xts.tkey);
+ if (ret < 0)
+ gf_log("crypt", GF_LOG_ERROR, "Set tweak key failed");
+
+ GF_FREE(data_key);
+ return ret;
+}
+
+static int32_t aes_xts_init(void)
+{
+ cassert(AES_BLOCK_SIZE == (1 << AES_BLOCK_BITS));
+ return 0;
+}
+
+static int32_t check_key_aes_xts(uint32_t keysize)
+{
+ switch(keysize) {
+ case 256:
+ case 512:
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+static int32_t encrypt_aes_xts(const unsigned char *from,
+ unsigned char *to, size_t length,
+ off_t offset, const int enc,
+ struct object_cipher_info *object)
+{
+ XTS128_CONTEXT ctx;
+ if (enc) {
+ ctx.key1 = &object->u.aes_xts.dkey[AES_ENCRYPT];
+ ctx.block1 = (block128_f)AES_encrypt;
+ }
+ else {
+ ctx.key1 = &object->u.aes_xts.dkey[AES_DECRYPT];
+ ctx.block1 = (block128_f)AES_decrypt;
+ }
+ ctx.key2 = &object->u.aes_xts.tkey;
+ ctx.block2 = (block128_f)AES_encrypt;
+
+ return CRYPTO_xts128_encrypt(&ctx,
+ object->u.aes_xts.ivec,
+ from,
+ to,
+ length, enc);
+}
+
+/*
+ * Cipher input chunk @from of length @len;
+ * @to: result of cipher transform;
+ * @off: offset in a file (must be cblock-aligned);
+ */
+static void cipher_data(struct object_cipher_info *object,
+ char *from,
+ char *to,
+ off_t off,
+ size_t len,
+ const int enc)
+{
+ crypt_check_input_len(len, object);
+
+#if TRIVIAL_TFM && DEBUG_CRYPT
+ return;
+#endif
+ data_cipher_algs[object->o_alg][object->o_mode].set_iv(off, object);
+ data_cipher_algs[object->o_alg][object->o_mode].encrypt
+ ((const unsigned char *)from,
+ (unsigned char *)to,
+ len,
+ off,
+ enc,
+ object);
+}
+
+#define MAX_CIPHER_CHUNK (1 << 30)
+
+/*
+ * Do cipher (encryption/decryption) transform of a
+ * continuous region of memory.
+ *
+ * @len: a number of bytes to transform;
+ * @buf: data to transform;
+ * @off: offset in a file, should be block-aligned
+ * for atomic cipher modes and ksize-aligned
+ * for other modes).
+ * @dir: direction of transform (encrypt/decrypt).
+ */
+static void cipher_region(struct object_cipher_info *object,
+ char *from,
+ char *to,
+ off_t off,
+ size_t len,
+ int dir)
+{
+ while (len > 0) {
+ size_t to_cipher;
+
+ to_cipher = len;
+ if (to_cipher > MAX_CIPHER_CHUNK)
+ to_cipher = MAX_CIPHER_CHUNK;
+
+ /* this will reset IV */
+ cipher_data(object,
+ from,
+ to,
+ off,
+ to_cipher,
+ dir);
+ from += to_cipher;
+ to += to_cipher;
+ off += to_cipher;
+ len -= to_cipher;
+ }
+}
+
+/*
+ * Do cipher transform (encryption/decryption) of
+ * plaintext/ciphertext represented by @vec.
+ *
+ * Pre-conditions: @vec represents a continuous piece
+ * of data in a file at offset @off to be ciphered
+ * (encrypted/decrypted).
+ * @count is the number of vec's components. All the
+ * components must be block-aligned, the caller is
+ * responsible for this. @dir is "direction" of
+ * transform (encrypt/decrypt).
+ */
+static void cipher_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off,
+ int32_t dir)
+{
+ int i;
+ int len = 0;
+
+ for (i = 0; i < count; i++) {
+ cipher_region(object,
+ vec[i].iov_base,
+ vec[i].iov_base,
+ off + len,
+ vec[i].iov_len,
+ dir);
+ len += vec[i].iov_len;
+ }
+}
+
+void encrypt_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off)
+{
+ cipher_aligned_iov(object, vec, count, off, 1);
+}
+
+void decrypt_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off)
+{
+ cipher_aligned_iov(object, vec, count, off, 0);
+}
+
+#if DEBUG_CRYPT
+static void compound_stream(struct iovec *vec, int count, char *buf, off_t skip)
+{
+ int i;
+ int off = 0;
+ for (i = 0; i < count; i++) {
+ memcpy(buf + off,
+ vec[i].iov_base + skip,
+ vec[i].iov_len - skip);
+
+ off += (vec[i].iov_len - skip);
+ skip = 0;
+ }
+}
+
+static void check_iovecs(struct iovec *vec, int cnt,
+ struct iovec *avec, int acnt, uint32_t off_in_head)
+{
+ char *s1, *s2;
+ uint32_t size, asize;
+
+ size = iovec_get_size(vec, cnt);
+ asize = iovec_get_size(avec, acnt) - off_in_head;
+ if (size != asize) {
+ gf_log("crypt", GF_LOG_DEBUG, "size %d is not eq asize %d",
+ size, asize);
+ return;
+ }
+ s1 = GF_CALLOC(1, size, gf_crypt_mt_data);
+ if (!s1) {
+ gf_log("crypt", GF_LOG_DEBUG, "Can not allocate stream ");
+ return;
+ }
+ s2 = GF_CALLOC(1, asize, gf_crypt_mt_data);
+ if (!s2) {
+ GF_FREE(s1);
+ gf_log("crypt", GF_LOG_DEBUG, "Can not allocate stream ");
+ return;
+ }
+ compound_stream(vec, cnt, s1, 0);
+ compound_stream(avec, acnt, s2, off_in_head);
+ if (memcmp(s1, s2, size))
+ gf_log("crypt", GF_LOG_DEBUG, "chunks of different data");
+ GF_FREE(s1);
+ GF_FREE(s2);
+}
+
+#else
+#define check_iovecs(vec, count, avec, avecn, off) noop
+#endif /* DEBUG_CRYPT */
+
+static char *data_alloc_block(xlator_t *this, crypt_local_t *local,
+ int32_t block_size)
+{
+ struct iobuf *iobuf = NULL;
+
+ iobuf = iobuf_get2(this->ctx->iobuf_pool, block_size);
+ if (!iobuf) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "Failed to get iobuf");
+ return NULL;
+ }
+ if (!local->iobref_data) {
+ local->iobref_data = iobref_new();
+ if (!local->iobref_data) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "Failed to get iobref");
+ iobuf_unref(iobuf);
+ return NULL;
+ }
+ }
+ iobref_add(local->iobref_data, iobuf);
+ return iobuf->ptr;
+}
+
+/*
+ * Compound @avec, which represent the same data
+ * chunk as @vec, but has aligned components of
+ * specified block size. Alloc blocks, if needed.
+ * In particular, incomplete head and tail blocks
+ * must be allocated.
+ * Put number of allocated blocks to @num_blocks.
+ *
+ * Example:
+ *
+ * input: data chunk represented by 4 components
+ * [AB],[BC],[CD],[DE];
+ * output: 5 logical blocks (0, 1, 2, 3, 4).
+ *
+ * A B C D E
+ * *-----*+------*-+---*----+--------+-*
+ * | || | | | | | |
+ * *-+-----+*------+-*---+----*--------*-+------*
+ * 0 1 2 3 4
+ *
+ * 0 - incomplete compound (head);
+ * 1, 2 - full compound;
+ * 3 - full non-compound (the case of reuse);
+ * 4 - incomplete non-compound (tail).
+ */
+int32_t align_iov_by_atoms(xlator_t *this,
+ crypt_local_t *local,
+ struct object_cipher_info *object,
+ struct iovec *vec /* input vector */,
+ int32_t count /* number of vec components */,
+ struct iovec *avec /* aligned vector */,
+ char **blocks /* pool of blocks */,
+ uint32_t *blocks_allocated,
+ struct avec_config *conf)
+{
+ int vecn = 0; /* number of the current component in vec */
+ int avecn = 0; /* number of the current component in avec */
+ off_t vec_off = 0; /* offset in the current vec component,
+ * i.e. the number of bytes have already
+ * been copied */
+ int32_t block_size = get_atom_size(object);
+ size_t to_process; /* number of vec's bytes to copy and(or) re-use */
+ int32_t off_in_head = conf->off_in_head;
+
+ to_process = iovec_get_size(vec, count);
+
+ while (to_process > 0) {
+ if (off_in_head ||
+ vec[vecn].iov_len - vec_off < block_size) {
+ /*
+ * less than block_size:
+ * the case of incomplete (head or tail),
+ * or compound block
+ */
+ size_t copied = 0;
+ /*
+ * populate the pool with a new block
+ */
+ blocks[*blocks_allocated] = data_alloc_block(this,
+ local,
+ block_size);
+ if (!blocks[*blocks_allocated])
+ return -ENOMEM;
+ memset(blocks[*blocks_allocated], 0, off_in_head);
+ /*
+ * fill the block with vec components
+ */
+ do {
+ size_t to_copy;
+
+ to_copy = vec[vecn].iov_len - vec_off;
+ if (to_copy > block_size - off_in_head)
+ to_copy = block_size - off_in_head;
+
+ memcpy(blocks[*blocks_allocated] + off_in_head + copied,
+ vec[vecn].iov_base + vec_off,
+ to_copy);
+
+ copied += to_copy;
+ to_process -= to_copy;
+
+ vec_off += to_copy;
+ if (vec_off == vec[vecn].iov_len) {
+ /* finished with this vecn */
+ vec_off = 0;
+ vecn++;
+ }
+ } while (copied < (block_size - off_in_head) && to_process > 0);
+ /*
+ * update avec
+ */
+ avec[avecn].iov_len = off_in_head + copied;
+ avec[avecn].iov_base = blocks[*blocks_allocated];
+
+ (*blocks_allocated)++;
+ off_in_head = 0;
+ } else {
+ /*
+ * the rest of the current vec component
+ * is not less than block_size, so reuse
+ * the memory buffer of the component.
+ */
+ size_t to_reuse;
+ to_reuse = (to_process > block_size ?
+ block_size :
+ to_process);
+ avec[avecn].iov_len = to_reuse;
+ avec[avecn].iov_base = vec[vecn].iov_base + vec_off;
+
+ vec_off += to_reuse;
+ if (vec_off == vec[vecn].iov_len) {
+ /* finished with this vecn */
+ vec_off = 0;
+ vecn++;
+ }
+ to_process -= to_reuse;
+ }
+ avecn++;
+ }
+ check_iovecs(vec, count, avec, avecn, conf->off_in_head);
+ return 0;
+}
+
+/*
+ * allocate and setup aligned vector for data submission
+ * Pre-condition: @conf is set.
+ */
+int32_t set_config_avec_data(xlator_t *this,
+ crypt_local_t *local,
+ struct avec_config *conf,
+ struct object_cipher_info *object,
+ struct iovec *vec,
+ int32_t vec_count)
+{
+ int32_t ret = ENOMEM;
+ struct iovec *avec;
+ char **pool;
+ uint32_t blocks_in_pool = 0;
+
+ conf->type = DATA_ATOM;
+
+ avec = GF_CALLOC(conf->acount, sizeof(*avec), gf_crypt_mt_iovec);
+ if (!avec)
+ return ret;
+ pool = GF_CALLOC(conf->acount, sizeof(pool), gf_crypt_mt_char);
+ if (!pool) {
+ GF_FREE(avec);
+ return ret;
+ }
+ if (!vec) {
+ /*
+ * degenerated case: no data
+ */
+ pool[0] = data_alloc_block(this, local, get_atom_size(object));
+ if (!pool[0])
+ goto free;
+ blocks_in_pool = 1;
+ avec->iov_base = pool[0];
+ avec->iov_len = conf->off_in_tail;
+ }
+ else {
+ ret = align_iov_by_atoms(this, local, object, vec, vec_count,
+ avec, pool, &blocks_in_pool, conf);
+ if (ret)
+ goto free;
+ }
+ conf->avec = avec;
+ conf->pool = pool;
+ conf->blocks_in_pool = blocks_in_pool;
+ return 0;
+ free:
+ GF_FREE(avec);
+ GF_FREE(pool);
+ return ret;
+}
+
+/*
+ * allocate and setup aligned vector for hole submission
+ */
+int32_t set_config_avec_hole(xlator_t *this,
+ crypt_local_t *local,
+ struct avec_config *conf,
+ struct object_cipher_info *object,
+ glusterfs_fop_t fop)
+{
+ uint32_t i, idx;
+ struct iovec *avec;
+ char **pool;
+ uint32_t num_blocks;
+ uint32_t blocks_in_pool = 0;
+
+ conf->type = HOLE_ATOM;
+
+ num_blocks = conf->acount -
+ (conf->nr_full_blocks ? conf->nr_full_blocks - 1 : 0);
+
+ switch (fop) {
+ case GF_FOP_WRITE:
+ /*
+ * hole goes before data
+ */
+ if (num_blocks == 1 && conf->off_in_tail != 0)
+ /*
+ * we won't submit a hole which fits into
+ * a data atom: this part of hole will be
+ * submitted with data write
+ */
+ return 0;
+ break;
+ case GF_FOP_FTRUNCATE:
+ /*
+ * expanding truncate, hole goes after data,
+ * and will be submited in any case.
+ */
+ break;
+ default:
+ gf_log("crypt", GF_LOG_WARNING,
+ "bad file operation %d", fop);
+ return 0;
+ }
+ avec = GF_CALLOC(num_blocks, sizeof(*avec), gf_crypt_mt_iovec);
+ if (!avec)
+ return ENOMEM;
+ pool = GF_CALLOC(num_blocks, sizeof(pool), gf_crypt_mt_char);
+ if (!pool) {
+ GF_FREE(avec);
+ return ENOMEM;
+ }
+ for (i = 0; i < num_blocks; i++) {
+ pool[i] = data_alloc_block(this, local, get_atom_size(object));
+ if (pool[i] == NULL)
+ goto free;
+ blocks_in_pool++;
+ }
+ if (has_head_block(conf)) {
+ /* set head block */
+ idx = 0;
+ avec[idx].iov_base = pool[idx];
+ avec[idx].iov_len = get_atom_size(object);
+ memset(avec[idx].iov_base + conf->off_in_head,
+ 0,
+ get_atom_size(object) - conf->off_in_head);
+ }
+ if (has_tail_block(conf)) {
+ /* set tail block */
+ idx = num_blocks - 1;
+ avec[idx].iov_base = pool[idx];
+ avec[idx].iov_len = get_atom_size(object);
+ memset(avec[idx].iov_base, 0, conf->off_in_tail);
+ }
+ if (has_full_blocks(conf)) {
+ /* set full block */
+ idx = conf->off_in_head ? 1 : 0;
+ avec[idx].iov_base = pool[idx];
+ avec[idx].iov_len = get_atom_size(object);
+ /*
+ * since we re-use the buffer,
+ * zeroes will be set every time
+ * before encryption, see submit_full()
+ */
+ }
+ conf->avec = avec;
+ conf->pool = pool;
+ conf->blocks_in_pool = blocks_in_pool;
+ return 0;
+ free:
+ GF_FREE(avec);
+ GF_FREE(pool);
+ return ENOMEM;
+}
+
+/* A helper for setting up config of partial atoms (which
+ * participate in read-modify-write sequence).
+ *
+ * Calculate and setup precise amount of "extra-bytes"
+ * that should be uptodated at the end of partial (not
+ * necessarily tail!) block.
+ *
+ * Pre-condition: local->old_file_size is valid!
+ * @conf contains setup, which is enough for correct calculation
+ * of has_tail_block(), ->get_offset().
+ */
+void set_gap_at_end(call_frame_t *frame, struct object_cipher_info *object,
+ struct avec_config *conf, atom_data_type dtype)
+{
+ uint32_t to_block;
+ crypt_local_t *local = frame->local;
+ uint64_t old_file_size = local->old_file_size;
+ struct rmw_atom *partial = atom_by_types(dtype,
+ has_tail_block(conf) ?
+ TAIL_ATOM : HEAD_ATOM);
+
+ if (old_file_size <= partial->offset_at(frame, object))
+ to_block = 0;
+ else {
+ to_block = old_file_size - partial->offset_at(frame, object);
+ if (to_block > get_atom_size(object))
+ to_block = get_atom_size(object);
+ }
+ if (to_block > conf->off_in_tail)
+ conf->gap_in_tail = to_block - conf->off_in_tail;
+ else
+ /*
+ * nothing to uptodate
+ */
+ conf->gap_in_tail = 0;
+}
+
+/*
+ * fill struct avec_config with offsets layouts
+ */
+void set_config_offsets(call_frame_t *frame,
+ xlator_t *this,
+ uint64_t offset,
+ uint64_t count,
+ atom_data_type dtype,
+ int32_t set_gap)
+{
+ crypt_local_t *local;
+ struct object_cipher_info *object;
+ struct avec_config *conf;
+ uint32_t resid;
+
+ uint32_t atom_size;
+ uint32_t atom_bits;
+
+ size_t orig_size;
+ off_t orig_offset;
+ size_t expanded_size;
+ off_t aligned_offset;
+
+ uint32_t off_in_head = 0;
+ uint32_t off_in_tail = 0;
+ uint32_t nr_full_blocks;
+ int32_t size_full_blocks;
+
+ uint32_t acount; /* number of alifned components to write.
+ * The same as number of occupied logical
+ * blocks (atoms)
+ */
+ local = frame->local;
+ object = &local->info->cinfo;
+ conf = (dtype == DATA_ATOM ?
+ get_data_conf(frame) : get_hole_conf(frame));
+
+ orig_offset = offset;
+ orig_size = count;
+
+ atom_size = get_atom_size(object);
+ atom_bits = get_atom_bits(object);
+
+ /*
+ * Round-down the start,
+ * round-up the end.
+ */
+ resid = offset & (uint64_t)(atom_size - 1);
+
+ if (resid)
+ off_in_head = resid;
+ aligned_offset = offset - off_in_head;
+ expanded_size = orig_size + off_in_head;
+
+ /* calculate tail,
+ expand size forward */
+ resid = (offset + orig_size) & (uint64_t)(atom_size - 1);
+
+ if (resid) {
+ off_in_tail = resid;
+ expanded_size += (atom_size - off_in_tail);
+ }
+ /*
+ * calculate number of occupied blocks
+ */
+ acount = expanded_size >> atom_bits;
+ /*
+ * calculate number of full blocks
+ */
+ size_full_blocks = expanded_size;
+ if (off_in_head)
+ size_full_blocks -= atom_size;
+ if (off_in_tail && size_full_blocks > 0)
+ size_full_blocks -= atom_size;
+ nr_full_blocks = size_full_blocks >> atom_bits;
+
+ conf->atom_size = atom_size;
+ conf->orig_size = orig_size;
+ conf->orig_offset = orig_offset;
+ conf->expanded_size = expanded_size;
+ conf->aligned_offset = aligned_offset;
+
+ conf->off_in_head = off_in_head;
+ conf->off_in_tail = off_in_tail;
+ conf->nr_full_blocks = nr_full_blocks;
+ conf->acount = acount;
+ /*
+ * Finally, calculate precise amount of
+ * "extra-bytes" that should be uptodated
+ * at the end.
+ * Only if RMW is expected.
+ */
+ if (off_in_tail && set_gap)
+ set_gap_at_end(frame, object, conf, dtype);
+}
+
+struct data_cipher_alg data_cipher_algs[LAST_CIPHER_ALG][LAST_CIPHER_MODE] = {
+ [AES_CIPHER_ALG][XTS_CIPHER_MODE] =
+ { .atomic = _gf_true,
+ .should_pad = _gf_true,
+ .blkbits = AES_BLOCK_BITS,
+ .init = aes_xts_init,
+ .set_private = set_private_aes_xts,
+ .check_key = check_key_aes_xts,
+ .set_iv = set_iv_aes_xts,
+ .encrypt = encrypt_aes_xts
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/keys.c b/xlators/encryption/crypt/src/keys.c
new file mode 100644
index 000000000..4a1d3bb5a
--- /dev/null
+++ b/xlators/encryption/crypt/src/keys.c
@@ -0,0 +1,302 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "crypt-common.h"
+#include "crypt.h"
+
+/* Key hierarchy
+
+ +----------------+
+ | MASTER_VOL_KEY |
+ +-------+--------+
+ |
+ |
+ +----------------+----------------+
+ | | |
+ | | |
+ +-------+------+ +-------+-------+ +------+--------+
+ | NMTD_VOL_KEY | | EMTD_FILE_KEY | | DATA_FILE_KEY |
+ +-------+------+ +---------------+ +---------------+
+ |
+ |
+ +-------+-------+
+ | NMTD_LINK_KEY |
+ +---------------+
+
+ */
+
+#if DEBUG_CRYPT
+static void check_prf_iters(uint32_t num_iters)
+{
+ if (num_iters == 0)
+ gf_log ("crypt", GF_LOG_DEBUG,
+ "bad number of prf iterations : %d", num_iters);
+}
+#else
+#define check_prf_iters(num_iters) noop
+#endif /* DEBUG_CRYPT */
+
+unsigned char crypt_fake_oid[16] =
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+/*
+ * derive key in the counter mode using
+ * sha256-based HMAC as PRF, see
+ * NIST Special Publication 800-108, 5.1)
+ */
+
+#define PRF_OUTPUT_SIZE SHA256_DIGEST_LENGTH
+
+static int32_t kderive_init(struct kderive_context *ctx,
+ const unsigned char *pkey, /* parent key */
+ uint32_t pkey_size, /* parent key size */
+ const unsigned char *idctx, /* id-context */
+ uint32_t idctx_size,
+ crypt_key_type type /* type of child key */)
+{
+ unsigned char *pos;
+ uint32_t llen = strlen(crypt_keys[type].label);
+ /*
+ * Compoud the fixed input data for KDF:
+ * [i]_2 || Label || 0x00 || Id-Context || [L]_2),
+ * NIST SP 800-108, 5.1
+ */
+ ctx->fid_len =
+ sizeof(uint32_t) +
+ llen +
+ 1 +
+ idctx_size +
+ sizeof(uint32_t);
+
+ ctx->fid = GF_CALLOC(ctx->fid_len, 1, gf_crypt_mt_key);
+ if (!ctx->fid)
+ return ENOMEM;
+ ctx->out_len = round_up(crypt_keys[type].len >> 3,
+ PRF_OUTPUT_SIZE);
+ ctx->out = GF_CALLOC(ctx->out_len, 1, gf_crypt_mt_key);
+ if (!ctx->out) {
+ GF_FREE(ctx->fid);
+ return ENOMEM;
+ }
+ ctx->pkey = pkey;
+ ctx->pkey_len = pkey_size;
+ ctx->ckey_len = crypt_keys[type].len;
+
+ pos = ctx->fid;
+
+ /* counter will be set up in kderive_rfn() */
+ pos += sizeof(uint32_t);
+
+ memcpy(pos, crypt_keys[type].label, llen);
+ pos += llen;
+
+ /* set up zero octet */
+ *pos = 0;
+ pos += 1;
+
+ memcpy(pos, idctx, idctx_size);
+ pos += idctx_size;
+
+ *((uint32_t *)pos) = htobe32(ctx->ckey_len);
+
+ return 0;
+}
+
+static void kderive_update(struct kderive_context *ctx)
+{
+ uint32_t i;
+ HMAC_CTX hctx;
+ unsigned char *pos = ctx->out;
+ uint32_t *p_iter = (uint32_t *)ctx->fid;
+ uint32_t num_iters = ctx->out_len / PRF_OUTPUT_SIZE;
+
+ check_prf_iters(num_iters);
+
+ HMAC_CTX_init(&hctx);
+ for (i = 0; i < num_iters; i++) {
+ /*
+ * update the iteration number in the fid
+ */
+ *p_iter = htobe32(i);
+ HMAC_Init_ex(&hctx,
+ ctx->pkey, ctx->pkey_len >> 3,
+ EVP_sha256(),
+ NULL);
+ HMAC_Update(&hctx, ctx->fid, ctx->fid_len);
+ HMAC_Final(&hctx, pos, NULL);
+
+ pos += PRF_OUTPUT_SIZE;
+ }
+ HMAC_CTX_cleanup(&hctx);
+}
+
+static void kderive_final(struct kderive_context *ctx, unsigned char *child)
+{
+ memcpy(child, ctx->out, ctx->ckey_len >> 3);
+ GF_FREE(ctx->fid);
+ GF_FREE(ctx->out);
+ memset(ctx, 0, sizeof(*ctx));
+}
+
+/*
+ * derive per-volume key for object ids aithentication
+ */
+int32_t get_nmtd_vol_key(struct master_cipher_info *master)
+{
+ int32_t ret;
+ struct kderive_context ctx;
+
+ ret = kderive_init(&ctx,
+ master->m_key,
+ master_key_size(),
+ crypt_fake_oid, sizeof(uuid_t), NMTD_VOL_KEY);
+ if (ret)
+ return ret;
+ kderive_update(&ctx);
+ kderive_final(&ctx, master->m_nmtd_key);
+ return 0;
+}
+
+/*
+ * derive per-link key for aithentication of non-encrypted
+ * meta-data (nmtd)
+ */
+int32_t get_nmtd_link_key(loc_t *loc,
+ struct master_cipher_info *master,
+ unsigned char *result)
+{
+ int32_t ret;
+ struct kderive_context ctx;
+
+ ret = kderive_init(&ctx,
+ master->m_nmtd_key,
+ nmtd_vol_key_size(),
+ (const unsigned char *)loc->path,
+ strlen(loc->path), NMTD_LINK_KEY);
+ if (ret)
+ return ret;
+ kderive_update(&ctx);
+ kderive_final(&ctx, result);
+ return 0;
+}
+
+/*
+ * derive per-file key for encryption and authentication
+ * of encrypted part of metadata (emtd)
+ */
+int32_t get_emtd_file_key(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ unsigned char *result)
+{
+ int32_t ret;
+ struct kderive_context ctx;
+
+ ret = kderive_init(&ctx,
+ master->m_key,
+ master_key_size(),
+ info->oid, sizeof(uuid_t), EMTD_FILE_KEY);
+ if (ret)
+ return ret;
+ kderive_update(&ctx);
+ kderive_final(&ctx, result);
+ return 0;
+}
+
+static int32_t data_key_type_by_size(uint32_t keysize, crypt_key_type *type)
+{
+ int32_t ret = 0;
+ switch (keysize) {
+ case 256:
+ *type = DATA_FILE_KEY_256;
+ break;
+ case 512:
+ *type = DATA_FILE_KEY_512;
+ break;
+ default:
+ gf_log("crypt", GF_LOG_ERROR, "Unsupported data key size %d",
+ keysize);
+ ret = ENOTSUP;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * derive per-file key for data encryption
+ */
+int32_t get_data_file_key(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ uint32_t keysize,
+ unsigned char *key)
+{
+ int32_t ret;
+ struct kderive_context ctx;
+ crypt_key_type type;
+
+ ret = data_key_type_by_size(keysize, &type);
+ if (ret)
+ return ret;
+ ret = kderive_init(&ctx,
+ master->m_key,
+ master_key_size(),
+ info->oid, sizeof(uuid_t), type);
+ if (ret)
+ return ret;
+ kderive_update(&ctx);
+ kderive_final(&ctx, key);
+ return 0;
+}
+
+/*
+ * NOTE: Don't change existing keys: it will break compatibility;
+ */
+struct crypt_key crypt_keys[LAST_KEY_TYPE] = {
+ [MASTER_VOL_KEY] =
+ { .len = MASTER_VOL_KEY_SIZE << 3,
+ .label = "volume-master",
+ },
+ [NMTD_VOL_KEY] =
+ { .len = NMTD_VOL_KEY_SIZE << 3,
+ .label = "volume-nmtd-key-generation"
+ },
+ [NMTD_LINK_KEY] =
+ { .len = 128,
+ .label = "link-nmtd-authentication"
+ },
+ [EMTD_FILE_KEY] =
+ { .len = 128,
+ .label = "file-emtd-encryption-and-auth"
+ },
+ [DATA_FILE_KEY_256] =
+ { .len = 256,
+ .label = "file-data-encryption-256"
+ },
+ [DATA_FILE_KEY_512] =
+ { .len = 512,
+ .label = "file-data-encryption-512"
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/metadata.c b/xlators/encryption/crypt/src/metadata.c
new file mode 100644
index 000000000..36b14c055
--- /dev/null
+++ b/xlators/encryption/crypt/src/metadata.c
@@ -0,0 +1,605 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "crypt-common.h"
+#include "crypt.h"
+#include "metadata.h"
+
+int32_t alloc_format(crypt_local_t *local, size_t size)
+{
+ if (size > 0) {
+ local->format = GF_CALLOC(1, size, gf_crypt_mt_mtd);
+ if (!local->format)
+ return ENOMEM;
+ }
+ local->format_size = size;
+ return 0;
+}
+
+int32_t alloc_format_create(crypt_local_t *local)
+{
+ return alloc_format(local, new_format_size());
+}
+
+void free_format(crypt_local_t *local)
+{
+ GF_FREE(local->format);
+}
+
+/*
+ * Check compatibility with extracted metadata
+ */
+static int32_t check_file_metadata(struct crypt_inode_info *info)
+{
+ struct object_cipher_info *object = &info->cinfo;
+
+ if (info->nr_minor != CRYPT_XLATOR_ID) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "unsupported minor subversion %d", info->nr_minor);
+ return EINVAL;
+ }
+ if (object->o_alg > LAST_CIPHER_ALG) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "unsupported cipher algorithm %d",
+ object->o_alg);
+ return EINVAL;
+ }
+ if (object->o_mode > LAST_CIPHER_MODE) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "unsupported cipher mode %d",
+ object->o_mode);
+ return EINVAL;
+ }
+ if (object->o_block_bits < CRYPT_MIN_BLOCK_BITS ||
+ object->o_block_bits > CRYPT_MAX_BLOCK_BITS) {
+ gf_log("crypt", GF_LOG_WARNING, "unsupported block bits %d",
+ object->o_block_bits);
+ return EINVAL;
+ }
+ /* TBD: check data key size */
+ return 0;
+}
+
+static size_t format_size_v1(mtd_op_t op, size_t old_size)
+{
+
+ switch (op) {
+ case MTD_CREATE:
+ return sizeof(struct mtd_format_v1);
+ case MTD_OVERWRITE:
+ return old_size;
+ case MTD_APPEND:
+ return old_size + NMTD_8_MAC_SIZE;
+ case MTD_CUT:
+ if (old_size > sizeof(struct mtd_format_v1))
+ return old_size - NMTD_8_MAC_SIZE;
+ else
+ return 0;
+ default:
+ gf_log("crypt", GF_LOG_WARNING, "Bad mtd operation");
+ return 0;
+ }
+}
+
+/*
+ * Calculate size of the updated format string.
+ * Returned zero means that we don't need to update the format string.
+ */
+size_t format_size(mtd_op_t op, size_t old_size)
+{
+ size_t versioned;
+
+ versioned = mtd_loaders[current_mtd_loader()].format_size(op,
+ old_size - sizeof(struct crypt_format));
+ if (versioned != 0)
+ return versioned + sizeof(struct crypt_format);
+ return 0;
+}
+
+/*
+ * size of the format string of newly created file (nr_links = 1)
+ */
+size_t new_format_size(void)
+{
+ return format_size(MTD_CREATE, 0);
+}
+
+/*
+ * Calculate per-link MAC by pathname
+ */
+static int32_t calc_link_mac_v1(struct mtd_format_v1 *fmt,
+ loc_t *loc,
+ unsigned char *result,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ unsigned char nmtd_link_key[16];
+ CMAC_CTX *cctx;
+ size_t len;
+
+ ret = get_nmtd_link_key(loc, master, nmtd_link_key);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Can not get nmtd link key");
+ return -1;
+ }
+ cctx = CMAC_CTX_new();
+ if (!cctx) {
+ gf_log("crypt", GF_LOG_ERROR, "CMAC_CTX_new failed");
+ return -1;
+ }
+ ret = CMAC_Init(cctx, nmtd_link_key, sizeof(nmtd_link_key),
+ EVP_aes_128_cbc(), 0);
+ if (!ret) {
+ gf_log("crypt", GF_LOG_ERROR, "CMAC_Init failed");
+ CMAC_CTX_free(cctx);
+ return -1;
+ }
+ ret = CMAC_Update(cctx, get_NMTD_V1(info), SIZE_OF_NMTD_V1);
+ if (!ret) {
+ gf_log("crypt", GF_LOG_ERROR, "CMAC_Update failed");
+ CMAC_CTX_free(cctx);
+ return -1;
+ }
+ ret = CMAC_Final(cctx, result, &len);
+ CMAC_CTX_free(cctx);
+ if (!ret) {
+ gf_log("crypt", GF_LOG_ERROR, "CMAC_Final failed");
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Create per-link MAC of index @idx by pathname
+ */
+static int32_t create_link_mac_v1(struct mtd_format_v1 *fmt,
+ uint32_t idx,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ unsigned char *mac;
+ unsigned char cmac[16];
+
+ mac = get_NMTD_V1_MAC(fmt) + idx * SIZE_OF_NMTD_V1_MAC;
+
+ ret = calc_link_mac_v1(fmt, loc, cmac, info, master);
+ if (ret)
+ return -1;
+ memcpy(mac, cmac, SIZE_OF_NMTD_V1_MAC);
+ return 0;
+}
+
+static int32_t create_format_v1(unsigned char *wire,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ struct mtd_format_v1 *fmt;
+ unsigned char mtd_key[16];
+ AES_KEY EMTD_KEY;
+ unsigned char nmtd_link_key[16];
+ uint32_t ad;
+ GCM128_CONTEXT *gctx;
+
+ fmt = (struct mtd_format_v1 *)wire;
+
+ fmt->minor_id = info->nr_minor;
+ fmt->alg_id = AES_CIPHER_ALG;
+ fmt->dkey_factor = master->m_dkey_size >> KEY_FACTOR_BITS;
+ fmt->block_bits = master->m_block_bits;
+ fmt->mode_id = master->m_mode;
+ /*
+ * retrieve keys for the parts of metadata
+ */
+ ret = get_emtd_file_key(info, master, mtd_key);
+ if (ret)
+ return ret;
+ ret = get_nmtd_link_key(loc, master, nmtd_link_key);
+ if (ret)
+ return ret;
+
+ AES_set_encrypt_key(mtd_key, sizeof(mtd_key)*8, &EMTD_KEY);
+
+ gctx = CRYPTO_gcm128_new(&EMTD_KEY, (block128_f)AES_encrypt);
+
+ /* TBD: Check return values */
+
+ CRYPTO_gcm128_setiv(gctx, info->oid, sizeof(uuid_t));
+
+ ad = htole32(MTD_LOADER_V1);
+ ret = CRYPTO_gcm128_aad(gctx, (const unsigned char *)&ad, sizeof(ad));
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_aad failed");
+ CRYPTO_gcm128_release(gctx);
+ return ret;
+ }
+ ret = CRYPTO_gcm128_encrypt(gctx,
+ get_EMTD_V1(fmt),
+ get_EMTD_V1(fmt),
+ SIZE_OF_EMTD_V1);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_encrypt failed");
+ CRYPTO_gcm128_release(gctx);
+ return ret;
+ }
+ /*
+ * set MAC of encrypted part of metadata
+ */
+ CRYPTO_gcm128_tag(gctx, get_EMTD_V1_MAC(fmt), SIZE_OF_EMTD_V1_MAC);
+ CRYPTO_gcm128_release(gctx);
+ /*
+ * set the first MAC of non-encrypted part of metadata
+ */
+ return create_link_mac_v1(fmt, 0, loc, info, master);
+}
+
+/*
+ * Called by fops:
+ * ->create();
+ * ->link();
+ *
+ * Pack common and version-specific parts of file's metadata
+ * Pre-conditions: @info contains valid object-id.
+ */
+int32_t create_format(unsigned char *wire,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ struct crypt_format *fmt = (struct crypt_format *)wire;
+
+ fmt->loader_id = current_mtd_loader();
+
+ wire += sizeof(struct crypt_format);
+ return mtd_loaders[current_mtd_loader()].create_format(wire, loc,
+ info, master);
+}
+
+/*
+ * Append or overwrite per-link mac of @mac_idx index
+ * in accordance with the new pathname
+ */
+int32_t appov_link_mac_v1(unsigned char *new,
+ unsigned char *old,
+ uint32_t old_size,
+ int32_t mac_idx,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local)
+{
+ memcpy(new, old, old_size);
+ return create_link_mac_v1((struct mtd_format_v1 *)new, mac_idx,
+ loc, info, master);
+}
+
+/*
+ * Cut per-link mac of @mac_idx index
+ */
+static int32_t cut_link_mac_v1(unsigned char *new,
+ unsigned char *old,
+ uint32_t old_size,
+ int32_t mac_idx,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local)
+{
+ memcpy(new,
+ old,
+ sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * (mac_idx - 1));
+
+ memcpy(new + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * (mac_idx - 1),
+ old + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * mac_idx,
+ old_size - (sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * mac_idx));
+ return 0;
+}
+
+int32_t update_format_v1(unsigned char *new,
+ unsigned char *old,
+ size_t old_len,
+ int32_t mac_idx, /* of old name */
+ mtd_op_t op,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local)
+{
+ switch (op) {
+ case MTD_APPEND:
+ mac_idx = 1 + (old_len - sizeof(struct mtd_format_v1))/8;
+ case MTD_OVERWRITE:
+ return appov_link_mac_v1(new, old, old_len, mac_idx,
+ loc, info, master, local);
+ case MTD_CUT:
+ return cut_link_mac_v1(new, old, old_len, mac_idx,
+ loc, info, master, local);
+ default:
+ gf_log("crypt", GF_LOG_ERROR, "Bad mtd operation %d", op);
+ return -1;
+ }
+}
+
+/*
+ * Called by fops:
+ *
+ * ->link()
+ * ->unlink()
+ * ->rename()
+ *
+ */
+int32_t update_format(unsigned char *new,
+ unsigned char *old,
+ size_t old_len,
+ int32_t mac_idx,
+ mtd_op_t op,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local)
+{
+ if (!new)
+ return 0;
+ memcpy(new, old, sizeof(struct crypt_format));
+
+ old += sizeof(struct crypt_format);
+ new += sizeof(struct crypt_format);
+ old_len -= sizeof(struct crypt_format);
+
+ return mtd_loaders[current_mtd_loader()].update_format(new, old,
+ old_len,
+ mac_idx, op,
+ loc, info,
+ master, local);
+}
+
+/*
+ * Perform preliminary checks of found metadata
+ * Return < 0 on errors;
+ * Return number of object-id MACs (>= 1) on success
+ */
+int32_t check_format_v1(uint32_t len, unsigned char *wire)
+{
+ uint32_t nr_links;
+
+ if (len < sizeof(struct mtd_format_v1)) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "v1-loader: bad metadata size %d", len);
+ goto error;
+ }
+ len -= sizeof(struct mtd_format_v1);
+ if (len % sizeof(nmtd_8_mac_t)) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "v1-loader: bad metadata format");
+ goto error;
+ }
+ nr_links = 1 + len / sizeof(nmtd_8_mac_t);
+ if (nr_links > _POSIX_LINK_MAX)
+ goto error;
+ return nr_links;
+ error:
+ return EIO;
+}
+
+/*
+ * Verify per-link MAC specified by index @idx
+ *
+ * return:
+ * -1 on errors;
+ * 0 on failed verification;
+ * 1 on sucessful verification
+ */
+static int32_t verify_link_mac_v1(struct mtd_format_v1 *fmt,
+ uint32_t idx /* index of the mac to verify */,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ unsigned char *mac;
+ unsigned char cmac[16];
+
+ mac = get_NMTD_V1_MAC(fmt) + idx * SIZE_OF_NMTD_V1_MAC;
+
+ ret = calc_link_mac_v1(fmt, loc, cmac, info, master);
+ if (ret)
+ return -1;
+ if (memcmp(cmac, mac, SIZE_OF_NMTD_V1_MAC))
+ return 0;
+ return 1;
+}
+
+/*
+ * Lookup per-link MAC by pathname.
+ *
+ * return index of the MAC, if it was found;
+ * return < 0 on errors, or if the MAC wasn't found
+ */
+static int32_t lookup_link_mac_v1(struct mtd_format_v1 *fmt,
+ uint32_t nr_macs,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ uint32_t idx;
+
+ for (idx = 0; idx < nr_macs; idx++) {
+ ret = verify_link_mac_v1(fmt, idx, loc, info, master);
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return idx;
+ }
+ return -ENOENT;
+}
+
+/*
+ * Extract version-specific part of metadata
+ */
+static int32_t open_format_v1(unsigned char *wire,
+ int32_t len,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local,
+ gf_boolean_t load_info)
+{
+ int32_t ret;
+ int32_t num_nmtd_macs;
+ struct mtd_format_v1 *fmt;
+ unsigned char mtd_key[16];
+ AES_KEY EMTD_KEY;
+ GCM128_CONTEXT *gctx;
+ uint32_t ad;
+ emtd_8_mac_t gmac;
+ struct object_cipher_info *object;
+
+ num_nmtd_macs = check_format_v1(len, wire);
+ if (num_nmtd_macs <= 0)
+ return EIO;
+ fmt = (struct mtd_format_v1 *)wire;
+
+ ret = lookup_link_mac_v1(fmt, num_nmtd_macs, loc, info, master);
+ if (ret < 0) {
+ gf_log("crypt", GF_LOG_ERROR, "NMTD verification failed");
+ return EINVAL;
+ }
+ local->mac_idx = ret;
+ if (load_info == _gf_false)
+ /* the case of partial open */
+ return 0;
+
+ object = &info->cinfo;
+
+ ret = get_emtd_file_key(info, master, mtd_key);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Can not retrieve metadata key");
+ return ret;
+ }
+ /*
+ * decrypt encrypted meta-data
+ */
+ ret = AES_set_encrypt_key(mtd_key, sizeof(mtd_key)*8, &EMTD_KEY);
+ if (ret < 0) {
+ gf_log("crypt", GF_LOG_ERROR, "Can not set encrypt key");
+ return ret;
+ }
+ gctx = CRYPTO_gcm128_new(&EMTD_KEY, (block128_f)AES_encrypt);
+ if (!gctx) {
+ gf_log("crypt", GF_LOG_ERROR, "Can not alloc gcm context");
+ return ENOMEM;
+ }
+ CRYPTO_gcm128_setiv(gctx, info->oid, sizeof(uuid_t));
+
+ ad = htole32(MTD_LOADER_V1);
+ ret = CRYPTO_gcm128_aad(gctx, (const unsigned char *)&ad, sizeof(ad));
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_aad failed");
+ CRYPTO_gcm128_release(gctx);
+ return ret;
+ }
+ ret = CRYPTO_gcm128_decrypt(gctx,
+ get_EMTD_V1(fmt),
+ get_EMTD_V1(fmt),
+ SIZE_OF_EMTD_V1);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_decrypt failed");
+ CRYPTO_gcm128_release(gctx);
+ return ret;
+ }
+ /*
+ * verify metadata
+ */
+ CRYPTO_gcm128_tag(gctx, gmac, sizeof(gmac));
+ CRYPTO_gcm128_release(gctx);
+ if (memcmp(gmac, get_EMTD_V1_MAC(fmt), SIZE_OF_EMTD_V1_MAC)) {
+ gf_log("crypt", GF_LOG_ERROR, "EMTD verification failed");
+ return EINVAL;
+ }
+ /*
+ * load verified metadata to the private part of inode
+ */
+ info->nr_minor = fmt->minor_id;
+
+ object->o_alg = fmt->alg_id;
+ object->o_dkey_size = fmt->dkey_factor << KEY_FACTOR_BITS;
+ object->o_block_bits = fmt->block_bits;
+ object->o_mode = fmt->mode_id;
+
+ return check_file_metadata(info);
+}
+
+/*
+ * perform metadata authentication against @loc->path;
+ * extract crypt-specific attribtes and populate @info
+ * with them (optional)
+ */
+int32_t open_format(unsigned char *str,
+ int32_t len,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local,
+ gf_boolean_t load_info)
+{
+ struct crypt_format *fmt;
+ if (len < sizeof(*fmt)) {
+ gf_log("crypt", GF_LOG_ERROR, "Bad core format");
+ return EIO;
+ }
+ fmt = (struct crypt_format *)str;
+
+ if (fmt->loader_id >= LAST_MTD_LOADER) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "Unsupported loader id %d", fmt->loader_id);
+ return EINVAL;
+ }
+ str += sizeof(*fmt);
+ len -= sizeof(*fmt);
+
+ return mtd_loaders[fmt->loader_id].open_format(str,
+ len,
+ loc,
+ info,
+ master,
+ local,
+ load_info);
+}
+
+struct crypt_mtd_loader mtd_loaders [LAST_MTD_LOADER] = {
+ [MTD_LOADER_V1] =
+ {.format_size = format_size_v1,
+ .create_format = create_format_v1,
+ .open_format = open_format_v1,
+ .update_format = update_format_v1
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/metadata.h b/xlators/encryption/crypt/src/metadata.h
new file mode 100644
index 000000000..a92f149ef
--- /dev/null
+++ b/xlators/encryption/crypt/src/metadata.h
@@ -0,0 +1,74 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __METADATA_H__
+#define __METADATA_H__
+
+#define NMTD_8_MAC_SIZE (8)
+#define EMTD_8_MAC_SIZE (8)
+
+typedef uint8_t nmtd_8_mac_t[NMTD_8_MAC_SIZE];
+typedef uint8_t emtd_8_mac_t[EMTD_8_MAC_SIZE] ;
+
+/*
+ * Version "v1" of file's metadata.
+ * Metadata of this version has 4 components:
+ *
+ * 1) EMTD (Encrypted part of MeTaData);
+ * 2) NMTD (Non-encrypted part of MeTaData);
+ * 3) EMTD_MAC; (EMTD Message Authentication Code);
+ * 4) Array of per-link NMTD MACs (for every (hard)link it includes
+ * exactly one MAC)
+ */
+struct mtd_format_v1 {
+ /* EMTD, encrypted part of meta-data */
+ uint8_t alg_id; /* cipher algorithm id (only AES for now) */
+ uint8_t mode_id; /* cipher mode id; (only XTS for now) */
+ uint8_t block_bits; /* encoded block size */
+ uint8_t minor_id; /* client translator id */
+ uint8_t dkey_factor; /* encoded size of the data key */
+ /* MACs */
+ emtd_8_mac_t gmac; /* MAC of the encrypted meta-data, 8 bytes */
+ nmtd_8_mac_t omac; /* per-link MACs of the non-encrypted
+ * meta-data: at least one such MAC is always
+ * present */
+} __attribute__((packed));
+
+/*
+ * NMTD, the non-encrypted part of metadata of version "v1"
+ * is file's gfid, which is generated on trusted machines.
+ */
+#define SIZE_OF_NMTD_V1 (sizeof(uuid_t))
+#define SIZE_OF_EMTD_V1 (offsetof(struct mtd_format_v1, gmac) - \
+ offsetof(struct mtd_format_v1, alg_id))
+#define SIZE_OF_NMTD_V1_MAC (NMTD_8_MAC_SIZE)
+#define SIZE_OF_EMTD_V1_MAC (EMTD_8_MAC_SIZE)
+
+static inline unsigned char *get_EMTD_V1(struct mtd_format_v1 *format)
+{
+ return &format->alg_id;
+}
+
+static inline unsigned char *get_NMTD_V1(struct crypt_inode_info *info)
+{
+ return info->oid;
+}
+
+static inline unsigned char *get_EMTD_V1_MAC(struct mtd_format_v1 *format)
+{
+ return format->gmac;
+}
+
+static inline unsigned char *get_NMTD_V1_MAC(struct mtd_format_v1 *format)
+{
+ return format->omac;
+}
+
+#endif /* __METADATA_H__ */
diff --git a/xlators/encryption/rot-13/src/rot-13.c b/xlators/encryption/rot-13/src/rot-13.c
index b9ac29a72..1bcfe0192 100644
--- a/xlators/encryption/rot-13/src/rot-13.c
+++ b/xlators/encryption/rot-13/src/rot-13.c
@@ -150,6 +150,7 @@ init (xlator_t *this)
if (gf_string2boolean (data->data, &priv->encrypt_write) == -1) {
gf_log (this->name, GF_LOG_ERROR,
"encrypt-write takes only boolean options");
+ GF_FREE (priv);
return -1;
}
}
@@ -159,6 +160,7 @@ init (xlator_t *this)
if (gf_string2boolean (data->data, &priv->decrypt_read) == -1) {
gf_log (this->name, GF_LOG_ERROR,
"decrypt-read takes only boolean options");
+ GF_FREE (priv);
return -1;
}
}
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
index 6a73301d7..1fdd474c2 100644
--- a/xlators/features/Makefile.am
+++ b/xlators/features/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = locks quota read-only mac-compat quiesce marker index \
- protect $(GLUPY_SUBDIR) # trash path-converter # filter
+SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier \
+ protect compress changelog gfid-access $(GLUPY_SUBDIR) qemu-block # trash path-converter # filter
CLEANFILES =
diff --git a/xlators/storage/bd_map/Makefile.am b/xlators/features/barrier/Makefile.am
index a985f42a8..a985f42a8 100644
--- a/xlators/storage/bd_map/Makefile.am
+++ b/xlators/features/barrier/Makefile.am
diff --git a/xlators/features/barrier/src/Makefile.am b/xlators/features/barrier/src/Makefile.am
new file mode 100644
index 000000000..8859be328
--- /dev/null
+++ b/xlators/features/barrier/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = barrier.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+barrier_la_LDFLAGS = -module -avoid-version
+
+barrier_la_SOURCES = barrier.c
+
+barrier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = barrier.h barrier-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/barrier/src/barrier-mem-types.h b/xlators/features/barrier/src/barrier-mem-types.h
new file mode 100644
index 000000000..36647a669
--- /dev/null
+++ b/xlators/features/barrier/src/barrier-mem-types.h
@@ -0,0 +1,20 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BARRIER_MEM_TYPES_H__
+#define __BARRIER_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_barrier_mem_types_ {
+ gf_barrier_mt_priv_t = gf_common_mt_end + 1,
+ gf_barrier_mt_end
+};
+#endif
diff --git a/xlators/features/barrier/src/barrier.c b/xlators/features/barrier/src/barrier.c
new file mode 100644
index 000000000..5edb9cdd3
--- /dev/null
+++ b/xlators/features/barrier/src/barrier.c
@@ -0,0 +1,658 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "barrier.h"
+#include "defaults.h"
+#include "call-stub.h"
+
+#include "statedump.h"
+
+int32_t
+barrier_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ BARRIER_FOP_CBK (writev, out, frame, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (fremovexattr, out, frame, this, op_ret, op_errno,
+ xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (removexattr, out, frame, this, op_ret, op_errno,
+ xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (truncate, out, frame, this, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (ftruncate, out, frame, this, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ BARRIER_FOP_CBK (rename, out, frame, this, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (rmdir, out, frame, this, op_ret, op_errno, preparent,
+ postparent, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (unlink, out, frame, this, op_ret, op_errno, preparent,
+ postparent, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (fsync, out, frame, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t off, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ if (!(flags & O_SYNC)) {
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ fd, vector, count, off, flags, iobref, xdata);
+
+ return 0;
+ }
+
+ STACK_WIND (frame, barrier_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ off, flags, iobref, xdata);
+ return 0;
+}
+
+int32_t
+barrier_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_fremovexattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int32_t
+barrier_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_removexattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+int32_t
+barrier_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_truncate_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+int32_t
+barrier_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_rename_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+int
+barrier_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_rmdir_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rmdir,
+ loc, flags, xdata);
+ return 0;
+}
+
+int32_t
+barrier_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflag, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_unlink_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+}
+
+int32_t
+barrier_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_ftruncate_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+int32_t
+barrier_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t flags, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_fsync_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync,
+ fd, flags, xdata);
+ return 0;
+}
+
+call_stub_t *
+__barrier_dequeue (xlator_t *this, struct list_head *queue)
+{
+ call_stub_t *stub = NULL;
+ barrier_priv_t *priv = NULL;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (list_empty (queue))
+ goto out;
+
+ stub = list_entry (queue->next, call_stub_t, list);
+ list_del_init (&stub->list);
+
+out:
+ return stub;
+}
+
+void
+barrier_dequeue_all (xlator_t *this, struct list_head *queue)
+{
+ call_stub_t *stub = NULL;
+
+ gf_log (this->name, GF_LOG_INFO, "Dequeuing all the barriered fops");
+
+ /* TODO: Start the below task in a new thread */
+ while ((stub = __barrier_dequeue (this, queue)))
+ call_resume (stub);
+
+ gf_log (this->name, GF_LOG_INFO, "Dequeuing the barriered fops is "
+ "finished");
+ return;
+}
+
+void
+barrier_timeout (void *data)
+{
+ xlator_t *this = NULL;
+ barrier_priv_t *priv = NULL;
+ struct list_head queue = {0,};
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+
+ INIT_LIST_HEAD (&queue);
+
+ gf_log (this->name, GF_LOG_CRITICAL, "Disabling barrier because of "
+ "the barrier timeout.");
+
+ LOCK (&priv->lock);
+ {
+ __barrier_disable (this, &queue);
+ }
+ UNLOCK (&priv->lock);
+
+ barrier_dequeue_all (this, &queue);
+
+ return;
+}
+
+void
+__barrier_enqueue (xlator_t *this, call_stub_t *stub)
+{
+ barrier_priv_t *priv = NULL;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ list_add_tail (&stub->list, &priv->queue);
+ priv->queue_size++;
+
+ return;
+}
+
+void
+__barrier_disable (xlator_t *this, struct list_head *queue)
+{
+ GF_UNUSED int ret = 0;
+ barrier_priv_t *priv = NULL;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (priv->timer) {
+ ret = gf_timer_call_cancel (this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+
+ list_splice_init (&priv->queue, queue);
+ priv->queue_size = 0;
+ priv->barrier_enabled = _gf_false;
+}
+
+int
+__barrier_enable (xlator_t *this, barrier_priv_t *priv)
+{
+ int ret = -1;
+
+ priv->timer = gf_timer_call_after (this->ctx, priv->timeout,
+ barrier_timeout, (void *) this);
+ if (!priv->timer) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Couldn't add barrier "
+ "timeout event.");
+ goto out;
+ }
+
+ priv->barrier_enabled = _gf_true;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+notify (xlator_t *this, int event, void *data, ...)
+{
+ barrier_priv_t *priv = NULL;
+ dict_t *dict = NULL;
+ gf_boolean_t past = _gf_false;
+ int ret = -1;
+ gf_boolean_t barrier_enabled = _gf_false;
+ struct list_head queue = {0,};
+
+ priv = this->private;
+ GF_ASSERT (priv);
+ INIT_LIST_HEAD (&queue);
+
+ switch (event) {
+ case GF_EVENT_TRANSLATOR_OP:
+ {
+ dict = data;
+ GF_OPTION_RECONF ("barrier", barrier_enabled, dict,
+ bool, out);
+
+ LOCK (&priv->lock);
+ {
+ past = priv->barrier_enabled;
+
+ switch (past) {
+ case _gf_false:
+ if (barrier_enabled) {
+ ret = __barrier_enable (this,priv);
+ if (ret)
+ goto unlock;
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Already disabled.");
+ goto unlock;
+ }
+ break;
+
+ case _gf_true:
+ if (!barrier_enabled) {
+ __barrier_disable(this, &queue);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Already enabled");
+ goto unlock;
+ }
+ break;
+ }
+ ret = 0;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+
+ if (!list_empty (&queue))
+ barrier_dequeue_all (this, &queue);
+ // missing break is intentional
+ }
+ default:
+ {
+ default_notify (this, event, data);
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ barrier_priv_t *priv = NULL;
+ gf_boolean_t past = _gf_false;
+ int ret = -1;
+ gf_boolean_t barrier_enabled = _gf_false;
+ uint32_t timeout = {0,};
+ struct list_head queue = {0,};
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_OPTION_RECONF ("barrier", barrier_enabled, options, bool, out);
+ GF_OPTION_RECONF ("barrier-timeout", timeout, options, time, out);
+
+ INIT_LIST_HEAD (&queue);
+
+ LOCK (&priv->lock);
+ {
+ past = priv->barrier_enabled;
+
+ switch (past) {
+ case _gf_false:
+ if (barrier_enabled) {
+ ret = __barrier_enable (this, priv);
+ if (ret) {
+ goto unlock;
+ }
+ }
+ break;
+
+ case _gf_true:
+ if (!barrier_enabled) {
+ __barrier_disable (this, &queue);
+
+ }
+ break;
+ }
+ priv->timeout.tv_sec = timeout;
+ ret = 0;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+
+ if (!list_empty (&queue))
+ barrier_dequeue_all (this, &queue);
+
+out:
+ return ret;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ ret = xlator_mem_acct_init (this, gf_barrier_mt_end + 1);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting "
+ "initialization failed.");
+
+ return ret;
+}
+
+int
+init (xlator_t *this)
+{
+ int ret = -1;
+ barrier_priv_t *priv = NULL;
+ uint32_t timeout = {0,};
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "'barrier' not configured with exactly one child");
+ goto out;
+ }
+
+ if (!this->parents)
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_barrier_mt_priv_t);
+ if (!priv)
+ goto out;
+
+ LOCK_INIT (&priv->lock);
+
+ GF_OPTION_INIT ("barrier", priv->barrier_enabled, bool, out);
+ GF_OPTION_INIT ("barrier-timeout", timeout, time, out);
+ priv->timeout.tv_sec = timeout;
+
+ INIT_LIST_HEAD (&priv->queue);
+
+ if (priv->barrier_enabled) {
+ ret = __barrier_enable (this, priv);
+ if (ret == -1)
+ goto out;
+ }
+
+ this->private = priv;
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ barrier_priv_t *priv = NULL;
+ struct list_head queue = {0,};
+
+ priv = this->private;
+ if (!priv)
+ goto out;
+
+ INIT_LIST_HEAD (&queue);
+
+ gf_log (this->name, GF_LOG_INFO, "Disabling barriering and dequeuing "
+ "all the queued fops");
+ LOCK (&priv->lock);
+ {
+ __barrier_disable (this, &queue);
+ }
+ UNLOCK (&priv->lock);
+
+ if (!list_empty (&queue))
+ barrier_dequeue_all (this, &queue);
+
+ this->private = NULL;
+
+ LOCK_DESTROY (&priv->lock);
+ GF_FREE (priv);
+out:
+ return;
+}
+
+static void
+barrier_dump_stub (call_stub_t *stub, char *prefix)
+{
+ char key[GF_DUMP_MAX_BUF_LEN] = {0,};
+
+ gf_proc_dump_build_key (key, prefix, "fop");
+ gf_proc_dump_write (key, "%s", gf_fop_list[stub->fop]);
+
+ gf_proc_dump_build_key (key, prefix, "gfid");
+ gf_proc_dump_write (key, "%s", uuid_utoa (stub->args.loc.gfid));
+
+ if (stub->args.loc.path) {
+ gf_proc_dump_build_key (key, prefix, "path");
+ gf_proc_dump_write (key, "%s", stub->args.loc.path);
+ }
+ if (stub->args.loc.name) {
+ gf_proc_dump_build_key (key, prefix, "name");
+ gf_proc_dump_write (key, "%s", stub->args.loc.name);
+ }
+
+ return;
+}
+
+static void
+__barrier_dump_queue (barrier_priv_t *priv)
+{
+ call_stub_t *stub = NULL;
+ char key[GF_DUMP_MAX_BUF_LEN] = {0,};
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO ("barrier", priv, out);
+
+ list_for_each_entry (stub, &priv->queue, list) {
+ snprintf (key, sizeof (key), "stub.%d", i++);
+ gf_proc_dump_add_section (key);
+ barrier_dump_stub(stub, key);
+ }
+
+out:
+ return;
+}
+
+int
+barrier_dump_priv (xlator_t *this)
+{
+ int ret = -1;
+ char key[GF_DUMP_MAX_BUF_LEN] = {0,};
+ barrier_priv_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("barrier", this, out);
+
+ priv = this->private;
+ if (!priv)
+ return 0;
+
+ gf_proc_dump_build_key (key, "xlator.features.barrier", "priv");
+ gf_proc_dump_add_section (key);
+
+ LOCK (&priv->lock);
+ {
+ gf_proc_dump_build_key (key, "barrier", "enabled");
+ gf_proc_dump_write (key, "%d", priv->barrier_enabled);
+ gf_proc_dump_build_key (key, "barrier", "timeout");
+ gf_proc_dump_write (key, "%"PRId64, priv->timeout.tv_sec);
+ if (priv->barrier_enabled) {
+ gf_proc_dump_build_key (key, "barrier", "queue_size");
+ gf_proc_dump_write (key, "%d", priv->queue_size);
+ __barrier_dump_queue (priv);
+ }
+ }
+ UNLOCK (&priv->lock);
+
+out:
+ return ret;
+}
+
+struct xlator_fops fops = {
+
+ /* Barrier Class fops */
+ .rmdir = barrier_rmdir,
+ .unlink = barrier_unlink,
+ .rename = barrier_rename,
+ .removexattr = barrier_removexattr,
+ .fremovexattr = barrier_fremovexattr,
+ .truncate = barrier_truncate,
+ .ftruncate = barrier_ftruncate,
+ .fsync = barrier_fsync,
+
+ /* Writes with only O_SYNC flag */
+ .writev = barrier_writev,
+};
+
+struct xlator_dumpops dumpops = {
+ .priv = barrier_dump_priv,
+};
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+ { .key = {"barrier"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "When \"on\", blocks acknowledgements to application "
+ "for file operations such as rmdir, rename, unlink, "
+ "removexattr, fremovexattr, truncate, ftruncate, "
+ "write (with O_SYNC), fsync. It is turned \"off\" by "
+ "default."
+ },
+ { .key = {"barrier-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
+ .default_value = "120",
+ .description = "After 'timeout' seconds since the time 'barrier' "
+ "option was set to \"on\", acknowledgements to file "
+ "operations are no longer blocked and previously "
+ "blocked acknowledgements are sent to the application"
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/features/barrier/src/barrier.h b/xlators/features/barrier/src/barrier.h
new file mode 100644
index 000000000..8face9f65
--- /dev/null
+++ b/xlators/features/barrier/src/barrier.h
@@ -0,0 +1,91 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BARRIER_H__
+#define __BARRIER_H__
+
+#include "barrier-mem-types.h"
+#include "xlator.h"
+#include "timer.h"
+#include "call-stub.h"
+
+#define BARRIER_SAFE_ASSIGN(lock, to, value) \
+ do { \
+ LOCK (&(lock)); \
+ { \
+ to = value; \
+ } \
+ UNLOCK (&(lock)); \
+ } while (0)
+
+#define BARRIER_FOP_CBK(fop_name, label, frame, this, params ...) \
+ do { \
+ barrier_priv_t *_priv = NULL; \
+ call_stub_t *_stub = NULL; \
+ gf_boolean_t _barrier_enabled= _gf_false; \
+ struct list_head queue = {0, }; \
+ \
+ INIT_LIST_HEAD (&queue); \
+ \
+ _priv = this->private; \
+ GF_ASSERT (_priv); \
+ \
+ LOCK (&_priv->lock); \
+ { \
+ if (_priv->barrier_enabled) { \
+ _barrier_enabled = _priv->barrier_enabled;\
+ \
+ _stub = fop_##fop_name##_cbk_stub \
+ (frame, \
+ default_##fop_name##_cbk_resume,\
+ params); \
+ if (!_stub) { \
+ __barrier_disable (this, &queue);\
+ goto unlock; \
+ } \
+ \
+ __barrier_enqueue (this, _stub); \
+ } \
+ } \
+unlock: \
+ UNLOCK (&_priv->lock); \
+ \
+ if (_stub) \
+ goto label; \
+ \
+ if (_barrier_enabled && !_stub) { \
+ gf_log (this->name, GF_LOG_CRITICAL, \
+ "Failed to barrier FOPs, disabling " \
+ "barrier. FOP: %s, ERROR: %s", \
+ #fop_name, strerror (ENOMEM)); \
+ barrier_dequeue_all (this, &queue); \
+ } \
+ \
+ STACK_UNWIND_STRICT (fop_name, frame, params); \
+ goto label; \
+ } while (0)
+
+typedef struct {
+ gf_timer_t *timer;
+ gf_boolean_t barrier_enabled;
+ gf_lock_t lock;
+ struct list_head queue;
+ struct timespec timeout;
+ uint32_t queue_size;
+} barrier_priv_t;
+
+int __barrier_enable (xlator_t *this, barrier_priv_t *priv);
+void __barrier_enqueue (xlator_t *this, call_stub_t *stub);
+void __barrier_disable (xlator_t *this, struct list_head *queue);
+void barrier_timeout (void *data);
+void barrier_dequeue_all (xlator_t *this, struct list_head *queue);
+call_stub_t *__barrier_dequeue (xlator_t *this, struct list_head *queue);
+
+#endif
diff --git a/xlators/features/changelog/Makefile.am b/xlators/features/changelog/Makefile.am
new file mode 100644
index 000000000..153bb6850
--- /dev/null
+++ b/xlators/features/changelog/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src lib
+
+CLEANFILES =
diff --git a/xlators/features/changelog/lib/Makefile.am b/xlators/features/changelog/lib/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/features/changelog/lib/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/changelog/lib/examples/c/get-changes.c b/xlators/features/changelog/lib/examples/c/get-changes.c
new file mode 100644
index 000000000..14562585a
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/c/get-changes.c
@@ -0,0 +1,87 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/**
+ * get set of new changes every 10 seconds (just print the file names)
+ *
+ * Compile it using:
+ * gcc -o getchanges `pkg-config --cflags libgfchangelog` get-changes.c \
+ * `pkg-config --libs libgfchangelog`
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <limits.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <errno.h>
+
+#include "changelog.h"
+
+#define handle_error(fn) \
+ printf ("%s (reason: %s)\n", fn, strerror (errno))
+
+int
+main (int argc, char ** argv)
+{
+ int i = 0;
+ int ret = 0;
+ ssize_t nr_changes = 0;
+ ssize_t changes = 0;
+ char fbuf[PATH_MAX] = {0,};
+
+ /* get changes for brick "/home/vshankar/export/yow/yow-1" */
+ ret = gf_changelog_register ("/home/vshankar/export/yow/yow-1",
+ "/tmp/scratch", "/tmp/change.log", 9, 5);
+ if (ret) {
+ handle_error ("register failed");
+ goto out;
+ }
+
+ while (1) {
+ i = 0;
+ nr_changes = gf_changelog_scan ();
+ if (nr_changes < 0) {
+ handle_error ("scan(): ");
+ break;
+ }
+
+ if (nr_changes == 0)
+ goto next;
+
+ printf ("Got %ld changelog files\n", nr_changes);
+
+ while ( (changes =
+ gf_changelog_next_change (fbuf, PATH_MAX)) > 0) {
+ printf ("changelog file [%d]: %s\n", ++i, fbuf);
+
+ /* process changelog */
+ /* ... */
+ /* ... */
+ /* ... */
+ /* done processing */
+
+ ret = gf_changelog_done (fbuf);
+ if (ret)
+ handle_error ("gf_changelog_done");
+ }
+
+ if (changes == -1)
+ handle_error ("gf_changelog_next_change");
+
+ next:
+ sleep (10);
+ }
+
+ out:
+ return ret;
+}
diff --git a/xlators/features/changelog/lib/examples/python/changes.py b/xlators/features/changelog/lib/examples/python/changes.py
new file mode 100644
index 000000000..d21db8eab
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/python/changes.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+
+import os
+import sys
+import time
+import libgfchangelog
+
+cl = libgfchangelog.Changes()
+
+def get_changes(brick, scratch_dir, log_file, log_level, interval):
+ change_list = []
+ try:
+ cl.cl_register(brick, scratch_dir, log_file, log_level)
+ while True:
+ cl.cl_scan()
+ change_list = cl.cl_getchanges()
+ if change_list:
+ print change_list
+ for change in change_list:
+ print('done with %s' % (change))
+ cl.cl_done(change)
+ time.sleep(interval)
+ except OSError:
+ ex = sys.exc_info()[1]
+ print ex
+
+if __name__ == '__main__':
+ if len(sys.argv) != 5:
+ print("usage: %s <brick> <scratch-dir> <log-file> <fetch-interval>"
+ % (sys.argv[0]))
+ sys.exit(1)
+ get_changes(sys.argv[1], sys.argv[2], sys.argv[3], 9, int(sys.argv[4]))
diff --git a/xlators/features/changelog/lib/examples/python/libgfchangelog.py b/xlators/features/changelog/lib/examples/python/libgfchangelog.py
new file mode 100644
index 000000000..68ec3baf1
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/python/libgfchangelog.py
@@ -0,0 +1,64 @@
+import os
+from ctypes import *
+from ctypes.util import find_library
+
+class Changes(object):
+ libgfc = CDLL(find_library("gfchangelog"), use_errno=True)
+
+ @classmethod
+ def geterrno(cls):
+ return get_errno()
+
+ @classmethod
+ def raise_oserr(cls):
+ errn = cls.geterrno()
+ raise OSError(errn, os.strerror(errn))
+
+ @classmethod
+ def _get_api(cls, call):
+ return getattr(cls.libgfc, call)
+
+ @classmethod
+ def cl_register(cls, brick, path, log_file, log_level, retries = 0):
+ ret = cls._get_api('gf_changelog_register')(brick, path,
+ log_file, log_level, retries)
+ if ret == -1:
+ cls.raise_oserr()
+
+ @classmethod
+ def cl_scan(cls):
+ ret = cls._get_api('gf_changelog_scan')()
+ if ret == -1:
+ cls.raise_oserr()
+
+ @classmethod
+ def cl_startfresh(cls):
+ ret = cls._get_api('gf_changelog_start_fresh')()
+ if ret == -1:
+ cls.raise_oserr()
+
+ @classmethod
+ def cl_getchanges(cls):
+ """ remove hardcoding for path name length """
+ def clsort(f):
+ return f.split('.')[-1]
+ changes = []
+ buf = create_string_buffer('\0', 4096)
+ call = cls._get_api('gf_changelog_next_change')
+
+ while True:
+ ret = call(buf, 4096)
+ if ret in (0, -1):
+ break;
+ changes.append(buf.raw[:ret-1])
+ if ret == -1:
+ cls.raise_oserr()
+ # cleanup tracker
+ cls.cl_startfresh()
+ return sorted(changes, key=clsort)
+
+ @classmethod
+ def cl_done(cls, clfile):
+ ret = cls._get_api('gf_changelog_done')(clfile)
+ if ret == -1:
+ cls.raise_oserr()
diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am
new file mode 100644
index 000000000..28d5a70aa
--- /dev/null
+++ b/xlators/features/changelog/lib/src/Makefile.am
@@ -0,0 +1,38 @@
+libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \
+ -DDATADIR=\"$(localstatedir)\"
+
+libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -fpic \
+ -I../../../src/ -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/xlators/features/changelog/src \
+ -DDATADIR=\"$(localstatedir)\"
+
+libgfchangelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(GF_GLUSTERFS_LIBS)
+
+libgfchangelog_la_LDFLAGS = $(GF_LDFLAGS) -version-info $(LIBGFCHANGELOG_LT_VERSION)
+
+libgfchangelogdir = $(includedir)/glusterfs/gfchangelog
+lib_LTLIBRARIES = libgfchangelog.la
+
+CONTRIB_BUILDDIR = $(top_builddir)/contrib
+
+libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-process.c \
+ gf-changelog-helpers.c gf-history-changelog.c \
+ $(CONTRIBDIR)/uuid/clear.c \
+ $(CONTRIBDIR)/uuid/copy.c $(CONTRIBDIR)/uuid/gen_uuid.c \
+ $(CONTRIBDIR)/uuid/pack.c $(CONTRIBDIR)/uuid/parse.c \
+ $(CONTRIBDIR)/uuid/unparse.c $(CONTRIBDIR)/uuid/uuid_time.c \
+ $(CONTRIBDIR)/uuid/compare.c $(CONTRIBDIR)/uuid/isnull.c \
+ $(CONTRIBDIR)/uuid/unpack.c
+
+noinst_HEADERS = gf-changelog-helpers.h $(CONTRIBDIR)/uuid/uuidd.h \
+ $(CONTRIBDIR)/uuid/uuid.h $(CONTRIBDIR)/uuid/uuidP.h \
+ $(CONTRIB_BUILDDIR)/uuid/uuid_types.h
+
+libgfchangelog_HEADERS = changelog.h
+
+CLEANFILES =
+CONFIG_CLEAN_FILES = $(CONTRIB_BUILDDIR)/uuid/uuid_types.h
+
+$(top_builddir)/libglusterfs/src/libglusterfs.la:
+ $(MAKE) -C $(top_builddir)/libglusterfs/src/ all
diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/changelog.h
new file mode 100644
index 000000000..5cddfb583
--- /dev/null
+++ b/xlators/features/changelog/lib/src/changelog.h
@@ -0,0 +1,31 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GF_CHANGELOG_H
+#define _GF_CHANGELOG_H
+
+/* API set */
+
+int
+gf_changelog_register (char *brick_path, char *scratch_dir,
+ char *log_file, int log_levl, int max_reconnects);
+ssize_t
+gf_changelog_scan ();
+
+int
+gf_changelog_start_fresh ();
+
+ssize_t
+gf_changelog_next_change (char *bufptr, size_t maxlen);
+
+int
+gf_changelog_done (char *file);
+
+#endif
diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.c b/xlators/features/changelog/lib/src/gf-changelog-helpers.c
new file mode 100644
index 000000000..1eef8bf04
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.c
@@ -0,0 +1,180 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-mem-types.h"
+#include "gf-changelog-helpers.h"
+
+ssize_t gf_changelog_read_path (int fd, char *buffer, size_t bufsize)
+{
+ return read (fd, buffer, bufsize);
+}
+
+size_t
+gf_changelog_write (int fd, char *buffer, size_t len)
+{
+ ssize_t size = 0;
+ size_t writen = 0;
+
+ while (writen < len) {
+ size = write (fd,
+ buffer + writen, len - writen);
+ if (size <= 0)
+ break;
+
+ writen += size;
+ }
+
+ return writen;
+}
+
+void
+gf_rfc3986_encode (unsigned char *s, char *enc, char *estr)
+{
+ for (; *s; s++) {
+ if (estr[*s])
+ sprintf(enc, "%c", estr[*s]);
+ else
+ sprintf(enc, "%%%02X", *s);
+ while (*++enc);
+ }
+}
+
+/**
+ * thread safe version of readline with buffering
+ * (taken from Unix Network Programming Volume I, W.R. Stevens)
+ *
+ * This is favoured over fgets() as we'd need to ftruncate()
+ * (see gf_changelog_scan() API) to record new changelog files.
+ * stream open functions does have a truncate like api (although
+ * that can be done via @fflush(fp), @ftruncate(fd) and @fseek(fp),
+ * but this involves mixing POSIX file descriptors and stream FILE *).
+ *
+ * NOTE: This implmentation still does work with more than one fd's
+ * used to perform gf_readline(). For this very reason it's not
+ * made a part of libglusterfs.
+ */
+
+static pthread_key_t rl_key;
+static pthread_once_t rl_once = PTHREAD_ONCE_INIT;
+
+static void
+readline_destructor (void *ptr)
+{
+ GF_FREE (ptr);
+}
+
+static void
+readline_once (void)
+{
+ pthread_key_create (&rl_key, readline_destructor);
+}
+
+static ssize_t
+my_read (read_line_t *tsd, int fd, char *ptr)
+{
+ if (tsd->rl_cnt <= 0) {
+ if ( (tsd->rl_cnt = read (fd, tsd->rl_buf, MAXLINE)) < 0 )
+ return -1;
+ else if (tsd->rl_cnt == 0)
+ return 0;
+ tsd->rl_bufptr = tsd->rl_buf;
+ }
+
+ tsd->rl_cnt--;
+ *ptr = *tsd->rl_bufptr++;
+ return 1;
+}
+
+static int
+gf_readline_init_once (read_line_t **tsd)
+{
+ if (pthread_once (&rl_once, readline_once) != 0)
+ return -1;
+
+ *tsd = pthread_getspecific (rl_key);
+ if (*tsd)
+ goto out;
+
+ *tsd = GF_CALLOC (1, sizeof (**tsd),
+ gf_changelog_mt_libgfchangelog_rl_t);
+ if (!*tsd)
+ return -1;
+
+ if (pthread_setspecific (rl_key, *tsd) != 0)
+ return -1;
+
+ out:
+ return 0;
+}
+
+ssize_t
+gf_readline (int fd, void *vptr, size_t maxlen)
+{
+ size_t n = 0;
+ size_t rc = 0;
+ char c = ' ';
+ char *ptr = NULL;
+ read_line_t *tsd = NULL;
+
+ if (gf_readline_init_once (&tsd))
+ return -1;
+
+ ptr = vptr;
+ for (n = 1; n < maxlen; n++) {
+ if ( (rc = my_read (tsd, fd, &c)) == 1 ) {
+ *ptr++ = c;
+ if (c == '\n')
+ break;
+ } else if (rc == 0) {
+ *ptr = '\0';
+ return (n - 1);
+ } else
+ return -1;
+ }
+
+ *ptr = '\0';
+ return n;
+
+}
+
+off_t
+gf_lseek (int fd, off_t offset, int whence)
+{
+ off_t off = 0;
+ read_line_t *tsd = NULL;
+
+ if (gf_readline_init_once (&tsd))
+ return -1;
+
+ if ( (off = lseek (fd, offset, whence)) == -1)
+ return -1;
+
+ tsd->rl_cnt = 0;
+ tsd->rl_bufptr = tsd->rl_buf;
+
+ return off;
+}
+
+int
+gf_ftruncate (int fd, off_t length)
+{
+ read_line_t *tsd = NULL;
+
+ if (gf_readline_init_once (&tsd))
+ return -1;
+
+ if (ftruncate (fd, 0))
+ return -1;
+
+ tsd->rl_cnt = 0;
+ tsd->rl_bufptr = tsd->rl_buf;
+
+ return 0;
+}
diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h
new file mode 100644
index 000000000..fa0edabf0
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h
@@ -0,0 +1,102 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GF_CHANGELOG_HELPERS_H
+#define _GF_CHANGELOG_HELPERS_H
+
+#include <unistd.h>
+#include <dirent.h>
+#include <limits.h>
+#include <pthread.h>
+
+#include <xlator.h>
+
+#define GF_CHANGELOG_TRACKER "tracker"
+
+#define GF_CHANGELOG_CURRENT_DIR ".current"
+#define GF_CHANGELOG_PROCESSED_DIR ".processed"
+#define GF_CHANGELOG_PROCESSING_DIR ".processing"
+#define GF_CHANGELOG_HISTORY_DIR ".history"
+
+#ifndef MAXLINE
+#define MAXLINE 4096
+#endif
+
+#define GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, len) do { \
+ memcpy (ascii + off, ptr, len); \
+ off += len; \
+ } while (0)
+
+typedef struct read_line {
+ int rl_cnt;
+ char *rl_bufptr;
+ char rl_buf[MAXLINE];
+} read_line_t;
+
+typedef struct gf_changelog {
+ xlator_t *this;
+
+ /* 'processing' directory stream */
+ DIR *gfc_dir;
+
+ /* fd to the tracker file */
+ int gfc_fd;
+
+ /* connection retries */
+ int gfc_connretries;
+
+ char gfc_sockpath[UNIX_PATH_MAX];
+
+ char gfc_brickpath[PATH_MAX];
+
+ /* socket for recieving notifications */
+ int gfc_sockfd;
+
+ char *gfc_working_dir;
+
+ /* RFC 3986 string encoding */
+ char rfc3986[256];
+
+ char gfc_current_dir[PATH_MAX];
+ char gfc_processed_dir[PATH_MAX];
+ char gfc_processing_dir[PATH_MAX];
+
+ pthread_t gfc_changelog_processor;
+
+ /* Holds gfc for History API */
+ struct gf_changelog *hist_gfc;
+} gf_changelog_t;
+
+int
+gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc);
+
+void *
+gf_changelog_process (void *data);
+
+ssize_t
+gf_changelog_read_path (int fd, char *buffer, size_t bufsize);
+
+void
+gf_rfc3986_encode (unsigned char *s, char *enc, char *estr);
+
+size_t
+gf_changelog_write (int fd, char *buffer, size_t len);
+
+ssize_t
+gf_readline (int fd, void *vptr, size_t maxlen);
+
+int
+gf_ftruncate (int fd, off_t length);
+
+off_t
+gf_lseek (int fd, off_t offset, int whence);
+
+
+#endif
diff --git a/xlators/features/changelog/lib/src/gf-changelog-process.c b/xlators/features/changelog/lib/src/gf-changelog-process.c
new file mode 100644
index 000000000..df7204931
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-process.c
@@ -0,0 +1,571 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+#include <pthread.h>
+
+#include "uuid.h"
+#include "globals.h"
+#include "glusterfs.h"
+
+#include "gf-changelog-helpers.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+
+extern int byebye;
+
+/**
+ * number of gfid records after fop number
+ */
+int nr_gfids[] = {
+ [GF_FOP_MKNOD] = 1,
+ [GF_FOP_MKDIR] = 1,
+ [GF_FOP_UNLINK] = 1,
+ [GF_FOP_RMDIR] = 1,
+ [GF_FOP_SYMLINK] = 1,
+ [GF_FOP_RENAME] = 2,
+ [GF_FOP_LINK] = 1,
+ [GF_FOP_CREATE] = 1,
+};
+
+static char *
+binary_to_ascii (uuid_t uuid)
+{
+ return uuid_utoa (uuid);
+}
+
+static char *
+conv_noop (char *ptr) { return ptr; }
+
+#define VERIFY_SEPARATOR(ptr, plen, perr) \
+ { \
+ if (*(ptr + plen) != '\0') { \
+ perr = 1; \
+ break; \
+ } \
+ }
+
+#define MOVER_MOVE(mover, nleft, bytes) \
+ { \
+ mover += bytes; \
+ nleft -= bytes; \
+ } \
+
+#define PARSE_GFID(mov, ptr, le, fn, perr) \
+ { \
+ VERIFY_SEPARATOR (mov, le, perr); \
+ ptr = fn (mov); \
+ if (!ptr) { \
+ perr = 1; \
+ break; \
+ } \
+ }
+
+#define FILL_AND_MOVE(pt, buf, of, mo, nl, le) \
+ { \
+ GF_CHANGELOG_FILL_BUFFER (pt, buf, of, strlen (pt)); \
+ MOVER_MOVE (mo, nl, le); \
+ }
+
+
+#define PARSE_GFID_MOVE(ptr, uuid, mover, nleft, perr) \
+ { \
+ memcpy (uuid, mover, sizeof (uuid_t)); \
+ ptr = binary_to_ascii (uuid); \
+ if (!ptr) { \
+ perr = 1; \
+ break; \
+ } \
+ MOVER_MOVE (mover, nleft, sizeof (uuid_t)); \
+ } \
+
+#define LINE_BUFSIZE 3*PATH_MAX /* enough buffer for extra chars too */
+
+/**
+ * using mmap() makes parsing easy. fgets() cannot be used here as
+ * the binary gfid could contain a line-feed (0x0A), in that case fgets()
+ * would read an incomplete line and parsing would fail. using POSIX fds
+ * would result is additional code to maintain state in case of partial
+ * reads of data (where multiple entries do not fit extirely in the buffer).
+ *
+ * mmap() gives the flexibility of pointing to an offset in the file
+ * without us worrying about reading it in memory (VM does that for us for
+ * free).
+ */
+
+static int
+gf_changelog_parse_binary (xlator_t *this,
+ gf_changelog_t *gfc, int from_fd, int to_fd,
+ size_t start_offset, struct stat *stbuf)
+
+{
+ int ret = -1;
+ off_t off = 0;
+ off_t nleft = 0;
+ uuid_t uuid = {0,};
+ char *ptr = NULL;
+ char *bname_start = NULL;
+ char *bname_end = NULL;
+ char *mover = NULL;
+ char *start = NULL;
+ char current_mover = ' ';
+ size_t blen = 0;
+ int parse_err = 0;
+ char ascii[LINE_BUFSIZE] = {0,};
+
+ nleft = stbuf->st_size;
+
+ start = (char *) mmap (NULL, nleft,
+ PROT_READ, MAP_PRIVATE, from_fd, 0);
+ if (!start) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mmap() error (reason: %s)", strerror (errno));
+ goto out;
+ }
+
+ mover = start;
+
+ MOVER_MOVE (mover, nleft, start_offset);
+
+ while (nleft > 0) {
+
+ off = blen = 0;
+ ptr = bname_start = bname_end = NULL;
+
+ current_mover = *mover;
+
+ switch (current_mover) {
+ case 'D':
+ case 'M':
+ MOVER_MOVE (mover, nleft, 1);
+ PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err);
+
+ break;
+
+ case 'E':
+ MOVER_MOVE (mover, nleft, 1);
+ PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err);
+
+ bname_start = mover;
+ if ( (bname_end = strchr (mover, '\n')) == NULL ) {
+ parse_err = 1;
+ break;
+ }
+
+ blen = bname_end - bname_start;
+ MOVER_MOVE (mover, nleft, blen);
+
+ break;
+
+ default:
+ parse_err = 1;
+ }
+
+ if (parse_err)
+ break;
+
+ GF_CHANGELOG_FILL_BUFFER (&current_mover, ascii, off, 1);
+ GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1);
+ GF_CHANGELOG_FILL_BUFFER (ptr, ascii, off, strlen (ptr));
+ if (blen)
+ GF_CHANGELOG_FILL_BUFFER (bname_start,
+ ascii, off, blen);
+ GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1);
+
+ if (gf_changelog_write (to_fd, ascii, off) != off) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "processing binary changelog failed due to "
+ " error in writing ascii change (reason: %s)",
+ strerror (errno));
+ break;
+ }
+
+ MOVER_MOVE (mover, nleft, 1);
+ }
+
+ if ( (nleft == 0) && (!parse_err))
+ ret = 0;
+
+ if (munmap (start, stbuf->st_size))
+ gf_log (this->name, GF_LOG_ERROR,
+ "munmap() error (reason: %s)", strerror (errno));
+ out:
+ return ret;
+}
+
+/**
+ * ascii decoder:
+ * - separate out one entry from another
+ * - use fop name rather than fop number
+ */
+static int
+gf_changelog_parse_ascii (xlator_t *this,
+ gf_changelog_t *gfc, int from_fd, int to_fd,
+ size_t start_offset, struct stat *stbuf)
+{
+ int ng = 0;
+ int ret = -1;
+ int fop = 0;
+ int len = 0;
+ off_t off = 0;
+ off_t nleft = 0;
+ char *ptr = NULL;
+ char *eptr = NULL;
+ char *start = NULL;
+ char *mover = NULL;
+ int parse_err = 0;
+ char current_mover = ' ';
+ char ascii[LINE_BUFSIZE] = {0,};
+ const char *fopname = NULL;
+
+ nleft = stbuf->st_size;
+
+ start = (char *) mmap (NULL, nleft,
+ PROT_READ, MAP_PRIVATE, from_fd, 0);
+ if (!start) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mmap() error (reason: %s)", strerror (errno));
+ goto out;
+ }
+
+ mover = start;
+
+ MOVER_MOVE (mover, nleft, start_offset);
+
+ while (nleft > 0) {
+ off = 0;
+ current_mover = *mover;
+
+ GF_CHANGELOG_FILL_BUFFER (&current_mover, ascii, off, 1);
+ GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1);
+
+ switch (current_mover) {
+ case 'D':
+ case 'M':
+ MOVER_MOVE (mover, nleft, 1);
+
+ /* target gfid */
+ PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN,
+ conv_noop, parse_err);
+ FILL_AND_MOVE(ptr, ascii, off,
+ mover, nleft, UUID_CANONICAL_FORM_LEN);
+ break;
+
+ case 'E':
+ MOVER_MOVE (mover, nleft, 1);
+
+ /* target gfid */
+ PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN,
+ conv_noop, parse_err);
+ FILL_AND_MOVE (ptr, ascii, off,
+ mover, nleft, UUID_CANONICAL_FORM_LEN);
+ FILL_AND_MOVE (" ", ascii, off,
+ mover, nleft, 1);
+
+ /* fop */
+ len = strlen (mover);
+ VERIFY_SEPARATOR (mover, len, parse_err);
+
+ fop = atoi (mover);
+ if ( (fopname = gf_fop_list[fop]) == NULL) {
+ parse_err = 1;
+ break;
+ }
+
+ MOVER_MOVE (mover, nleft, len);
+
+ len = strlen (fopname);
+ GF_CHANGELOG_FILL_BUFFER (fopname, ascii, off, len);
+
+ /* pargfid + bname */
+ ng = nr_gfids[fop];
+ while (ng-- > 0) {
+ MOVER_MOVE (mover, nleft, 1);
+ len = strlen (mover);
+ GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1);
+
+ PARSE_GFID (mover, ptr, len,
+ conv_noop, parse_err);
+ eptr = calloc (3, strlen (ptr));
+ if (!eptr) {
+ parse_err = 1;
+ break;
+ }
+
+ gf_rfc3986_encode ((unsigned char *) ptr,
+ eptr, gfc->rfc3986);
+ FILL_AND_MOVE (eptr, ascii, off,
+ mover, nleft, len);
+ free (eptr);
+ }
+
+ break;
+ default:
+ parse_err = 1;
+ }
+
+ if (parse_err)
+ break;
+
+ GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1);
+
+ if (gf_changelog_write (to_fd, ascii, off) != off) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "processing ascii changelog failed due to "
+ " wrror in writing change (reason: %s)",
+ strerror (errno));
+ break;
+ }
+
+ MOVER_MOVE (mover, nleft, 1);
+
+ }
+
+ if ( (nleft == 0) && (!parse_err))
+ ret = 0;
+
+ if (munmap (start, stbuf->st_size))
+ gf_log (this->name, GF_LOG_ERROR,
+ "munmap() error (reason: %s)", strerror (errno));
+
+ out:
+ return ret;
+}
+
+#define COPY_BUFSIZE 8192
+static int
+gf_changelog_copy (xlator_t *this, int from_fd, int to_fd)
+{
+ ssize_t size = 0;
+ char buffer[COPY_BUFSIZE+1] = {0,};
+
+ while (1) {
+ size = read (from_fd, buffer, COPY_BUFSIZE);
+ if (size <= 0)
+ break;
+
+ if (gf_changelog_write (to_fd,
+ buffer, size) != size) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error processing ascii changlog");
+ size = -1;
+ break;
+ }
+ }
+
+ return (size < 0 ? -1 : 0);
+}
+
+static int
+gf_changelog_decode (xlator_t *this, gf_changelog_t *gfc, int from_fd,
+ int to_fd, struct stat *stbuf, int *zerob)
+{
+ int ret = -1;
+ int encoding = -1;
+ size_t elen = 0;
+ char buffer[1024] = {0,};
+
+ CHANGELOG_GET_ENCODING (from_fd, buffer, 1024, encoding, elen);
+ if (encoding == -1) /* unknown encoding */
+ goto out;
+
+ if (!CHANGELOG_VALID_ENCODING (encoding))
+ goto out;
+
+ if (elen == stbuf->st_size) {
+ *zerob = 1;
+ goto out;
+ }
+
+ /**
+ * start processing after the header
+ */
+ lseek (from_fd, elen, SEEK_SET);
+
+ switch (encoding) {
+ case CHANGELOG_ENCODE_BINARY:
+ /**
+ * this ideally should have been a part of changelog-encoders.c
+ * (ie. part of the changelog translator).
+ */
+ ret = gf_changelog_parse_binary (this, gfc, from_fd,
+ to_fd, elen, stbuf);
+ break;
+
+ case CHANGELOG_ENCODE_ASCII:
+ ret = gf_changelog_parse_ascii (this, gfc, from_fd,
+ to_fd, elen, stbuf);
+ break;
+ default:
+ ret = gf_changelog_copy (this, from_fd, to_fd);
+ }
+
+ out:
+ return ret;
+}
+
+static int
+gf_changelog_consume (xlator_t *this, gf_changelog_t *gfc, char *from_path)
+{
+ int ret = -1;
+ int fd1 = 0;
+ int fd2 = 0;
+ int zerob = 0;
+ struct stat stbuf = {0,};
+ char dest[PATH_MAX] = {0,};
+ char to_path[PATH_MAX] = {0,};
+
+ ret = stat (from_path, &stbuf);
+ if (ret || !S_ISREG(stbuf.st_mode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "stat failed on changelog file: %s", from_path);
+ goto out;
+ }
+
+ fd1 = open (from_path, O_RDONLY);
+ if (fd1 < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot open changelog file: %s (reason: %s)",
+ from_path, strerror (errno));
+ goto out;
+ }
+
+ (void) snprintf (to_path, PATH_MAX, "%s%s",
+ gfc->gfc_current_dir, basename (from_path));
+ (void) snprintf (dest, PATH_MAX, "%s%s",
+ gfc->gfc_processing_dir, basename (from_path));
+
+ fd2 = open (to_path, O_CREAT | O_TRUNC | O_RDWR,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (fd2 < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot create ascii changelog file %s (reason %s)",
+ to_path, strerror (errno));
+ goto close_fd;
+ } else {
+ ret = gf_changelog_decode (this, gfc, fd1,
+ fd2, &stbuf, &zerob);
+
+ close (fd2);
+
+ if (!ret) {
+ /* move it to processing on a successfull
+ decode */
+ ret = rename (to_path, dest);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "error moving %s to processing dir"
+ " (reason: %s)", to_path,
+ strerror (errno));
+ }
+
+ /* remove it from .current if it's an empty file */
+ if (zerob) {
+ ret = unlink (to_path);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not unlink %s (reason: %s",
+ to_path, strerror (errno));
+ }
+ }
+
+ close_fd:
+ close (fd1);
+
+ out:
+ return ret;
+}
+
+static char *
+gf_changelog_ext_change (xlator_t *this,
+ gf_changelog_t *gfc, char *path, size_t readlen)
+{
+ int alo = 0;
+ int ret = 0;
+ size_t len = 0;
+ char *buf = NULL;
+
+ buf = path;
+ while (len < readlen) {
+ if (*buf == '\0') {
+ alo = 1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "processing changelog: %s", path);
+ ret = gf_changelog_consume (this, gfc, path);
+ }
+
+ if (ret)
+ break;
+
+ len++; buf++;
+ if (alo) {
+ alo = 0;
+ path = buf;
+ }
+ }
+
+ return (ret) ? NULL : path;
+}
+
+void *
+gf_changelog_process (void *data)
+{
+ ssize_t len = 0;
+ ssize_t offlen = 0;
+ xlator_t *this = NULL;
+ char *sbuf = NULL;
+ gf_changelog_t *gfc = NULL;
+ char from_path[PATH_MAX] = {0,};
+
+ gfc = (gf_changelog_t *) data;
+ this = gfc->this;
+
+ pthread_detach (pthread_self());
+
+ for (;;) {
+ len = gf_changelog_read_path (gfc->gfc_sockfd,
+ from_path + offlen,
+ PATH_MAX - offlen);
+ if (len < 0)
+ continue; /* ignore it for now */
+
+ if (len == 0) { /* close() from the changelog translator */
+ gf_log (this->name, GF_LOG_INFO, "close from changelog"
+ " notification translator.");
+
+ if (gfc->gfc_connretries != 1) {
+ if (!gf_changelog_notification_init(this, gfc))
+ continue;
+ }
+
+ byebye = 1;
+ break;
+ }
+
+ len += offlen;
+ sbuf = gf_changelog_ext_change (this, gfc, from_path, len);
+ if (!sbuf) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not extract changelog filename");
+ continue;
+ }
+
+ offlen = 0;
+ if (sbuf != (from_path + len)) {
+ offlen = from_path + len - sbuf;
+ memmove (from_path, sbuf, offlen);
+ }
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "byebye (%d) from processing thread...", byebye);
+ return NULL;
+}
diff --git a/xlators/features/changelog/lib/src/gf-changelog.c b/xlators/features/changelog/lib/src/gf-changelog.c
new file mode 100644
index 000000000..0827f2cac
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog.c
@@ -0,0 +1,571 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+#include <dirent.h>
+#include <stddef.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <string.h>
+
+#include "globals.h"
+#include "glusterfs.h"
+#include "logging.h"
+
+#include "gf-changelog-helpers.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+#include "changelog-mem-types.h"
+
+int byebye = 0;
+
+static void
+gf_changelog_cleanup (gf_changelog_t *gfc)
+{
+ /* socket */
+ if (gfc->gfc_sockfd != -1)
+ close (gfc->gfc_sockfd);
+ /* tracker fd */
+ if (gfc->gfc_fd != -1)
+ close (gfc->gfc_fd);
+ /* processing dir */
+ if (gfc->gfc_dir)
+ closedir (gfc->gfc_dir);
+
+ if (gfc->gfc_working_dir)
+ free (gfc->gfc_working_dir); /* allocated by realpath */
+}
+
+void
+__attribute__ ((constructor)) gf_changelog_ctor (void)
+{
+ glusterfs_ctx_t *ctx = NULL;
+
+ ctx = glusterfs_ctx_new ();
+ if (!ctx)
+ return;
+
+ if (glusterfs_globals_init (ctx)) {
+ free (ctx);
+ ctx = NULL;
+ return;
+ }
+
+ THIS->ctx = ctx;
+}
+
+void
+__attribute__ ((destructor)) gf_changelog_dtor (void)
+{
+ xlator_t *this = NULL;
+ glusterfs_ctx_t *ctx = NULL;
+ gf_changelog_t *gfc = NULL;
+
+ this = THIS;
+ if (!this)
+ return;
+
+ ctx = this->ctx;
+ gfc = this->private;
+
+ if (gfc) {
+ if (gfc->hist_gfc) {
+ gf_changelog_cleanup(gfc->hist_gfc);
+ GF_FREE (gfc->hist_gfc);
+ }
+ gf_changelog_cleanup (gfc);
+ GF_FREE (gfc);
+ }
+
+ if (ctx) {
+ pthread_mutex_destroy (&ctx->lock);
+ free (ctx);
+ ctx = NULL;
+ }
+}
+
+
+static int
+gf_changelog_open_dirs (gf_changelog_t *gfc)
+{
+ int ret = -1;
+ DIR *dir = NULL;
+ int tracker_fd = 0;
+ char tracker_path[PATH_MAX] = {0,};
+
+ (void) snprintf (gfc->gfc_current_dir, PATH_MAX,
+ "%s/"GF_CHANGELOG_CURRENT_DIR"/",
+ gfc->gfc_working_dir);
+ ret = mkdir_p (gfc->gfc_current_dir, 0600, _gf_false);
+ if (ret)
+ goto out;
+
+ (void) snprintf (gfc->gfc_processed_dir, PATH_MAX,
+ "%s/"GF_CHANGELOG_PROCESSED_DIR"/",
+ gfc->gfc_working_dir);
+ ret = mkdir_p (gfc->gfc_processed_dir, 0600, _gf_false);
+ if (ret)
+ goto out;
+
+ (void) snprintf (gfc->gfc_processing_dir, PATH_MAX,
+ "%s/"GF_CHANGELOG_PROCESSING_DIR"/",
+ gfc->gfc_working_dir);
+ ret = mkdir_p (gfc->gfc_processing_dir, 0600, _gf_false);
+ if (ret)
+ goto out;
+
+ dir = opendir (gfc->gfc_processing_dir);
+ if (!dir) {
+ gf_log ("", GF_LOG_ERROR,
+ "opendir() error [reason: %s]", strerror (errno));
+ goto out;
+ }
+
+ gfc->gfc_dir = dir;
+
+ (void) snprintf (tracker_path, PATH_MAX,
+ "%s/"GF_CHANGELOG_TRACKER, gfc->gfc_working_dir);
+
+ tracker_fd = open (tracker_path, O_CREAT | O_APPEND | O_RDWR,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (tracker_fd < 0) {
+ closedir (gfc->gfc_dir);
+ ret = -1;
+ goto out;
+ }
+
+ gfc->gfc_fd = tracker_fd;
+ ret = 0;
+ out:
+ return ret;
+}
+
+int
+gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc)
+{
+ int ret = 0;
+ int len = 0;
+ int tries = 0;
+ int sockfd = 0;
+ struct sockaddr_un remote;
+
+ this = gfc->this;
+
+ if (gfc->gfc_sockfd != -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Reconnecting...");
+ close (gfc->gfc_sockfd);
+ }
+
+ sockfd = socket (AF_UNIX, SOCK_STREAM, 0);
+ if (sockfd < 0) {
+ ret = -1;
+ goto out;
+ }
+
+ CHANGELOG_MAKE_SOCKET_PATH (gfc->gfc_brickpath,
+ gfc->gfc_sockpath, UNIX_PATH_MAX);
+ gf_log (this->name, GF_LOG_INFO,
+ "connecting to changelog socket: %s (brick: %s)",
+ gfc->gfc_sockpath, gfc->gfc_brickpath);
+
+ remote.sun_family = AF_UNIX;
+ strcpy (remote.sun_path, gfc->gfc_sockpath);
+
+ len = strlen (remote.sun_path) + sizeof (remote.sun_family);
+
+ while (tries < gfc->gfc_connretries) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "connection attempt %d/%d...",
+ tries + 1, gfc->gfc_connretries);
+
+ /* initiate a connect */
+ if (connect (sockfd, (struct sockaddr *) &remote, len) == 0) {
+ gfc->gfc_sockfd = sockfd;
+ break;
+ }
+
+ tries++;
+ sleep (2);
+ }
+
+ if (tries == gfc->gfc_connretries) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not connect to changelog socket!"
+ " bailing out...");
+ close (sockfd);
+ ret = -1;
+ } else
+ gf_log (this->name, GF_LOG_INFO,
+ "connection successful");
+
+ out:
+ return ret;
+}
+
+int
+gf_changelog_done (char *file)
+{
+ int ret = -1;
+ char *buffer = NULL;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ char to_path[PATH_MAX] = {0,};
+
+ errno = EINVAL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ if (!file || !strlen (file))
+ goto out;
+
+ /* make sure 'file' is inside ->gfc_working_dir */
+ buffer = realpath (file, NULL);
+ if (!buffer)
+ goto out;
+
+ if (strncmp (gfc->gfc_working_dir,
+ buffer, strlen (gfc->gfc_working_dir)))
+ goto out;
+
+ (void) snprintf (to_path, PATH_MAX, "%s%s",
+ gfc->gfc_processed_dir, basename (buffer));
+ gf_log (this->name, GF_LOG_DEBUG,
+ "moving %s to processed directory", file);
+ ret = rename (buffer, to_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot move %s to %s (reason: %s)",
+ file, to_path, strerror (errno));
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ if (buffer)
+ free (buffer); /* allocated by realpath() */
+ return ret;
+}
+
+/**
+ * @API
+ * for a set of changelogs, start from the begining
+ */
+int
+gf_changelog_start_fresh ()
+{
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ errno = EINVAL;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ if (gf_ftruncate (gfc->gfc_fd, 0))
+ goto out;
+
+ return 0;
+
+ out:
+ return -1;
+}
+
+/**
+ * @API
+ * return the next changelog file entry. zero means all chanelogs
+ * consumed.
+ */
+ssize_t
+gf_changelog_next_change (char *bufptr, size_t maxlen)
+{
+ ssize_t size = 0;
+ int tracker_fd = 0;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ char buffer[PATH_MAX] = {0,};
+
+ errno = EINVAL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ tracker_fd = gfc->gfc_fd;
+
+ size = gf_readline (tracker_fd, buffer, maxlen);
+ if (size < 0)
+ goto out;
+ if (size == 0)
+ return 0;
+
+ memcpy (bufptr, buffer, size - 1);
+ *(buffer + size) = '\0';
+
+ return size;
+
+ out:
+ return -1;
+}
+
+/**
+ * @API
+ * gf_changelog_scan() - scan and generate a list of change entries
+ *
+ * calling this api multiple times (without calling gf_changlog_done())
+ * would result new changelogs(s) being refreshed in the tracker file.
+ * This call also acts as a cancellation point for the consumer.
+ */
+ssize_t
+gf_changelog_scan ()
+{
+ int ret = 0;
+ int tracker_fd = 0;
+ size_t len = 0;
+ size_t off = 0;
+ xlator_t *this = NULL;
+ size_t nr_entries = 0;
+ gf_changelog_t *gfc = NULL;
+ struct dirent *entryp = NULL;
+ struct dirent *result = NULL;
+ char buffer[PATH_MAX] = {0,};
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ /**
+ * do we need to protect 'byebye' with locks? worst, the
+ * consumer would get notified during next scan().
+ */
+ if (byebye) {
+ errno = ECONNREFUSED;
+ goto out;
+ }
+
+ errno = EINVAL;
+
+ tracker_fd = gfc->gfc_fd;
+
+ if (gf_ftruncate (tracker_fd, 0))
+ goto out;
+
+ len = offsetof(struct dirent, d_name)
+ + pathconf(gfc->gfc_processing_dir, _PC_NAME_MAX) + 1;
+ entryp = GF_CALLOC (1, len,
+ gf_changelog_mt_libgfchangelog_dirent_t);
+ if (!entryp)
+ goto out;
+
+ rewinddir (gfc->gfc_dir);
+ while (1) {
+ ret = readdir_r (gfc->gfc_dir, entryp, &result);
+ if (ret || !result)
+ break;
+
+ if ( !strcmp (basename (entryp->d_name), ".")
+ || !strcmp (basename (entryp->d_name), "..") )
+ continue;
+
+ nr_entries++;
+
+ GF_CHANGELOG_FILL_BUFFER (gfc->gfc_processing_dir,
+ buffer, off,
+ strlen (gfc->gfc_processing_dir));
+ GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer,
+ off, strlen (entryp->d_name));
+ GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1);
+
+ if (gf_changelog_write (tracker_fd, buffer, off) != off) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error writing changelog filename"
+ " to tracker file");
+ break;
+ }
+ off = 0;
+ }
+
+ GF_FREE (entryp);
+
+ if (!result) {
+ if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1)
+ return nr_entries;
+ }
+ out:
+ return -1;
+}
+
+/**
+ * @API
+ * gf_changelog_register() - register a client for updates.
+ */
+int
+gf_changelog_register (char *brick_path, char *scratch_dir,
+ char *log_file, int log_level, int max_reconnects)
+{
+ int i = 0;
+ int ret = -1;
+ int errn = 0;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ char hist_scratch_dir[PATH_MAX] = {0,};
+
+ this = THIS;
+ if (!this->ctx)
+ goto out;
+
+ errno = ENOMEM;
+
+ gfc = GF_CALLOC (1, sizeof (*gfc),
+ gf_changelog_mt_libgfchangelog_t);
+ if (!gfc)
+ goto out;
+
+ gfc->this = this;
+
+ gfc->gfc_dir = NULL;
+ gfc->gfc_fd = gfc->gfc_sockfd = -1;
+
+ gfc->gfc_working_dir = realpath (scratch_dir, NULL);
+ if (!gfc->gfc_working_dir) {
+ errn = errno;
+ goto cleanup;
+ }
+
+ /* Begin: Changes for History API */
+ gfc->hist_gfc = NULL;
+
+ gfc->hist_gfc = GF_CALLOC (1, sizeof (*gfc),
+ gf_changelog_mt_libgfchangelog_t);
+ if (!gfc->hist_gfc)
+ goto cleanup;
+
+ gfc->hist_gfc->gfc_dir = NULL;
+ gfc->hist_gfc->gfc_fd = gfc->hist_gfc->gfc_sockfd = -1;
+ gfc->hist_gfc->this = NULL;
+
+ (void) strncpy (hist_scratch_dir, scratch_dir, PATH_MAX);
+ (void) snprintf (hist_scratch_dir, PATH_MAX,
+ "%s/"GF_CHANGELOG_HISTORY_DIR"/",
+ gfc->gfc_working_dir);
+
+ ret = mkdir_p (hist_scratch_dir, 0600, _gf_false);
+ if (ret) {
+ errn = errno;
+ goto cleanup;
+ }
+
+ gfc->hist_gfc->gfc_working_dir = realpath (hist_scratch_dir, NULL);
+ if (!gfc->hist_gfc->gfc_working_dir) {
+ errn = errno;
+ goto cleanup;
+ }
+
+ ret = gf_changelog_open_dirs (gfc->hist_gfc);
+ if (ret) {
+ errn = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not create entries in history scratch dir");
+ goto cleanup;
+ }
+
+ (void) strncpy (gfc->hist_gfc->gfc_brickpath, brick_path, PATH_MAX);
+
+ for (i=0; i < 256; i++) {
+ gfc->hist_gfc->rfc3986[i] =
+ (isalnum(i) || i == '~' ||
+ i == '-' || i == '.' || i == '_') ? i : 0;
+ }
+ /* End: Changes for History API*/
+
+ ret = gf_changelog_open_dirs (gfc);
+ if (ret) {
+ errn = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not create entries in scratch dir");
+ goto cleanup;
+ }
+
+ /* passing ident as NULL means to use default ident for syslog */
+ if (gf_log_init (this->ctx, log_file, NULL))
+ goto cleanup;
+
+ gf_log_set_loglevel ((log_level == -1) ? GF_LOG_INFO :
+ log_level);
+
+ gfc->gfc_connretries = (max_reconnects <= 0) ? 1 : max_reconnects;
+ (void) strncpy (gfc->gfc_brickpath, brick_path, PATH_MAX);
+
+ ret = gf_changelog_notification_init (this, gfc);
+ if (ret) {
+ errn = errno;
+ goto cleanup;
+ }
+
+ ret = gf_thread_create (&gfc->gfc_changelog_processor,
+ NULL, gf_changelog_process, gfc);
+ if (ret) {
+ errn = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "error creating changelog processor thread"
+ " new changes won't be recorded!!!");
+ goto cleanup;
+ }
+
+ for (i=0; i < 256; i++) {
+ gfc->rfc3986[i] =
+ (isalnum(i) || i == '~' ||
+ i == '-' || i == '.' || i == '_') ? i : 0;
+ }
+
+ ret = 0;
+ this->private = gfc;
+
+ goto out;
+
+ cleanup:
+ if (gfc->hist_gfc) {
+ gf_changelog_cleanup (gfc->hist_gfc);
+ GF_FREE (gfc->hist_gfc);
+ }
+ gf_changelog_cleanup (gfc);
+ GF_FREE (gfc);
+ this->private = NULL;
+ errno = errn;
+
+ out:
+ return ret;
+}
diff --git a/xlators/features/changelog/lib/src/gf-history-changelog.c b/xlators/features/changelog/lib/src/gf-history-changelog.c
new file mode 100644
index 000000000..bfc4cd37d
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-history-changelog.c
@@ -0,0 +1,274 @@
+#include <errno.h>
+#include <dirent.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <string.h>
+
+#include "globals.h"
+#include "glusterfs.h"
+#include "logging.h"
+
+#include "gf-changelog-helpers.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+#include "changelog-mem-types.h"
+
+/*@API
+ * gf_history_changelog_done:
+ * Move processed history changelog file from .processing
+ * to .processed
+ *
+ * ARGUMENTS:
+ * file(IN): path to processed history changelog file in
+ * .processing directory.
+ *
+ * RETURN VALUE:
+ * 0: On success.
+ * -1: On error.
+ */
+int
+gf_history_changelog_done (char *file)
+{
+ int ret = -1;
+ char *buffer = NULL;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ gf_changelog_t *hist_gfc = NULL;
+ char to_path[PATH_MAX] = {0,};
+
+ errno = EINVAL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ hist_gfc = gfc->hist_gfc;
+ if (!hist_gfc)
+ goto out;
+
+ if (!file || !strlen (file))
+ goto out;
+
+ /* make sure 'file' is inside ->gfc_working_dir */
+ buffer = realpath (file, NULL);
+ if (!buffer)
+ goto out;
+
+ if (strncmp (hist_gfc->gfc_working_dir,
+ buffer, strlen (hist_gfc->gfc_working_dir)))
+ goto out;
+
+ (void) snprintf (to_path, PATH_MAX, "%s%s",
+ hist_gfc->gfc_processed_dir, basename (buffer));
+ gf_log (this->name, GF_LOG_DEBUG,
+ "moving %s to processed directory", file);
+ ret = rename (buffer, to_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot move %s to %s (reason: %s)",
+ file, to_path, strerror (errno));
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ if (buffer)
+ free (buffer); /* allocated by realpath() */
+ return ret;
+}
+/**
+ * @API
+ * gf_history_changelog_start_fresh:
+ * For a set of changelogs, start from the begining.
+ * It will truncates the history tracker fd.
+ *
+ * RETURN VALUES:
+ * 0: On success.
+ * -1: On error.
+ */
+int
+gf_history_changelog_start_fresh ()
+{
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ gf_changelog_t *hist_gfc = NULL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ errno = EINVAL;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ hist_gfc = gfc->hist_gfc;
+ if (!hist_gfc)
+ goto out;
+
+ if (gf_ftruncate (hist_gfc->gfc_fd, 0))
+ goto out;
+
+ return 0;
+
+ out:
+ return -1;
+}
+
+/*
+ * @API
+ * gf_history_changelog_next_change:
+ * Return the next history changelog file entry. Zero means all
+ * history chanelogs are consumed.
+ *
+ * ARGUMENTS:
+ * bufptr(OUT): Path to unprocessed history changelog file
+ * from tracker file.
+ * maxlen(IN): Usually PATH_MAX.
+ *
+ * RETURN VALUES:
+ * size: On success.
+ * -1 : On error.
+ */
+ssize_t
+gf_history_changelog_next_change (char *bufptr, size_t maxlen)
+{
+ ssize_t size = 0;
+ int tracker_fd = 0;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ gf_changelog_t *hist_gfc = NULL;
+ char buffer[PATH_MAX] = {0,};
+
+ errno = EINVAL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ hist_gfc = gfc->hist_gfc;
+ if (!hist_gfc)
+ goto out;
+
+ tracker_fd = hist_gfc->gfc_fd;
+
+ size = gf_readline (tracker_fd, buffer, maxlen);
+ if (size < 0)
+ goto out;
+ if (size == 0)
+ return 0;
+
+ memcpy (bufptr, buffer, size - 1);
+ *(buffer + size) = '\0';
+
+ return size;
+
+ out:
+ return -1;
+}
+
+/*
+ * @API
+ * gf_history_changelog_scan:
+ * Scan and generate a list of change entries.
+ * Calling this api multiple times (without calling gf_changlog_done())
+ * would result new changelogs(s) being refreshed in the tracker file.
+ * This call also acts as a cancellation point for the consumer.
+ *
+ * RETURN VALUES:
+ * nr_entries: On success.
+ * -1 : On error.
+ */
+ssize_t
+gf_history_changelog_scan ()
+{
+ int ret = 0;
+ int tracker_fd = 0;
+ size_t len = 0;
+ size_t off = 0;
+ xlator_t *this = NULL;
+ size_t nr_entries = 0;
+ gf_changelog_t *gfc = NULL;
+ gf_changelog_t *hist_gfc = NULL;
+ struct dirent *entryp = NULL;
+ struct dirent *result = NULL;
+ char buffer[PATH_MAX] = {0,};
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ hist_gfc = gfc->hist_gfc;
+ if (!hist_gfc)
+ goto out;
+
+ errno = EINVAL;
+
+ tracker_fd = hist_gfc->gfc_fd;
+
+ if (gf_ftruncate (tracker_fd, 0))
+ goto out;
+
+ len = offsetof(struct dirent, d_name)
+ + pathconf(hist_gfc->gfc_processing_dir, _PC_NAME_MAX) + 1;
+ entryp = GF_CALLOC (1, len,
+ gf_changelog_mt_libgfchangelog_dirent_t);
+ if (!entryp)
+ goto out;
+
+ rewinddir (hist_gfc->gfc_dir);
+ while (1) {
+ ret = readdir_r (hist_gfc->gfc_dir, entryp, &result);
+ if (ret || !result)
+ break;
+
+ if ( !strcmp (basename (entryp->d_name), ".")
+ || !strcmp (basename (entryp->d_name), "..") )
+ continue;
+
+ nr_entries++;
+
+ GF_CHANGELOG_FILL_BUFFER (hist_gfc->gfc_processing_dir,
+ buffer, off,
+ strlen (hist_gfc->gfc_processing_dir));
+ GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer,
+ off, strlen (entryp->d_name));
+ GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1);
+
+ if (gf_changelog_write (tracker_fd, buffer, off) != off) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error writing changelog filename"
+ " to tracker file");
+ break;
+ }
+ off = 0;
+ }
+
+ GF_FREE (entryp);
+
+ if (!result) {
+ if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1)
+ return nr_entries;
+ }
+ out:
+ return -1;
+}
diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am
new file mode 100644
index 000000000..525ce97dc
--- /dev/null
+++ b/xlators/features/changelog/src/Makefile.am
@@ -0,0 +1,21 @@
+xlator_LTLIBRARIES = changelog.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \
+ changelog-misc.h changelog-encoders.h changelog-notifier.h \
+ changelog-fops.h policy/changelog-policy.h
+
+changelog_la_LDFLAGS = -module -avoid-version
+
+changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \
+ changelog-encoders.c changelog-notifier.c changelog-default-fops.c \
+ policy/changelog-policy-default.c policy/changelog-policy-replication.c
+changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -Ipolicy/ -fPIC -D_FILE_OFFSET_BITS=64 \
+ -D_GNU_SOURCE -D$(GF_HOST_OS) -shared -DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/changelog/src/changelog-default-fops.c b/xlators/features/changelog/src/changelog-default-fops.c
new file mode 100644
index 000000000..59749905e
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-default-fops.c
@@ -0,0 +1,561 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "changelog-encoders.h"
+
+/** FOPS */
+
+/* default rmdir */
+int32_t
+changelog_default_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default unlink */
+int32_t
+changelog_default_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default rename */
+int32_t
+changelog_default_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ /* 3 == fop + oldloc + newloc */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->inode->gfid, 3);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 3);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default link */
+int32_t
+changelog_default_link (call_frame_t *frame,
+ xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ int ret = 1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default mknid */
+int32_t
+changelog_default_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default symlink */
+int32_t
+changelog_default_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc,
+ mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default mknod */
+int32_t
+changelog_default_mknod (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2);
+
+ co = changelog_get_usable_buffer (frame->local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default create */
+int32_t
+changelog_default_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ changelog_opt_t *co = NULL;
+ size_t xtra_len = 0;
+ changelog_local_t *local = NULL;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ /* init with two extra records */
+ ret = -1;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default fsetattr */
+int32_t
+changelog_default_fsetattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/* default setattr */
+int32_t
+changelog_default_setattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ loc->inode, loc->inode->gfid, 0);
+ return 0;
+}
+
+/* default fremovexattr */
+int32_t
+changelog_default_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/* default removexattr */
+int32_t
+changelog_default_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ loc->inode, loc->inode->gfid, 0);
+ return 0;
+}
+
+/* default setxattr */
+int32_t
+changelog_default_setxattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ loc->inode, loc->inode->gfid, 0);
+ return 0;
+}
+
+/* default fsetxattr */
+int32_t
+changelog_default_fsetxattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/* default truncate */
+int32_t
+changelog_default_truncate (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ loc->inode, loc->inode->gfid, 0);
+ return 0;
+}
+
+/* default ftruncate */
+int32_t
+changelog_default_ftruncate (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/* default writev */
+int32_t
+changelog_default_writev (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/** COPS */
+
+int
+changelog_default_cops_open (xlator_t *this,
+ changelog_priv_t *priv,
+ void *cpriv, char *name, gf_boolean_t last)
+{
+ changelog_log_data_t cld = {0,};
+ changelog_rollover_data_t *crd = NULL;
+ struct timeval tv = {0,};
+
+ crd = &cld.cld_roll;
+
+ cld.cld_type = CHANGELOG_TYPE_ROLLOVER;
+
+ if (gettimeofday (&tv, NULL))
+ return -1;
+
+ crd->crd_prealloc_size = 0; /* no preallocation */
+ crd->crd_finale = last;
+ crd->crd_use_suffix = _gf_true;
+ crd->crd_roll_key = (unsigned long) tv.tv_sec;
+
+ (void) strcpy (crd->crd_changelog_name, name);
+ (void) strcpy (crd->crd_changelog_oname, name);
+
+ /* inject a roll-over event */
+ return changelog_inject_single_event (this, priv, NULL, &cld);
+}
+
+int
+changelog_default_cops_rollover (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ char *name, gf_boolean_t last)
+{
+ return changelog_default_cops_open (this, priv, cpriv, name, last);
+}
+
+int
+changelog_default_cops_sync (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv)
+{
+ changelog_log_data_t cld = {0,};
+
+ cld.cld_type = CHANGELOG_TYPE_FSYNC;
+ return changelog_inject_single_event (this, priv, NULL, &cld);
+}
+
+/**
+ * write to the changelog: @changelog_update() implements inode version
+ * checking and all other stuffs...
+ */
+int
+changelog_default_cops_write (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local, changelog_log_type type)
+{
+ changelog_update (this, priv, local, type);
+ return 0;
+}
+
+off_t
+changelog_default_cops_get_offset (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local)
+{
+ return *(off_t *)cpriv;
+}
+
+void
+changelog_default_cops_set_offset (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local, off_t bytes)
+{
+ *(off_t *)cpriv += bytes;
+}
+
+void
+changelog_default_cops_reset_offset (xlator_t *this, changelog_priv_t *priv,
+ void *cpriv, changelog_local_t *local)
+{
+ *(off_t *)cpriv = 0;
+}
+
+/**
+ * roll-over takes care of close and open
+ */
+int
+changelog_default_cops_close (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv)
+{
+ errno = ENOTSUP;
+ return -1;
+}
+
+int
+changelog_default_cops_read (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv, char *buffer)
+{
+ errno = ENOTSUP;
+ return -1;
+}
+
+/**
+ * no purging of changelogs
+ */
+int
+changelog_default_cops_unlink (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv, char *name)
+{
+ errno = ENOTSUP;
+ return -1;
+}
diff --git a/xlators/features/changelog/src/changelog-encoders.c b/xlators/features/changelog/src/changelog-encoders.c
new file mode 100644
index 000000000..ecd598e4d
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-encoders.c
@@ -0,0 +1,182 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "changelog-encoders.h"
+
+size_t
+entry_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ char *tmpbuf = NULL;
+ size_t bufsz = 0;
+ struct changelog_entry_fields *ce = NULL;
+
+ ce = (struct changelog_entry_fields *) data;
+
+ if (encode) {
+ tmpbuf = uuid_utoa (ce->cef_uuid);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, tmpbuf, strlen (tmpbuf));
+ } else {
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ ce->cef_uuid, sizeof (uuid_t));
+ }
+
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, "/", 1);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ ce->cef_bname, strlen (ce->cef_bname));
+ return bufsz;
+}
+
+size_t
+fop_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ char buf[10] = {0,};
+ size_t bufsz = 0;
+ glusterfs_fop_t fop = 0;
+
+ fop = *(glusterfs_fop_t *) data;
+
+ if (encode) {
+ (void) snprintf (buf, sizeof (buf), "%d", fop);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, &fop, sizeof (fop));
+
+ return bufsz;
+}
+
+void
+entry_free_fn (void *data)
+{
+ changelog_opt_t *co = data;
+
+ if (!co)
+ return;
+
+ GF_FREE (co->co_entry.cef_bname);
+}
+
+/**
+ * try to write all data in one shot
+ */
+
+static inline void
+changelog_encode_write_xtra (changelog_write_data_t *cwd,
+ char *buffer, size_t *off, gf_boolean_t encode)
+{
+ int i = 0;
+ size_t offset = 0;
+ void *data = NULL;
+ changelog_opt_t *co = NULL;
+
+ offset = *off;
+
+ co = (changelog_opt_t *) cwd->cwd_ptr;
+
+ for (; i < cwd->cwd_xtra_records; i++, co++) {
+ if (i)
+ CHANGELOG_FILL_BUFFER (buffer, offset, "\0", 1);
+
+ switch (co->co_type) {
+ case CHANGELOG_OPT_REC_FOP:
+ data = &co->co_fop;
+ break;
+ case CHANGELOG_OPT_REC_ENTRY:
+ data = &co->co_entry;
+ break;
+ case CHANGELOG_OPT_REC_NAME:
+ data = co->co_entry.cef_bname;
+ break;
+ case CHANGELOG_OPT_REC_ULL:
+ data = &co->co_number;
+ break;
+ case CHANGELOG_OPT_REC_UUID:
+ data = &co->co_uuid;
+ break;
+ case CHANGELOG_OPT_REC_INT32:
+ data = &co->co_int32;
+ break;
+ case CHANGELOG_OPT_REC_UINT32:
+ data = &co->co_uint32;
+ break;
+ }
+
+ if (co->co_convert)
+ offset += co->co_convert (data,
+ buffer + offset, encode);
+ else /* no coversion: write it out as it is */
+ CHANGELOG_FILL_BUFFER (buffer, offset,
+ data, co->co_len);
+ }
+
+ *off = offset;
+}
+
+int
+changelog_encode_ascii (xlator_t *this,
+ changelog_local_t *local, changelog_log_data_t *cld)
+{
+ size_t off = 0;
+ size_t gfid_len = 0;
+ char *gfid_str = NULL;
+ char *buffer = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_write_data_t *cwd = NULL;
+
+ priv = this->private;
+ cwd = &cld->cld_wdata;
+
+ gfid_str = uuid_utoa (cwd->cwd_gfid);
+ gfid_len = strlen (gfid_str);
+
+ /* extra bytes for decorations */
+ buffer = alloca (gfid_len + cwd->cwd_ptr_len + 100);
+ if (!priv->no_gfid_hdr)
+ CHANGELOG_STORE_ASCII (priv, buffer,
+ off, gfid_str, gfid_len, cld);
+
+ if (cwd->cwd_xtra_records) {
+ changelog_encode_write_xtra (cwd, buffer, &off, _gf_true);
+ CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1);
+ }
+
+ return changelog_write_change (this, priv,
+ local, buffer, off);
+}
+
+int
+changelog_encode_binary (xlator_t *this,
+ changelog_local_t *local, changelog_log_data_t *cld)
+{
+ size_t off = 0;
+ char *buffer = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_write_data_t *cwd = NULL;
+
+ priv = this->private;
+ cwd = &cld->cld_wdata;
+
+ /* extra bytes for decorations */
+ buffer = alloca (sizeof (uuid_t) + cwd->cwd_ptr_len + 100);
+ if (!priv->no_gfid_hdr)
+ CHANGELOG_STORE_BINARY (priv, buffer, off, cwd->cwd_gfid, cld);
+
+ if (cwd->cwd_xtra_records) {
+ changelog_encode_write_xtra (cwd, buffer, &off, _gf_false);
+ CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1);
+ }
+
+ return changelog_write_change (this, priv,
+ local, buffer, off);
+}
diff --git a/xlators/features/changelog/src/changelog-encoders.h b/xlators/features/changelog/src/changelog-encoders.h
new file mode 100644
index 000000000..2a96ba4dd
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-encoders.h
@@ -0,0 +1,48 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_ENCODERS_H
+#define _CHANGELOG_ENCODERS_H
+
+#include "xlator.h"
+#include "defaults.h"
+
+#include "changelog-helpers.h"
+
+#define CHANGELOG_STORE_ASCII(priv, buf, off, gfid, gfid_len, cld) do { \
+ CHANGELOG_FILL_BUFFER (buffer, off, \
+ priv->maps[cld->cld_type], 1); \
+ CHANGELOG_FILL_BUFFER (buffer, \
+ off, gfid, gfid_len); \
+ CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); \
+ } while (0)
+
+#define CHANGELOG_STORE_BINARY(priv, buf, off, gfid, cld) do { \
+ CHANGELOG_FILL_BUFFER (buffer, off, \
+ priv->maps[cld->cld_type], 1); \
+ CHANGELOG_FILL_BUFFER (buffer, \
+ off, gfid, sizeof (uuid_t)); \
+ CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); \
+ } while (0)
+
+size_t
+entry_fn (void *data, char *buffer, gf_boolean_t encode);
+size_t
+fop_fn (void *data, char *buffer, gf_boolean_t encode);
+void
+entry_free_fn (void *data);
+int
+changelog_encode_binary (xlator_t *,
+ changelog_local_t *, changelog_log_data_t *);
+int
+changelog_encode_ascii (xlator_t *,
+ changelog_local_t *, changelog_log_data_t *);
+
+#endif /* _CHANGELOG_ENCODERS_H */
diff --git a/xlators/features/changelog/src/changelog-fops.h b/xlators/features/changelog/src/changelog-fops.h
new file mode 100644
index 000000000..597327be3
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-fops.h
@@ -0,0 +1,157 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_FOPS_H
+#define _CHANGELOG_FOPS_H
+
+/* FOPS */
+
+int32_t
+changelog_default_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata);
+int32_t
+changelog_default_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata);
+int32_t
+changelog_default_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata);
+int32_t
+changelog_default_link (call_frame_t *frame,
+ xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata);
+int32_t
+changelog_default_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata);
+int32_t
+changelog_default_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc,
+ mode_t umask, dict_t *xdata);
+int32_t
+changelog_default_mknod (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t dev, mode_t umask, dict_t *xdata);
+int32_t
+changelog_default_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata);
+int32_t
+changelog_default_fsetattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+int32_t
+changelog_default_setattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+int32_t
+changelog_default_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata);
+int32_t
+changelog_default_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata);
+int32_t
+changelog_default_setxattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata);
+int32_t
+changelog_default_fsetxattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata);
+int32_t
+changelog_default_truncate (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata);
+int32_t
+changelog_default_ftruncate (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata);
+int32_t
+changelog_default_writev (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata);
+
+/* COPS */
+int
+changelog_default_cops_open (xlator_t *, changelog_priv_t *,
+ void *, char*, gf_boolean_t);
+
+int
+changelog_default_cops_close (xlator_t *, changelog_priv_t *, void *);
+
+int
+changelog_default_cops_sync (xlator_t *this,
+ changelog_priv_t *priv, void *);
+
+int
+changelog_default_cops_rollover (xlator_t *,
+ changelog_priv_t *, void *,
+ char *, gf_boolean_t);
+int
+changelog_default_cops_write (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *, changelog_log_type);
+
+int
+changelog_default_cops_read (xlator_t *,
+ changelog_priv_t *, void *, char *);
+
+int
+changelog_default_cops_unlink (xlator_t *,
+ changelog_priv_t *, void *, char *);
+
+off_t
+changelog_default_cops_get_offset (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *);
+
+void
+changelog_default_cops_set_offset (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *, off_t );
+
+void
+changelog_default_cops_reset_offset (xlator_t *, changelog_priv_t *,
+ void *, changelog_local_t *);
+
+
+GF_UNUSED static struct xlator_fops changelog_default_fops = {
+ .mknod = changelog_default_mknod,
+ .mkdir = changelog_default_mkdir,
+ .create = changelog_default_create,
+ .symlink = changelog_default_symlink,
+ .writev = changelog_default_writev,
+ .truncate = changelog_default_truncate,
+ .ftruncate = changelog_default_ftruncate,
+ .link = changelog_default_link,
+ .rename = changelog_default_rename,
+ .unlink = changelog_default_unlink,
+ .rmdir = changelog_default_rmdir,
+ .setattr = changelog_default_setattr,
+ .fsetattr = changelog_default_fsetattr,
+ .setxattr = changelog_default_setxattr,
+ .fsetxattr = changelog_default_fsetxattr,
+ .removexattr = changelog_default_removexattr,
+ .fremovexattr = changelog_default_fremovexattr,
+};
+
+GF_UNUSED static struct changelog_ops changelog_default_cops = {
+ .open = changelog_default_cops_open,
+ .sync = changelog_default_cops_sync,
+ .read = changelog_default_cops_read,
+ .close = changelog_default_cops_close,
+ .write = changelog_default_cops_write,
+ .unlink = changelog_default_cops_unlink,
+ .rollover = changelog_default_cops_rollover,
+ .get_offset = changelog_default_cops_get_offset,
+ .set_offset = changelog_default_cops_set_offset,
+ .reset_offset = changelog_default_cops_reset_offset,
+};
+
+#endif /* _CHANGELOG_FOPS_H */
diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c
new file mode 100644
index 000000000..ad4fe4013
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-helpers.c
@@ -0,0 +1,719 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+#include "iobuf.h"
+
+#include "changelog-helpers.h"
+#include "changelog-mem-types.h"
+
+#include <pthread.h>
+
+void
+changelog_thread_cleanup (xlator_t *this, pthread_t thr_id)
+{
+ int ret = 0;
+ void *retval = NULL;
+
+ /* send a cancel request to the thread */
+ ret = pthread_cancel (thr_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not cancel thread (reason: %s)",
+ strerror (errno));
+ goto out;
+ }
+
+ ret = pthread_join (thr_id, &retval);
+ if (ret || (retval != PTHREAD_CANCELED)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cancel request not adhered as expected"
+ " (reason: %s)", strerror (errno));
+ }
+
+ out:
+ return;
+}
+
+inline void *
+changelog_get_usable_buffer (changelog_local_t *local)
+{
+ changelog_write_data_t *cwd = &local->cld.cld_wdata;
+
+ if (!cwd->cwd_iobuf)
+ return NULL;
+
+ return cwd->cwd_ptr;
+}
+
+inline void
+changelog_set_usable_record_and_length (changelog_local_t *local,
+ size_t len, int xr)
+{
+ changelog_write_data_t *cwd = &local->cld.cld_wdata;
+
+ cwd->cwd_ptr_len = len;
+ cwd->cwd_xtra_records = xr;
+}
+
+void
+changelog_local_cleanup (xlator_t *xl, changelog_local_t *local)
+{
+ int i = 0;
+ changelog_opt_t *co = NULL;
+ changelog_write_data_t *cwd = NULL;
+
+ if (!local)
+ return;
+
+ cwd = &local->cld.cld_wdata;
+
+ /* cleanup dynamic allocation for extra records */
+ if (cwd->cwd_xtra_records) {
+ co = (changelog_opt_t *) cwd->cwd_ptr;
+ for (; i < cwd->cwd_xtra_records; i++, co++)
+ if (co->co_free)
+ co->co_free (co);
+ }
+
+ CHANGELOG_IOBUF_UNREF (cwd->cwd_iobuf);
+
+ if (local->inode)
+ inode_unref (local->inode);
+
+ mem_put (local);
+}
+
+inline int
+changelog_write (int fd, char *buffer, size_t len)
+{
+ ssize_t size = 0;
+ size_t writen = 0;
+
+ while (writen < len) {
+ size = write (fd,
+ buffer + writen, len - writen);
+ if (size <= 0)
+ break;
+
+ writen += size;
+ }
+
+ return (writen != len);
+}
+
+static int
+changelog_rollover_changelog (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_rollover_data_t *crd)
+{
+ int ret = -1;
+ int notify = 0;
+ char *bname = NULL;
+ char ofile[PATH_MAX] = {0,};
+ char nfile[PATH_MAX] = {0,};
+
+ if (priv->changelog_fd != -1) {
+ close (priv->changelog_fd);
+ priv->changelog_fd = -1;
+ }
+
+ /**
+ * no rolling-over of changelogs, policy implementer choose
+ * to do the heavy-lifting of having distinct changelog name.
+ *
+ * NOTE: This implies libgfchangelog would not be notified
+ (well, we could, but lets not do that now...)
+ */
+ if (!crd->crd_use_suffix)
+ return 0;
+
+ (void) snprintf (ofile, PATH_MAX,
+ "%s/%s", priv->changelog_dir,
+ crd->crd_changelog_oname);
+ (void) snprintf (nfile, PATH_MAX, "%s/%s.%lu",
+ priv->changelog_dir,
+ crd->crd_changelog_name, crd->crd_roll_key);
+
+ ret = rename (ofile, nfile);
+ if (!ret)
+ notify = 1;
+
+ if (ret && (errno == ENOENT)) {
+ ret = 0;
+ }
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error renaming %s -> %s (reason %s)",
+ ofile, nfile, strerror (errno));
+ }
+
+ if (notify) {
+ bname = basename (nfile);
+ gf_log (this->name, GF_LOG_DEBUG, "notifying: %s", bname);
+ ret = changelog_write (priv->wfd, bname, strlen (bname) + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send file name to notify thread"
+ " (reason: %s)", strerror (errno));
+ }
+ }
+
+ return ret;
+}
+
+int
+changelog_open (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local, changelog_rollover_data_t *crd)
+{
+ int fd = 0;
+ int ret = -1;
+ int flags = 0;
+ char buffer[1024] = {0,};
+ char changelog_path[PATH_MAX] = {0,};
+
+ (void) snprintf (changelog_path, PATH_MAX,
+ "%s/%s", priv->changelog_dir,
+ crd->crd_changelog_name);
+
+ flags |= (O_CREAT | O_RDWR);
+ if (priv->fsync_interval == 0)
+ flags |= O_SYNC;
+
+ fd = open (changelog_path, flags,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unable to open/create changelog file %s"
+ " (reason: %s). change-logging will be"
+ " inactive", changelog_path, strerror (errno));
+ goto out;
+ }
+
+ priv->changelog_fd = fd;
+ CHANGELOG_INVOKE_CFOP (this, priv, reset_offset, local);
+
+ /* preallocate if required */
+ if (crd->crd_prealloc_size > 0) {
+ ret = posix_fallocate (priv->changelog_fd,
+ 0, crd->crd_prealloc_size);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to preallocate %llu bytes",
+ (unsigned long long) crd->crd_prealloc_size);
+ }
+ }
+
+ (void) snprintf (buffer, 1024, CHANGELOG_HEADER,
+ CHANGELOG_VERSION_MAJOR,
+ CHANGELOG_VERSION_MINOR,
+ priv->encode_mode);
+ ret = changelog_write_change (this, priv,
+ local, buffer, strlen (buffer));
+ if (ret) {
+ close (priv->changelog_fd);
+ priv->changelog_fd = -1;
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static int
+changelog_start_next_change (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_data_t *cld)
+{
+ int ret = 0;
+ changelog_rollover_data_t *crd = &cld->cld_roll;
+
+ ret = changelog_rollover_changelog (this, priv, crd);
+
+ if (!ret && !crd->crd_finale)
+ ret = changelog_open (this, priv, local, crd);
+ return ret;
+}
+
+/**
+ * return the length of entry
+ */
+inline size_t
+changelog_entry_length ()
+{
+ return sizeof (changelog_log_data_t);
+}
+
+int
+changelog_write_change (xlator_t *this, changelog_priv_t *priv,
+ changelog_local_t *local, char *buffer, size_t len)
+{
+ int ret = -1;
+ off_t offset = 0;
+ ssize_t size = 0;
+ size_t writen = 0;
+
+ offset = CHANGELOG_INVOKE_CFOP (this, priv, get_offset, local);
+
+ while (writen < len) {
+ size = pwrite (priv->changelog_fd,
+ buffer + writen, len - writen, offset + writen);
+ if (size <= 0)
+ break;
+
+ writen += size;
+ }
+
+ if (writen == len) {
+ ret = 0;
+ CHANGELOG_INVOKE_CFOP (this, priv, set_offset, local, writen);
+ }
+
+ return ret;
+}
+
+inline int
+changelog_handle_change (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local, changelog_log_data_t *cld)
+{
+ int ret = 0;
+
+ if (CHANGELOG_TYPE_IS_ROLLOVER (cld->cld_type)) {
+ ret = changelog_start_next_change (this, priv, local, cld);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Problem rolling over changelog(s)");
+ goto out;
+ }
+
+ /**
+ * case when there is reconfigure done (disabling changelog) and there
+ * are still fops that have updates in prgress.
+ */
+ if (priv->changelog_fd == -1)
+ return 0;
+
+ if (CHANGELOG_TYPE_IS_FSYNC (cld->cld_type)) {
+ ret = fsync (priv->changelog_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync failed (reason: %s)",
+ strerror (errno));
+ }
+ goto out;
+ }
+
+ ret = priv->ce->encode (this, local, cld);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error writing changelog to disk");
+ }
+
+ out:
+ return ret;
+}
+
+static inline void
+changelog_local_init_defaults (changelog_local_t *local,
+ uuid_t gfid, struct iobuf *iobuf)
+{
+ changelog_write_data_t *cwd = &(local->cld.cld_wdata);
+
+ uuid_copy (cwd->cwd_gfid, gfid);
+ cwd->cwd_iobuf = iobuf;
+ cwd->cwd_xtra_records = 0; /* set by the caller */
+}
+
+changelog_local_t *
+changelog_local_init (xlator_t *this, inode_t *inode,
+ uuid_t gfid, int xtra_records,
+ gf_boolean_t update_flag)
+{
+ changelog_local_t *local = NULL;
+ struct iobuf *iobuf = NULL;
+
+ /**
+ * Relax the presence of inode if @update_flag is true.
+ * The caller (implmentation of the fop) needs to be careful to
+ * not blindly use local->inode.
+ */
+ if (!update_flag && !inode) {
+ gf_log_callingfn (this->name, GF_LOG_WARNING,
+ "inode needed for version checking !!!");
+ goto out;
+ }
+
+ if (xtra_records) {
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool,
+ xtra_records * CHANGELOG_OPT_RECORD_LEN);
+ if (!iobuf)
+ goto out;
+ }
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ CHANGELOG_IOBUF_UNREF (iobuf);
+ goto out;
+ }
+
+ local->update_no_check = update_flag;
+
+ (void) changelog_local_init_defaults (local, gfid, iobuf);
+
+ if (inode)
+ local->inode = inode_ref (inode);
+
+ out:
+ return local;
+}
+
+int
+changelog_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_addr = 0;
+ changelog_inode_ctx_t *ctx = NULL;
+
+ inode_ctx_del (inode, this, &ctx_addr);
+ if (!ctx_addr)
+ return 0;
+
+ ctx = (changelog_inode_ctx_t *) (long) ctx_addr;
+ GF_FREE (ctx);
+
+ return 0;
+}
+
+int
+changelog_inject_single_event (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_data_t *cld)
+{
+ return priv->cd.dispatchfn (this, priv,
+ priv->cd.cd_data, local, cld);
+}
+
+/**
+ * TODO: these threads have many thing in common (wake up after
+ * a certain time etc..). move them into separate routine.
+ */
+void *
+changelog_rollover (void *data)
+{
+ int ret = 0;
+ char *cname = NULL;
+ xlator_t *this = NULL;
+ struct timeval tv = {0,};
+ changelog_time_slice_t *slice = NULL;
+ changelog_priv_t *priv = data;
+
+ this = priv->cr.this;
+ slice = &priv->slice;
+
+ while (1) {
+ tv.tv_sec = priv->rollover_time;
+ tv.tv_usec = 0;
+
+ ret = select (0, NULL, NULL, NULL, &tv);
+ if (ret)
+ continue;
+
+ LOCK (&priv->lock);
+ {
+ cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp);
+ ret = CHANGELOG_INVOKE_CFOP (this, priv, rollover,
+ cname, _gf_false);
+ if (!ret)
+ SLICE_VERSION_UPDATE (slice);
+ }
+ UNLOCK (&priv->lock);
+ }
+
+ return NULL;
+}
+
+void *
+changelog_fsync_thread (void *data)
+{
+ int ret = 0;
+ xlator_t *this = NULL;
+ struct timeval tv = {0,};
+ changelog_priv_t *priv = data;
+
+ this = priv->cf.this;
+
+ while (1) {
+ tv.tv_sec = priv->fsync_interval;
+ tv.tv_usec = 0;
+
+ ret = select (0, NULL, NULL, NULL, &tv);
+ if (ret)
+ continue;
+
+ ret = CHANGELOG_INVOKE_CFOP (this, priv, sync);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to inject fsync event");
+ }
+
+ return NULL;
+}
+
+/* macros for inode/changelog version checks */
+
+#define INODE_VERSION_UPDATE(priv, inode, iver, slice, type) do { \
+ LOCK (&inode->lock); \
+ { \
+ LOCK (&priv->lock); \
+ { \
+ *iver = slice->changelog_version[type]; \
+ } \
+ UNLOCK (&priv->lock); \
+ } \
+ UNLOCK (&inode->lock); \
+ } while (0)
+
+#define INODE_VERSION_EQUALS_SLICE(priv, ver, slice, type, upd) do { \
+ LOCK (&priv->lock); \
+ { \
+ upd = (ver == slice->changelog_version[type]) \
+ ? _gf_false : _gf_true; \
+ } \
+ UNLOCK (&priv->lock); \
+ } while (0)
+
+static int
+__changelog_inode_ctx_set (xlator_t *this,
+ inode_t *inode, changelog_inode_ctx_t *ctx)
+{
+ uint64_t ctx_addr = (uint64_t) ctx;
+ return __inode_ctx_set (inode, this, &ctx_addr);
+}
+
+/**
+ * one shot routine to get the address and the value of a inode version
+ * for a particular type.
+ */
+static changelog_inode_ctx_t *
+__changelog_inode_ctx_get (xlator_t *this,
+ inode_t *inode, unsigned long **iver,
+ unsigned long *version, changelog_log_type type)
+{
+ int ret = 0;
+ uint64_t ctx_addr = 0;
+ changelog_inode_ctx_t *ctx = NULL;
+
+ ret = __inode_ctx_get (inode, this, &ctx_addr);
+ if (ret < 0)
+ ctx_addr = 0;
+ if (ctx_addr != 0) {
+ ctx = (changelog_inode_ctx_t *) (long)ctx_addr;
+ goto out;
+ }
+
+ ctx = GF_CALLOC (1, sizeof (*ctx), gf_changelog_mt_inode_ctx_t);
+ if (!ctx)
+ goto out;
+
+ ret = __changelog_inode_ctx_set (this, inode, ctx);
+ if (ret) {
+ GF_FREE (ctx);
+ ctx = NULL;
+ }
+
+ out:
+ if (ctx && iver && version) {
+ *iver = CHANGELOG_INODE_VERSION_TYPE (ctx, type);
+ *version = **iver;
+ }
+
+ return ctx;
+}
+
+static changelog_inode_ctx_t *
+changelog_inode_ctx_get (xlator_t *this,
+ inode_t *inode, unsigned long **iver,
+ unsigned long *version, changelog_log_type type)
+{
+ changelog_inode_ctx_t *ctx = NULL;
+
+ LOCK (&inode->lock);
+ {
+ ctx = __changelog_inode_ctx_get (this,
+ inode, iver, version, type);
+ }
+ UNLOCK (&inode->lock);
+
+ return ctx;
+}
+
+/**
+ * This is the main update routine. Locking has been made granular so as to
+ * maximize parallelism of fops - I'll try to explain it below using execution
+ * timelines.
+ *
+ * Basically, the contention is between multiple execution threads of this
+ * routine and the roll-over thread. So, instead of having a big lock, we hold
+ * granular locks: inode->lock and priv->lock. Now I'll explain what happens
+ * when there is an update and a roll-over at just about the same time.
+ * NOTE:
+ * - the dispatcher itself synchronizes updates via it's own lock
+ * - the slice version in incremented by the roll-over thread
+ *
+ * Case 1: When the rollover thread wins before the inode version can be
+ * compared with the slice version.
+ *
+ * [updater] | [rollover]
+ * |
+ * | <SLICE: 1, 1, 1>
+ * <changelog_update> |
+ * <changelog_inode_ctx_get> |
+ * <CTX: 1, 1, 1> |
+ * | <dispatch-rollover-event>
+ * | LOCK (&priv->lock)
+ * | <SLICE_VERSION_UPDATE>
+ * | <SLICE: 2, 2, 2>
+ * | UNLOCK (&priv->lock)
+ * |
+ * LOCK (&priv->lock) |
+ * <INODE_VERSION_EQUALS_SLICE> |
+ * I: 1 <-> S: 2 |
+ * update: true |
+ * UNLOCK (&priv->lock) |
+ * |
+ * <if update == true> |
+ * <dispath-update-event> |
+ * <INODE_VERSION_UPDATE> |
+ * LOCK (&inode->lock) |
+ * LOCK (&priv->lock) |
+ * <CTX: 2, 1, 1> |
+ * UNLOCK (&priv->lock) |
+ * UNLOCK (&inode->lock) |
+ *
+ * Therefore, the change gets recorded in the next change (no lost change). If
+ * the slice version was ahead of the inode version (say I:1, S: 2), then
+ * anyway the comparison would result in a update (I: 1, S: 3).
+ *
+ * If the rollover time is too less, then there is another contention when the
+ * updater tries to bring up inode version to the slice version (this is also
+ * the case when the roll-over thread wakes up during INODE_VERSION_UPDATE.
+ *
+ * <CTX: 1, 1, 1> | <SLICE: 2, 2, 2>
+ * |
+ * |
+ * <dispath-update-event> |
+ * <INODE_VERSION_UPDATE> |
+ * LOCK (&inode->lock) |
+ * LOCK (&priv->lock) |
+ * <CTX: 2, 1, 1> |
+ * UNLOCK (&priv->lock) |
+ * UNLOCK (&inode->lock) |
+ * | <dispatch-rollover-event>
+ * | LOCK (&priv->lock)
+ * | <SLICE_VERSION_UPDATE>
+ * | <SLICE: 3, 3, 3>
+ * | UNLOCK (&priv->lock)
+ *
+ *
+ * Case 2: When the fop thread wins
+ *
+ * [updater] | [rollover]
+ * |
+ * | <SLICE: 1, 1, 1>
+ * <changelog_update> |
+ * <changelog_inode_ctx_get> |
+ * <CTX: 0, 0, 0> |
+ * |
+ * LOCK (&priv->lock) |
+ * <INODE_VERSION_EQUALS_SLICE> |
+ * I: 0 <-> S: 1 |
+ * update: true |
+ * UNLOCK (&priv->lock) |
+ * | <dispatch-rollover-event>
+ * | LOCK (&priv->lock)
+ * | <SLICE_VERSION_UPDATE>
+ * | <SLICE: 2, 2, 2>
+ * | UNLOCK (&priv->lock)
+ * <if update == true> |
+ * <dispath-update-event> |
+ * <INODE_VERSION_UPDATE> |
+ * LOCK (&inode->lock) |
+ * LOCK (&priv->lock) |
+ * <CTX: 2, 0, 0> |
+ * UNLOCK (&priv->lock) |
+ * UNLOCK (&inode->lock) |
+ *
+ * Here again, if the inode version was equal to the slice version (I: 1, S: 1)
+ * then there is no need to record an update (as the equality of the two version
+ * signifies an update was recorded in the current time slice).
+ */
+inline void
+changelog_update (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_type type)
+{
+ int ret = 0;
+ unsigned long *iver = NULL;
+ unsigned long version = 0;
+ inode_t *inode = NULL;
+ changelog_time_slice_t *slice = NULL;
+ changelog_inode_ctx_t *ctx = NULL;
+ changelog_log_data_t *cld_0 = NULL;
+ gf_boolean_t need_upd = _gf_true;
+
+ slice = &priv->slice;
+
+ /**
+ * for fops that do not require inode version checking
+ */
+ if (local->update_no_check)
+ goto update;
+
+ inode = local->inode;
+
+ ctx = changelog_inode_ctx_get (this,
+ inode, &iver, &version, type);
+ if (!ctx)
+ goto update;
+
+ INODE_VERSION_EQUALS_SLICE (priv, version, slice, type, need_upd);
+
+ update:
+ if (need_upd) {
+ cld_0 = &local->cld;
+ cld_0->cld_type = type;
+
+ ret = priv->cd.dispatchfn (this, priv,
+ priv->cd.cd_data, local, cld_0);
+
+ /**
+ * update after the dispatcher has successfully done
+ * it's job.
+ */
+ if (!local->update_no_check && iver && !ret)
+ INODE_VERSION_UPDATE (priv, inode, iver, slice, type);
+ }
+
+ return;
+}
diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h
new file mode 100644
index 000000000..e4e2dfc96
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-helpers.h
@@ -0,0 +1,578 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_HELPERS_H
+#define _CHANGELOG_HELPERS_H
+
+#include "locking.h"
+#include "timer.h"
+#include "pthread.h"
+#include "iobuf.h"
+
+#include "changelog-misc.h"
+
+/**
+ * structures representing the changelog entries
+ */
+typedef struct changelog_write_data {
+ /**
+ * sincd gfid is _always_ a necessity, it's not a part
+ * of the iobuf. by doing this we do not add any overhead
+ * for data and metadata related fops.
+ */
+ uuid_t cwd_gfid;
+
+ /**
+ * iobufs are used for optionals records: pargfid, path,
+ * write offsets etc.. It's the fop implementers job
+ * to allocate (iobuf_get() in the fop) and get unref'ed
+ * in the callback (CHANGELOG_STACK_UNWIND).
+ */
+ struct iobuf *cwd_iobuf;
+
+ /**
+ * after allocation you can point this to the length of
+ * usable data, but make sure it does not exceed the
+ * the size of the requested iobuf.
+ */
+ size_t cwd_iobuf_len;
+ #define cwd_ptr cwd_iobuf->ptr
+ #define cwd_ptr_len cwd_iobuf_len
+
+ /**
+ * number of optional records
+ */
+ int cwd_xtra_records;
+} changelog_write_data_t;
+
+typedef struct changelog_rollover_data {
+ /**
+ * need a changelog reopen?
+ */
+ gf_boolean_t crd_finale;
+
+ /**
+ * changelog file name to be opened after a rollover
+ */
+ char crd_changelog_name[PATH_MAX];
+
+ /**
+ * changelog file name before rollover
+ */
+ char crd_changelog_oname[PATH_MAX];
+
+ /**
+ * use @crd_roll_key as suffix during roll-over
+ */
+ gf_boolean_t crd_use_suffix;
+
+ /**
+ * suffix used when rolling a changelog
+ */
+ unsigned long crd_roll_key;
+
+ /**
+ * preallocation? if yes, how much?
+ */
+ off_t crd_prealloc_size;
+} changelog_rollover_data_t;
+
+/**
+ * the changelog entry: structure representing the type of entry
+ * and a union encapsulating the above declared structures.
+ */
+typedef struct changelog_log_data {
+ /**
+ * type of the log data entry
+ */
+ changelog_log_type cld_type;
+
+ /**
+ * union for the type of changelog operations. @fsync() does
+ * not have a corresponding entry in this union as it just
+ * performs and @fsync() on ->changelog_fd.
+ */
+ union {
+ changelog_write_data_t cld_wdata;
+ changelog_rollover_data_t cld_roll;
+ };
+} changelog_log_data_t;
+
+typedef struct changelog_local changelog_local_t;
+
+/**
+ * holder for dispatch function and private data
+ */
+
+typedef struct changelog_priv changelog_priv_t;
+
+typedef struct changelog_dispatcher {
+ void *cd_data;
+ int (*dispatchfn) (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *, changelog_log_data_t *);
+} changelog_dispatcher_t;
+
+struct changelog_bootstrap {
+ changelog_mode_t mode;
+ int (*ctor) (xlator_t *, changelog_dispatcher_t *, gf_boolean_t);
+ int (*dtor) (xlator_t *, changelog_dispatcher_t *);
+};
+
+struct changelog_encoder {
+ changelog_encoder_t encoder;
+ int (*encode) (xlator_t *,
+ changelog_local_t *, changelog_log_data_t *);
+};
+
+struct changelog_ops {
+ /* changelog open */
+ int (*open) (xlator_t *, changelog_priv_t *,
+ void *, char *, gf_boolean_t);
+
+ /* changelog close */
+ int (*close) (xlator_t *, changelog_priv_t *, void *);
+
+ /* changelog rollover */
+ int (*rollover) (xlator_t *,
+ changelog_priv_t *,
+ void *, char *, gf_boolean_t);
+
+ int (*sync) (xlator_t *, changelog_priv_t *, void *);
+
+ /* changelog write */
+ int (*write) (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *, changelog_log_type);
+
+ /* changelog read */
+ int (*read) (xlator_t *,
+ changelog_priv_t *, void *, char *);
+
+ int (*unlink) (xlator_t *,
+ changelog_priv_t *, void *, char *);
+
+ /* {get|set} offset */
+ off_t (*get_offset) (xlator_t *this,
+ changelog_priv_t *, void *, changelog_local_t *);
+
+ void (*set_offset) (xlator_t *this,
+ changelog_priv_t *, void *,
+ changelog_local_t *, off_t);
+
+ void (*reset_offset) (xlator_t *this, changelog_priv_t *,
+ void *, changelog_local_t *);
+};
+
+/**
+ * This structure is _filled_ by the policy init (@init_policy) routine.
+ * Default @fops and @cops are passed to the init routine, which can
+ * choose to override the file operation or changelog operation behaviour.
+ * Just by _replacing_ the function pointers, a policy can change it's
+ * file and changelog operation behaviour. Kind of inheritance...
+ */
+struct changelog_logpolicy {
+ /* current changelog name */
+ char changelog_name[PATH_MAX];
+
+ /* private data */
+ void *cpriv;
+
+ /* file ops for the policy */
+ struct xlator_fops *fops;
+
+ /* changelog operations for the policy */
+ struct changelog_ops *cops;
+
+ /* current active policy */
+ changelog_log_policy_t policy;
+
+ int (*init_policy) (xlator_t *,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *);
+ int (*fini_policy) (xlator_t *, struct changelog_logpolicy *);
+};
+
+#define CHANGELOG_FNAME_FROM_POLICY(c) c->changelog_name
+
+#define CHANGELOG_INVOKE_FOP(priv,fop,...) priv->cp->fops->fop (__VA_ARGS__)
+
+#define CHANGELOG_INVOKE_CFOP(this,priv,fop,...) \
+ priv->cp->cops->fop (this, priv, priv->cp->cpriv, ##__VA_ARGS__)
+
+/* xlator private */
+
+typedef struct changelog_time_slice {
+ /**
+ * just in case we need nanosecond granularity some day.
+ * field is unused as of now (maybe we'd need it later).
+ */
+ struct timeval tv_start;
+
+ /**
+ * version of changelog file, incremented each time changes
+ * rollover.
+ */
+ unsigned long changelog_version[CHANGELOG_MAX_TYPE];
+} changelog_time_slice_t;
+
+typedef struct changelog_rollover {
+ /* rollover thread */
+ pthread_t rollover_th;
+
+ xlator_t *this;
+} changelog_rollover_t;
+
+typedef struct changelog_fsync {
+ /* fsync() thread */
+ pthread_t fsync_th;
+
+ xlator_t *this;
+} changelog_fsync_t;
+
+# define CHANGELOG_MAX_CLIENTS 5
+typedef struct changelog_notify {
+ /* reader end of the pipe */
+ int rfd;
+
+ /* notifier thread */
+ pthread_t notify_th;
+
+ /* unique socket path */
+ char sockpath[UNIX_PATH_MAX];
+
+ int socket_fd;
+
+ /**
+ * simple array of accept()'ed fds. Not scalable at all
+ * for large number of clients, but it's okay as we have
+ * a ahrd limit in this version (@CHANGELOG_MAX_CLIENTS).
+ */
+ int client_fd[CHANGELOG_MAX_CLIENTS];
+
+ xlator_t *this;
+} changelog_notify_t;
+
+struct changelog_priv {
+ gf_boolean_t active;
+
+ /**
+ * write the record header?
+ */
+ gf_boolean_t no_gfid_hdr;
+
+ gf_boolean_t lockless_update;
+
+ /* to generate unique socket file per brick */
+ char *changelog_brick;
+
+ /* logging directory */
+ char *changelog_dir;
+
+ /* one file for all changelog types */
+ int changelog_fd;
+
+ gf_lock_t lock;
+
+ /* writen end of the pipe */
+ int wfd;
+
+ /* rollover time */
+ int32_t rollover_time;
+
+ /* fsync() interval */
+ int32_t fsync_interval;
+
+ /* changelog type maps */
+ const char *maps[CHANGELOG_MAX_TYPE];
+
+ /* time slicer */
+ changelog_time_slice_t slice;
+
+ /* context of the updater */
+ changelog_dispatcher_t cd;
+
+ /* context of the rollover thread */
+ changelog_rollover_t cr;
+
+ /* context of fsync thread */
+ changelog_fsync_t cf;
+
+ /* context of the notifier thread */
+ changelog_notify_t cn;
+
+ /* operation mode */
+ changelog_mode_t op_mode;
+
+ /* bootstrap routine for 'current' logger */
+ struct changelog_bootstrap *cb;
+
+ /* encoder mode */
+ changelog_encoder_t encode_mode;
+
+ /* encoder */
+ struct changelog_encoder *ce;
+
+ /* logging policy */
+ changelog_log_policy_t policy;
+
+ /* policy logger */
+ struct changelog_logpolicy *cp;
+
+ /* current NSR term */
+ uint32_t term;
+};
+
+struct changelog_local {
+ inode_t *inode;
+
+ /**
+ * fops that do not need inode version checks
+ */
+ gf_boolean_t update_no_check;
+
+ /**
+ * the log data entry
+ */
+ changelog_log_data_t cld;
+
+ /**
+ * number of bytes written: used for continuation
+ */
+ off_t nr_bytes;
+
+ /**
+ * temporary scratch pads
+ */
+ union {
+ void *ptr;
+ unsigned long val;
+ } lu;
+};
+
+/* inode version is stored in inode ctx */
+typedef struct changelog_inode_ctx {
+ unsigned long iversion[CHANGELOG_MAX_TYPE];
+} changelog_inode_ctx_t;
+
+#define CHANGELOG_INODE_VERSION_TYPE(ctx, type) &(ctx->iversion[type])
+
+/**
+ * Optional Records:
+ * fops that need to save additional information request a array of
+ * @changelog_opt_t struct. The array is allocated via @iobufs.
+ */
+typedef enum {
+ CHANGELOG_OPT_REC_FOP,
+ CHANGELOG_OPT_REC_ULL,
+ CHANGELOG_OPT_REC_UUID,
+ CHANGELOG_OPT_REC_NAME,
+ CHANGELOG_OPT_REC_ENTRY,
+ CHANGELOG_OPT_REC_INT32,
+ CHANGELOG_OPT_REC_UINT32,
+} changelog_optional_rec_type_t;
+
+struct changelog_entry_fields {
+ uuid_t cef_uuid;
+ char *cef_bname;
+};
+
+typedef struct {
+ /**
+ * @co_covert can be used to do post-processing of the record before
+ * it's persisted to the CHANGELOG. If this is NULL, then the record
+ * is persisted as per it's in memory format.
+ */
+ size_t (*co_convert) (void *data, char *buffer, gf_boolean_t encode);
+
+ /* release routines */
+ void (*co_free) (void *data);
+
+ /* type of the field */
+ changelog_optional_rec_type_t co_type;
+
+ /**
+ * sizeof of the 'valid' field in the union. This field is not used if
+ * @co_convert is specified.
+ */
+ size_t co_len;
+
+ union {
+ uuid_t co_uuid;
+ glusterfs_fop_t co_fop;
+ int co_int32;
+ unsigned int co_uint32;
+ unsigned long long co_number;
+ struct changelog_entry_fields co_entry;
+ };
+} changelog_opt_t;
+
+#define CHANGELOG_OPT_RECORD_LEN sizeof (changelog_opt_t)
+
+/**
+ * helpers routines
+ */
+
+void
+changelog_thread_cleanup (xlator_t *this, pthread_t thr_id);
+
+void *
+changelog_get_usable_buffer (changelog_local_t *local);
+
+void
+changelog_set_usable_record_and_length (changelog_local_t *local,
+ size_t len, int xr);
+void
+changelog_local_cleanup (xlator_t *xl, changelog_local_t *local);
+changelog_local_t *
+changelog_local_init (xlator_t *this, inode_t *inode, uuid_t gfid,
+ int xtra_records, gf_boolean_t update_flag);
+int
+changelog_inject_single_event (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_data_t *cld);
+size_t
+changelog_entry_length ();
+int
+changelog_write (int fd, char *buffer, size_t len);
+int
+changelog_write_change (xlator_t *this, changelog_priv_t *priv,
+ changelog_local_t *local, char *buffer, size_t len);
+inline int
+changelog_handle_change (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local, changelog_log_data_t *cld);
+inline void
+changelog_update (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_type type);
+void *
+changelog_rollover (void *data);
+void *
+changelog_fsync_thread (void *data);
+int
+changelog_forget (xlator_t *this, inode_t *inode);
+
+/* macros */
+
+#define CHANGELOG_STACK_UNWIND(fop, frame, params ...) do { \
+ changelog_local_t *__local = NULL; \
+ xlator_t *__xl = NULL; \
+ if (frame) { \
+ __local = frame->local; \
+ __xl = frame->this; \
+ frame->local = NULL; \
+ } \
+ STACK_UNWIND_STRICT (fop, frame, params); \
+ changelog_local_cleanup (__xl, __local); \
+ } while (0)
+
+#define CHANGELOG_IOBUF_REF(iobuf) do { \
+ if (iobuf) \
+ iobuf_ref (iobuf); \
+ } while (0)
+
+#define CHANGELOG_IOBUF_UNREF(iobuf) do { \
+ if (iobuf) \
+ iobuf_unref (iobuf); \
+ } while (0)
+
+#define CHANGELOG_FILL_BUFFER(buffer, off, val, len) do { \
+ memcpy (buffer + off, val, len); \
+ off += len; \
+ } while (0)
+
+#define SLICE_VERSION_UPDATE(slice) do { \
+ int i = 0; \
+ for (; i < CHANGELOG_MAX_TYPE; i++) { \
+ slice->changelog_version[i]++; \
+ } \
+ } while (0)
+
+#define CHANGELOG_FILL_INT32(co, number, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_INT32; \
+ co->co_int32 = number; \
+ xlen += sizeof (int); \
+ } while (0)
+
+#define CHANGELOG_FILL_UINT32(co, number, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_UINT32; \
+ co->co_uint32 = number; \
+ xlen += sizeof (unsigned int); \
+ } while (0)
+
+#define CHANGELOG_FILL_FOP_NUMBER(co, fop, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_FOP; \
+ co->co_fop = fop; \
+ xlen += sizeof (fop); \
+ } while (0)
+
+#define CHANGELOG_FILL_NAME(co, name, freefn, xlen, label) \
+ do { \
+ co->co_convert = NULL; \
+ co->co_free = freefn; \
+ co->co_type = CHANGELOG_OPT_REC_NAME; \
+ co->co_entry.cef_bname = gf_strdup(name); \
+ if (!co->co_entry.cef_bname) \
+ goto label; \
+ co->co_len = strlen (name); \
+ xlen += co->co_len; \
+ } while(0) \
+
+#define CHANGELOG_FILL_ENTRY(co, pargfid, bname, \
+ converter, freefn, xlen, label) \
+ do { \
+ co->co_convert = converter; \
+ co->co_free = freefn; \
+ co->co_type = CHANGELOG_OPT_REC_ENTRY; \
+ uuid_copy (co->co_entry.cef_uuid, pargfid); \
+ co->co_entry.cef_bname = gf_strdup(bname); \
+ if (!co->co_entry.cef_bname) \
+ goto label; \
+ xlen += (UUID_CANONICAL_FORM_LEN + strlen (bname)); \
+ } while (0)
+
+#define CHANGELOG_INIT(this, local, inode, gfid, xrec) \
+ local = changelog_local_init (this, inode, gfid, xrec, _gf_false)
+
+#define CHANGELOG_INIT_NOCHECK(this, local, inode, gfid, xrec) \
+ local = changelog_local_init (this, inode, gfid, xrec, _gf_true)
+
+#define CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, label) do { \
+ if (!priv->active) \
+ goto label; \
+ /* ignore rebalance process's activity. */ \
+ if (frame->root->pid == GF_CLIENT_PID_DEFRAG) \
+ goto label; \
+ } while (0)
+
+/* ignore internal fops */
+#define CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(dict, label) do { \
+ if (dict && dict_get (dict, GLUSTERFS_INTERNAL_FOP_KEY)) \
+ goto label; \
+ } while (0)
+
+#define CHANGELOG_COND_GOTO(priv, cond, label) do { \
+ if (!priv->active || cond) \
+ goto label; \
+ } while (0)
+
+int
+changelog_open (xlator_t *this, changelog_priv_t *priv, changelog_local_t *local, changelog_rollover_data_t *crd);
+
+#endif /* _CHANGELOG_HELPERS_H */
diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h
new file mode 100644
index 000000000..a65bbb4f2
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-mem-types.h
@@ -0,0 +1,30 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_MEM_TYPES_H
+#define _CHANGELOG_MEM_TYPES_H
+
+#include "mem-types.h"
+
+enum gf_changelog_mem_types {
+ gf_changelog_mt_priv_t = gf_common_mt_end + 1,
+ gf_changelog_mt_str_t = gf_common_mt_end + 2,
+ gf_changelog_mt_batch_t = gf_common_mt_end + 3,
+ gf_changelog_mt_rt_t = gf_common_mt_end + 4,
+ gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5,
+ gf_changelog_mt_fop_policy_t = gf_common_mt_end + 6,
+ gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 7,
+ gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 8,
+ gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 9,
+ gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 10,
+ gf_changelog_mt_end
+};
+
+#endif
diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h
new file mode 100644
index 000000000..58bd3279d
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-misc.h
@@ -0,0 +1,107 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_MISC_H
+#define _CHANGELOG_MISC_H
+
+#include "glusterfs.h"
+#include "common-utils.h"
+
+#define CHANGELOG_MAX_TYPE 3
+#define CHANGELOG_FILE_NAME "CHANGELOG"
+
+#define CHANGELOG_VERSION_MAJOR 1
+#define CHANGELOG_VERSION_MINOR 0
+
+#define CHANGELOG_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY"/changelog-%s.sock"
+
+/**
+ * header starts with the version and the format of the changelog.
+ * 'version' not much of a use now.
+ */
+#define CHANGELOG_HEADER \
+ "GlusterFS Changelog | version: v%d.%d | encoding : %d\n"
+
+#define CHANGELOG_MAKE_SOCKET_PATH(brick_path, sockpath, len) do { \
+ char md5_sum[MD5_DIGEST_LENGTH*2+1] = {0,}; \
+ md5_wrapper((unsigned char *) brick_path, \
+ strlen(brick_path), \
+ md5_sum); \
+ (void) snprintf (sockpath, len, \
+ CHANGELOG_UNIX_SOCK, md5_sum); \
+ } while (0)
+
+/**
+ * ... used by libgfchangelog.
+ */
+#define CHANGELOG_GET_ENCODING(fd, buffer, len, enc, enc_len) do { \
+ FILE *fp; \
+ int fd_dup, maj, min; \
+ \
+ enc = -1; \
+ fd_dup = dup (fd); \
+ \
+ if (fd_dup != -1) { \
+ fp = fdopen (fd_dup, "r"); \
+ if (fp) { \
+ if (fgets (buffer, len, fp)) { \
+ elen = strlen (buffer); \
+ sscanf (buffer, \
+ CHANGELOG_HEADER, \
+ &maj, &min, &enc); \
+ } \
+ fclose (fp); \
+ } else { \
+ close (fd_dup); \
+ } \
+ } \
+ } while (0)
+
+/**
+ * everything after @CHANGELOG_TYPE_ENTRY are internal types
+ * (ie. none of the fops trigger this type of event), hence
+ * CHANGELOG_MAX_TYPE = 3
+ */
+typedef enum {
+ CHANGELOG_TYPE_DATA = 0,
+ CHANGELOG_TYPE_METADATA,
+ CHANGELOG_TYPE_ENTRY,
+ CHANGELOG_TYPE_ROLLOVER,
+ CHANGELOG_TYPE_FSYNC,
+} changelog_log_type;
+
+/* operation modes - RT for now */
+typedef enum {
+ CHANGELOG_MODE_RT = 0,
+} changelog_mode_t;
+
+/* encoder types */
+
+typedef enum {
+ CHANGELOG_ENCODE_MIN = 0,
+ CHANGELOG_ENCODE_BINARY,
+ CHANGELOG_ENCODE_ASCII,
+ CHANGELOG_ENCODE_MAX,
+} changelog_encoder_t;
+
+/* logging policies */
+typedef enum {
+ CHANGELOG_LOG_POLICY_DEFAULT = 0,
+ CHANGELOG_LOG_POLICY_REPLICATE,
+} changelog_log_policy_t;
+
+#define CHANGELOG_VALID_ENCODING(enc) \
+ (enc > CHANGELOG_ENCODE_MIN && enc < CHANGELOG_ENCODE_MAX)
+
+#define CHANGELOG_TYPE_IS_ENTRY(type) (type == CHANGELOG_TYPE_ENTRY)
+#define CHANGELOG_TYPE_IS_ROLLOVER(type) (type == CHANGELOG_TYPE_ROLLOVER)
+#define CHANGELOG_TYPE_IS_FSYNC(type) (type == CHANGELOG_TYPE_FSYNC)
+
+#endif /* _CHANGELOG_MISC_H */
diff --git a/xlators/features/changelog/src/changelog-notifier.c b/xlators/features/changelog/src/changelog-notifier.c
new file mode 100644
index 000000000..5f3d063a8
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-notifier.c
@@ -0,0 +1,314 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-notifier.h"
+
+#include <pthread.h>
+
+inline static void
+changelog_notify_clear_fd (changelog_notify_t *cn, int i)
+{
+ cn->client_fd[i] = -1;
+}
+
+inline static void
+changelog_notify_save_fd (changelog_notify_t *cn, int i, int fd)
+{
+ cn->client_fd[i] = fd;
+}
+
+static int
+changelog_notify_insert_fd (xlator_t *this, changelog_notify_t *cn, int fd)
+{
+ int i = 0;
+ int ret = 0;
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if (cn->client_fd[i] == -1)
+ break;
+ }
+
+ if (i == CHANGELOG_MAX_CLIENTS) {
+ /**
+ * this case should not be hit as listen() would limit
+ * the number of completely established connections.
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "hit max client limit (%d)", CHANGELOG_MAX_CLIENTS);
+ ret = -1;
+ }
+ else
+ changelog_notify_save_fd (cn, i, fd);
+
+ return ret;
+}
+
+static void
+changelog_notify_fill_rset (changelog_notify_t *cn, fd_set *rset, int *maxfd)
+{
+ int i = 0;
+
+ FD_ZERO (rset);
+
+ FD_SET (cn->socket_fd, rset);
+ *maxfd = cn->socket_fd;
+
+ FD_SET (cn->rfd, rset);
+ *maxfd = max (*maxfd, cn->rfd);
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if (cn->client_fd[i] != -1) {
+ FD_SET (cn->client_fd[i], rset);
+ *maxfd = max (*maxfd, cn->client_fd[i]);
+ }
+ }
+
+ *maxfd = *maxfd + 1;
+}
+
+static int
+changelog_notify_client (changelog_notify_t *cn, char *path, ssize_t len)
+{
+ int i = 0;
+ int ret = 0;
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if (cn->client_fd[i] == -1)
+ continue;
+
+ if (changelog_write (cn->client_fd[i],
+ path, len)) {
+ ret = -1;
+
+ close (cn->client_fd[i]);
+ changelog_notify_clear_fd (cn, i);
+ }
+ }
+
+ return ret;
+}
+
+static void
+changelog_notifier_init (changelog_notify_t *cn)
+{
+ int i = 0;
+
+ cn->socket_fd = -1;
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ changelog_notify_clear_fd (cn, i);
+ }
+}
+
+static void
+changelog_close_client_conn (changelog_notify_t *cn)
+{
+ int i = 0;
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if (cn->client_fd[i] == -1)
+ continue;
+
+ close (cn->client_fd[i]);
+ changelog_notify_clear_fd (cn, i);
+ }
+}
+
+static void
+changelog_notifier_cleanup (void *arg)
+{
+ changelog_notify_t *cn = NULL;
+
+ cn = (changelog_notify_t *) arg;
+
+ changelog_close_client_conn (cn);
+
+ if (cn->socket_fd != -1)
+ close (cn->socket_fd);
+
+ if (cn->rfd)
+ close (cn->rfd);
+
+ if (unlink (cn->sockpath))
+ gf_log ("", GF_LOG_WARNING,
+ "could not unlink changelog socket file"
+ " %s (reason: %s", cn->sockpath, strerror (errno));
+}
+
+void *
+changelog_notifier (void *data)
+{
+ int i = 0;
+ int fd = 0;
+ int max_fd = 0;
+ int len = 0;
+ ssize_t readlen = 0;
+ xlator_t *this = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_notify_t *cn = NULL;
+ struct sockaddr_un local = {0,};
+ char path[PATH_MAX] = {0,};
+ char abspath[PATH_MAX] = {0,};
+
+ char buffer;
+ fd_set rset;
+
+ priv = (changelog_priv_t *) data;
+
+ cn = &priv->cn;
+ this = cn->this;
+
+ pthread_cleanup_push (changelog_notifier_cleanup, cn);
+
+ changelog_notifier_init (cn);
+
+ cn->socket_fd = socket (AF_UNIX, SOCK_STREAM, 0);
+ if (cn->socket_fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "changelog socket error (reason: %s)",
+ strerror (errno));
+ goto out;
+ }
+
+ CHANGELOG_MAKE_SOCKET_PATH (priv->changelog_brick,
+ cn->sockpath, UNIX_PATH_MAX);
+ if (unlink (cn->sockpath) < 0) {
+ if (errno != ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not unlink changelog socket file (%s)"
+ " (reason: %s)",
+ CHANGELOG_UNIX_SOCK, strerror (errno));
+ goto cleanup;
+ }
+ }
+
+ local.sun_family = AF_UNIX;
+ strcpy (local.sun_path, cn->sockpath);
+
+ len = strlen (local.sun_path) + sizeof (local.sun_family);
+
+ /* bind to the unix domain socket */
+ if (bind (cn->socket_fd, (struct sockaddr *) &local, len) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not bind to changelog socket (reason: %s)",
+ strerror (errno));
+ goto cleanup;
+ }
+
+ /* listen for incoming connections */
+ if (listen (cn->socket_fd, CHANGELOG_MAX_CLIENTS) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "listen() error on changelog socket (reason: %s)",
+ strerror (errno));
+ goto cleanup;
+ }
+
+ /**
+ * simple select() on all to-be-read file descriptors. This method
+ * though old school works pretty well when you have a handfull of
+ * fd's to be watched (clients).
+ *
+ * Future TODO: move this to epoll based notification facility if
+ * number of clients increase.
+ */
+ for (;;) {
+ changelog_notify_fill_rset (cn, &rset, &max_fd);
+
+ if (select (max_fd, &rset, NULL, NULL, NULL) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "select() returned -1 (reason: %s)",
+ strerror (errno));
+ sleep (2);
+ continue;
+ }
+
+ if (FD_ISSET (cn->socket_fd, &rset)) {
+ fd = accept (cn->socket_fd, NULL, NULL);
+ if (fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "accept error on changelog socket"
+ " (reason: %s)", strerror (errno));
+ } else if (changelog_notify_insert_fd (this, cn, fd)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "hit max client limit");
+ }
+ }
+
+ if (FD_ISSET (cn->rfd, &rset)) {
+ /**
+ * read changelog filename and notify all connected
+ * clients.
+ */
+ readlen = 0;
+ while (readlen < PATH_MAX) {
+ len = read (cn->rfd, &path[readlen++], 1);
+ if (len == -1) {
+ break;
+ }
+
+ if (len == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rollover thread sent EOF"
+ " on pipe - possibly a crash.");
+ /* be blunt and close all connections */
+ pthread_exit(NULL);
+ }
+
+ if (path[readlen - 1] == '\0')
+ break;
+ }
+
+ /* should we close all client connections here too? */
+ if (len < 0 || readlen == PATH_MAX) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not get pathname from rollover"
+ " thread or pathname too long");
+ goto process_rest;
+ }
+
+ (void) snprintf (abspath, PATH_MAX,
+ "%s/%s", priv->changelog_dir, path);
+ if (changelog_notify_client (cn, abspath,
+ strlen (abspath) + 1))
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not notify some clients with new"
+ " changelogs");
+ }
+
+ process_rest:
+ for (i = 0; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if ( (fd = cn->client_fd[i]) == -1 )
+ continue;
+
+ if (FD_ISSET (fd, &rset)) {
+ /**
+ * the only data we accept from the client is a
+ * disconnect. Anything else is treated as bogus
+ * and is silently discarded (also warned!!!).
+ */
+ if ( (readlen = read (fd, &buffer, 1)) <= 0 ) {
+ close (fd);
+ changelog_notify_clear_fd (cn, i);
+ } else {
+ /* silently discard data and log */
+ gf_log (this->name, GF_LOG_WARNING,
+ "misbehaving changelog client");
+ }
+ }
+ }
+
+ }
+
+ cleanup:;
+ pthread_cleanup_pop (1);
+
+ out:
+ return NULL;
+}
diff --git a/xlators/features/changelog/src/changelog-notifier.h b/xlators/features/changelog/src/changelog-notifier.h
new file mode 100644
index 000000000..55e728356
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-notifier.h
@@ -0,0 +1,19 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_NOTIFIER_H
+#define _CHANGELOG_NOTIFIER_H
+
+#include "changelog-helpers.h"
+
+void *
+changelog_notifier (void *data);
+
+#endif
diff --git a/xlators/features/changelog/src/changelog-rt.c b/xlators/features/changelog/src/changelog-rt.c
new file mode 100644
index 000000000..91d47e059
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rt.c
@@ -0,0 +1,83 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "changelog-rt.h"
+#include "changelog-mem-types.h"
+
+int
+changelog_rt_init (xlator_t *this,
+ changelog_dispatcher_t *cd, gf_boolean_t lockless_update)
+{
+ changelog_rt_t *crt = NULL;
+
+ crt = GF_CALLOC (1, sizeof (*crt),
+ gf_changelog_mt_rt_t);
+ if (!crt)
+ return -1;
+
+ /* TBD: don't init (and destroy) if lock-less update */
+ LOCK_INIT (&crt->lock);
+
+ cd->cd_data = crt;
+ cd->dispatchfn = lockless_update ?
+ &changelog_rt_enqueue_lockless : &changelog_rt_enqueue;
+
+ return 0;
+}
+
+int
+changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd)
+{
+ changelog_rt_t *crt = NULL;
+
+ crt = cd->cd_data;
+
+ LOCK_DESTROY (&crt->lock);
+ GF_FREE (crt);
+
+ return 0;
+}
+
+int
+changelog_rt_enqueue (xlator_t *this,
+ changelog_priv_t *priv, void *cbatch,
+ changelog_local_t *local, changelog_log_data_t *cld_0)
+{
+ int ret = 0;
+ changelog_rt_t *crt = NULL;
+
+ crt = (changelog_rt_t *) cbatch;
+
+ LOCK (&crt->lock);
+ {
+ ret = changelog_handle_change (this, priv, local, cld_0);
+ }
+ UNLOCK (&crt->lock);
+
+ return ret;
+}
+
+int
+changelog_rt_enqueue_lockless (xlator_t *this,
+ changelog_priv_t *priv, void *cbatch,
+ changelog_local_t *local,
+ changelog_log_data_t *cld_0)
+{
+ return changelog_handle_change (this, priv, local, cld_0);
+}
diff --git a/xlators/features/changelog/src/changelog-rt.h b/xlators/features/changelog/src/changelog-rt.h
new file mode 100644
index 000000000..634b7473b
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rt.h
@@ -0,0 +1,40 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_RT_H
+#define _CHANGELOG_RT_H
+
+#include "locking.h"
+#include "timer.h"
+#include "pthread.h"
+
+#include "changelog-helpers.h"
+
+/* unused as of now - may be you would need it later */
+typedef struct changelog_rt {
+ gf_lock_t lock;
+} changelog_rt_t;
+
+int
+changelog_rt_init (xlator_t *this,
+ changelog_dispatcher_t *cd, gf_boolean_t lockless_update);
+int
+changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd);
+int
+changelog_rt_enqueue (xlator_t *this,
+ changelog_priv_t *priv, void *cbatch,
+ changelog_local_t *local, changelog_log_data_t *cld_0);
+int
+changelog_rt_enqueue_lockless (xlator_t *this,
+ changelog_priv_t *priv, void *cbatch,
+ changelog_local_t *local,
+ changelog_log_data_t *cld_0);
+
+#endif /* _CHANGELOG_RT_H */
diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c
new file mode 100644
index 000000000..6d4b502de
--- /dev/null
+++ b/xlators/features/changelog/src/changelog.c
@@ -0,0 +1,1389 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+#include "iobuf.h"
+#include <pthread.h>
+
+#include "changelog-rt.h"
+#include "changelog-notifier.h"
+#include "changelog-encoders.h"
+#include "changelog-mem-types.h"
+
+#include "changelog-fops.h"
+#include "changelog-policy.h"
+
+static struct changelog_bootstrap
+cb_bootstrap[] = {
+ {
+ .mode = CHANGELOG_MODE_RT,
+ .ctor = changelog_rt_init,
+ .dtor = changelog_rt_fini,
+ },
+};
+
+static struct changelog_encoder
+cb_encoder[] = {
+ [CHANGELOG_ENCODE_BINARY] =
+ {
+ .encoder = CHANGELOG_ENCODE_BINARY,
+ .encode = changelog_encode_binary,
+ },
+ [CHANGELOG_ENCODE_ASCII] =
+ {
+ .encoder = CHANGELOG_ENCODE_ASCII,
+ .encode = changelog_encode_ascii,
+ },
+};
+
+static struct changelog_logpolicy
+cb_policy[] = {
+ [CHANGELOG_LOG_POLICY_DEFAULT] =
+ {
+ .fops = NULL,
+ .cops = NULL,
+ .policy = CHANGELOG_LOG_POLICY_DEFAULT,
+ .init_policy = changelog_default_policy_init,
+ .fini_policy = changelog_default_policy_fini,
+ },
+ [CHANGELOG_LOG_POLICY_REPLICATE] =
+ {
+ .fops = NULL,
+ .cops = NULL,
+ .policy = CHANGELOG_LOG_POLICY_REPLICATE,
+ .init_policy = changelog_replication_policy_init,
+ .fini_policy = changelog_replication_policy_fini,
+ },
+};
+
+/* Entry operations - TYPE III */
+
+/* {{{ */
+
+/* rmdir */
+
+int32_t
+changelog_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (rmdir, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, rmdir, frame, this, loc, xflags, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_rmdir_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->rmdir,
+ loc, xflags, xdata);
+ return 0;
+}
+
+/* unlink */
+
+int32_t
+changelog_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (unlink, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+ CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, unlink, frame, this, loc, xflags, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_unlink_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->unlink,
+ loc, xflags, xdata);
+ return 0;
+}
+
+/* rename */
+
+int32_t
+changelog_rename_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *buf, struct iatt *preoldparent,
+ struct iatt *postoldparent, struct iatt *prenewparent,
+ struct iatt *postnewparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (rename, frame, op_ret, op_errno,
+ buf, preoldparent, postoldparent,
+ prenewparent, postnewparent, xdata);
+ return 0;
+}
+
+
+int32_t
+changelog_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, rename, frame, this, oldloc, newloc, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_rename_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+/* link */
+
+int32_t
+changelog_link_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (link, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_link (call_frame_t *frame,
+ xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+ CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, link, frame, this, oldloc, newloc, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_link_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+/* mkdir */
+
+int32_t
+changelog_mkdir_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (mkdir, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, mkdir, frame, this,
+ loc, mode, umask, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_mkdir_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->mkdir,
+ loc, mode, umask, xdata);
+ return 0;
+}
+
+/* symlink */
+
+int32_t
+changelog_symlink_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (symlink, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc,
+ mode_t umask, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, symlink, frame, this,
+ linkname, loc, umask, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_symlink_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->symlink,
+ linkname, loc, umask, xdata);
+ return 0;
+}
+
+/* mknod */
+
+int32_t
+changelog_mknod_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (mknod, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_mknod (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, mknod, frame, this,
+ loc, mode, dev, umask, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_mknod_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->mknod,
+ loc, mode, dev, umask, xdata);
+ return 0;
+}
+
+/* creat */
+
+int32_t
+changelog_create_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (create, frame,
+ op_ret, op_errno, fd, inode,
+ buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, create, frame, this, loc,
+ flags, mode, umask, fd, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_create_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+}
+
+/* }}} */
+
+
+/* Metadata modification fops - TYPE II */
+
+/* {{{ */
+
+/* {f}setattr */
+
+int32_t
+changelog_fsetattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *preop_stbuf,
+ struct iatt *postop_stbuf, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (fsetattr, frame, op_ret, op_errno,
+ preop_stbuf, postop_stbuf, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_fsetattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, fsetattr,
+ frame, this, fd, stbuf, valid, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_fsetattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ return 0;
+
+
+}
+
+int32_t
+changelog_setattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *preop_stbuf,
+ struct iatt *postop_stbuf, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (setattr, frame, op_ret, op_errno,
+ preop_stbuf, postop_stbuf, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_setattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, setattr,
+ frame, this, loc, stbuf, valid, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_setattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+}
+
+/* {f}removexattr */
+
+int32_t
+changelog_fremovexattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, fremovexattr,
+ frame, this, fd, name, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_fremovexattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int32_t
+changelog_removexattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, removexattr, frame, this, loc, name, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_removexattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+/* {f}setxattr */
+
+int32_t
+changelog_setxattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_setxattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, setxattr,
+ frame, this, loc, dict, flags, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_setxattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+}
+
+int32_t
+changelog_fsetxattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_fsetxattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, fsetxattr,
+ frame, this, fd, dict, flags, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_fsetxattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+}
+
+/* }}} */
+
+
+/* Data modification fops - TYPE I */
+
+/* {{{ */
+
+/* {f}truncate() */
+
+int32_t
+changelog_truncate_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (truncate, frame,
+ op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int32_t
+changelog_truncate (call_frame_t *frame,
+ xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, truncate, frame, this, loc, offset, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_truncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+int32_t
+changelog_ftruncate_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (ftruncate, frame,
+ op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int32_t
+changelog_ftruncate (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, ftruncate, frame, this, fd, offset, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_ftruncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+/* writev() */
+
+int32_t
+changelog_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret <= 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (writev, frame,
+ op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int32_t
+changelog_writev (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, writev, frame, this, fd,
+ vector, count, offset, flags, iobref, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_writev_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->writev, fd, vector,
+ count, offset, flags, iobref, xdata);
+ return 0;
+}
+
+/* }}} */
+
+/**
+ * The
+ * - @init ()
+ * - @fini ()
+ * - @reconfigure ()
+ * ... and helper routines
+ */
+
+/**
+ * needed if there are more operation modes in the future.
+ */
+static void
+changelog_assign_opmode (changelog_priv_t *priv, char *mode)
+{
+ if ( strncmp (mode, "realtime", 8) == 0 ) {
+ priv->op_mode = CHANGELOG_MODE_RT;
+ }
+}
+
+static void
+changelog_assign_encoding (changelog_priv_t *priv, char *enc)
+{
+ if ( strncmp (enc, "binary", 6) == 0 ) {
+ priv->encode_mode = CHANGELOG_ENCODE_BINARY;
+ } else if ( strncmp (enc, "ascii", 5) == 0 ) {
+ priv->encode_mode = CHANGELOG_ENCODE_ASCII;
+ }
+}
+
+static void
+changelog_assign_policy (changelog_priv_t *priv, char *pol)
+{
+ if ( strncmp (pol, "default", 7) == 0 )
+ priv->policy = CHANGELOG_LOG_POLICY_DEFAULT;
+ else if ( strncmp (pol, "replication", 11) == 0 )
+ priv->policy = CHANGELOG_LOG_POLICY_REPLICATE;
+}
+
+/* cleanup any helper threads that are running */
+static void
+changelog_cleanup_helper_threads (xlator_t *this, changelog_priv_t *priv)
+{
+ if (priv->cr.rollover_th) {
+ changelog_thread_cleanup (this, priv->cr.rollover_th);
+ priv->cr.rollover_th = 0;
+ }
+
+ if (priv->cf.fsync_th) {
+ changelog_thread_cleanup (this, priv->cf.fsync_th);
+ priv->cf.fsync_th = 0;
+ }
+}
+
+/* spawn helper thread; cleaning up in case of errors */
+static int
+changelog_spawn_helper_threads (xlator_t *this, changelog_priv_t *priv)
+{
+ int ret = 0;
+
+ priv->cr.this = this;
+ if (priv->rollover_time) {
+ ret = pthread_create (&priv->cr.rollover_th,
+ NULL, changelog_rollover, priv);
+ if (ret)
+ goto out;
+ }
+
+ if (priv->fsync_interval) {
+ priv->cf.this = this;
+ ret = pthread_create (&priv->cf.fsync_th,
+ NULL, changelog_fsync_thread, priv);
+ }
+
+ if (ret)
+ changelog_cleanup_helper_threads (this, priv);
+
+ out:
+ return ret;
+}
+
+/* cleanup the notifier thread */
+static int
+changelog_cleanup_notifier (xlator_t *this, changelog_priv_t *priv)
+{
+ int ret = 0;
+
+ if (priv->cn.notify_th) {
+ changelog_thread_cleanup (this, priv->cn.notify_th);
+ priv->cn.notify_th = 0;
+
+ ret = close (priv->wfd);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "error closing writer end of notifier pipe"
+ " (reason: %s)", strerror (errno));
+ }
+
+ return ret;
+}
+
+/* spawn the notifier thread - nop if already running */
+static int
+changelog_spawn_notifier (xlator_t *this, changelog_priv_t *priv)
+{
+ int ret = 0;
+ int flags = 0;
+ int pipe_fd[2] = {0, 0};
+
+ if (priv->cn.notify_th)
+ goto out; /* notifier thread already running */
+
+ ret = pipe (pipe_fd);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot create pipe (reason: %s)", strerror (errno));
+ goto out;
+ }
+
+ /* writer is non-blocking */
+ flags = fcntl (pipe_fd[1], F_GETFL);
+ flags |= O_NONBLOCK;
+
+ ret = fcntl (pipe_fd[1], F_SETFL, flags);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set O_NONBLOCK flag");
+ goto out;
+ }
+
+ priv->wfd = pipe_fd[1];
+
+ priv->cn.this = this;
+ priv->cn.rfd = pipe_fd[0];
+
+ ret = pthread_create (&priv->cn.notify_th,
+ NULL, changelog_notifier, priv);
+
+ out:
+ return ret;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_changelog_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING, "Memory accounting"
+ " init failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+static int
+changelog_init (xlator_t *this, changelog_priv_t *priv)
+{
+ int i = 0;
+ int ret = -1;
+ char *cname = NULL;
+ struct timeval tv = {0,};
+
+ ret = gettimeofday (&tv, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gettimeofday() failure");
+ goto out;
+ }
+
+ priv->slice.tv_start = tv;
+
+ priv->maps[CHANGELOG_TYPE_DATA] = "D ";
+ priv->maps[CHANGELOG_TYPE_METADATA] = "M ";
+ priv->maps[CHANGELOG_TYPE_ENTRY] = "E ";
+
+ for (; i < CHANGELOG_MAX_TYPE; i++) {
+ /* start with version 1 */
+ priv->slice.changelog_version[i] = 1;
+ }
+
+ if (!priv->active)
+ return ret;
+
+ /* spawn the notifier thread */
+ ret = changelog_spawn_notifier (this, priv);
+ if (ret)
+ goto out;
+
+ cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp);
+
+ LOCK (&priv->lock);
+ {
+ ret = CHANGELOG_INVOKE_CFOP (this, priv,
+ open, cname, _gf_false);
+ }
+ UNLOCK (&priv->lock);
+
+ if (ret)
+ goto out;
+
+ /* ... and finally spawn the helpers threads */
+ ret = changelog_spawn_helper_threads (this, priv);
+
+ out:
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = 0;
+ char *tmp = NULL;
+ char *cname = NULL;
+ changelog_priv_t *priv = NULL;
+ gf_boolean_t active_earlier = _gf_true;
+ gf_boolean_t active_now = _gf_true;
+ changelog_time_slice_t *slice = NULL;
+
+ priv = this->private;
+ if (!priv)
+ goto out;
+
+ ret = -1;
+ active_earlier = priv->active;
+
+ /* first stop the rollover and the fsync thread */
+ changelog_cleanup_helper_threads (this, priv);
+
+ GF_OPTION_RECONF ("changelog-dir", tmp, options, str, out);
+ if (!tmp) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "\"changelog-dir\" option is not set");
+ goto out;
+ }
+
+ GF_FREE (priv->changelog_dir);
+ priv->changelog_dir = gf_strdup (tmp);
+ if (!priv->changelog_dir)
+ goto out;
+
+ ret = mkdir_p (priv->changelog_dir, 0600, _gf_true);
+ if (ret)
+ goto out;
+
+ GF_OPTION_RECONF ("changelog", active_now, options, bool, out);
+
+ /**
+ * changelog_handle_change() handles changes that could possibly
+ * have been submit changes before changelog deactivation.
+ */
+ if (!active_now)
+ priv->active = _gf_false;
+
+ GF_OPTION_RECONF ("op-mode", tmp, options, str, out);
+ changelog_assign_opmode (priv, tmp);
+
+ tmp = NULL;
+
+ GF_OPTION_RECONF ("encoding", tmp, options, str, out);
+ changelog_assign_encoding (priv, tmp);
+
+ GF_OPTION_RECONF ("rollover-time",
+ priv->rollover_time, options, int32, out);
+ GF_OPTION_RECONF ("fsync-interval",
+ priv->fsync_interval, options, int32, out);
+
+ if (active_now || active_earlier) {
+ slice = &priv->slice;
+ cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp);
+
+ LOCK (&priv->lock);
+ {
+ ret = CHANGELOG_INVOKE_CFOP (this, priv, rollover,
+ cname, !active_now);
+ if (!ret && active_now)
+ SLICE_VERSION_UPDATE (slice);
+ }
+ UNLOCK (&priv->lock);
+
+ if (ret)
+ goto out;
+
+ if (active_now) {
+ ret = changelog_spawn_notifier (this, priv);
+ if (!ret)
+ ret = changelog_spawn_helper_threads (this,
+ priv);
+ } else
+ ret = changelog_cleanup_notifier (this, priv);
+ }
+
+ out:
+ if (ret) {
+ ret = changelog_cleanup_notifier (this, priv);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "changelog reconfigured");
+ if (active_now)
+ priv->active = _gf_true;
+ }
+
+ return ret;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ int ret = -1;
+ char *tmp = NULL;
+ changelog_priv_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("changelog", this, out);
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "translator needs a single subvolume");
+ goto out;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "dangling volume. please check volfile");
+ goto out;
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_changelog_mt_priv_t);
+ if (!priv)
+ goto out;
+
+ this->local_pool = mem_pool_new (changelog_local_t, 64);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create local memory pool");
+ goto out;
+ }
+
+ LOCK_INIT (&priv->lock);
+
+ GF_OPTION_INIT ("changelog-brick", tmp, str, out);
+ if (!tmp) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "\"changelog-brick\" option is not set");
+ goto out;
+ }
+
+ priv->changelog_brick = gf_strdup (tmp);
+ if (!priv->changelog_brick)
+ goto out;
+ tmp = NULL;
+
+ GF_OPTION_INIT ("changelog-dir", tmp, str, out);
+ if (!tmp) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "\"changelog-dir\" option is not set");
+ goto out;
+ }
+
+ priv->changelog_dir = gf_strdup (tmp);
+ if (!priv->changelog_dir)
+ goto out;
+ tmp = NULL;
+
+ /**
+ * create the directory even if change-logging would be inactive
+ * so that consumers can _look_ into it (finding nothing...)
+ */
+ ret = mkdir_p (priv->changelog_dir, 0600, _gf_true);
+ if (ret)
+ goto out;
+
+ GF_OPTION_INIT ("changelog", priv->active, bool, out);
+
+ GF_OPTION_INIT ("op-mode", tmp, str, out);
+ changelog_assign_opmode (priv, tmp);
+
+ tmp = NULL;
+
+ GF_OPTION_INIT ("encoding", tmp, str, out);
+ changelog_assign_encoding (priv, tmp);
+
+ tmp = NULL;
+
+ GF_OPTION_INIT ("policy", tmp, str, out);
+ changelog_assign_policy (priv, tmp);
+
+ GF_OPTION_INIT ("fsync-interval", priv->fsync_interval, int32, out);
+
+ GF_ASSERT (cb_encoder[priv->encode_mode].encoder == priv->encode_mode);
+ priv->ce = &cb_encoder[priv->encode_mode];
+
+ GF_ASSERT (cb_bootstrap[priv->op_mode].mode == priv->op_mode);
+ priv->cb = &cb_bootstrap[priv->op_mode];
+
+ GF_ASSERT (cb_policy[priv->policy].policy == priv->policy);
+ priv->cp = &cb_policy[priv->policy];
+
+
+ /* ... init logging policy */
+ ret = priv->cp->init_policy (this, priv, priv->cp);
+ if (ret)
+ goto out;
+
+ /* ... now bootstrap the logger */
+ ret = priv->cb->ctor (this, &priv->cd, priv->lockless_update);
+ if (ret)
+ goto out;
+
+ /* override the value if set */
+ if (dict_get (this->options, "rollover-time")) {
+ ret = dict_get_int32 (this->options,
+ "rollover-time", &priv->rollover_time);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot get value for \"rollover-time\"");
+ goto out;
+ }
+ }
+
+ priv->changelog_fd = -1;
+ ret = changelog_init (this, priv);
+ if (ret)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "changelog translator loaded");
+
+ out:
+ if (ret) {
+ if (this->local_pool)
+ mem_pool_destroy (this->local_pool);
+ if (priv->cb) {
+ ret = priv->cb->dtor (this, &priv->cd);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "error in cleanup during init()");
+ }
+ GF_FREE (priv->changelog_brick);
+ GF_FREE (priv->changelog_dir);
+ GF_FREE (priv);
+ this->private = NULL;
+ } else
+ this->private = priv;
+
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ int ret = -1;
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv) {
+ ret = priv->cb->dtor (this, &priv->cd);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "error in fini");
+ mem_pool_destroy (this->local_pool);
+ GF_FREE (priv->changelog_brick);
+ GF_FREE (priv->changelog_dir);
+ GF_FREE (priv);
+ }
+
+ this->private = NULL;
+
+ return;
+}
+
+struct xlator_fops fops = {
+ .mknod = changelog_mknod,
+ .mkdir = changelog_mkdir,
+ .create = changelog_create,
+ .symlink = changelog_symlink,
+ .writev = changelog_writev,
+ .truncate = changelog_truncate,
+ .ftruncate = changelog_ftruncate,
+ .link = changelog_link,
+ .rename = changelog_rename,
+ .unlink = changelog_unlink,
+ .rmdir = changelog_rmdir,
+ .setattr = changelog_setattr,
+ .fsetattr = changelog_fsetattr,
+ .setxattr = changelog_setxattr,
+ .fsetxattr = changelog_fsetxattr,
+ .removexattr = changelog_removexattr,
+ .fremovexattr = changelog_fremovexattr,
+};
+
+struct xlator_cbks cbks = {
+ .forget = changelog_forget,
+};
+
+struct volume_options options[] = {
+ {.key = {"changelog"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "enable/disable change-logging"
+ },
+ {.key = {"changelog-brick"},
+ .type = GF_OPTION_TYPE_PATH,
+ .description = "brick path to generate unique socket file name."
+ " should be the export directory of the volume strictly."
+ },
+ {.key = {"changelog-dir"},
+ .type = GF_OPTION_TYPE_PATH,
+ .description = "directory for the changelog files"
+ },
+ {.key = {"op-mode"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "realtime",
+ .value = {"realtime"},
+ .description = "operation mode - futuristic operation modes"
+ },
+ {.key = {"encoding"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "ascii",
+ .value = {"binary", "ascii"},
+ .description = "encoding type for changelogs"
+ },
+ {.key = {"rollover-time"},
+ .description = "time to switch to a new changelog file (in seconds)"
+ },
+ {.key = {"fsync-interval"},
+ .type = GF_OPTION_TYPE_TIME,
+ .default_value = "5",
+ .description = "do not open CHANGELOG file with O_SYNC mode."
+ " instead perform fsync() at specified intervals"
+ },
+ {.key = {"policy"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "replication",
+ .value = {"default", "replication"},
+ .description = "Logging policies"
+ },
+ {.key = {NULL}
+ },
+};
diff --git a/xlators/features/changelog/src/policy/changelog-policy-default.c b/xlators/features/changelog/src/policy/changelog-policy-default.c
new file mode 100644
index 000000000..eaa3d107f
--- /dev/null
+++ b/xlators/features/changelog/src/policy/changelog-policy-default.c
@@ -0,0 +1,45 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-policy.h"
+#include "changelog-fops.h"
+
+int
+changelog_default_policy_init (xlator_t *this,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *cp)
+{
+ priv->rollover_time = 15;
+
+ priv->no_gfid_hdr = _gf_false;
+ priv->lockless_update = _gf_false;
+
+ cp->cpriv = GF_CALLOC (1, sizeof (off_t),
+ gf_changelog_mt_fop_policy_t);
+ if (!cp->cpriv)
+ return -1;
+
+ (void) memset (cp->changelog_name, '\0', PATH_MAX);
+ (void) memcpy (cp->changelog_name,
+ CHANGELOG_FILE_NAME, strlen (CHANGELOG_FILE_NAME));
+
+ cp->fops = &changelog_default_fops; /* default logging policy */
+ cp->cops = &changelog_default_cops; /* default changelog operations */
+
+ return 0;
+}
+
+int
+changelog_default_policy_fini (xlator_t *this,
+ struct changelog_logpolicy *cp)
+{
+ GF_FREE (cp->cpriv);
+ return 0;
+}
diff --git a/xlators/features/changelog/src/policy/changelog-policy-replication.c b/xlators/features/changelog/src/policy/changelog-policy-replication.c
new file mode 100644
index 000000000..29c049716
--- /dev/null
+++ b/xlators/features/changelog/src/policy/changelog-policy-replication.c
@@ -0,0 +1,1374 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-policy.h"
+#include "changelog-encoders.h"
+#include "changelog-fops.h"
+
+#define JOURNAL_NAME "TERM"
+
+#define JOURNAL_SECTOR_SIZE 128
+
+#define PRE_OP_MARK 0x5F4552505FULL /* _PRE_ */
+#define POST_OP_MARK 0x5F54534F505FULL /* _POST_ */
+
+/* similar to fop_fn, but... */
+size_t
+int32_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ size_t bufsz = 0;
+ int nr = 0;
+ char buf[20] = {0,};
+
+ nr = *(int *) data;
+
+ if (encode) {
+ (void) snprintf (buf, sizeof (buf), "%d", nr);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ &nr, sizeof (int));
+
+ return bufsz;
+}
+
+
+size_t
+uint32_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ size_t bufsz = 0;
+ unsigned int nr = 0;
+ char buf[20] = {0,};
+
+ nr = *(unsigned int *) data;
+
+ if (encode) {
+ (void) snprintf (buf, sizeof (buf), "%u", nr);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ &nr, sizeof (unsigned int));
+
+ return bufsz;
+}
+
+size_t
+number_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ char buf[1024] = {0,};
+ size_t bufsz = 0;
+ unsigned long long nr = 0;
+
+ nr = *(unsigned long long *) data;
+
+ if (encode) {
+ (void) snprintf (buf, sizeof (buf), "%llu", nr);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ &nr, sizeof (unsigned long long));
+
+ return bufsz;
+}
+
+size_t
+uuid_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ char buf[1024] = {0,};
+ uuid_t uuid = {0,};
+ size_t bufsz = 0;
+
+ memcpy (uuid, (uuid_t *) data, sizeof (uuid_t));
+
+ if (encode) {
+ char *tmpbuf = uuid_utoa (uuid);
+ (void) snprintf (buf, sizeof (buf), "%s", tmpbuf);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, uuid, sizeof (uuid_t));
+
+ return bufsz;
+}
+
+#define CHANGELOG_FILL_USIGNLL(co, number, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_ULL; \
+ co->co_number = (unsigned long long) number; \
+ xlen += sizeof (unsigned long long); \
+ if (!co->co_convert) \
+ co->co_len = sizeof (unsigned long long); \
+ } while (0)
+
+#define CHANGELOG_FILL_UUID(co, uuid, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_UUID; \
+ uuid_copy (co->co_uuid, uuid); \
+ xlen += sizeof (uuid_t); \
+ } while (0)
+
+
+/* TBD: move declarations here and nsr.c into a common place */
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define NSR_INDEX_XATTR "trusted.nsr.index"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+
+static gf_boolean_t
+changelog_fix_term(xlator_t *this,
+ changelog_local_t *local,
+ dict_t *xdata)
+{
+ int32_t old_term, new_term;
+ uint32_t index;
+ changelog_priv_t *priv = this->private;
+ int ret = 0;
+ char nfile[PATH_MAX] = {0,};
+ int32_t recon_term, recon_index;
+ changelog_rollover_data_t crd;
+
+ // If coming via the regular IO path, we should get the dict "nsr-term"
+ // If coming via reconciliation, we should get the dicts "nsr-recon-term"
+ // that indicates the term and "nsr-recon-index" for the index
+ if ((dict_get_int32(xdata,NSR_TERM_XATTR,&new_term) == 0) &&
+ (dict_get_uint32(xdata, NSR_INDEX_XATTR, &index) == 0)) {
+ old_term = priv->term;
+
+ if (old_term != new_term) {
+ GF_ASSERT(new_term > old_term);
+ LOCK (&priv->lock);
+ priv->term = new_term;
+ (void) snprintf (nfile, PATH_MAX, "%s.%d",
+ JOURNAL_NAME, priv->term);
+ ret = CHANGELOG_INVOKE_CFOP(this, priv, rollover,
+ nfile, _gf_false);
+ UNLOCK (&priv->lock);
+ if (ret != 0)
+ return _gf_false;
+ }
+ local->nr_bytes = 0;
+ local->lu.val = index;
+ } else if ((dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+
+ old_term = priv->term;
+
+ if (old_term != recon_term) {
+ LOCK (&priv->lock);
+ {
+ priv->term = recon_term;
+ (void) snprintf (crd.crd_changelog_name,
+ PATH_MAX, "%s.%d",
+ JOURNAL_NAME, priv->term);
+ crd.crd_prealloc_size = 1<<29;
+ ret = changelog_open(this, priv, local, &crd);
+ }
+ UNLOCK (&priv->lock);
+ if (ret != 0)
+ return _gf_false;
+ }
+
+ local->nr_bytes = 0;
+ local->lu.val = recon_index;
+ } else {
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+/**
+ * Replication policy records journal entries in the FOP path. This is
+ * quite different that the default policy (used by geo-replication),
+ * which journals records in the callback path on a successfull posix
+ * operation. Additionally, each record starts with a PRE-OP marker and
+ * the index number generated by the leader.
+ * (c.f. nsr_$NAME$() ~/xlator/cluster/nsr/nsr-server/src/all-templates.c)
+ *
+ * POST-OPs are marked asynchronously and not during in the callback path
+ * Marking it in the callback path is incorrect as the actual FOP may not
+ * have been synchronized to the disk. Therefore, POST op marking is done
+ * after a successful file system sync, which is trigerred periodically
+ * by NSR server component. To keep journal updates strictly sequential,
+ * POST-OPs are separate record in the journal.
+ */
+
+/**
+ * Override File Operations
+ *
+ * NOTE: Since journal updates are done in the FOP path, there is no
+ * actual use of @local in cbk. Therefore, @local could have been
+ * declared statically for each FOP (which would remove the overhead
+ * of allocating (here) and deallocating (in cbk)). Let's not do that
+ * and keep it this way for now. Will worry about it later (for code
+ * reasonability and performance).
+ */
+
+int32_t
+changelog_replication_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + UID + GID + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 7);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 7);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ return changelog_replication_rmdir (frame, this, loc, xflags, xdata);
+}
+
+int32_t
+changelog_replication_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + UID + GID + OLDLOC + NEWLOC */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->inode->gfid, 8);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, oldloc->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 8);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_link (call_frame_t *frame,
+ xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + UID + GID + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->gfid, 7);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, oldloc->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 7);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_mkdir (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+
+ /* <PRE> + IDX + FOP + GFID + MODE + UID + GID + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 8);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, mode | S_IFDIR, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 8);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc,
+ mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+
+ /* <PRE> + IDX + FOP + GFID + LINKNAME + UID + GID + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 8);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_NAME (co, linkname, entry_free_fn, xtra_len, out);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 8);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_mknod (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t dev,
+ mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+
+ /* <PRE> + IDX + FOP + GFID + MODE + UID + GID + DEV + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 9);
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term (this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, mode, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, dev, number_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 9);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+
+ /* <PRE> + IDX + FOP + GFID + MODE + UID + GID + ENTRY */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 8);
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term (this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, mode, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 8);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+static int
+_changelog_setattr_fill_common (call_frame_t *frame, xlator_t *this,
+ int32_t attr, struct iatt *stbuf,
+ uuid_t gfid, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ int used_count = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ used_count = 7;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, used_count);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ /**
+ * - <PRE>
+ * - IDX
+ * - FOP
+ * - GFID
+ * - Valid flag
+ * GF_SET_ATTR_MODE [chmod]
+ * ->ia_prot
+ * ->ia_type
+ * GF_SET_ATTR_UID | GF_SET_ATTR_GID [chown]
+ * ->ia_uid
+ * ->ia_gid
+ * GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME [utimes]
+ * ->ia_atime
+ * ->ia_mtime
+ */
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, attr, uint32_fn, xtra_len);
+ co++;
+
+ if (attr & GF_SET_ATTR_MODE) {
+ mode_t mode = 0;
+
+ /* ->ia_prot & ->ia_type stored as a consolidated value */
+ used_count--;
+ mode = st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type);
+
+ CHANGELOG_FILL_UINT32 (co, mode, uint32_fn, xtra_len);
+
+ } else if (attr & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
+ uid_t uid = -1;
+ gid_t gid = -1;
+
+ if (attr & GF_SET_ATTR_UID)
+ uid = stbuf->ia_uid;
+ if (attr & GF_SET_ATTR_GID)
+ gid = stbuf->ia_gid;
+
+ /* ->ia_uid & ->ia_gid */
+ CHANGELOG_FILL_INT32 (co, uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, gid, int32_fn, xtra_len);
+
+ } else if (attr & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+
+ /* ->ia_atime & ->ia_mtime, need usecs? */
+ CHANGELOG_FILL_UINT32 (co,
+ stbuf->ia_atime, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co,
+ stbuf->ia_mtime, uint32_fn, xtra_len);
+ }
+
+ changelog_set_usable_record_and_length (local, xtra_len, used_count);
+
+ ret = 0;
+ frame->local = local;
+
+ changelog_update (this, priv, frame->local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+
+ return ret;
+}
+
+int32_t
+changelog_replication_fsetattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid,
+ dict_t *xdata)
+{
+ return _changelog_setattr_fill_common (frame, this, valid,
+ stbuf, fd->inode->gfid, xdata);
+}
+
+int32_t
+changelog_replication_setattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ return _changelog_setattr_fill_common (frame, this, valid,
+ stbuf, loc->inode->gfid, xdata);
+}
+
+int32_t
+changelog_replication_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+ int32_t xattr_op;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 4);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0)
+ CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+ fop_fn, xtra_len);
+ else
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+ xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+ int32_t xattr_op;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 4);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0)
+ CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+ fop_fn, xtra_len);
+ else
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+ xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_setxattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+ int32_t xattr_op;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 4);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0)
+ CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+ fop_fn, xtra_len);
+ else
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+ xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_fsetxattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+ int32_t xattr_op;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 4);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0)
+ CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+ fop_fn, xtra_len);
+ else
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+ xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_truncate (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + Offset */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 5);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 5);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_DATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_ftruncate (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + Offset */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 5);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 5);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_DATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_writev (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + Offset + Length */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 6);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, iov_length (vector, count),
+ number_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 6);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_DATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* overriden COPS */
+int
+changelog_replication_cops_open (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ char *name, gf_boolean_t last)
+{
+ changelog_local_t local = {0,};
+ changelog_log_data_t cld = {0,};
+ changelog_rollover_data_t *crd = NULL;
+
+ crd = &cld.cld_roll;
+
+ cld.cld_type = CHANGELOG_TYPE_ROLLOVER;
+
+ crd->crd_finale = last;
+ crd->crd_use_suffix = _gf_false;
+ crd->crd_prealloc_size = 1<<29; /* preallocate 512 MB */
+
+
+ (void) strcpy (crd->crd_changelog_name, name);
+
+ local.lu.val = 0;
+ local.nr_bytes = 0;
+
+ return changelog_inject_single_event (this, priv, &local, &cld);
+}
+
+/**
+ * NO-OP changelog write (from changelog.c). Records are journaled
+ * in the FOP path.
+ */
+int
+changelog_replication_cops_write (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local,
+ changelog_log_type type)
+{
+ return 0;
+}
+
+/**
+ * no implicit rollover
+ */
+int
+changelog_replication_cops_rollover (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ char *name, gf_boolean_t last)
+{
+ return changelog_replication_cops_open (this, priv, cpriv, name, last);
+}
+
+off_t
+changelog_replication_cops_get_offset (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local)
+{
+ if (!local)
+ return 0;
+
+ return (local->lu.val * JOURNAL_SECTOR_SIZE) + local->nr_bytes;
+}
+
+void
+changelog_replication_cops_set_offset (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local, off_t bytes)
+{
+ local->nr_bytes += bytes;
+}
+
+void
+changelog_replication_cops_reset_offset (xlator_t *this, changelog_priv_t *priv,
+ void *cpriv, changelog_local_t *local)
+{
+ return;
+}
+
+int
+changelog_replication_policy_init (xlator_t *this,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *cp)
+{
+ struct xlator_fops *r_fops = NULL;
+ struct changelog_ops *r_cops = NULL;
+
+ r_fops = GF_CALLOC (1, sizeof (struct xlator_fops),
+ gf_changelog_mt_fop_policy_t);
+ if (!r_fops)
+ return -1;
+
+ r_cops = GF_CALLOC (1, sizeof (struct changelog_ops),
+ gf_changelog_mt_fop_policy_t);
+ if (!r_cops) {
+ GF_FREE (r_fops);
+ return -1;
+ }
+
+ cp->cpriv = GF_CALLOC (1, sizeof (off_t),
+ gf_changelog_mt_fop_policy_t);
+ if (!cp->cpriv) {
+ GF_FREE (r_fops);
+ GF_FREE (r_cops);
+ return -1;
+ }
+
+ /* no roll-over, one big fat journal per term */
+ priv->rollover_time = 0;
+
+ /* fsync() is internally trigerred by NSR */
+ priv->fsync_interval = 0;
+
+ /* no record header: extra data (via iobufs) are always persisted */
+ priv->no_gfid_hdr = _gf_true;
+
+ priv->lockless_update = _gf_false;
+
+ memcpy (r_fops, &changelog_default_fops, sizeof (struct xlator_fops));
+ memcpy (r_cops, &changelog_default_cops, sizeof (struct changelog_ops));
+
+ priv->term = 0;
+ (void) memset (cp->changelog_name, '\0', PATH_MAX);
+ memcpy(cp->changelog_name, JOURNAL_NAME, strlen(JOURNAL_NAME));
+#if 0
+ (void) snprintf (cp->changelog_name, PATH_MAX,
+ JOURNAL_NAME, priv->term);
+#endif
+
+ /* overload all fops */
+ r_fops->writev = changelog_replication_writev;
+ r_fops->ftruncate = changelog_replication_ftruncate;
+ r_fops->truncate = changelog_replication_truncate;
+ r_fops->fsetxattr = changelog_replication_fsetxattr;
+ r_fops->setxattr = changelog_replication_setxattr;
+ r_fops->removexattr = changelog_replication_removexattr;
+ r_fops->fremovexattr = changelog_replication_fremovexattr;
+ r_fops->setattr = changelog_replication_setattr;
+ r_fops->fsetattr = changelog_replication_fsetattr;
+ r_fops->create = changelog_replication_create;
+ r_fops->mknod = changelog_replication_mknod;
+ r_fops->symlink = changelog_replication_symlink;
+ r_fops->mkdir = changelog_replication_mkdir;
+ r_fops->link = changelog_replication_link;
+ r_fops->rename = changelog_replication_rename;
+ r_fops->unlink = changelog_replication_unlink;
+ r_fops->rmdir = changelog_replication_rmdir;
+
+ /* overload cops */
+ r_cops->open = changelog_replication_cops_open;
+ r_cops->write = changelog_replication_cops_write;
+ r_cops->rollover = changelog_replication_cops_rollover;
+ r_cops->get_offset = changelog_replication_cops_get_offset;
+ r_cops->set_offset = changelog_replication_cops_set_offset;
+ r_cops->reset_offset = changelog_replication_cops_reset_offset;
+
+
+ cp->fops = r_fops;
+ cp->cops = r_cops;
+
+ return 0;
+}
+
+int
+changelog_replication_policy_fini (xlator_t *this,
+ struct changelog_logpolicy *cp)
+{
+ GF_FREE (cp->fops);
+ GF_FREE (cp->cops);
+ GF_FREE (cp->cpriv);
+ return 0;
+}
diff --git a/xlators/features/changelog/src/policy/changelog-policy.h b/xlators/features/changelog/src/policy/changelog-policy.h
new file mode 100644
index 000000000..73fdc1a98
--- /dev/null
+++ b/xlators/features/changelog/src/policy/changelog-policy.h
@@ -0,0 +1,41 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_POLICY_H
+#define _CHANGELOG_POLICY_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "changelog-mem-types.h"
+#include "changelog-helpers.h"
+
+int
+changelog_default_policy_init (xlator_t *this,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *);
+int
+changelog_default_policy_fini (xlator_t *this,
+ struct changelog_logpolicy *);
+int
+changelog_replication_policy_init (xlator_t *this,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *cp);
+int
+changelog_replication_policy_fini (xlator_t *this,
+ struct changelog_logpolicy *cp);
+
+#endif /* _CHANGELOG_POLICY_H */
diff --git a/xlators/features/compress/Makefile.am b/xlators/features/compress/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/features/compress/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/compress/src/Makefile.am b/xlators/features/compress/src/Makefile.am
new file mode 100644
index 000000000..263b21b78
--- /dev/null
+++ b/xlators/features/compress/src/Makefile.am
@@ -0,0 +1,17 @@
+xlator_LTLIBRARIES = cdc.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+noinst_HEADERS = cdc.h cdc-mem-types.h
+
+cdc_la_LDFLAGS = -module -avoid-version $(LIBZ_LIBS)
+
+cdc_la_SOURCES = cdc.c cdc-helper.c
+cdc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \
+-shared $(LIBZ_CFLAGS)
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/compress/src/cdc-helper.c b/xlators/features/compress/src/cdc-helper.c
new file mode 100644
index 000000000..54432ff45
--- /dev/null
+++ b/xlators/features/compress/src/cdc-helper.c
@@ -0,0 +1,547 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+
+#include "cdc.h"
+#include "cdc-mem-types.h"
+
+#ifdef HAVE_LIB_Z
+#include "zlib.h"
+#endif
+
+#ifdef HAVE_LIB_Z
+/* gzip header looks something like this
+ * (RFC 1950)
+ *
+ * +---+---+---+---+---+---+---+---+---+---+
+ * |ID1|ID2|CM |FLG| MTIME |XFL|OS |
+ * +---+---+---+---+---+---+---+---+---+---+
+ *
+ * Data is usually sent without this header i.e
+ * Data sent = <compressed-data> + trailer(8)
+ * The trailer contains the checksum.
+ *
+ * gzip_header is added only during debugging.
+ * Refer to the function cdc_dump_iovec_to_disk
+ */
+static const char gzip_header[10] =
+ {
+ '\037', '\213', Z_DEFLATED, 0,
+ 0, 0, 0, 0,
+ 0, GF_CDC_OS_ID
+ };
+
+static int32_t
+cdc_next_iovec (xlator_t *this, cdc_info_t *ci)
+{
+ int ret = -1;
+
+ ci->ncount++;
+ /* check for iovec overflow -- should not happen */
+ if (ci->ncount == MAX_IOVEC) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Zlib output buffer overflow"
+ " ->ncount (%d) | ->MAX_IOVEC (%d)",
+ ci->ncount, MAX_IOVEC);
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static void
+cdc_put_long (unsigned char *string, unsigned long x)
+{
+ string[0] = (unsigned char) (x & 0xff);
+ string[1] = (unsigned char) ((x & 0xff00) >> 8);
+ string[2] = (unsigned char) ((x & 0xff0000) >> 16);
+ string[3] = (unsigned char) ((x & 0xff000000) >> 24);
+}
+
+static unsigned long
+cdc_get_long (unsigned char *buf)
+{
+ return ((unsigned long) buf[0])
+ | (((unsigned long) buf[1]) << 8)
+ | (((unsigned long) buf[2]) << 16)
+ | (((unsigned long) buf[3]) << 24);
+}
+
+static int32_t
+cdc_init_gzip_trailer (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci)
+{
+ int ret = -1;
+ char *buf = NULL;
+
+ ret = cdc_next_iovec (this, ci);
+ if (ret)
+ goto out;
+
+ buf = CURR_VEC(ci).iov_base =
+ (char *) GF_CALLOC (1, GF_CDC_VALIDATION_SIZE,
+ gf_cdc_mt_gzip_trailer_t);
+
+ if (!CURR_VEC(ci).iov_base)
+ goto out;
+
+ CURR_VEC(ci).iov_len = GF_CDC_VALIDATION_SIZE;
+
+ cdc_put_long ((unsigned char *)&buf[0], ci->crc);
+ cdc_put_long ((unsigned char *)&buf[4], ci->stream.total_in);
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static int32_t
+cdc_alloc_iobuf_and_init_vec (xlator_t *this,
+ cdc_priv_t *priv, cdc_info_t *ci,
+ int size)
+{
+ int ret = -1;
+ int alloc_len = 0;
+ struct iobuf *iobuf = NULL;
+
+ ret = cdc_next_iovec (this, ci);
+ if (ret)
+ goto out;
+
+ alloc_len = size ? size : ci->buffer_size;
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, alloc_len);
+ if (!iobuf)
+ goto out;
+
+ ret = iobref_add (ci->iobref, iobuf);
+ if (ret)
+ goto out;
+
+ /* Initialize this iovec */
+ CURR_VEC(ci).iov_base = iobuf->ptr;
+ CURR_VEC(ci).iov_len = alloc_len;
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static void
+cdc_init_zlib_output_stream (cdc_priv_t *priv, cdc_info_t *ci, int size)
+{
+ ci->stream.next_out = (unsigned char *) CURR_VEC(ci).iov_base;
+ ci->stream.avail_out = size ? size : ci->buffer_size;
+}
+
+/* This routine is for testing and debugging only.
+ * Data written = header(10) + <compressed-data> + trailer(8)
+ * So each gzip dump file is at least 18 bytes in size.
+ */
+void
+cdc_dump_iovec_to_disk (xlator_t *this, cdc_info_t *ci, const char *file)
+{
+ int i = 0;
+ int fd = 0;
+ size_t writen = 0;
+ size_t total_writen = 0;
+
+ fd = open (file, O_WRONLY|O_CREAT|O_TRUNC, 0777 );
+ if (fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot open file: %s", file);
+ return;
+ }
+
+ writen = write (fd, (char *) gzip_header, 10);
+ total_writen += writen;
+ for (i = 0; i < ci->ncount; i++) {
+ writen = write (fd, (char *) ci->vec[i].iov_base, ci->vec[i].iov_len);
+ total_writen += writen;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dump'd %zu bytes to %s", total_writen, GF_CDC_DEBUG_DUMP_FILE );
+
+ close (fd);
+}
+
+static int32_t
+cdc_flush_libz_buffer (cdc_priv_t *priv, xlator_t *this, cdc_info_t *ci,
+ int (*libz_func)(z_streamp, int),
+ int flush)
+{
+ int32_t ret = Z_OK;
+ int done = 0;
+ unsigned int deflate_len = 0;
+
+ for (;;) {
+ deflate_len = ci->buffer_size - ci->stream.avail_out;
+
+ if (deflate_len != 0) {
+ CURR_VEC(ci).iov_len = deflate_len;
+
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret) {
+ ret = Z_MEM_ERROR;
+ break;
+ }
+
+ /* Re-position Zlib output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+ }
+
+ if (done) {
+ ci->ncount--;
+ break;
+ }
+
+ ret = libz_func (&ci->stream, flush);
+
+ if (ret == Z_BUF_ERROR) {
+ ret = Z_OK;
+ ci->ncount--;
+ break;
+ }
+
+ done = (ci->stream.avail_out != 0 || ret == Z_STREAM_END);
+
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+ }
+
+ return ret;
+}
+
+static int32_t
+do_cdc_compress (struct iovec *vec, xlator_t *this, cdc_priv_t *priv,
+ cdc_info_t *ci)
+{
+ int ret = -1;
+
+ /* Initialize defalte */
+ ret = deflateInit2 (&ci->stream, priv->cdc_level, Z_DEFLATED,
+ priv->window_size, priv->mem_level,
+ Z_DEFAULT_STRATEGY);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unable to init Zlib (retval: %d)", ret);
+ goto out;
+ }
+
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret)
+ goto out;
+
+ /* setup output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+
+ /* setup input buffer */
+ ci->stream.next_in = (unsigned char *) vec->iov_base;
+ ci->stream.avail_in = vec->iov_len;
+
+ ci->crc = crc32 (ci->crc, (const Bytef *) vec->iov_base, vec->iov_len);
+
+ gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%d buffer_size=%d",
+ ci->crc, ci->stream.avail_in, ci->buffer_size);
+
+ /* compress !! */
+ while (ci->stream.avail_in != 0) {
+ if (ci->stream.avail_out == 0) {
+
+ CURR_VEC(ci).iov_len = ci->buffer_size;
+
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret)
+ break;
+
+ /* Re-position Zlib output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+ }
+
+ ret = deflate (&ci->stream, Z_NO_FLUSH);
+ if (ret != Z_OK)
+ break;
+ }
+
+ out:
+ return ret;
+}
+
+int32_t
+cdc_compress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci,
+ dict_t **xdata)
+{
+ int ret = -1;
+ int i = 0;
+
+ ci->iobref = iobref_new ();
+ if (!ci->iobref)
+ goto out;
+
+ if (!*xdata) {
+ *xdata = dict_new ();
+ if (!*xdata) {
+ gf_log (this->name, GF_LOG_ERROR, "Cannot allocate xdata"
+ " dict");
+ goto out;
+ }
+ }
+
+ /* data */
+ for (i = 0; i < ci->count; i++) {
+ ret = do_cdc_compress (&ci->vector[i], this, priv, ci);
+ if (ret != Z_OK)
+ goto deflate_cleanup_out;
+ }
+
+ /* flush zlib buffer */
+ ret = cdc_flush_libz_buffer (priv, this, ci, deflate, Z_FINISH);
+ if (!(ret == Z_OK || ret == Z_STREAM_END)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Compression Error: ret (%d)", ret);
+ ret = -1;
+ goto deflate_cleanup_out;
+ }
+
+ /* trailer */
+ ret = cdc_init_gzip_trailer (this, priv, ci);
+ if (ret)
+ goto deflate_cleanup_out;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Compressed %ld to %ld bytes",
+ ci->stream.total_in, ci->stream.total_out);
+
+ ci->nbytes = ci->stream.total_out + GF_CDC_VALIDATION_SIZE;
+
+ /* set deflated canary value for identification */
+ ret = dict_set_int32 (*xdata, GF_CDC_DEFLATE_CANARY_VAL, 1);
+ if (ret) {
+ /* Send uncompressed data if we can't _tell_ the client
+ * that deflated data is on it's way. So, we just log
+ * the faliure and continue as usual.
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "Data deflated, but could not set canary"
+ " value in dict for identification");
+ }
+
+ /* This is to be used in testing */
+ if ( priv->debug ) {
+ cdc_dump_iovec_to_disk (this, ci, GF_CDC_DEBUG_DUMP_FILE );
+ }
+
+ deflate_cleanup_out:
+ (void) deflateEnd(&ci->stream);
+
+ out:
+ return ret;
+}
+
+
+/* deflate content is checked by the presence of a canary
+ * value in the dict as the key
+ */
+static int32_t
+cdc_check_content_for_deflate (dict_t *xdata)
+{
+ return dict_get (xdata, GF_CDC_DEFLATE_CANARY_VAL) ? -1 : 0;
+}
+
+static unsigned long
+cdc_extract_crc (char *trailer)
+{
+ return cdc_get_long ((unsigned char *) &trailer[0]);
+}
+
+static unsigned long
+cdc_extract_size (char *trailer)
+{
+ return cdc_get_long ((unsigned char *) &trailer[4]);
+}
+
+static int32_t
+cdc_validate_inflate (cdc_info_t *ci, unsigned long crc,
+ unsigned long len)
+{
+ return !((crc == ci->crc)
+ /* inflated length is hidden inside
+ * Zlib stream struct */
+ && (len == ci->stream.total_out));
+}
+
+static int32_t
+do_cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci)
+{
+ int ret = -1;
+ int i = 0;
+ int len = 0;
+ char *inflte = NULL;
+ char *trailer = NULL;
+ struct iovec vec = {0,};
+ unsigned long computed_crc = 0;
+ unsigned long computed_len = 0;
+
+ ret = inflateInit2 (&ci->stream, priv->window_size);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Zlib: Unable to initialize inflate");
+ goto out;
+ }
+
+ vec = THIS_VEC(ci, 0);
+
+ trailer = (char *) (((char *) vec.iov_base) + vec.iov_len
+ - GF_CDC_VALIDATION_SIZE);
+
+ /* CRC of uncompressed data */
+ computed_crc = cdc_extract_crc (trailer);
+
+ /* size of uncomrpessed data */
+ computed_len = cdc_extract_size (trailer);
+
+ gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%lu buffer_size=%d",
+ computed_crc, computed_len, ci->buffer_size);
+
+ inflte = vec.iov_base ;
+ len = vec.iov_len - GF_CDC_VALIDATION_SIZE;
+
+ /* allocate buffer of the original length of the data */
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret)
+ goto out;
+
+ /* setup output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+
+ /* setup input buffer */
+ ci->stream.next_in = (unsigned char *) inflte;
+ ci->stream.avail_in = len;
+
+ while (ci->stream.avail_in != 0) {
+ if (ci->stream.avail_out == 0) {
+ CURR_VEC(ci).iov_len = ci->buffer_size;
+
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret)
+ break;
+
+ /* Re-position Zlib output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+ }
+
+ ret = inflate (&ci->stream, Z_NO_FLUSH);
+ if (ret == Z_STREAM_ERROR)
+ break;
+ }
+
+ /* flush zlib buffer */
+ ret = cdc_flush_libz_buffer (priv, this, ci, inflate, Z_SYNC_FLUSH);
+ if (!(ret == Z_OK || ret == Z_STREAM_END)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Decompression Error: ret (%d)", ret);
+ ret = -1;
+ goto out;
+ }
+
+ /* compute CRC of the uncompresses data to check for
+ * correctness */
+
+ for (i = 0; i < ci->ncount; i++) {
+ ci->crc = crc32 (ci->crc,
+ (const Bytef *) ci->vec[i].iov_base,
+ ci->vec[i].iov_len);
+ }
+
+ /* validate inflated data */
+ ret = cdc_validate_inflate (ci, computed_crc, computed_len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Checksum or length mismatched in inflated data");
+ }
+
+ out:
+ return ret;
+}
+
+int32_t
+cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci,
+ dict_t *xdata)
+{
+ int32_t ret = -1;
+
+ /* check for deflate content */
+ if (!cdc_check_content_for_deflate (xdata)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Content not deflated, passing through ...");
+ goto passthrough_out;
+ }
+
+ ci->iobref = iobref_new ();
+ if (!ci->iobref)
+ goto passthrough_out;
+
+ /* do we need to do this? can we assume that one iovec
+ * will hold per request data everytime?
+ *
+ * server/client protocol seems to deal with a single
+ * iovec even if op_ret > 1M. So, it looks ok to
+ * assume that a single iovec will contain all the
+ * data (This saves us a lot from finding the trailer
+ * and the data since it could have been split-up onto
+ * two adjacent iovec's.
+ *
+ * But, in case this translator is loaded above quick-read
+ * for some reason, then it's entirely possible that we get
+ * multiple iovec's...
+ *
+ * This case (handled below) is not tested. (by loading the
+ * xlator below quick-read)
+ */
+
+ /* @@ I_HOPE_THIS_IS_NEVER_HIT */
+ if (ci->count > 1) {
+ gf_log (this->name, GF_LOG_WARNING, "unable to handle"
+ " multiple iovecs (%d in number)", ci->count);
+ goto inflate_cleanup_out;
+ /* TODO: coallate all iovecs in one */
+ }
+
+ ret = do_cdc_decompress (this, priv, ci);
+ if (ret)
+ goto inflate_cleanup_out;
+
+ ci->nbytes = ci->stream.total_out;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Inflated %ld to %ld bytes",
+ ci->stream.total_in, ci->stream.total_out);
+
+ inflate_cleanup_out:
+ (void) inflateEnd (&ci->stream);
+
+ passthrough_out:
+ return ret;
+}
+
+#endif
diff --git a/xlators/features/compress/src/cdc-mem-types.h b/xlators/features/compress/src/cdc-mem-types.h
new file mode 100644
index 000000000..ead2c70ba
--- /dev/null
+++ b/xlators/features/compress/src/cdc-mem-types.h
@@ -0,0 +1,23 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CDC_MEM_TYPES_H
+#define __CDC_MEM_TYPES_H
+
+#include "mem-types.h"
+
+enum gf_cdc_mem_types {
+ gf_cdc_mt_priv_t = gf_common_mt_end + 1,
+ gf_cdc_mt_vec_t = gf_common_mt_end + 2,
+ gf_cdc_mt_gzip_trailer_t = gf_common_mt_end + 3,
+ gf_cdc_mt_end = gf_common_mt_end + 4,
+};
+
+#endif
diff --git a/xlators/features/compress/src/cdc.c b/xlators/features/compress/src/cdc.c
new file mode 100644
index 000000000..67fc52505
--- /dev/null
+++ b/xlators/features/compress/src/cdc.c
@@ -0,0 +1,361 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <sys/uio.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "cdc.h"
+#include "cdc-mem-types.h"
+
+static void
+cdc_cleanup_iobref (cdc_info_t *ci)
+{
+ assert(ci->iobref != NULL);
+ iobref_clear (ci->iobref);
+}
+
+int32_t
+cdc_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count,
+ struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ int ret = -1;
+ cdc_priv_t *priv = NULL;
+ cdc_info_t ci = {0,};
+
+ GF_VALIDATE_OR_GOTO ("cdc", this, default_out);
+ GF_VALIDATE_OR_GOTO (this->name, frame, default_out);
+
+ priv = this->private;
+
+ if (op_ret <= 0)
+ goto default_out;
+
+ if ( (priv->min_file_size != 0)
+ && (op_ret < priv->min_file_size) )
+ goto default_out;
+
+ ci.count = count;
+ ci.ibytes = op_ret;
+ ci.vector = vector;
+ ci.buf = NULL;
+ ci.iobref = NULL;
+ ci.ncount = 0;
+ ci.crc = 0;
+ ci.buffer_size = GF_CDC_DEF_BUFFERSIZE;
+
+/* A readv compresses on the server side and decompresses on the client side
+ */
+ if (priv->op_mode == GF_CDC_MODE_SERVER) {
+ ret = cdc_compress (this, priv, &ci, &xdata);
+ } else if (priv->op_mode == GF_CDC_MODE_CLIENT) {
+ ret = cdc_decompress (this, priv, &ci, xdata);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid operation mode (%d)", priv->op_mode);
+ }
+
+ if (ret)
+ goto default_out;
+
+ STACK_UNWIND_STRICT (readv, frame, ci.nbytes, op_errno,
+ ci.vec, ci.ncount, stbuf, iobref,
+ xdata);
+ cdc_cleanup_iobref (&ci);
+ return 0;
+
+ default_out:
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
+ vector, count, stbuf, iobref, xdata);
+ return 0;
+}
+
+int32_t
+cdc_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags,
+ dict_t *xdata)
+{
+ fop_readv_cbk_t cbk = NULL;
+
+#ifdef HAVE_LIB_Z
+ cbk = cdc_readv_cbk;
+#else
+ cbk = default_readv_cbk;
+#endif
+ STACK_WIND (frame, cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+}
+
+int32_t
+cdc_writev_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int32_t
+cdc_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t offset,
+ uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int ret = -1;
+ cdc_priv_t *priv = NULL;
+ cdc_info_t ci = {0,};
+ size_t isize = 0;
+
+ GF_VALIDATE_OR_GOTO ("cdc", this, default_out);
+ GF_VALIDATE_OR_GOTO (this->name, frame, default_out);
+
+ priv = this->private;
+
+ isize = iov_length(vector, count);
+
+ if (isize <= 0)
+ goto default_out;
+
+ if ( (priv->min_file_size != 0) && (isize < priv->min_file_size) )
+ goto default_out;
+
+ ci.count = count;
+ ci.ibytes = isize;
+ ci.vector = vector;
+ ci.buf = NULL;
+ ci.iobref = NULL;
+ ci.ncount = 0;
+ ci.crc = 0;
+ ci.buffer_size = GF_CDC_DEF_BUFFERSIZE;
+
+/* A writev compresses on the client side and decompresses on the server side
+ */
+ if (priv->op_mode == GF_CDC_MODE_CLIENT) {
+ ret = cdc_compress (this, priv, &ci, &xdata);
+ } else if (priv->op_mode == GF_CDC_MODE_SERVER) {
+ ret = cdc_decompress (this, priv, &ci, xdata);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid operation mode (%d) ", priv->op_mode);
+ }
+
+ if (ret)
+ goto default_out;
+
+ STACK_WIND (frame,
+ cdc_writev_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->writev,
+ fd, ci.vec, ci.ncount, offset, flags,
+ iobref, xdata);
+
+ cdc_cleanup_iobref (&ci);
+ return 0;
+
+ default_out:
+ STACK_WIND (frame,
+ cdc_writev_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->writev,
+ fd, vector, count, offset, flags,
+ iobref, xdata);
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_cdc_mt_end);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ int ret = -1;
+ char *temp_str = NULL;
+ cdc_priv_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("cdc", this, err);
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Need subvolume == 1");
+ goto err;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Dangling volume. Check volfile");
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_cdc_mt_priv_t);
+ if (!priv) {
+ goto err;
+ }
+
+ /* Check if debug mode is turned on */
+ GF_OPTION_INIT ("debug", priv->debug, bool, err);
+ if( priv->debug ) {
+ gf_log (this->name, GF_LOG_DEBUG, "CDC debug option turned on");
+ }
+
+ /* Set Gzip Window Size */
+ GF_OPTION_INIT ("window-size", priv->window_size, int32, err);
+ if ( (priv->window_size > GF_CDC_MAX_WINDOWSIZE)
+ || (priv->window_size < GF_CDC_DEF_WINDOWSIZE) ) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Invalid gzip window size (%d), using default",
+ priv->window_size);
+ priv->window_size = GF_CDC_DEF_WINDOWSIZE;
+ }
+
+ /* Set Gzip (De)Compression Level */
+ GF_OPTION_INIT ("compression-level", priv->cdc_level, int32, err);
+ if ( ((priv->cdc_level < 1) || (priv->cdc_level > 9))
+ && (priv->cdc_level != GF_CDC_DEF_COMPRESSION) ) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Invalid gzip (de)compression level (%d),"
+ " using default", priv->cdc_level);
+ priv->cdc_level = GF_CDC_DEF_COMPRESSION;
+ }
+
+ /* Set Gzip Memory Level */
+ GF_OPTION_INIT ("mem-level", priv->mem_level, int32, err);
+ if ( (priv->mem_level < 1) || (priv->mem_level > 9) ) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Invalid gzip memory level, using the default");
+ priv->mem_level = GF_CDC_DEF_MEMLEVEL;
+ }
+
+ /* Set min file size to enable compression */
+ GF_OPTION_INIT ("min-size", priv->min_file_size, int32, err);
+
+ /* Mode of operation - Server/Client */
+ ret = dict_get_str (this->options, "mode", &temp_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Operation mode not specified !!");
+ goto err;
+ }
+
+ if (GF_CDC_MODE_IS_CLIENT (temp_str)) {
+ priv->op_mode = GF_CDC_MODE_CLIENT;
+ } else if (GF_CDC_MODE_IS_SERVER (temp_str)) {
+ priv->op_mode = GF_CDC_MODE_SERVER;
+ } else {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Bogus operation mode (%s) specified", temp_str);
+ goto err;
+ }
+
+ this->private = priv;
+ gf_log (this->name, GF_LOG_DEBUG, "CDC xlator loaded in (%s) mode",temp_str);
+ return 0;
+
+ err:
+ if (priv)
+ GF_FREE (priv);
+
+ return -1;
+}
+
+void
+fini (xlator_t *this)
+{
+ cdc_priv_t *priv = this->private;
+
+ if (priv)
+ GF_FREE (priv);
+ this->private = NULL;
+ return;
+}
+
+struct xlator_fops fops = {
+ .readv = cdc_readv,
+ .writev = cdc_writev,
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"window-size"},
+ .default_value = "-15",
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Size of the zlib history buffer."
+ },
+ { .key = {"mem-level"},
+ .default_value = "8",
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Memory allocated for internal compression state. "
+ "1 uses minimum memory but is slow and reduces "
+ "compression ratio; memLevel=9 uses maximum memory "
+ "for optimal speed. The default value is 8."
+ },
+ { .key = {"compression-level"},
+ .default_value = "-1",
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Compression levels \n"
+ "0 : no compression, 1 : best speed, \n"
+ "9 : best compression, -1 : default compression "
+ },
+ { .key = {"min-size"},
+ .default_value = "0",
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Data is compressed only when its size exceeds this."
+ },
+ { .key = {"mode"},
+ .value = {"server", "client"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Set on the basis of where the xlator is loaded. "
+ "This option should NOT be configured by user."
+ },
+ { .key = {"debug"},
+ .default_value = "false",
+ .type = GF_OPTION_TYPE_BOOL,
+ .description = "This is used in testing. Will dump compressed data "
+ "to disk as a gzip file."
+ },
+ { .key = {NULL}
+ },
+};
diff --git a/xlators/features/compress/src/cdc.h b/xlators/features/compress/src/cdc.h
new file mode 100644
index 000000000..71f4d2317
--- /dev/null
+++ b/xlators/features/compress/src/cdc.h
@@ -0,0 +1,107 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CDC_H
+#define __CDC_H
+
+#ifdef HAVE_LIB_Z
+#include "zlib.h"
+#endif
+
+#include "xlator.h"
+
+#ifndef MAX_IOVEC
+#define MAX_IOVEC 16
+#endif
+
+typedef struct cdc_priv {
+ int window_size;
+ int mem_level;
+ int cdc_level;
+ int min_file_size;
+ int op_mode;
+ gf_boolean_t debug;
+ gf_lock_t lock;
+} cdc_priv_t;
+
+typedef struct cdc_info {
+ /* input bits */
+ int count;
+ int32_t ibytes;
+ struct iovec *vector;
+ struct iatt *buf;
+
+ /* output bits */
+ int ncount;
+ int nbytes;
+ int buffer_size;
+ struct iovec vec[MAX_IOVEC];
+ struct iobref *iobref;
+
+ /* zlib bits */
+#ifdef HAVE_LIB_Z
+ z_stream stream;
+#endif
+ unsigned long crc;
+} cdc_info_t;
+
+#define NVEC(ci) (ci->ncount - 1)
+#define CURR_VEC(ci) ci->vec[ci->ncount - 1]
+#define THIS_VEC(ci, i) ci->vector[i]
+
+/* Gzip defaults */
+#define GF_CDC_DEF_WINDOWSIZE -15 /* default value */
+#define GF_CDC_MAX_WINDOWSIZE -8 /* max value */
+
+#ifdef HAVE_LIB_Z
+#define GF_CDC_DEF_COMPRESSION Z_DEFAULT_COMPRESSION
+#else
+#define GF_CDC_DEF_COMPRESSION -1
+#endif
+
+#define GF_CDC_DEF_MEMLEVEL 8
+#define GF_CDC_DEF_BUFFERSIZE 262144 // 256K - default compression buffer size
+
+/* Operation mode
+ * If xlator is loaded on client, readv decompresses and writev compresses
+ * If xlator is loaded on server, readv compresses and writev decompresses
+ */
+#define GF_CDC_MODE_CLIENT 0
+#define GF_CDC_MODE_SERVER 1
+
+/* min size of data to do cmpression
+ * 0 == compress even 1byte
+ */
+#define GF_CDC_MIN_CHUNK_SIZE 0
+
+#define GF_CDC_VALIDATION_SIZE 8
+
+#define GF_CDC_OS_ID 0xFF
+#define GF_CDC_DEFLATE_CANARY_VAL "deflate"
+#define GF_CDC_DEBUG_DUMP_FILE "/tmp/cdcdump.gz"
+
+#define GF_CDC_MODE_IS_CLIENT(m) \
+ (strcmp (m, "client") == 0)
+
+#define GF_CDC_MODE_IS_SERVER(m) \
+ (strcmp (m, "server") == 0)
+
+int32_t
+cdc_compress (xlator_t *this,
+ cdc_priv_t *priv,
+ cdc_info_t *ci,
+ dict_t **xdata);
+int32_t
+cdc_decompress (xlator_t *this,
+ cdc_priv_t *priv,
+ cdc_info_t *ci,
+ dict_t *xdata);
+
+#endif
diff --git a/xlators/bindings/python/Makefile.am b/xlators/features/gfid-access/Makefile.am
index af437a64d..af437a64d 100644
--- a/xlators/bindings/python/Makefile.am
+++ b/xlators/features/gfid-access/Makefile.am
diff --git a/xlators/features/gfid-access/src/Makefile.am b/xlators/features/gfid-access/src/Makefile.am
new file mode 100644
index 000000000..db53affaa
--- /dev/null
+++ b/xlators/features/gfid-access/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = gfid-access.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+gfid_access_la_LDFLAGS = -module -avoid-version
+
+gfid_access_la_SOURCES = gfid-access.c
+gfid_access_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = gfid-access.h gfid-access-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/gfid-access/src/gfid-access-mem-types.h b/xlators/features/gfid-access/src/gfid-access-mem-types.h
new file mode 100644
index 000000000..168d67b43
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access-mem-types.h
@@ -0,0 +1,23 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GFID_ACCESS_MEM_TYPES_H
+#define _GFID_ACCESS_MEM_TYPES_H
+
+#include "mem-types.h"
+
+enum gf_changelog_mem_types {
+ gf_gfid_access_mt_priv_t = gf_common_mt_end + 1,
+ gf_gfid_access_mt_gfid_t,
+ gf_gfid_access_mt_end
+};
+
+#endif
+
diff --git a/xlators/features/gfid-access/src/gfid-access.c b/xlators/features/gfid-access/src/gfid-access.c
new file mode 100644
index 000000000..5cb6ecfbd
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access.c
@@ -0,0 +1,1299 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "gfid-access.h"
+#include "inode.h"
+#include "byte-order.h"
+
+
+
+void
+ga_newfile_args_free (ga_newfile_args_t *args)
+{
+ if (!args)
+ goto out;
+
+ GF_FREE (args->bname);
+
+ if (S_ISLNK (args->st_mode) && args->args.symlink.linkpath) {
+ GF_FREE (args->args.symlink.linkpath);
+ args->args.symlink.linkpath = NULL;
+ }
+
+ mem_put (args);
+out:
+ return;
+}
+
+
+void
+ga_heal_args_free (ga_heal_args_t *args)
+{
+ if (!args)
+ goto out;
+
+ GF_FREE (args->bname);
+
+ mem_put (args);
+out:
+ return;
+}
+
+
+ga_newfile_args_t *
+ga_newfile_parse_args (xlator_t *this, data_t *data)
+{
+ ga_newfile_args_t *args = NULL;
+ ga_private_t *priv = NULL;
+ int len = 0;
+ int blob_len = 0;
+ int min_len = 0;
+ void *blob = NULL;
+
+ priv = this->private;
+
+ blob = data->data;
+ blob_len = data->len;
+
+ min_len = sizeof (args->uid) + sizeof (args->gid) + sizeof (args->gfid)
+ + sizeof (args->st_mode) + 2 + 2;
+ if (blob_len < min_len) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid length: Total length is less "
+ "than minimum length.");
+ goto err;
+ }
+
+ args = mem_get0 (priv->newfile_args_pool);
+ if (args == NULL)
+ goto err;
+
+ args->uid = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ args->gid = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ memcpy (args->gfid, blob, sizeof (args->gfid));
+ blob += sizeof (args->gfid);
+ blob_len -= sizeof (args->gfid);
+
+ args->st_mode = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ len = strnlen (blob, blob_len);
+ if (len == blob_len)
+ if (len == blob_len) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. No null byte present.",
+ args->gfid);
+ goto err;
+ }
+
+ args->bname = GF_CALLOC (1, (len + 1), gf_common_mt_char);
+ if (args->bname == NULL)
+ goto err;
+
+ memcpy (args->bname, blob, (len + 1));
+ blob += (len + 1);
+ blob_len -= (len + 1);
+
+ if (S_ISDIR (args->st_mode)) {
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mkdir.mode = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mkdir.umask = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+ if (blob_len < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ } else if (S_ISLNK (args->st_mode)) {
+ len = strnlen (blob, blob_len);
+ if (len == blob_len) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.symlink.linkpath = GF_CALLOC (1, len + 1,
+ gf_common_mt_char);
+ if (args->args.symlink.linkpath == NULL)
+ goto err;
+
+ memcpy (args->args.symlink.linkpath, blob, (len + 1));
+ blob += (len + 1);
+ blob_len -= (len + 1);
+ } else {
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mknod.mode = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mknod.rdev = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mknod.umask = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+ }
+
+ if (blob_len) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+
+ return args;
+
+err:
+ if (args)
+ ga_newfile_args_free (args);
+
+ return NULL;
+}
+
+ga_heal_args_t *
+ga_heal_parse_args (xlator_t *this, data_t *data)
+{
+ ga_heal_args_t *args = NULL;
+ ga_private_t *priv = NULL;
+ void *blob = NULL;
+ int len = 0;
+ int blob_len = 0;
+
+ blob = data->data;
+ blob_len = data->len;
+
+ priv = this->private;
+
+ /* bname should at least contain a character */
+ if (blob_len < (sizeof (args->gfid) + 2))
+ goto err;
+
+ args = mem_get0 (priv->heal_args_pool);
+ if (!args)
+ goto err;
+
+ memcpy (args->gfid, blob, sizeof (args->gfid));
+ blob += sizeof (args->gfid);
+ blob_len -= sizeof (args->gfid);
+
+ len = strnlen (blob, blob_len);
+ if (len == blob_len)
+ goto err;
+
+ args->bname = GF_CALLOC (1, len + 1, gf_common_mt_char);
+ if (!args->bname)
+ goto err;
+
+ memcpy (args->bname, blob, len);
+ blob_len -= (len + 1);
+
+ if (blob_len)
+ goto err;
+
+ return args;
+
+err:
+ if (args)
+ ga_heal_args_free (args);
+
+ return NULL;
+}
+
+static int32_t
+ga_fill_tmp_loc (loc_t *loc, xlator_t *this, uuid_t gfid,
+ char *bname, dict_t *xdata, loc_t *new_loc)
+{
+ int ret = -1;
+ uint64_t value = 0;
+ inode_t *parent = NULL;
+
+ parent = loc->inode;
+ ret = inode_ctx_get (loc->inode, this, &value);
+ if (!ret) {
+ parent = (void *)value;
+ if (uuid_is_null (parent->gfid))
+ parent = loc->inode;
+ }
+
+ /* parent itself should be looked up */
+ uuid_copy (new_loc->pargfid, parent->gfid);
+ new_loc->parent = inode_ref (parent);
+
+ new_loc->inode = inode_grep (parent->table, parent, bname);
+ if (!new_loc->inode)
+ new_loc->inode = inode_new (parent->table);
+
+ loc_path (new_loc, bname);
+ new_loc->name = basename (new_loc->path);
+
+ /* As GFID would not be set on the entry yet, lets not send entry
+ gfid in the request */
+ /*uuid_copy (new_loc->gfid, (const unsigned char *)gfid); */
+
+ ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+
+
+static gf_boolean_t
+__is_gfid_access_dir (uuid_t gfid)
+{
+ uuid_t aux_gfid;
+
+ memset (aux_gfid, 0, 16);
+ aux_gfid[15] = GF_AUX_GFID;
+
+ if (uuid_compare (gfid, aux_gfid) == 0)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+int32_t
+ga_forget (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ uint64_t value = 0;
+ inode_t *tmp_inode = NULL;
+
+ ret = inode_ctx_del (inode, this, &value);
+ if (ret)
+ goto out;
+
+ tmp_inode = (void *)value;
+ inode_unref (tmp_inode);
+
+out:
+ return 0;
+}
+
+
+static int
+ga_heal_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *stat, dict_t *dict,
+ struct iatt *postparent)
+{
+ call_frame_t *orig_frame = NULL;
+
+ orig_frame = frame->local;
+ frame->local = NULL;
+
+ /* don't worry about inode linking and other stuff. They'll happen on
+ * the next lookup.
+ */
+ STACK_DESTROY (frame->root);
+
+ STACK_UNWIND_STRICT (setxattr, orig_frame, op_ret, op_errno, dict);
+
+ return 0;
+}
+
+static int32_t
+ga_newentry_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+ struct iatt *statpost,
+ dict_t *xdata)
+{
+ ga_local_t *local = NULL;
+
+ local = frame->local;
+ frame->local = NULL;
+
+ /* don't worry about inode linking and other stuff. They'll happen on
+ * the next lookup.
+ */
+ STACK_DESTROY (frame->root);
+
+ STACK_UNWIND_STRICT (setxattr, local->orig_frame, op_ret,
+ op_errno, xdata);
+
+ loc_wipe (&local->loc);
+ mem_put (local);
+
+ return 0;
+}
+
+static int
+ga_newentry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ ga_local_t *local = NULL;
+ struct iatt temp_stat = {0,};
+
+ local = frame->local;
+
+ if (!local->uid && !local->gid)
+ goto done;
+
+ temp_stat.ia_uid = local->uid;
+ temp_stat.ia_gid = local->gid;
+
+ STACK_WIND (frame, ga_newentry_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &local->loc, &temp_stat,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID), xdata);
+
+ return 0;
+
+done:
+ /* don't worry about inode linking and other stuff. They'll happen on
+ * the next lookup.
+ */
+ frame->local = NULL;
+ STACK_DESTROY (frame->root);
+
+ STACK_UNWIND_STRICT (setxattr, local->orig_frame, op_ret,
+ op_errno, xdata);
+
+ loc_wipe (&local->loc);
+ mem_put (local);
+
+ return 0;
+}
+
+int32_t
+ga_new_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data,
+ dict_t *xdata)
+{
+ int ret = -1;
+ ga_newfile_args_t *args = NULL;
+ loc_t tmp_loc = {0,};
+ call_frame_t *new_frame = NULL;
+ mode_t mode = 0;
+ ga_local_t *local = NULL;
+ uuid_t gfid = {0,};
+
+ args = ga_newfile_parse_args (this, data);
+ if (!args)
+ goto out;
+
+ ret = uuid_parse (args->gfid, gfid);
+ if (ret)
+ goto out;
+
+ if (!xdata)
+ xdata = dict_new ();
+
+ ret = ga_fill_tmp_loc (loc, this, gfid,
+ args->bname, xdata, &tmp_loc);
+ if (ret)
+ goto out;
+
+ new_frame = copy_frame (frame);
+ if (!new_frame)
+ goto out;
+
+ local = mem_get0 (this->local_pool);
+ local->orig_frame = frame;
+
+ local->uid = args->uid;
+ local->gid = args->gid;
+
+ loc_copy (&local->loc, &tmp_loc);
+
+ new_frame->local = local;
+
+ if (S_ISDIR (args->st_mode)) {
+ STACK_WIND (new_frame, ga_newentry_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+ &tmp_loc, args->args.mkdir.mode,
+ args->args.mkdir.umask, xdata);
+ } else if (S_ISLNK (args->st_mode)) {
+ STACK_WIND (new_frame, ga_newentry_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ args->args.symlink.linkpath,
+ &tmp_loc, 0, xdata);
+ } else {
+ /* use 07777 (4 7s) for considering the Sticky bits etc) */
+ mode = (S_IFMT & args->st_mode) |
+ (07777 & args->args.mknod.mode);;
+
+ STACK_WIND (new_frame, ga_newentry_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
+ &tmp_loc, mode,
+ args->args.mknod.rdev, args->args.mknod.umask,
+ xdata);
+ }
+
+ ret = 0;
+out:
+ ga_newfile_args_free (args);
+
+ return ret;
+}
+
+int32_t
+ga_heal_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data,
+ dict_t *xdata)
+{
+ int ret = -1;
+ ga_heal_args_t *args = NULL;
+ loc_t tmp_loc = {0,};
+ call_frame_t *new_frame = NULL;
+ uuid_t gfid = {0,};
+
+ args = ga_heal_parse_args (this, data);
+ if (!args)
+ goto out;
+
+ ret = uuid_parse (args->gfid, gfid);
+ if (ret)
+ goto out;
+
+ if (!xdata)
+ xdata = dict_new ();
+
+ ret = ga_fill_tmp_loc (loc, this, gfid, args->bname,
+ xdata, &tmp_loc);
+ if (ret)
+ goto out;
+
+ new_frame = copy_frame (frame);
+ if (!new_frame)
+ goto out;
+ new_frame->local = (void *)frame;
+
+ STACK_WIND (new_frame, ga_heal_cbk, FIRST_CHILD (this),
+ FIRST_CHILD(this)->fops->lookup,
+ &tmp_loc, xdata);
+
+ ret = 0;
+out:
+ if (args)
+ ga_heal_args_free (args);
+
+ return ret;
+}
+
+int32_t
+ga_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+ga_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+
+ data_t *data = NULL;
+ int op_errno = ENOMEM;
+ int ret = 0;
+ inode_t *unref = NULL;
+
+ if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) &&
+ ((loc->parent &&
+ __is_root_gfid (loc->parent->gfid)) ||
+ __is_root_gfid (loc->pargfid))) {
+ op_errno = EPERM;
+ goto err;
+ }
+
+ data = dict_get (dict, GF_FUSE_AUX_GFID_NEWFILE);
+ if (data) {
+ ret = ga_new_entry (frame, this, loc, data, xdata);
+ if (ret)
+ goto err;
+ return 0;
+ }
+
+ data = dict_get (dict, GF_FUSE_AUX_GFID_HEAL);
+ if (data) {
+ ret = ga_heal_entry (frame, this, loc, data, xdata);
+ if (ret)
+ goto err;
+ return 0;
+ }
+
+ //If the inode is a virtual inode change the inode otherwise perform
+ //the operation on same inode
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, ga_setxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr, loc, dict, flags,
+ xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata);
+ return 0;
+}
+
+
+int32_t
+ga_virtual_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ int j = 0;
+ int i = 0;
+ int ret = 0;
+ uint64_t temp_ino = 0;
+ inode_t *cbk_inode = NULL;
+ inode_t *true_inode = NULL;
+ uuid_t random_gfid = {0,};
+ inode_t *linked_inode = NULL;
+
+ if (frame->local)
+ cbk_inode = frame->local;
+ else
+ cbk_inode = inode_ref (inode);
+
+ frame->local = NULL;
+ if (op_ret)
+ goto unwind;
+
+ if (!IA_ISDIR (buf->ia_type))
+ goto unwind;
+
+ /* need to send back a different inode for linking in itable */
+ if (cbk_inode == inode) {
+ /* check if the inode is in the 'itable' or
+ if its just previously discover()'d inode */
+ true_inode = inode_find (inode->table, buf->ia_gfid);
+ if (!true_inode) {
+ /* This unref is for 'inode_ref()' done in beginning.
+ This is needed as cbk_inode is allocated new inode
+ whose unref is taken at the end*/
+ inode_unref (cbk_inode);
+ cbk_inode = inode_new (inode->table);
+
+ if (!cbk_inode) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ /* the inode is not present in itable, ie, the actual
+ path is not yet looked up. Use the current inode
+ itself for now */
+
+ linked_inode = inode_link (inode, NULL, NULL, buf);
+ inode = linked_inode;
+ } else {
+ /* 'inode_ref()' has been done in inode_find() */
+ inode = true_inode;
+ }
+
+ ret = inode_ctx_put (cbk_inode, this, (uint64_t)inode);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the inode ctx with"
+ "the actual inode");
+ if (inode)
+ inode_unref (inode);
+ }
+ inode = NULL;
+ }
+
+ if (!uuid_is_null (cbk_inode->gfid)) {
+ /* if the previous linked inode is used, use the
+ same gfid */
+ uuid_copy (random_gfid, cbk_inode->gfid);
+ } else {
+ /* replace the buf->ia_gfid to a random gfid
+ for directory, for files, what we received is fine */
+ uuid_generate (random_gfid);
+ }
+
+ uuid_copy (buf->ia_gfid, random_gfid);
+
+ for (i = 15; i > (15 - 8); i--) {
+ temp_ino += (uint64_t)(buf->ia_gfid[i]) << j;
+ j += 8;
+ }
+ buf->ia_ino = temp_ino;
+
+unwind:
+ /* Lookup on non-existing gfid returns ESTALE.
+ Convert into ENOENT for virtual lookup*/
+ if (op_errno == ESTALE)
+ op_errno = ENOENT;
+
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, cbk_inode, buf,
+ xdata, postparent);
+
+ /* Also handles inode_unref of frame->local if done in ga_lookup */
+ if (cbk_inode)
+ inode_unref (cbk_inode);
+
+ return 0;
+}
+
+int32_t
+ga_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ ga_private_t *priv = NULL;
+
+ /* if the entry in question is not 'root',
+ then follow the normal path */
+ if (op_ret || !__is_root_gfid(buf->ia_gfid))
+ goto unwind;
+
+ priv = this->private;
+
+ /* do we need to copy root stbuf everytime? */
+ /* mostly yes, as we want to have the 'stat' info show latest
+ in every _cbk() */
+
+ /* keep the reference for root stat buf */
+ priv->root_stbuf = *buf;
+ priv->gfiddir_stbuf = priv->root_stbuf;
+ priv->gfiddir_stbuf.ia_gfid[15] = GF_AUX_GFID;
+ priv->gfiddir_stbuf.ia_ino = GF_AUX_GFID;
+
+unwind:
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+int32_t
+ga_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ ga_private_t *priv = NULL;
+ int ret = -1;
+ uuid_t tmp_gfid = {0,};
+ loc_t tmp_loc = {0,};
+ uint64_t value = 0;
+ inode_t *inode = NULL;
+ inode_t *true_inode = NULL;
+ int32_t op_errno = ENOENT;
+
+ /* if its discover(), no need for any action here */
+ if (!loc->name)
+ goto wind;
+
+ /* if its revalidate, and inode is not of type directory,
+ proceed with 'wind' */
+ if (loc->inode && loc->inode->ia_type &&
+ !IA_ISDIR (loc->inode->ia_type)) {
+
+ /* a revalidate on ".gfid/<dentry>" is possible, check for it */
+ if (((loc->parent &&
+ __is_gfid_access_dir (loc->parent->gfid)) ||
+ __is_gfid_access_dir (loc->pargfid))) {
+
+ /* here, just send 'loc->gfid' and 'loc->inode' */
+ tmp_loc.inode = inode_ref (loc->inode);
+ uuid_copy (tmp_loc.gfid, loc->inode->gfid);
+
+ STACK_WIND (frame, default_lookup_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup,
+ &tmp_loc, xdata);
+
+ inode_unref (tmp_loc.inode);
+
+ return 0;
+ }
+
+ /* not something to bother, continue the flow */
+ goto wind;
+ }
+
+ priv = this->private;
+
+ /* need to check if the lookup is on virtual dir */
+ if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) &&
+ ((loc->parent && __is_root_gfid (loc->parent->gfid)) ||
+ __is_root_gfid (loc->pargfid))) {
+ /* this means, the query is on '/.gfid', return the fake stat,
+ and say success */
+
+ STACK_UNWIND_STRICT (lookup, frame, 0, 0, loc->inode,
+ &priv->gfiddir_stbuf, xdata,
+ &priv->root_stbuf);
+ return 0;
+ }
+
+ /* now, check if the lookup() is on an existing entry,
+ but on gfid-path */
+ if (!((loc->parent && __is_gfid_access_dir (loc->parent->gfid)) ||
+ __is_gfid_access_dir (loc->pargfid))) {
+ if (!loc->parent)
+ goto wind;
+
+ ret = inode_ctx_get (loc->parent, this, &value);
+ if (ret)
+ goto wind;
+
+ inode = (inode_t *) value;
+
+ ret = loc_copy_overload_parent (&tmp_loc, loc, inode);
+ if (ret)
+ goto err;
+
+ STACK_WIND (frame, ga_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, &tmp_loc, xdata);
+
+ loc_wipe (&tmp_loc);
+ return 0;
+ }
+
+ /* make sure the 'basename' is actually a 'canonical-gfid',
+ otherwise, return error */
+ ret = uuid_parse (loc->name, tmp_gfid);
+ if (ret)
+ goto err;
+
+ /* if its fresh lookup, go ahead and send it down, if not,
+ for directory, we need indirection to actual dir inode */
+ if (!(loc->inode && loc->inode->ia_type))
+ goto discover;
+
+ /* revalidate on directory */
+ ret = inode_ctx_get (loc->inode, this, &value);
+ if (ret)
+ goto err;
+
+ inode = (void *)value;
+
+ /* valid inode, already looked up, work on that */
+ if (inode->ia_type)
+ goto discover;
+
+ /* check if the inode is in the 'itable' or
+ if its just previously discover()'d inode */
+ true_inode = inode_find (loc->inode->table, tmp_gfid);
+ if (true_inode) {
+ /* time do another lookup and update the context
+ with proper inode */
+ op_errno = ESTALE;
+ /* 'inode_ref()' done in inode_find */
+ inode_unref (true_inode);
+ goto err;
+ }
+
+discover:
+ /* for the virtual entries, we don't need to send 'gfid-req' key, as
+ for these entries, we don't want to 'set' a new gfid */
+ if (xdata)
+ dict_del (xdata, "gfid-req");
+
+ uuid_copy (tmp_loc.gfid, tmp_gfid);
+
+ /* if revalidate, then we need to have the proper reference */
+ if (inode) {
+ tmp_loc.inode = inode_ref (inode);
+ frame->local = inode_ref (loc->inode);
+ } else {
+ tmp_loc.inode = inode_ref (loc->inode);
+ }
+
+ STACK_WIND (frame, ga_virtual_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata);
+
+ inode_unref (tmp_loc.inode);
+
+ return 0;
+
+wind:
+ /* used for all the normal lookup path */
+ STACK_WIND (frame, ga_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, loc->inode,
+ &priv->gfiddir_stbuf, xdata,
+ &priv->root_stbuf);
+ return 0;
+}
+
+int
+ga_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, loc, mode, umask,
+ xdata);
+
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, loc->inode,
+ NULL, NULL, NULL, xdata);
+ return 0;
+}
+
+
+int
+ga_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ STACK_WIND (frame, default_create_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, NULL, xdata);
+
+ return 0;
+
+}
+
+int
+ga_symlink (call_frame_t *frame, xlator_t *this, const char *linkname,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ STACK_WIND (frame, default_symlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ linkname, loc, umask, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (symlink, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, xdata);
+
+ return 0;
+}
+
+int
+ga_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod, loc, mode, rdev,
+ umask, xdata);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, xdata);
+
+ return 0;
+}
+
+int
+ga_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flag,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_rmdir_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir,
+ loc, flag, xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, NULL,
+ NULL, xdata);
+
+ return 0;
+}
+
+int
+ga_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_unlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink,
+ loc, xflag, xdata);
+
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL,
+ NULL, xdata);
+
+ return 0;
+}
+
+int
+ga_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ int op_errno = 0;
+ inode_t *oldloc_unref = NULL;
+ inode_t *newloc_unref = NULL;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err);
+ GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err);
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref,
+ handle_newloc);
+
+handle_newloc:
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind);
+
+wind:
+ STACK_WIND (frame, default_rename_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+ oldloc, newloc, xdata);
+
+ if (oldloc_unref)
+ inode_unref (oldloc_unref);
+
+ if (newloc_unref)
+ inode_unref (newloc_unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, NULL, xdata);
+
+ return 0;
+}
+
+
+int
+ga_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ int op_errno = 0;
+ inode_t *oldloc_unref = NULL;
+ inode_t *newloc_unref = NULL;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err);
+ GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err);
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref,
+ handle_newloc);
+
+handle_newloc:
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind);
+
+wind:
+ STACK_WIND (frame, default_link_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+ oldloc, newloc, xdata);
+
+ if (oldloc_unref)
+ inode_unref (oldloc_unref);
+
+ if (newloc_unref)
+ inode_unref (newloc_unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (link, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, xdata);
+
+ return 0;
+}
+
+int32_t
+ga_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ /* also check if the loc->inode itself is virtual
+ inode, if yes, return with failure, mainly because we
+ can't handle all the readdirp and other things on it. */
+ if (inode_ctx_get (loc->inode, this, NULL) == 0) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ STACK_WIND (frame, default_opendir_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir,
+ loc, fd, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (opendir, frame, -1, op_errno, NULL, xdata);
+
+ return 0;
+}
+
+int32_t
+ga_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+}
+
+int32_t
+ga_stat (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+}
+
+int32_t
+ga_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid,
+ dict_t *xdata)
+{
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid,
+ xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+}
+
+int32_t
+ga_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name,
+ xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+}
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_gfid_access_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING, "Memory accounting"
+ " init failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ ga_private_t *priv = NULL;
+ int ret = -1;
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "not configured with exactly one child. exiting");
+ goto out;
+ }
+
+ /* This can be the top of graph in certain cases */
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dangling volume. check volfile ");
+ }
+
+ /* TODO: define a mem-type structure */
+ priv = GF_CALLOC (1, sizeof (*priv), gf_gfid_access_mt_priv_t);
+ if (!priv)
+ goto out;
+
+ priv->newfile_args_pool = mem_pool_new (ga_newfile_args_t, 512);
+ if (!priv->newfile_args_pool)
+ goto out;
+
+ priv->heal_args_pool = mem_pool_new (ga_heal_args_t, 512);
+ if (!priv->heal_args_pool)
+ goto out;
+
+ this->local_pool = mem_pool_new (ga_local_t, 16);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create local_t's memory pool");
+ goto out;
+ }
+
+ this->private = priv;
+
+ ret = 0;
+out:
+ if (ret && priv) {
+ if (priv->newfile_args_pool)
+ mem_pool_destroy (priv->newfile_args_pool);
+ GF_FREE (priv);
+ }
+
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ ga_private_t *priv = NULL;
+ priv = this->private;
+ this->private = NULL;
+
+ if (priv) {
+ if (priv->newfile_args_pool)
+ mem_pool_destroy (priv->newfile_args_pool);
+ if (priv->heal_args_pool)
+ mem_pool_destroy (priv->heal_args_pool);
+ GF_FREE (priv);
+ }
+
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = ga_lookup,
+
+ /* entry fops */
+ .mkdir = ga_mkdir,
+ .mknod = ga_mknod,
+ .create = ga_create,
+ .symlink = ga_symlink,
+ .link = ga_link,
+ .unlink = ga_unlink,
+ .rmdir = ga_rmdir,
+ .rename = ga_rename,
+
+ /* handle any other directory operations here */
+ .opendir = ga_opendir,
+ .stat = ga_stat,
+ .setattr = ga_setattr,
+ .getxattr = ga_getxattr,
+ .removexattr = ga_removexattr,
+
+ /* special fop to handle more entry creations */
+ .setxattr = ga_setxattr,
+};
+
+struct xlator_cbks cbks = {
+ .forget = ga_forget,
+};
+
+struct volume_options options[] = {
+ /* This translator doesn't take any options, or provide any options */
+ { .key = {NULL} },
+};
diff --git a/xlators/features/gfid-access/src/gfid-access.h b/xlators/features/gfid-access/src/gfid-access.h
new file mode 100644
index 000000000..e883eca69
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access.h
@@ -0,0 +1,134 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef __GFID_ACCESS_H__
+#define __GFID_ACCESS_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "defaults.h"
+#include "gfid-access-mem-types.h"
+
+#define UUID_CANONICAL_FORM_LEN 36
+
+#define GF_FUSE_AUX_GFID_NEWFILE "glusterfs.gfid.newfile"
+#define GF_FUSE_AUX_GFID_HEAL "glusterfs.gfid.heal"
+
+#define GF_GFID_KEY "GLUSTERFS_GFID"
+#define GF_GFID_DIR ".gfid"
+#define GF_AUX_GFID 0xd
+
+#define GFID_ACCESS_GET_VALID_DIR_INODE(x,l,unref,lbl) do { \
+ int ret = 0; \
+ uint64_t value = 0; \
+ inode_t *tmp_inode = NULL; \
+ \
+ /* if its an entry operation, on the virtual */ \
+ /* directory inode as parent, we need to handle */ \
+ /* it properly */ \
+ if (l->parent) { \
+ ret = inode_ctx_get (l->parent, x, &value); \
+ if (ret) \
+ goto lbl; \
+ tmp_inode = (inode_t *)value; \
+ l->parent = inode_ref (tmp_inode); \
+ /* if parent is virtual, no need to handle */ \
+ /* loc->inode */ \
+ break; \
+ } \
+ \
+ /* if its an inode operation, on the virtual */ \
+ /* directory inode itself, we need to handle */ \
+ /* it properly */ \
+ if (l->inode) { \
+ ret = inode_ctx_get (l->inode, x, &value); \
+ if (ret) \
+ goto lbl; \
+ tmp_inode = (inode_t *)value; \
+ l->inode = inode_ref (tmp_inode); \
+ } \
+ \
+ } while (0)
+
+#define GFID_ACCESS_ENTRY_OP_CHECK(loc,err,lbl) do { \
+ /* need to check if the lookup is on virtual dir */ \
+ if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && \
+ ((loc->parent && \
+ __is_root_gfid (loc->parent->gfid)) || \
+ __is_root_gfid (loc->pargfid))) { \
+ err = EEXIST; \
+ goto lbl; \
+ } \
+ \
+ /* now, check if the lookup() is on an existing */ \
+ /* entry, but on gfid-path */ \
+ if ((loc->parent && \
+ __is_gfid_access_dir (loc->parent->gfid)) || \
+ __is_gfid_access_dir (loc->pargfid)) { \
+ err = EPERM; \
+ goto lbl; \
+ } \
+ } while (0)
+
+
+typedef struct {
+ unsigned int uid;
+ unsigned int gid;
+ char gfid[UUID_CANONICAL_FORM_LEN + 1];
+ unsigned int st_mode;
+ char *bname;
+
+ union {
+ struct _symlink_in {
+ char *linkpath;
+ } __attribute__ ((__packed__)) symlink;
+
+ struct _mknod_in {
+ unsigned int mode;
+ unsigned int rdev;
+ unsigned int umask;
+ } __attribute__ ((__packed__)) mknod;
+
+ struct _mkdir_in {
+ unsigned int mode;
+ unsigned int umask;
+ } __attribute__ ((__packed__)) mkdir;
+ } __attribute__ ((__packed__)) args;
+} __attribute__((__packed__)) ga_newfile_args_t;
+
+typedef struct {
+ char gfid[UUID_CANONICAL_FORM_LEN + 1];
+ char *bname; /* a null terminated basename */
+} __attribute__((__packed__)) ga_heal_args_t;
+
+struct ga_private {
+ /* root inode's stbuf */
+ struct iatt root_stbuf;
+ struct iatt gfiddir_stbuf;
+ struct mem_pool *newfile_args_pool;
+ struct mem_pool *heal_args_pool;
+};
+typedef struct ga_private ga_private_t;
+
+struct __ga_local {
+ call_frame_t *orig_frame;
+ unsigned int uid;
+ unsigned int gid;
+ loc_t loc;
+};
+typedef struct __ga_local ga_local_t;
+
+#endif /* __GFID_ACCESS_H__ */
diff --git a/xlators/features/glupy/Makefile.am b/xlators/features/glupy/Makefile.am
index a985f42a8..060429ecf 100644
--- a/xlators/features/glupy/Makefile.am
+++ b/xlators/features/glupy/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = src
+SUBDIRS = src examples
CLEANFILES =
diff --git a/xlators/features/glupy/examples/Makefile.am b/xlators/features/glupy/examples/Makefile.am
new file mode 100644
index 000000000..c26abeaaf
--- /dev/null
+++ b/xlators/features/glupy/examples/Makefile.am
@@ -0,0 +1,5 @@
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+glupyexamplesdir = $(xlatordir)/glupy
+
+glupyexamples_PYTHON = negative.py helloworld.py debug-trace.py
diff --git a/xlators/features/glupy/src/debug-trace.py b/xlators/features/glupy/examples/debug-trace.py
index 53e76546b..6eef1b58b 100644
--- a/xlators/features/glupy/src/debug-trace.py
+++ b/xlators/features/glupy/examples/debug-trace.py
@@ -2,12 +2,13 @@ import sys
import stat
from uuid import UUID
from time import strftime, localtime
-from gluster import *
+from gluster.glupy import *
+
# This translator was written primarily to test the fop entry point definitions
-# and structure definitions in 'gluster.py'.
-# It is similar to the debug-trace translator, one of the already available
-# translator types written in C, that logs the arguments passed to the fops and
-# their corresponding cbk functions.
+# and structure definitions in 'glupy.py'.
+
+# It is similar to the C language debug-trace translator, which logs the
+# arguments passed to the fops and their corresponding cbk functions.
dl.get_id.restype = c_long
dl.get_id.argtypes = [ POINTER(call_frame_t) ]
diff --git a/xlators/features/glupy/src/helloworld.py b/xlators/features/glupy/examples/helloworld.py
index 8fe403711..b565a4e5b 100644
--- a/xlators/features/glupy/src/helloworld.py
+++ b/xlators/features/glupy/examples/helloworld.py
@@ -1,5 +1,5 @@
import sys
-from gluster import *
+from gluster.glupy import *
class xlator (Translator):
diff --git a/xlators/features/glupy/src/negative.py b/xlators/features/glupy/examples/negative.py
index 1023602b9..e7a4fc07c 100644
--- a/xlators/features/glupy/src/negative.py
+++ b/xlators/features/glupy/examples/negative.py
@@ -1,6 +1,6 @@
import sys
from uuid import UUID
-from gluster import *
+from gluster.glupy import *
# Negative-lookup-caching example. If a file wasn't there the last time we
# looked, it's probably still not there. This translator keeps track of
@@ -89,4 +89,3 @@ class xlator (Translator):
dl.unwind_create(frame,cookie,this,op_ret,op_errno,fd,inode,buf,
preparent,postparent,xdata)
return 0
-
diff --git a/xlators/features/glupy/src/Makefile.am b/xlators/features/glupy/src/Makefile.am
index 960862839..ae7b6d14d 100644
--- a/xlators/features/glupy/src/Makefile.am
+++ b/xlators/features/glupy/src/Makefile.am
@@ -1,20 +1,21 @@
xlator_LTLIBRARIES = glupy.la
+# Ensure GLUSTER_PYTHON_PATH is passed to glupy.so
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-
glupydir = $(xlatordir)/glupy
+AM_CPPFLAGS = $(PYTHONDEV_CPPFLAGS) $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -isystem $(BUILD_PYTHON_INC)
+AM_CFLAGS = $(PYTHONDEV_CFLAGS) -Wall -fno-strict-aliasing -DGLUSTER_PYTHON_PATH=\"$(glupydir)\" $(GF_CFLAGS)
-glupy_PYTHON = gluster.py negative.py helloworld.py debug-trace.py
-
-glupy_la_LDFLAGS = -module -avoid-version -shared -nostartfiles
+# Flags to build glupy.so with
+glupy_la_LDFLAGS = $(PYTHONDEV_LDFLAGS) -module -avoid-version -shared -nostartfiles
glupy_la_SOURCES = glupy.c
glupy_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
-lpthread -l$(BUILD_PYTHON_LIB)
noinst_HEADERS = glupy.h
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -isystem $(BUILD_PYTHON_INC)
-
-AM_CFLAGS = -Wall -fno-strict-aliasing -DGLUSTER_PYTHON_PATH=\"$(glupydir)\" $(GF_CFLAGS)
+# Install glupy.py into the Python site-packages area
+pyglupydir = $(pythondir)/gluster
+pyglupy_PYTHON = glupy.py
CLEANFILES =
diff --git a/xlators/features/glupy/src/glupy.c b/xlators/features/glupy/src/glupy.c
index dc86c0071..7492124dd 100644
--- a/xlators/features/glupy/src/glupy.c
+++ b/xlators/features/glupy/src/glupy.c
@@ -2314,6 +2314,25 @@ get_rootunique (call_frame_t *frame)
}
int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_glupy_mt_end);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ " failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
init (xlator_t *this)
{
glupy_private_t *priv = NULL;
@@ -2365,7 +2384,7 @@ init (xlator_t *this)
goto *err_cleanup;
}
- gf_log (this->name, GF_LOG_ERROR, "py_mod_name = %s", module_name);
+ gf_log (this->name, GF_LOG_DEBUG, "py_mod_name = %s", module_name);
priv->py_module = PyImport_Import(py_mod_name);
Py_DECREF(py_mod_name);
if (!priv->py_module) {
@@ -2375,6 +2394,7 @@ init (xlator_t *this)
}
goto *err_cleanup;
}
+ gf_log (this->name, GF_LOG_INFO, "Import of %s succeeded", module_name);
err_cleanup = &&err_deref_module;
py_init_func = PyObject_GetAttrString(priv->py_module, "xlator");
@@ -2407,7 +2427,7 @@ init (xlator_t *this)
}
goto *err_cleanup;
}
- gf_log (this->name, GF_LOG_INFO, "init returned %p", priv->py_xlator);
+ gf_log (this->name, GF_LOG_DEBUG, "init returned %p", priv->py_xlator);
return 0;
diff --git a/xlators/features/glupy/src/gluster.py b/xlators/features/glupy/src/glupy.py
index a5daa77d3..a5daa77d3 100644
--- a/xlators/features/glupy/src/gluster.py
+++ b/xlators/features/glupy/src/glupy.py
diff --git a/xlators/features/glupy/src/setup.py.in b/xlators/features/glupy/src/setup.py.in
new file mode 100644
index 000000000..b9ee02c2b
--- /dev/null
+++ b/xlators/features/glupy/src/setup.py.in
@@ -0,0 +1,24 @@
+from distutils.core import setup
+
+DESC = """GlusterFS is a clustered file-system capable of scaling to
+several petabytes. It aggregates various storage bricks over Infiniband
+RDMA or TCP/IP interconnect into one large parallel network file system.
+GlusterFS is one of the most sophisticated file systems in terms of
+features and extensibility. It borrows a powerful concept called
+Translators from GNU Hurd kernel. Much of the code in GlusterFS is in
+user space and easily manageable.
+
+This package contains Glupy, the Python translator interface for GlusterFS."""
+
+setup(
+ name='glusterfs-glupy',
+ version='@PACKAGE_VERSION@',
+ description='Glupy is the Python translator interface for GlusterFS',
+ long_description=DESC,
+ author='Gluster Community',
+ author_email='gluster-devel@gluster.org',
+ license='LGPLv3',
+ url='http://gluster.org/',
+ package_dir={'gluster':''},
+ packages=['gluster']
+)
diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c
index 3245076ec..4b2eb2e29 100644
--- a/xlators/features/index/src/index.c
+++ b/xlators/features/index/src/index.c
@@ -15,6 +15,7 @@
#include "index.h"
#include "options.h"
#include "glusterfs3-xdr.h"
+#include "syscall.h"
#define XATTROP_SUBDIR "xattrop"
@@ -361,7 +362,7 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir)
index_get_index (priv, index);
make_index_path (priv->index_basepath, subdir,
index, index_path, sizeof (index_path));
- ret = link (index_path, gfid_path);
+ ret = sys_link (index_path, gfid_path);
if (!ret || (errno == EEXIST)) {
ret = 0;
goto out;
@@ -392,7 +393,7 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir)
if (fd >= 0)
close (fd);
- ret = link (index_path, gfid_path);
+ ret = sys_link (index_path, gfid_path);
if (ret && (errno != EEXIST)) {
gf_log (this->name, GF_LOG_ERROR, "%s: Not able to "
"add to index (%s)", uuid_utoa (gfid),
@@ -442,22 +443,16 @@ _check_key_is_zero_filled (dict_t *d, char *k, data_t *v,
return 0;
}
-
void
-_xattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr)
+_index_action (xlator_t *this, inode_t *inode, gf_boolean_t zero_xattr)
{
- gf_boolean_t zero_xattr = _gf_true;
+ int ret = 0;
index_inode_ctx_t *ctx = NULL;
- int ret = 0;
-
- ret = dict_foreach (xattr, _check_key_is_zero_filled, NULL);
- if (ret == -1)
- zero_xattr = _gf_false;
ret = index_inode_ctx_get (inode, this, &ctx);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Not able to %s %s -> index",
- zero_xattr?"add":"del", uuid_utoa (inode->gfid));
+ zero_xattr?"del":"add", uuid_utoa (inode->gfid));
goto out;
}
if (zero_xattr) {
@@ -478,6 +473,19 @@ out:
}
void
+_xattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr)
+{
+ gf_boolean_t zero_xattr = _gf_true;
+ int ret = 0;
+
+ ret = dict_foreach (xattr, _check_key_is_zero_filled, NULL);
+ if (ret == -1)
+ zero_xattr = _gf_false;
+ _index_action (this, inode, zero_xattr);
+ return;
+}
+
+void
fop_xattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr)
{
_xattrop_index_action (this, inode, xattr);
@@ -489,13 +497,13 @@ fop_fxattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr)
_xattrop_index_action (this, inode, xattr);
}
-inline gf_boolean_t
+static inline gf_boolean_t
index_xattrop_track (loc_t *loc, gf_xattrop_flags_t flags, dict_t *dict)
{
return (flags == GF_XATTROP_ADD_ARRAY);
}
-inline gf_boolean_t
+static inline gf_boolean_t
index_fxattrop_track (fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict)
{
return (flags == GF_XATTROP_ADD_ARRAY);
@@ -655,6 +663,11 @@ int
index_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
+ //In wind phase bring the gfid into index. This way if the brick crashes
+ //just after posix performs xattrop before _cbk reaches index xlator
+ //we will still have the gfid in index.
+ _index_action (this, frame->local, _gf_false);
+
STACK_WIND (frame, index_xattrop_cbk, FIRST_CHILD (this),
FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr,
xdata);
@@ -665,6 +678,10 @@ int
index_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
+ //In wind phase bring the gfid into index. This way if the brick crashes
+ //just after posix performs xattrop before _cbk reaches index xlator
+ //we will still have the gfid in index.
+ _index_action (this, frame->local, _gf_false);
STACK_WIND (frame, index_fxattrop_cbk, FIRST_CHILD (this),
FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr,
xdata);
@@ -721,6 +738,41 @@ out:
return 0;
}
+uint64_t
+index_entry_count (xlator_t *this, char *subdir)
+{
+ index_priv_t *priv = NULL;
+ char index_dir[PATH_MAX];
+ DIR *dirp = NULL;
+ uint64_t count = 0;
+ struct dirent buf;
+ struct dirent *entry = NULL;
+
+ priv = this->private;
+
+ make_index_dir_path (priv->index_basepath, subdir,
+ index_dir, sizeof (index_dir));
+
+ dirp = opendir (index_dir);
+ if (!dirp)
+ return 0;
+
+ while (readdir_r (dirp, &buf, &entry) == 0) {
+ if (!entry)
+ break;
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+ if (!strncmp (entry->d_name, subdir, strlen (subdir)))
+ continue;
+ count++;
+ }
+ closedir (dirp);
+
+ return count;
+}
+
+
int32_t
index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
@@ -728,6 +780,7 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,
index_priv_t *priv = NULL;
dict_t *xattr = NULL;
int ret = 0;
+ uint64_t count = 0;
priv = this->private;
@@ -737,14 +790,26 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,
goto done;
}
- ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid,
- sizeof (priv->xattrop_vgfid));
- if (ret) {
- ret = -ENOMEM;
- gf_log (THIS->name, GF_LOG_ERROR, "xattrop index "
- "gfid set failed");
- goto done;
- }
+ if (strcmp (name, GF_XATTROP_INDEX_GFID) == 0) {
+ ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid,
+ sizeof (priv->xattrop_vgfid));
+ if (ret) {
+ ret = -ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR, "xattrop index "
+ "gfid set failed");
+ goto done;
+ }
+ } else if (strcmp (name, GF_XATTROP_INDEX_COUNT) == 0) {
+ count = index_entry_count (this, XATTROP_SUBDIR);
+
+ ret = dict_set_uint64 (xattr, (char *)name, count);
+ if (ret) {
+ ret = -ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR, "xattrop index "
+ "count set failed");
+ goto done;
+ }
+ }
done:
if (ret)
STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, xattr, xdata);
@@ -914,8 +979,12 @@ index_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
call_stub_t *stub = NULL;
+ index_priv_t *priv = NULL;
+
+ priv = this->private;
- if (!name || strcmp (GF_XATTROP_INDEX_GFID, name))
+ if (!name || (strcmp (GF_XATTROP_INDEX_GFID, name) &&
+ strcmp (GF_XATTROP_INDEX_COUNT, name)))
goto out;
stub = fop_getxattr_stub (frame, index_getxattr_wrapper, loc, name,
@@ -1078,12 +1147,18 @@ init (xlator_t *this)
INIT_LIST_HEAD (&priv->callstubs);
this->private = priv;
- ret = pthread_create (&thread, &w_attr, index_worker, this);
+
+ ret = index_dir_create (this, XATTROP_SUBDIR);
+ if (ret < 0)
+ goto out;
+
+ ret = gf_thread_create (&thread, &w_attr, index_worker, this);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "Failed to create "
"worker thread, aborting");
goto out;
}
+
ret = 0;
out:
if (ret) {
diff --git a/xlators/features/locks/src/Makefile.am b/xlators/features/locks/src/Makefile.am
index 8908c1f52..0f79731b4 100644
--- a/xlators/features/locks/src/Makefile.am
+++ b/xlators/features/locks/src/Makefile.am
@@ -11,6 +11,7 @@ noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
CLEANFILES =
diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c
index 5790a99ce..75593b898 100644
--- a/xlators/features/locks/src/clear.c
+++ b/xlators/features/locks/src/clear.c
@@ -338,8 +338,8 @@ blkd:
elock->basename, ENTRYLK_LOCK, elock->type,
-1, EAGAIN);
STACK_UNWIND_STRICT (entrylk, elock->frame, -1, EAGAIN, NULL);
- GF_FREE ((char *) elock->basename);
- GF_FREE (elock);
+
+ __pl_entrylk_unref (elock);
}
if (!(args->kind & CLRLK_GRANTED)) {
@@ -362,13 +362,13 @@ granted:
gcount++;
list_del_init (&elock->domain_list);
list_add_tail (&elock->domain_list, &removed);
+
+ __pl_entrylk_unref (elock);
}
}
pthread_mutex_unlock (&pl_inode->mutex);
- list_for_each_entry_safe (elock, tmp, &removed, domain_list) {
- grant_blocked_entry_locks (this, pl_inode, elock, dom);
- }
+ grant_blocked_entry_locks (this, pl_inode, dom);
ret = 0;
out:
@@ -379,8 +379,8 @@ out:
int
clrlk_clear_lks_in_all_domains (xlator_t *this, pl_inode_t *pl_inode,
- clrlk_args *args, int *blkd, int *granted,
- int *op_errno)
+ clrlk_args *args, int *blkd, int *granted,
+ int *op_errno)
{
pl_dom_list_t *dom = NULL;
int ret = -1;
diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
index 9c21bddb9..f6c71c1cf 100644
--- a/xlators/features/locks/src/common.c
+++ b/xlators/features/locks/src/common.c
@@ -35,6 +35,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock);
static int
pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode,
posix_lock_t *old_lock);
+
static pl_dom_list_t *
__allocate_domain (const char *volume)
{
@@ -75,8 +76,8 @@ get_domain (pl_inode_t *pl_inode, const char *volume)
{
pl_dom_list_t *dom = NULL;
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, pl_inode, out);
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, volume, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", pl_inode, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", volume, out);
pthread_mutex_lock (&pl_inode->mutex);
{
@@ -92,9 +93,9 @@ get_domain (pl_inode_t *pl_inode, const char *volume)
unlock:
pthread_mutex_unlock (&pl_inode->mutex);
if (dom) {
- gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Domain %s found", volume);
+ gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s found", volume);
} else {
- gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Domain %s not found", volume);
+ gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume);
}
out:
return dom;
@@ -135,10 +136,10 @@ __pl_inode_is_empty (pl_inode_t *pl_inode)
void
pl_print_locker (char *str, int size, xlator_t *this, call_frame_t *frame)
{
- snprintf (str, size, "Pid=%llu, lk-owner=%s, Transport=%p, Frame=%llu",
+ snprintf (str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu",
(unsigned long long) frame->root->pid,
lkowner_utoa (&frame->root->lk_owner),
- (void *)frame->root->trans,
+ frame->root->client,
(unsigned long long) frame->root->unique);
}
@@ -462,14 +463,14 @@ unlock:
/* Create a new posix_lock_t */
posix_lock_t *
-new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
+new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
gf_lkowner_t *owner, fd_t *fd)
{
posix_lock_t *lock = NULL;
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, flock, out);
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, transport, out);
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, fd, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", flock, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", client, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", fd, out);
lock = GF_CALLOC (1, sizeof (posix_lock_t),
gf_locks_mt_posix_lock_t);
@@ -485,7 +486,7 @@ new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
else
lock->fl_end = flock->l_start + flock->l_len - 1;
- lock->transport = transport;
+ lock->client = client;
lock->fd_num = fd_to_fdnum (fd);
lock->fd = fd;
lock->client_pid = client_pid;
@@ -565,7 +566,7 @@ same_owner (posix_lock_t *l1, posix_lock_t *l2)
{
return (is_same_lkowner (&l1->owner, &l2->owner) &&
- (l1->transport == l2->transport));
+ (l1->client == l2->client));
}
@@ -694,7 +695,7 @@ subtract_locks (posix_lock_t *big, posix_lock_t *small)
}
GF_ASSERT (0);
- gf_log (POSIX_LOCKS, GF_LOG_ERROR, "Unexpected case in subtract_locks");
+ gf_log ("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks");
out:
if (v.locks[0]) {
@@ -812,7 +813,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock)
sum = add_locks (lock, conf);
sum->fl_type = lock->fl_type;
- sum->transport = lock->transport;
+ sum->client = lock->client;
sum->fd_num = lock->fd_num;
sum->client_pid = lock->client_pid;
sum->owner = lock->owner;
@@ -830,7 +831,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock)
sum = add_locks (lock, conf);
sum->fl_type = conf->fl_type;
- sum->transport = conf->transport;
+ sum->client = conf->client;
sum->fd_num = conf->fd_num;
sum->client_pid = conf->client_pid;
sum->owner = conf->owner;
@@ -988,7 +989,7 @@ pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode,
flock.l_len = old_lock->user_flock.l_len;
- unlock_lock = new_posix_lock (&flock, old_lock->transport,
+ unlock_lock = new_posix_lock (&flock, old_lock->client,
old_lock->client_pid, &old_lock->owner,
old_lock->fd);
GF_VALIDATE_OR_GOTO (this->name, unlock_lock, out);
@@ -1097,3 +1098,4 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock)
return conf;
}
+
diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
index 94ef4b2b4..5ec630ee8 100644
--- a/xlators/features/locks/src/common.h
+++ b/xlators/features/locks/src/common.h
@@ -14,12 +14,14 @@
/*dump locks format strings */
#define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu"
#define ENTRY_FMT "type=%s on basename=%s"
-#define DUMP_GEN_FMT "pid = %llu, owner=%s, transport=%p, "
+#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p"
#define GRNTD_AT "granted at %s"
#define BLKD_AT "blocked at %s"
-#define DUMP_BLKD_FMT DUMP_GEN_FMT", "BLKD_AT
-#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "GRNTD_AT
-#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "BLKD_AT", "GRNTD_AT
+#define CONN_ID "connection-id=%s"
+#define DUMP_BLKD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT
+#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "GRNTD_AT
+#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT", "GRNTD_AT
+
#define ENTRY_BLKD_FMT ENTRY_FMT", "DUMP_BLKD_FMT
#define ENTRY_GRNTD_FMT ENTRY_FMT", "DUMP_GRNTD_FMT
#define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT", "DUMP_BLKD_GRNTD_FMT
@@ -29,8 +31,10 @@
#define RANGE_BLKD_GRNTD_FMT RANGE_FMT", "DUMP_BLKD_GRNTD_FMT
#define SET_FLOCK_PID(flock, lock) ((flock)->l_pid = lock->client_pid)
+
+
posix_lock_t *
-new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
+new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
gf_lkowner_t *owner, fd_t *fd);
pl_inode_t *
@@ -63,7 +67,8 @@ pl_dom_list_t *
get_domain (pl_inode_t *pl_inode, const char *volume);
void
-grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom);
+grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
+ pl_dom_list_t *dom);
void
__delete_inode_lock (pl_inode_lock_t *lock);
@@ -73,14 +78,14 @@ __pl_inodelk_unref (pl_inode_lock_t *lock);
void
grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_entry_lock_t *unlocked, pl_dom_list_t *dom);
+ pl_dom_list_t *dom);
void pl_update_refkeeper (xlator_t *this, inode_t *inode);
int32_t
-__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode);
+__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname);
int32_t
-get_inodelk_count (xlator_t *this, inode_t *inode);
+get_inodelk_count (xlator_t *this, inode_t *inode, char *domname);
int32_t
__get_entrylk_count (xlator_t *this, pl_inode_t *pl_inode);
@@ -143,6 +148,11 @@ pl_verify_reservelk (xlator_t *this, pl_inode_t *pl_inode,
posix_lock_t *lock, int can_block);
int
pl_reserve_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock);
+
uint32_t
check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename);
+
+void __pl_inodelk_unref (pl_inode_lock_t *lock);
+void __pl_entrylk_unref (pl_entry_lock_t *lock);
+
#endif /* __COMMON_H__ */
diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
index d934a8b94..8496d9d8d 100644
--- a/xlators/features/locks/src/entrylk.c
+++ b/xlators/features/locks/src/entrylk.c
@@ -23,11 +23,29 @@
#include "locks.h"
#include "common.h"
+
+void
+__pl_entrylk_unref (pl_entry_lock_t *lock)
+{
+ lock->ref--;
+ if (!lock->ref) {
+ GF_FREE ((char *)lock->basename);
+ GF_FREE (lock->connection_id);
+ GF_FREE (lock);
+ }
+}
+
+
+static void
+__pl_entrylk_ref (pl_entry_lock_t *lock)
+{
+ lock->ref++;
+}
+
+
static pl_entry_lock_t *
new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
- void *trans, pid_t client_pid, gf_lkowner_t *owner,
- const char *volume)
-
+ const char *domain, call_frame_t *frame, char *conn_id)
{
pl_entry_lock_t *newlock = NULL;
@@ -39,14 +57,22 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
newlock->basename = basename ? gf_strdup (basename) : NULL;
newlock->type = type;
- newlock->trans = trans;
- newlock->volume = volume;
- newlock->client_pid = client_pid;
- newlock->owner = *owner;
+ newlock->client = frame->root->client;
+ newlock->client_pid = frame->root->pid;
+ newlock->volume = domain;
+ newlock->owner = frame->root->lk_owner;
+ newlock->frame = frame;
+ newlock->this = frame->this;
+
+ if (conn_id) {
+ newlock->connection_id = gf_strdup (conn_id);
+ }
INIT_LIST_HEAD (&newlock->domain_list);
INIT_LIST_HEAD (&newlock->blocked_locks);
+ INIT_LIST_HEAD (&newlock->client_list);
+ __pl_entrylk_ref (newlock);
out:
return newlock;
}
@@ -77,42 +103,42 @@ __same_entrylk_owner (pl_entry_lock_t *l1, pl_entry_lock_t *l2)
{
return (is_same_lkowner (&l1->owner, &l2->owner) &&
- (l1->trans == l2->trans));
+ (l1->client == l2->client));
}
/**
- * lock_grantable - is this lock grantable?
+ * entrylk_grantable - is this lock grantable?
* @inode: inode in which to look
* @basename: name we're trying to lock
* @type: type of lock
*/
static pl_entry_lock_t *
-__lock_grantable (pl_dom_list_t *dom, const char *basename, entrylk_type type)
+__entrylk_grantable (pl_dom_list_t *dom, pl_entry_lock_t *lock)
{
- pl_entry_lock_t *lock = NULL;
+ pl_entry_lock_t *tmp = NULL;
if (list_empty (&dom->entrylk_list))
return NULL;
- list_for_each_entry (lock, &dom->entrylk_list, domain_list) {
- if (names_conflict (lock->basename, basename))
- return lock;
+ list_for_each_entry (tmp, &dom->entrylk_list, domain_list) {
+ if (names_conflict (tmp->basename, lock->basename))
+ return tmp;
}
return NULL;
}
static pl_entry_lock_t *
-__blocked_lock_conflict (pl_dom_list_t *dom, const char *basename, entrylk_type type)
+__blocked_entrylk_conflict (pl_dom_list_t *dom, pl_entry_lock_t *lock)
{
- pl_entry_lock_t *lock = NULL;
+ pl_entry_lock_t *tmp = NULL;
if (list_empty (&dom->blocked_entrylks))
return NULL;
- list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) {
- if (names_conflict (lock->basename, basename))
+ list_for_each_entry (tmp, &dom->blocked_entrylks, blocked_locks) {
+ if (names_conflict (tmp->basename, lock->basename))
return lock;
}
@@ -293,7 +319,7 @@ __find_most_matching_lock (pl_dom_list_t *dom, const char *basename)
}
/**
- * __lock_name - lock a name in a directory
+ * __lock_entrylk - lock a name in a directory
* @inode: inode for the directory in which to lock
* @basename: name of the entry to lock
* if null, lock the entire directory
@@ -304,86 +330,48 @@ __find_most_matching_lock (pl_dom_list_t *dom, const char *basename)
*/
int
-__lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type,
- call_frame_t *frame, pl_dom_list_t *dom, xlator_t *this, int nonblock)
+__lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock,
+ int nonblock, pl_dom_list_t *dom)
{
- pl_entry_lock_t *lock = NULL;
- pl_entry_lock_t *conf = NULL;
- void *trans = NULL;
- pid_t client_pid = 0;
- int ret = -EINVAL;
+ pl_entry_lock_t *conf = NULL;
+ int ret = -EAGAIN;
- trans = frame->root->trans;
- client_pid = frame->root->pid;
-
- lock = new_entrylk_lock (pinode, basename, type, trans, client_pid,
- &frame->root->lk_owner, dom->domain);
- if (!lock) {
- ret = -ENOMEM;
- goto out;
- }
-
- lock->frame = frame;
- lock->this = this;
- lock->trans = trans;
-
- conf = __lock_grantable (dom, basename, type);
+ conf = __entrylk_grantable (dom, lock);
if (conf) {
ret = -EAGAIN;
- if (nonblock){
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
+ if (nonblock)
goto out;
- }
-
gettimeofday (&lock->blkd_time, NULL);
list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
gf_log (this->name, GF_LOG_TRACE,
"Blocking lock: {pinode=%p, basename=%s}",
- pinode, basename);
+ pinode, lock->basename);
goto out;
}
- if ( __blocked_lock_conflict (dom, basename, type) && !(__owner_has_lock (dom, lock))) {
+ if (__blocked_entrylk_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) {
ret = -EAGAIN;
- if (nonblock) {
- GF_FREE ((char *) lock->basename);
- GF_FREE (lock);
+ if (nonblock)
goto out;
- }
- lock->frame = frame;
- lock->this = this;
-
gettimeofday (&lock->blkd_time, NULL);
list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
- gf_log (this->name, GF_LOG_TRACE,
+ gf_log (this->name, GF_LOG_DEBUG,
"Lock is grantable, but blocking to prevent starvation");
gf_log (this->name, GF_LOG_TRACE,
"Blocking lock: {pinode=%p, basename=%s}",
- pinode, basename);
+ pinode, lock->basename);
- ret = -EAGAIN;
goto out;
}
- switch (type) {
-
- case ENTRYLK_WRLCK:
- gettimeofday (&lock->granted_time, NULL);
- list_add_tail (&lock->domain_list, &dom->entrylk_list);
- break;
- default:
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Invalid type for entrylk specified: %d", type);
- ret = -EINVAL;
- goto out;
- }
+ __pl_entrylk_ref (lock);
+ gettimeofday (&lock->granted_time, NULL);
+ list_add (&lock->domain_list, &dom->entrylk_list);
ret = 0;
out:
@@ -391,37 +379,36 @@ out:
}
/**
- * __unlock_name - unlock a name in a directory
+ * __unlock_entrylk - unlock a name in a directory
* @inode: inode for the directory to unlock in
* @basename: name of the entry to unlock
* if null, unlock the entire directory
*/
pl_entry_lock_t *
-__unlock_name (pl_dom_list_t *dom, const char *basename, entrylk_type type)
+__unlock_entrylk (pl_dom_list_t *dom, pl_entry_lock_t *lock)
{
- pl_entry_lock_t *lock = NULL;
+ pl_entry_lock_t *tmp = NULL;
pl_entry_lock_t *ret_lock = NULL;
- lock = __find_most_matching_lock (dom, basename);
+ tmp = __find_most_matching_lock (dom, lock->basename);
- if (!lock) {
- gf_log ("locks", GF_LOG_DEBUG,
+ if (!tmp) {
+ gf_log ("locks", GF_LOG_ERROR,
"unlock on %s (type=ENTRYLK_WRLCK) attempted but no matching lock found",
- basename);
+ lock->basename);
goto out;
}
- if (names_equal (lock->basename, basename)
- && lock->type == type) {
+ if (names_equal (tmp->basename, lock->basename)
+ && tmp->type == lock->type) {
+
+ list_del_init (&tmp->domain_list);
+ ret_lock = tmp;
- if (type == ENTRYLK_WRLCK) {
- list_del_init (&lock->domain_list);
- ret_lock = lock;
- }
} else {
- gf_log ("locks", GF_LOG_DEBUG,
- "Unlock for a non-existing lock!");
+ gf_log ("locks", GF_LOG_ERROR,
+ "Unlock on %s for a non-existing lock!", lock->basename);
goto out;
}
@@ -443,7 +430,7 @@ check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename)
pthread_mutex_lock (&pinode->mutex);
{
list_for_each_entry (dom, &pinode->dom_list, inode_list) {
- conf = __lock_grantable (dom, basename, ENTRYLK_WRLCK);
+ conf = __find_most_matching_lock (dom, basename);
if (conf && conf->basename) {
entrylk = 1;
break;
@@ -469,26 +456,14 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
INIT_LIST_HEAD (&blocked_list);
list_splice_init (&dom->blocked_entrylks, &blocked_list);
- list_for_each_entry_safe (bl, tmp, &blocked_list,
- blocked_locks) {
+ list_for_each_entry_safe (bl, tmp, &blocked_list, blocked_locks) {
list_del_init (&bl->blocked_locks);
-
- gf_log ("locks", GF_LOG_TRACE,
- "Trying to unblock: {pinode=%p, basename=%s}",
- pl_inode, bl->basename);
-
- bl_ret = __lock_name (pl_inode, bl->basename, bl->type,
- bl->frame, dom, bl->this, 0);
+ bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom);
if (bl_ret == 0) {
list_add (&bl->blocked_locks, granted);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "should never happen");
- GF_FREE ((char *)bl->basename);
- GF_FREE (bl);
}
}
return;
@@ -497,7 +472,7 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
/* Grants locks if possible which are blocked on a lock */
void
grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_entry_lock_t *unlocked, pl_dom_list_t *dom)
+ pl_dom_list_t *dom)
{
struct list_head granted_list;
pl_entry_lock_t *tmp = NULL;
@@ -507,123 +482,56 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
pthread_mutex_lock (&pl_inode->mutex);
{
- __grant_blocked_entry_locks (this, pl_inode, dom, &granted_list);
+ __grant_blocked_entry_locks (this, pl_inode, dom,
+ &granted_list);
}
pthread_mutex_unlock (&pl_inode->mutex);
list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) {
- list_del_init (&lock->blocked_locks);
-
entrylk_trace_out (this, lock->frame, NULL, NULL, NULL,
lock->basename, ENTRYLK_LOCK, lock->type,
0, 0);
STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL);
+ lock->frame = NULL;
+ }
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
- }
-
- GF_FREE ((char *)unlocked->basename);
- GF_FREE (unlocked);
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) {
+ list_del_init (&lock->blocked_locks);
+ __pl_entrylk_unref (lock);
+ }
+ }
+ pthread_mutex_unlock (&pl_inode->mutex);
return;
}
-/**
- * release_entry_locks_for_transport: release all entry locks from this
- * transport for this loc_t
- */
-
-static int
-release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode,
- pl_dom_list_t *dom, void *trans)
-{
- pl_entry_lock_t *lock = NULL;
- pl_entry_lock_t *tmp = NULL;
- struct list_head granted;
- struct list_head released;
-
- INIT_LIST_HEAD (&granted);
- INIT_LIST_HEAD (&released);
-
- pthread_mutex_lock (&pinode->mutex);
- {
- list_for_each_entry_safe (lock, tmp, &dom->blocked_entrylks,
- blocked_locks) {
- if (lock->trans != trans)
- continue;
-
- list_del_init (&lock->blocked_locks);
-
- gf_log (this->name, GF_LOG_TRACE,
- "releasing lock on held by "
- "{transport=%p}",trans);
-
- list_add (&lock->blocked_locks, &released);
-
- }
-
- list_for_each_entry_safe (lock, tmp, &dom->entrylk_list,
- domain_list) {
- if (lock->trans != trans)
- continue;
-
- list_del_init (&lock->domain_list);
-
- gf_log (this->name, GF_LOG_TRACE,
- "releasing lock on held by "
- "{transport=%p}",trans);
-
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
- }
-
- __grant_blocked_entry_locks (this, pinode, dom, &granted);
-
- }
-
- pthread_mutex_unlock (&pinode->mutex);
-
- list_for_each_entry_safe (lock, tmp, &released, blocked_locks) {
- list_del_init (&lock->blocked_locks);
-
- STACK_UNWIND_STRICT (entrylk, lock->frame, -1, EAGAIN, NULL);
-
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
-
- }
-
- list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) {
- list_del_init (&lock->blocked_locks);
-
- STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL);
-
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
- }
-
- return 0;
-}
/* Common entrylk code called by pl_entrylk and pl_fentrylk */
int
pl_common_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, inode_t *inode, const char *basename,
- entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
-
- void * transport = NULL;
+ entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
- pl_inode_t * pinode = NULL;
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
int ret = -1;
- pl_entry_lock_t *unlocked = NULL;
char unwind = 1;
+ GF_UNUSED int dict_ret = -1;
+ pl_inode_t *pinode = NULL;
+ pl_entry_lock_t *reqlock = NULL;
+ pl_entry_lock_t *unlocked = NULL;
+ pl_dom_list_t *dom = NULL;
+ char *conn_id = NULL;
+ pl_ctx_t *ctx = NULL;
+ int nonblock = 0;
- pl_dom_list_t *dom = NULL;
+ if (xdata)
+ dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
pinode = pl_inode_get (this, inode);
if (!pinode) {
@@ -631,6 +539,15 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
goto out;
}
+ if (frame->root->client) {
+ ctx = pl_ctx_get (frame->root->client, this);
+ if (!ctx) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed");
+ goto unwind;
+ }
+ }
+
dom = get_domain (pinode, volume);
if (!dom){
op_errno = ENOMEM;
@@ -639,73 +556,69 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
entrylk_trace_in (this, frame, volume, fd, loc, basename, cmd, type);
- transport = frame->root->trans;
-
- if (frame->root->lk_owner.len == 0) {
- /*
- this is a special case that means release
- all locks from this transport
- */
-
- gf_log (this->name, GF_LOG_TRACE,
- "Releasing locks for transport %p", transport);
-
- release_entry_locks_for_transport (this, pinode, dom, transport);
- op_ret = 0;
-
- goto out;
+ reqlock = new_entrylk_lock (pinode, basename, type, dom->domain, frame,
+ conn_id);
+ if (!reqlock) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
}
switch (cmd) {
- case ENTRYLK_LOCK:
- pthread_mutex_lock (&pinode->mutex);
- {
- ret = __lock_name (pinode, basename, type,
- frame, dom, this, 0);
- }
- pthread_mutex_unlock (&pinode->mutex);
-
- op_errno = -ret;
- if (ret < 0) {
- if (ret == -EAGAIN)
- unwind = 0;
- else
- unwind = 1;
- goto out;
- } else {
- op_ret = 0;
- op_errno = 0;
- unwind = 1;
- goto out;
- }
-
- break;
-
case ENTRYLK_LOCK_NB:
- unwind = 1;
+ nonblock = 1;
+ /* fall through */
+ case ENTRYLK_LOCK:
+ if (ctx)
+ pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pinode->mutex);
{
- ret = __lock_name (pinode, basename, type,
- frame, dom, this, 1);
+ reqlock->pinode = pinode;
+
+ ret = __lock_entrylk (this, pinode, reqlock, nonblock, dom);
+ if (ret == 0) {
+ reqlock->frame = NULL;
+ op_ret = 0;
+ } else {
+ op_errno = -ret;
+ }
+
+ if (ctx && (!ret || !nonblock))
+ list_add (&reqlock->client_list,
+ &ctx->entrylk_lockers);
+
+ if (ret == -EAGAIN && !nonblock) {
+ /* blocked */
+ unwind = 0;
+ } else {
+ __pl_entrylk_unref (reqlock);
+ }
}
pthread_mutex_unlock (&pinode->mutex);
-
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- break;
+ if (ctx)
+ pthread_mutex_unlock (&ctx->lock);
+ break;
case ENTRYLK_UNLOCK:
+ if (ctx)
+ pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pinode->mutex);
{
- unlocked = __unlock_name (dom, basename, type);
+ unlocked = __unlock_entrylk (dom, reqlock);
+ if (unlocked) {
+ list_del_init (&unlocked->client_list);
+ __pl_entrylk_unref (unlocked);
+ op_ret = 0;
+ } else {
+ op_errno = EINVAL;
+ }
+ __pl_entrylk_unref (reqlock);
}
pthread_mutex_unlock (&pinode->mutex);
+ if (ctx)
+ pthread_mutex_unlock (&ctx->lock);
- if (unlocked)
- grant_blocked_entry_locks (this, pinode, unlocked, dom);
+ grant_blocked_entry_locks (this, pinode, dom);
break;
@@ -715,21 +628,19 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
"a bug report at http://bugs.gluster.com", cmd);
goto out;
}
-
- op_ret = 0;
out:
pl_update_refkeeper (this, inode);
+
if (unwind) {
entrylk_trace_out (this, frame, volume, fd, loc, basename,
cmd, type, op_ret, op_errno);
-
+unwind:
STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, NULL);
} else {
entrylk_trace_block (this, frame, volume, fd, loc, basename,
cmd, type);
}
-
return 0;
}
@@ -742,10 +653,10 @@ out:
int
pl_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
-
- pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd, type, loc, NULL);
+ pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd,
+ type, loc, NULL, xdata);
return 0;
}
@@ -760,10 +671,136 @@ pl_entrylk (call_frame_t *frame, xlator_t *this,
int
pl_fentrylk (call_frame_t *frame, xlator_t *this,
const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+ pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd,
+ type, NULL, fd, xdata);
+
+ return 0;
+}
+
+
+static void
+pl_entrylk_log_cleanup (pl_entry_lock_t *lock)
+{
+ pl_inode_t *pinode = NULL;
+ char *path = NULL;
+ char *file = NULL;
+
+ pinode = lock->pinode;
+
+ inode_path (pinode->refkeeper, NULL, &path);
+
+ if (path)
+ file = path;
+ else
+ file = uuid_utoa (pinode->refkeeper->gfid);
+
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "releasing lock on %s held by "
+ "{client=%p, pid=%"PRId64" lk-owner=%s}",
+ file, lock->client, (uint64_t) lock->client_pid,
+ lkowner_utoa (&lock->owner));
+ GF_FREE (path);
+}
+
+
+/* Release all entrylks from this client */
+int
+pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
{
+ pl_entry_lock_t *tmp = NULL;
+ pl_entry_lock_t *l = NULL;
+ pl_dom_list_t *dom = NULL;
+ pl_inode_t *pinode = NULL;
+
+ struct list_head released;
+ struct list_head unwind;
+
+ INIT_LIST_HEAD (&released);
+ INIT_LIST_HEAD (&unwind);
+
+ pthread_mutex_lock (&ctx->lock);
+ {
+ list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers,
+ client_list) {
+ list_del_init (&l->client_list);
+
+ pl_entrylk_log_cleanup (l);
+
+ pinode = l->pinode;
+
+ pthread_mutex_lock (&pinode->mutex);
+ {
+ /* If the entrylk object is part of granted list but not
+ * blocked list, then perform the following actions:
+ * i. delete the object from granted list;
+ * ii. grant other locks (from other clients) that may
+ * have been blocked on this entrylk; and
+ * iii. unref the object.
+ *
+ * If the entrylk object (L1) is part of both granted
+ * and blocked lists, then this means that a parallel
+ * unlock on another entrylk (L2 say) may have 'granted'
+ * L1 and added it to 'granted' list in
+ * __grant_blocked_entry_locks() (although using the
+ * 'blocked_locks' member). In that case, the cleanup
+ * codepath must try and grant other overlapping
+ * blocked entrylks from other clients, now that L1 is
+ * out of their way and then unref L1 in the end, and
+ * leave it to the other thread (the one executing
+ * unlock codepath) to unwind L1's frame, delete it from
+ * blocked_locks list, and perform the last unref on L1.
+ *
+ * If the entrylk object (L1) is part of blocked list
+ * only, the cleanup code path must:
+ * i. delete it from the blocked_locks list inside
+ * this critical section,
+ * ii. unwind its frame with EAGAIN,
+ * iii. try and grant blocked entry locks from other
+ * clients that were otherwise grantable, but were
+ * blocked to avoid leaving L1 to starve forever.
+ * iv. unref the object.
+ */
+ if (!list_empty (&l->domain_list)) {
+ list_del_init (&l->domain_list);
+ list_add_tail (&l->client_list,
+ &released);
+ } else {
+ list_del_init (&l->blocked_locks);
+ list_add_tail (&l->client_list,
+ &unwind);
+ }
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+ }
+ }
+ pthread_mutex_unlock (&ctx->lock);
+
+ list_for_each_entry_safe (l, tmp, &unwind, client_list) {
+ list_del_init (&l->client_list);
+
+ if (l->frame)
+ STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN,
+ NULL);
+ list_add_tail (&l->client_list, &released);
+ }
- pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd, type, NULL, fd);
+ list_for_each_entry_safe (l, tmp, &released, client_list) {
+ list_del_init (&l->client_list);
+
+ pinode = l->pinode;
+
+ dom = get_domain (pinode, l->volume);
+
+ grant_blocked_entry_locks (this, pinode, dom);
+
+ pthread_mutex_lock (&pinode->mutex);
+ {
+ __pl_entrylk_unref (l);
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+ }
return 0;
}
diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
index 42350e59a..c76cb7f91 100644
--- a/xlators/features/locks/src/inodelk.c
+++ b/xlators/features/locks/src/inodelk.c
@@ -26,7 +26,7 @@
inline void
__delete_inode_lock (pl_inode_lock_t *lock)
{
- list_del (&lock->list);
+ list_del_init (&lock->list);
}
static inline void
@@ -39,8 +39,10 @@ inline void
__pl_inodelk_unref (pl_inode_lock_t *lock)
{
lock->ref--;
- if (!lock->ref)
+ if (!lock->ref) {
+ GF_FREE (lock->connection_id);
GF_FREE (lock);
+ }
}
/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't conflict */
@@ -122,7 +124,7 @@ static inline int
same_inodelk_owner (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
{
return (is_same_lkowner (&l1->owner, &l2->owner) &&
- (l1->transport == l2->transport));
+ (l1->client == l2->client));
}
/* Returns true if the 2 inodelks conflict with each other */
@@ -202,7 +204,7 @@ __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
int ret = -EINVAL;
conf = __inodelk_grantable (dom, lock);
- if (conf){
+ if (conf) {
ret = -EAGAIN;
if (can_block == 0)
goto out;
@@ -230,7 +232,7 @@ __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
gettimeofday (&lock->blkd_time, NULL);
list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
- gf_log (this->name, GF_LOG_TRACE,
+ gf_log (this->name, GF_LOG_DEBUG,
"Lock is grantable, but blocking to prevent starvation");
gf_log (this->name, GF_LOG_TRACE,
"%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked",
@@ -292,7 +294,7 @@ __inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom)
" Matching lock not found for unlock %llu-%llu, by %s "
"on %p", (unsigned long long)lock->fl_start,
(unsigned long long)lock->fl_end,
- lkowner_utoa (&lock->owner), lock->transport);
+ lkowner_utoa (&lock->owner), lock->client);
goto out;
}
__delete_inode_lock (conf);
@@ -300,11 +302,13 @@ __inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom)
" Matching lock found for unlock %llu-%llu, by %s on %p",
(unsigned long long)lock->fl_start,
(unsigned long long)lock->fl_end, lkowner_utoa (&lock->owner),
- lock->transport);
+ lock->client);
out:
return conf;
}
+
+
static void
__grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
struct list_head *granted, pl_dom_list_t *dom)
@@ -333,7 +337,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
/* Grant all inodelks blocked on a lock */
void
-grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom)
+grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
+ pl_dom_list_t *dom)
{
struct list_head granted;
pl_inode_lock_t *lock;
@@ -360,6 +365,7 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *
&lock->user_flock, 0, 0, lock->volume);
STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL);
+ lock->frame = NULL;
}
pthread_mutex_lock (&pl_inode->mutex);
@@ -372,108 +378,153 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *
pthread_mutex_unlock (&pl_inode->mutex);
}
-/* Release all inodelks from this transport */
-static int
-release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom,
- inode_t *inode, void *trans)
-{
- pl_inode_lock_t *tmp = NULL;
- pl_inode_lock_t *l = NULL;
-
- pl_inode_t * pinode = NULL;
-
- struct list_head released;
+static void
+pl_inodelk_log_cleanup (pl_inode_lock_t *lock)
+{
+ pl_inode_t *pl_inode = NULL;
char *path = NULL;
char *file = NULL;
- INIT_LIST_HEAD (&released);
+ pl_inode = lock->pl_inode;
- pinode = pl_inode_get (this, inode);
+ inode_path (pl_inode->refkeeper, NULL, &path);
- pthread_mutex_lock (&pinode->mutex);
- {
+ if (path)
+ file = path;
+ else
+ file = uuid_utoa (pl_inode->refkeeper->gfid);
- list_for_each_entry_safe (l, tmp, &dom->blocked_inodelks, blocked_locks) {
- if (l->transport != trans)
- continue;
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "releasing lock on %s held by "
+ "{client=%p, pid=%"PRId64" lk-owner=%s}",
+ file, lock->client, (uint64_t) lock->client_pid,
+ lkowner_utoa (&lock->owner));
+ GF_FREE (path);
+}
- list_del_init (&l->blocked_locks);
- inode_path (inode, NULL, &path);
- if (path)
- file = path;
- else
- file = uuid_utoa (inode->gfid);
+/* Release all inodelks from this client */
+int
+pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
+{
+ pl_inode_lock_t *tmp = NULL;
+ pl_inode_lock_t *l = NULL;
+ pl_dom_list_t *dom = NULL;
+ pl_inode_t *pl_inode = NULL;
- gf_log (this->name, GF_LOG_DEBUG,
- "releasing blocking lock on %s held by "
- "{transport=%p, pid=%"PRId64" lk-owner=%s}",
- file, trans, (uint64_t) l->client_pid,
- lkowner_utoa (&l->owner));
+ struct list_head released;
+ struct list_head unwind;
- list_add (&l->blocked_locks, &released);
- if (path) {
- GF_FREE (path);
- path = NULL;
+ INIT_LIST_HEAD (&released);
+ INIT_LIST_HEAD (&unwind);
+
+ pthread_mutex_lock (&ctx->lock);
+ {
+ list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers,
+ client_list) {
+ list_del_init (&l->client_list);
+
+ pl_inodelk_log_cleanup (l);
+
+ pl_inode = l->pl_inode;
+
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ /* If the inodelk object is part of granted list but not
+ * blocked list, then perform the following actions:
+ * i. delete the object from granted list;
+ * ii. grant other locks (from other clients) that may
+ * have been blocked on this inodelk; and
+ * iii. unref the object.
+ *
+ * If the inodelk object (L1) is part of both granted
+ * and blocked lists, then this means that a parallel
+ * unlock on another inodelk (L2 say) may have 'granted'
+ * L1 and added it to 'granted' list in
+ * __grant_blocked_node_locks() (although using the
+ * 'blocked_locks' member). In that case, the cleanup
+ * codepath must try and grant other overlapping
+ * blocked inodelks from other clients, now that L1 is
+ * out of their way and then unref L1 in the end, and
+ * leave it to the other thread (the one executing
+ * unlock codepath) to unwind L1's frame, delete it from
+ * blocked_locks list, and perform the last unref on L1.
+ *
+ * If the inodelk object (L1) is part of blocked list
+ * only, the cleanup code path must:
+ * i. delete it from the blocked_locks list inside
+ * this critical section,
+ * ii. unwind its frame with EAGAIN,
+ * iii. try and grant blocked inode locks from other
+ * clients that were otherwise grantable, but just
+ * got blocked to avoid leaving L1 to starve
+ * forever.
+ * iv. unref the object.
+ */
+ if (!list_empty (&l->list)) {
+ __delete_inode_lock (l);
+ list_add_tail (&l->client_list,
+ &released);
+ } else {
+ list_del_init(&l->blocked_locks);
+ list_add_tail (&l->client_list,
+ &unwind);
+ }
}
+ pthread_mutex_unlock (&pl_inode->mutex);
}
+ }
+ pthread_mutex_unlock (&ctx->lock);
- list_for_each_entry_safe (l, tmp, &dom->inodelk_list, list) {
- if (l->transport != trans)
- continue;
-
- inode_path (inode, NULL, &path);
- if (path)
- file = path;
- else
- file = uuid_utoa (inode->gfid);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "releasing granted lock on %s held by "
- "{transport=%p, pid=%"PRId64" lk-owner=%s}",
- file, trans, (uint64_t) l->client_pid,
- lkowner_utoa (&l->owner));
-
- if (path) {
- GF_FREE (path);
- path = NULL;
- }
+ list_for_each_entry_safe (l, tmp, &unwind, client_list) {
+ list_del_init (&l->client_list);
+
+ if (l->frame)
+ STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN,
+ NULL);
+ list_add_tail (&l->client_list, &released);
- __delete_inode_lock (l);
- __pl_inodelk_unref (l);
- }
}
- GF_FREE (path);
- pthread_mutex_unlock (&pinode->mutex);
+ list_for_each_entry_safe (l, tmp, &released, client_list) {
+ list_del_init (&l->client_list);
+
+ pl_inode = l->pl_inode;
+
+ dom = get_domain (pl_inode, l->volume);
- list_for_each_entry_safe (l, tmp, &released, blocked_locks) {
- list_del_init (&l->blocked_locks);
+ grant_blocked_inode_locks (this, pl_inode, dom);
- STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN, NULL);
- //No need to take lock as the locks are only in one list
- __pl_inodelk_unref (l);
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ __pl_inodelk_unref (l);
+ }
+ pthread_mutex_unlock (&pl_inode->mutex);
}
- grant_blocked_inode_locks (this, pinode, dom);
return 0;
}
static int
-pl_inode_setlk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
- int can_block, pl_dom_list_t *dom)
+pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+ pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom)
{
int ret = -EINVAL;
pl_inode_lock_t *retlock = NULL;
gf_boolean_t unref = _gf_true;
+ lock->pl_inode = pl_inode;
+
+ if (ctx)
+ pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pl_inode->mutex);
{
if (lock->fl_type != F_UNLCK) {
ret = __lock_inodelk (this, pl_inode, lock, can_block, dom);
if (ret == 0) {
+ lock->frame = NULL;
gf_log (this->name, GF_LOG_TRACE,
"%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => OK",
lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
@@ -492,6 +543,10 @@ pl_inode_setlk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
if (can_block)
unref = _gf_false;
}
+
+ if (ctx && (!ret || can_block))
+ list_add_tail (&lock->client_list,
+ &ctx->inodelk_lockers);
} else {
retlock = __inode_unlock_lock (this, lock, dom);
if (!retlock) {
@@ -500,23 +555,29 @@ pl_inode_setlk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
ret = -EINVAL;
goto out;
}
- __pl_inodelk_unref (retlock);
+ list_del_init (&retlock->client_list);
+ __pl_inodelk_unref (retlock);
ret = 0;
}
- }
out:
- if (unref)
- __pl_inodelk_unref (lock);
+ if (unref)
+ __pl_inodelk_unref (lock);
+ }
pthread_mutex_unlock (&pl_inode->mutex);
+ if (ctx)
+ pthread_mutex_unlock (&ctx->lock);
+
grant_blocked_inode_locks (this, pl_inode, dom);
+
return ret;
}
/* Create a new inode_lock_t */
pl_inode_lock_t *
-new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
- gf_lkowner_t *owner, const char *volume)
+new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
+ call_frame_t *frame, xlator_t *this, const char *volume,
+ char *conn_id)
{
pl_inode_lock_t *lock = NULL;
@@ -535,33 +596,77 @@ new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
else
lock->fl_end = flock->l_start + flock->l_len - 1;
- lock->transport = transport;
+ lock->client = client;
lock->client_pid = client_pid;
lock->volume = volume;
- lock->owner = *owner;
+ lock->owner = frame->root->lk_owner;
+ lock->frame = frame;
+ lock->this = this;
+
+ if (conn_id) {
+ lock->connection_id = gf_strdup (conn_id);
+ }
INIT_LIST_HEAD (&lock->list);
INIT_LIST_HEAD (&lock->blocked_locks);
+ INIT_LIST_HEAD (&lock->client_list);
__pl_inodelk_ref (lock);
return lock;
}
+int32_t
+_pl_convert_volume (const char *volume, char **res)
+{
+ char *mdata_vol = NULL;
+ int ret = 0;
+
+ mdata_vol = strrchr (volume, ':');
+ //if the volume already ends with :metadata don't bother
+ if (mdata_vol && (strcmp (mdata_vol, ":metadata") == 0))
+ return 0;
+
+ ret = gf_asprintf (res, "%s:metadata", volume);
+ if (ret <= 0)
+ return ENOMEM;
+ return 0;
+}
+
+int32_t
+_pl_convert_volume_for_special_range (struct gf_flock *flock,
+ const char *volume, char **res)
+{
+ int32_t ret = 0;
+
+ if ((flock->l_start == LLONG_MAX -1) &&
+ (flock->l_len == 0)) {
+ ret = _pl_convert_volume (volume, res);
+ }
+
+ return ret;
+}
+
/* Common inodelk code called from pl_inodelk and pl_finodelk */
int
pl_common_inodelk (call_frame_t *frame, xlator_t *this,
const char *volume, inode_t *inode, int32_t cmd,
- struct gf_flock *flock, loc_t *loc, fd_t *fd)
+ struct gf_flock *flock, loc_t *loc, fd_t *fd, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
int ret = -1;
+ GF_UNUSED int dict_ret = -1;
int can_block = 0;
- pid_t client_pid = -1;
- void * transport = NULL;
pl_inode_t * pinode = NULL;
pl_inode_lock_t * reqlock = NULL;
pl_dom_list_t * dom = NULL;
+ char *res = NULL;
+ char *res1 = NULL;
+ char *conn_id = NULL;
+ pl_ctx_t *ctx = NULL;
+
+ if (xdata)
+ dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (inode, unwind);
@@ -572,10 +677,22 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this,
goto unwind;
}
+ op_errno = _pl_convert_volume_for_special_range (flock, volume, &res);
+ if (op_errno)
+ goto unwind;
+ if (res)
+ volume = res;
+
pl_trace_in (this, frame, fd, loc, cmd, flock, volume);
- transport = frame->root->trans;
- client_pid = frame->root->pid;
+ if (frame->root->client) {
+ ctx = pl_ctx_get (frame->root->client, this);
+ if (!ctx) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed");
+ goto unwind;
+ }
+ }
pinode = pl_inode_get (this, inode);
if (!pinode) {
@@ -589,22 +706,8 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this,
goto unwind;
}
- if (frame->root->lk_owner.len == 0) {
- /*
- special case: this means release all locks
- from this transport
- */
- gf_log (this->name, GF_LOG_TRACE,
- "Releasing all locks from transport %p", transport);
-
- release_inode_locks_of_transport (this, dom, inode, transport);
-
- op_ret = 0;
- goto unwind;
- }
-
- reqlock = new_inode_lock (flock, transport, client_pid,
- &frame->root->lk_owner, volume);
+ reqlock = new_inode_lock (flock, frame->root->client, frame->root->pid,
+ frame, this, volume, conn_id);
if (!reqlock) {
op_ret = -1;
@@ -612,21 +715,17 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this,
goto unwind;
}
- reqlock->frame = frame;
- reqlock->this = this;
switch (cmd) {
case F_SETLKW:
can_block = 1;
- reqlock->frame = frame;
- reqlock->this = this;
/* fall through */
case F_SETLK:
memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock));
- ret = pl_inode_setlk (this, pinode, reqlock,
- can_block, dom);
+ ret = pl_inode_setlk (this, ctx, pinode, reqlock, can_block,
+ dom);
if (ret < 0) {
if ((can_block) && (F_UNLCK != flock->l_type)) {
@@ -659,53 +758,78 @@ unwind:
STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, NULL);
out:
+ GF_FREE (res);
+ GF_FREE (res1);
return 0;
}
int
pl_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock)
+ const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock,
+ dict_t *xdata)
{
-
- pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock, loc, NULL);
+ pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock,
+ loc, NULL, xdata);
return 0;
}
int
pl_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock)
+ const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock,
+ dict_t *xdata)
{
-
- pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock, NULL, fd);
+ pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock,
+ NULL, fd, xdata);
return 0;
}
+static inline int32_t
+__get_inodelk_dom_count (pl_dom_list_t *dom)
+{
+ pl_inode_lock_t *lock = NULL;
+ int32_t count = 0;
+
+ list_for_each_entry (lock, &dom->inodelk_list, list) {
+ count++;
+ }
+ list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) {
+ count++;
+ }
+ return count;
+}
+/* Returns the no. of locks (blocked/granted) held on a given domain name
+ * If @domname is NULL, returns the no. of locks in all the domains present.
+ * If @domname is non-NULL and non-existent, returns 0 */
int32_t
-__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode)
+__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname)
{
int32_t count = 0;
- pl_inode_lock_t *lock = NULL;
pl_dom_list_t *dom = NULL;
list_for_each_entry (dom, &pl_inode->dom_list, inode_list) {
- list_for_each_entry (lock, &dom->inodelk_list, list) {
- count++;
- }
- list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) {
- count++;
- }
+ if (domname) {
+ if (strcmp (domname, dom->domain) == 0) {
+ count = __get_inodelk_dom_count (dom);
+ goto out;
+ }
+
+ } else {
+ /* Counting locks from all domains */
+ count += __get_inodelk_dom_count (dom);
+ }
}
+out:
return count;
}
int32_t
-get_inodelk_count (xlator_t *this, inode_t *inode)
+get_inodelk_count (xlator_t *this, inode_t *inode, char *domname)
{
pl_inode_t *pl_inode = NULL;
uint64_t tmp_pl_inode = 0;
@@ -721,7 +845,7 @@ get_inodelk_count (xlator_t *this, inode_t *inode)
pthread_mutex_lock (&pl_inode->mutex);
{
- count = __get_inodelk_count (this, pl_inode);
+ count = __get_inodelk_count (this, pl_inode, domname);
}
pthread_mutex_unlock (&pl_inode->mutex);
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
index 7ffc67e1b..8c2a6f867 100644
--- a/xlators/features/locks/src/locks.h
+++ b/xlators/features/locks/src/locks.h
@@ -19,10 +19,10 @@
#include "stack.h"
#include "call-stub.h"
#include "locks-mem-types.h"
+#include "client_t.h"
#include "lkowner.h"
-#define POSIX_LOCKS "posix-locks"
struct __pl_fd;
struct __posix_lock {
@@ -33,7 +33,7 @@ struct __posix_lock {
off_t fl_end;
short blocked; /* waiting to acquire */
- struct gf_flock user_flock; /* the flock supplied by the user */
+ struct gf_flock user_flock; /* the flock supplied by the user */
xlator_t *this; /* required for blocked locks */
unsigned long fd_num;
@@ -46,7 +46,7 @@ struct __posix_lock {
/* These two together serve to uniquely identify each process
across nodes */
- void *transport; /* to identify client node */
+ void *client; /* to identify client node */
gf_lkowner_t owner;
pid_t client_pid; /* pid of client process */
};
@@ -63,9 +63,9 @@ struct __pl_inode_lock {
const char *volume;
- struct gf_flock user_flock; /* the flock supplied by the user */
+ struct gf_flock user_flock; /* the flock supplied by the user */
xlator_t *this; /* required for blocked locks */
- fd_t *fd;
+ struct __pl_inode *pl_inode;
call_frame_t *frame;
@@ -75,9 +75,13 @@ struct __pl_inode_lock {
/* These two together serve to uniquely identify each process
across nodes */
- void *transport; /* to identify client node */
+ void *client; /* to identify client node */
gf_lkowner_t owner;
pid_t client_pid; /* pid of client process */
+
+ char *connection_id; /* stores the client connection id */
+
+ struct list_head client_list; /* list of all locks from a client */
};
typedef struct __pl_inode_lock pl_inode_lock_t;
@@ -101,9 +105,11 @@ typedef struct __pl_dom_list_t pl_dom_list_t;
struct __entry_lock {
struct list_head domain_list; /* list_head back to pl_dom_list_t */
struct list_head blocked_locks; /* list_head back to blocked_entrylks */
+ int ref;
call_frame_t *frame;
xlator_t *this;
+ struct __pl_inode *pinode;
const char *volume;
@@ -113,9 +119,13 @@ struct __entry_lock {
struct timeval blkd_time; /*time at which lock was queued into blkd list*/
struct timeval granted_time; /*time at which lock was queued into active list*/
- void *trans;
+ void *client;
gf_lkowner_t owner;
- pid_t client_pid; /* pid of client process */
+ pid_t client_pid; /* pid of client process */
+
+ char *connection_id; /* stores the client connection id */
+
+ struct list_head client_list; /* list of all locks from a client */
};
typedef struct __entry_lock pl_entry_lock_t;
@@ -140,21 +150,17 @@ struct __pl_inode {
typedef struct __pl_inode pl_inode_t;
-struct __pl_fd {
- gf_boolean_t nonblocking; /* whether O_NONBLOCK has been set */
-};
-typedef struct __pl_fd pl_fd_t;
-
-
typedef struct {
gf_boolean_t mandatory; /* if mandatory locking is enabled */
gf_boolean_t trace; /* trace lock requests in and out */
char *brickname;
} posix_locks_private_t;
+
typedef struct {
gf_boolean_t entrylk_count_req;
gf_boolean_t inodelk_count_req;
+ gf_boolean_t inodelk_dom_count_req;
gf_boolean_t posixlk_count_req;
gf_boolean_t parent_entrylk_req;
@@ -166,8 +172,33 @@ typedef struct {
enum {TRUNCATE, FTRUNCATE} op;
} pl_local_t;
+
typedef struct {
struct list_head locks_list;
} pl_fdctx_t;
+
+struct _locker {
+ struct list_head lockers;
+ char *volume;
+ inode_t *inode;
+ gf_lkowner_t owner;
+};
+
+typedef struct _locks_ctx {
+ pthread_mutex_t lock;
+ struct list_head inodelk_lockers;
+ struct list_head entrylk_lockers;
+} pl_ctx_t;
+
+
+pl_ctx_t *
+pl_ctx_get (client_t *client, xlator_t *xlator);
+
+int
+pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx);
+
+int
+pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx);
+
#endif /* __POSIX_LOCKS_H__ */
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
index 2bc5f8581..337623d65 100644
--- a/xlators/features/locks/src/posix.c
+++ b/xlators/features/locks/src/posix.c
@@ -51,7 +51,7 @@ pl_new_fdctx ()
fdctx = GF_CALLOC (1, sizeof (*fdctx),
gf_locks_mt_pl_fdctx_t);
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, fdctx, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", fdctx, out);
INIT_LIST_HEAD (&fdctx->locks_list);
@@ -66,7 +66,7 @@ pl_check_n_create_fdctx (xlator_t *this, fd_t *fd)
uint64_t tmp = 0;
pl_fdctx_t *fdctx = NULL;
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, this, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", this, out);
GF_VALIDATE_OR_GOTO (this->name, fd, out);
LOCK (&fd->lock);
@@ -119,7 +119,7 @@ pl_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
static int
truncate_allowed (pl_inode_t *pl_inode,
- void *transport, pid_t client_pid,
+ client_t *client, pid_t client_pid,
gf_lkowner_t *owner, off_t offset)
{
posix_lock_t *l = NULL;
@@ -128,7 +128,7 @@ truncate_allowed (pl_inode_t *pl_inode,
region.fl_start = offset;
region.fl_end = LLONG_MAX;
- region.transport = transport;
+ region.client = client;
region.client_pid = client_pid;
region.owner = *owner;
@@ -139,7 +139,7 @@ truncate_allowed (pl_inode_t *pl_inode,
&& locks_overlap (&region, l)
&& !same_owner (&region, l)) {
ret = 0;
- gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Truncate "
+ gf_log ("posix-locks", GF_LOG_TRACE, "Truncate "
"allowed");
break;
}
@@ -186,7 +186,7 @@ truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (priv->mandatory
&& pl_inode->mandatory
- && !truncate_allowed (pl_inode, frame->root->trans,
+ && !truncate_allowed (pl_inode, frame->root->client,
frame->root->pid, &frame->root->lk_owner,
local->offset)) {
op_ret = -1;
@@ -294,7 +294,7 @@ pl_locks_by_fd (pl_inode_t *pl_inode, fd_t *fd)
{
list_for_each_entry (l, &pl_inode->ext_list, list) {
- if ((l->fd_num == fd_to_fdnum(fd))) {
+ if (l->fd_num == fd_to_fdnum(fd)) {
found = 1;
break;
}
@@ -319,7 +319,7 @@ delete_locks_of_fd (xlator_t *this, pl_inode_t *pl_inode, fd_t *fd)
{
list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) {
- if ((l->fd_num == fd_to_fdnum(fd))) {
+ if (l->fd_num == fd_to_fdnum(fd)) {
if (l->blocked) {
list_move_tail (&l->list, &blocked_list);
continue;
@@ -347,7 +347,7 @@ delete_locks_of_fd (xlator_t *this, pl_inode_t *pl_inode, fd_t *fd)
static void
__delete_locks_of_owner (pl_inode_t *pl_inode,
- void *transport, gf_lkowner_t *owner)
+ client_t *client, gf_lkowner_t *owner)
{
posix_lock_t *tmp = NULL;
posix_lock_t *l = NULL;
@@ -357,7 +357,7 @@ __delete_locks_of_owner (pl_inode_t *pl_inode,
list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) {
if (l->blocked)
continue;
- if ((l->transport == transport) &&
+ if ((l->client == client) &&
is_same_lkowner (&l->owner, owner)) {
gf_log ("posix-locks", GF_LOG_TRACE,
" Flushing lock"
@@ -552,7 +552,8 @@ fetch_pathinfo (xlator_t *this, inode_t *inode, int32_t *op_errno,
ret = syncop_getxattr (FIRST_CHILD(this), &loc, &dict,
GF_XATTR_PATHINFO_KEY);
if (ret < 0) {
- *op_errno = errno;
+ *op_errno = -ret;
+ ret = -1;
goto out;
}
@@ -643,7 +644,8 @@ pl_fgetxattr_handle_lockinfo (xlator_t *this, fd_t *fd,
pl_inode_t *pl_inode = NULL;
char *key = NULL, *buf = NULL;
int32_t op_ret = 0;
- unsigned long fdnum = 0, len = 0;
+ unsigned long fdnum = 0;
+ int32_t len = 0;
dict_t *tmp = NULL;
pl_inode = pl_inode_get (this, fd->inode);
@@ -810,7 +812,7 @@ pl_migrate_locks (call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num,
list_for_each_entry (l, &pl_inode->ext_list, list) {
if (l->fd_num == oldfd_num) {
l->fd_num = newfd_num;
- l->transport = frame->root->trans;
+ l->client = frame->root->client;
}
}
}
@@ -983,7 +985,7 @@ pl_flush (call_frame_t *frame, xlator_t *this,
}
pthread_mutex_lock (&pl_inode->mutex);
{
- __delete_locks_of_owner (pl_inode, frame->root->trans,
+ __delete_locks_of_owner (pl_inode, frame->root->client,
&frame->root->lk_owner);
}
pthread_mutex_unlock (&pl_inode->mutex);
@@ -1178,7 +1180,7 @@ pl_readv (call_frame_t *frame, xlator_t *this,
if (priv->mandatory && pl_inode->mandatory) {
region.fl_start = offset;
region.fl_end = offset + size - 1;
- region.transport = frame->root->trans;
+ region.client = frame->root->client;
region.fd_num = fd_to_fdnum(fd);
region.client_pid = frame->root->pid;
region.owner = frame->root->lk_owner;
@@ -1272,7 +1274,7 @@ pl_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (priv->mandatory && pl_inode->mandatory) {
region.fl_start = offset;
region.fl_end = offset + iov_length (vector, count) - 1;
- region.transport = frame->root->trans;
+ region.client = frame->root->client;
region.fd_num = fd_to_fdnum(fd);
region.client_pid = frame->root->pid;
region.owner = frame->root->lk_owner;
@@ -1339,7 +1341,7 @@ __fd_has_locks (pl_inode_t *pl_inode, fd_t *fd)
posix_lock_t *l = NULL;
list_for_each_entry (l, &pl_inode->ext_list, list) {
- if ((l->fd_num == fd_to_fdnum(fd))) {
+ if (l->fd_num == fd_to_fdnum(fd)) {
found = 1;
break;
}
@@ -1353,7 +1355,7 @@ lock_dup (posix_lock_t *lock)
{
posix_lock_t *new_lock = NULL;
- new_lock = new_posix_lock (&lock->user_flock, lock->transport,
+ new_lock = new_posix_lock (&lock->user_flock, lock->client,
lock->client_pid, &lock->owner,
(fd_t *)lock->fd_num);
return new_lock;
@@ -1368,7 +1370,7 @@ __dup_locks_to_fdctx (pl_inode_t *pl_inode, fd_t *fd,
int ret = 0;
list_for_each_entry (l, &pl_inode->ext_list, list) {
- if ((l->fd_num == fd_to_fdnum(fd))) {
+ if (l->fd_num == fd_to_fdnum(fd)) {
duplock = lock_dup (l);
if (!duplock) {
ret = -1;
@@ -1513,8 +1515,6 @@ int
pl_lk (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- void *transport = NULL;
- pid_t client_pid = 0;
pl_inode_t *pl_inode = NULL;
int op_ret = 0;
int op_errno = 0;
@@ -1523,9 +1523,6 @@ pl_lk (call_frame_t *frame, xlator_t *this,
posix_lock_t *conf = NULL;
int ret = 0;
- transport = frame->root->trans;
- client_pid = frame->root->pid;
-
if ((flock->l_start < 0) || (flock->l_len < 0)) {
op_ret = -1;
op_errno = EINVAL;
@@ -1539,7 +1536,7 @@ pl_lk (call_frame_t *frame, xlator_t *this,
goto unwind;
}
- reqlock = new_posix_lock (flock, transport, client_pid,
+ reqlock = new_posix_lock (flock, frame->root->client, frame->root->pid,
&frame->root->lk_owner, fd);
if (!reqlock) {
@@ -1764,6 +1761,7 @@ pl_forget (xlator_t *this,
list_del_init (&entry_l->domain_list);
GF_FREE ((char *)entry_l->basename);
+ GF_FREE (entry_l->connection_id);
GF_FREE (entry_l);
}
@@ -1797,6 +1795,7 @@ pl_forget (xlator_t *this,
STACK_UNWIND_STRICT (entrylk, entry_l->frame, -1, 0, NULL);
GF_FREE ((char *)entry_l->basename);
+ GF_FREE (entry_l->connection_id);
GF_FREE (entry_l);
}
@@ -1946,19 +1945,34 @@ pl_entrylk_xattr_fill (xlator_t *this, inode_t *inode,
}
void
-pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode,
- dict_t *dict)
+pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode, dict_t *dict,
+ gf_boolean_t per_dom)
{
int32_t count = 0;
int ret = -1;
+ char *domname = NULL;
+
+
+ if (per_dom){
+ ret = dict_get_str (dict, GLUSTERFS_INODELK_DOM_COUNT,
+ &domname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "value for key %s",GLUSTERFS_INODELK_DOM_COUNT);
+ goto out;
+ }
+ }
+
+ count = get_inodelk_count (this, inode, domname);
- count = get_inodelk_count (this, inode);
ret = dict_set_int32 (dict, GLUSTERFS_INODELK_COUNT, count);
if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- " dict_set failed on key %s", GLUSTERFS_INODELK_COUNT);
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to set count for "
+ "key %s", GLUSTERFS_INODELK_COUNT);
}
+out:
+ return;
}
void
@@ -2003,7 +2017,9 @@ pl_lookup_cbk (call_frame_t *frame,
if (local->entrylk_count_req)
pl_entrylk_xattr_fill (this, inode, xdata);
if (local->inodelk_count_req)
- pl_inodelk_xattr_fill (this, inode, xdata);
+ pl_inodelk_xattr_fill (this, inode, xdata, _gf_false);
+ if (local->inodelk_dom_count_req)
+ pl_inodelk_xattr_fill (this, inode, xdata, _gf_true);
if (local->posixlk_count_req)
pl_posixlk_xattr_fill (this, inode, xdata);
@@ -2050,6 +2066,8 @@ pl_lookup (call_frame_t *frame,
local->entrylk_count_req = 1;
if (dict_get (xdata, GLUSTERFS_INODELK_COUNT))
local->inodelk_count_req = 1;
+ if (dict_get (xdata, GLUSTERFS_INODELK_DOM_COUNT))
+ local->inodelk_dom_count_req = 1;
if (dict_get (xdata, GLUSTERFS_POSIXLK_COUNT))
local->posixlk_count_req = 1;
if (dict_get (xdata, GLUSTERFS_PARENT_ENTRYLK))
@@ -2088,7 +2106,11 @@ pl_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->entrylk_count_req)
pl_entrylk_xattr_fill (this, entry->inode, entry->dict);
if (local->inodelk_count_req)
- pl_inodelk_xattr_fill (this, entry->inode, entry->dict);
+ pl_inodelk_xattr_fill (this, entry->inode, entry->dict,
+ _gf_false);
+ if (local->inodelk_dom_count_req)
+ pl_inodelk_xattr_fill (this, entry->inode, entry->dict,
+ _gf_true);
if (local->posixlk_count_req)
pl_posixlk_xattr_fill (this, entry->inode, entry->dict);
}
@@ -2117,6 +2139,8 @@ pl_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->entrylk_count_req = 1;
if (dict_get (dict, GLUSTERFS_INODELK_COUNT))
local->inodelk_count_req = 1;
+ if (dict_get (dict, GLUSTERFS_INODELK_DOM_COUNT))
+ local->inodelk_dom_count_req = 1;
if (dict_get (dict, GLUSTERFS_POSIXLK_COUNT))
local->posixlk_count_req = 1;
}
@@ -2136,8 +2160,8 @@ out:
void
pl_dump_lock (char *str, int size, struct gf_flock *flock,
- gf_lkowner_t *owner, void *trans, time_t *granted_time,
- time_t *blkd_time, gf_boolean_t active)
+ gf_lkowner_t *owner, void *trans, char *conn_id,
+ time_t *granted_time, time_t *blkd_time, gf_boolean_t active)
{
char *type_str = NULL;
char granted[32] = {0,};
@@ -2165,7 +2189,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock,
(unsigned long long) flock->l_start,
(unsigned long long) flock->l_len,
(unsigned long long) flock->l_pid,
- lkowner_utoa (owner), trans,
+ lkowner_utoa (owner), trans, conn_id,
ctime_r (granted_time, granted));
} else {
snprintf (str, size, RANGE_BLKD_GRNTD_FMT,
@@ -2173,7 +2197,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock,
(unsigned long long) flock->l_start,
(unsigned long long) flock->l_len,
(unsigned long long) flock->l_pid,
- lkowner_utoa (owner), trans,
+ lkowner_utoa (owner), trans, conn_id,
ctime_r (blkd_time, blocked),
ctime_r (granted_time, granted));
}
@@ -2184,7 +2208,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock,
(unsigned long long) flock->l_start,
(unsigned long long) flock->l_len,
(unsigned long long) flock->l_pid,
- lkowner_utoa (owner), trans,
+ lkowner_utoa (owner), trans, conn_id,
ctime_r (blkd_time, blocked));
}
@@ -2221,14 +2245,16 @@ __dump_entrylks (pl_inode_t *pl_inode)
lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" :
"ENTRYLK_WRLCK", lock->basename,
(unsigned long long) lock->client_pid,
- lkowner_utoa (&lock->owner), lock->trans,
+ lkowner_utoa (&lock->owner), lock->client,
+ lock->connection_id,
ctime_r (&lock->granted_time.tv_sec, granted));
} else {
snprintf (tmp, 256, ENTRY_BLKD_GRNTD_FMT,
lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" :
"ENTRYLK_WRLCK", lock->basename,
(unsigned long long) lock->client_pid,
- lkowner_utoa (&lock->owner), lock->trans,
+ lkowner_utoa (&lock->owner), lock->client,
+ lock->connection_id,
ctime_r (&lock->blkd_time.tv_sec, blocked),
ctime_r (&lock->granted_time.tv_sec, granted));
}
@@ -2247,7 +2273,8 @@ __dump_entrylks (pl_inode_t *pl_inode)
lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" :
"ENTRYLK_WRLCK", lock->basename,
(unsigned long long) lock->client_pid,
- lkowner_utoa (&lock->owner), lock->trans,
+ lkowner_utoa (&lock->owner), lock->client,
+ lock->connection_id,
ctime_r (&lock->blkd_time.tv_sec, blocked));
gf_proc_dump_write(key, tmp);
@@ -2298,7 +2325,7 @@ __dump_inodelks (pl_inode_t *pl_inode)
SET_FLOCK_PID (&lock->user_flock, lock);
pl_dump_lock (tmp, 256, &lock->user_flock,
&lock->owner,
- lock->transport,
+ lock->client, lock->connection_id,
&lock->granted_time.tv_sec,
&lock->blkd_time.tv_sec,
_gf_true);
@@ -2315,7 +2342,7 @@ __dump_inodelks (pl_inode_t *pl_inode)
SET_FLOCK_PID (&lock->user_flock, lock);
pl_dump_lock (tmp, 256, &lock->user_flock,
&lock->owner,
- lock->transport,
+ lock->client, lock->connection_id,
0, &lock->blkd_time.tv_sec,
_gf_false);
gf_proc_dump_write(key, tmp);
@@ -2356,7 +2383,7 @@ __dump_posixlks (pl_inode_t *pl_inode)
count,
lock->blocked ? "BLOCKED" : "ACTIVE");
pl_dump_lock (tmp, 256, &lock->user_flock,
- &lock->owner, lock->transport,
+ &lock->owner, lock->client, NULL,
&lock->granted_time.tv_sec, &lock->blkd_time.tv_sec,
(lock->blocked)? _gf_false: _gf_true);
gf_proc_dump_write(key, tmp);
@@ -2433,7 +2460,7 @@ unlock:
__dump_entrylks (pl_inode);
}
- count = __get_inodelk_count (this, pl_inode);
+ count = __get_inodelk_count (this, pl_inode, NULL);
if (count) {
gf_proc_dump_write("inodelk-count", "%d", count);
__dump_inodelks (pl_inode);
@@ -2480,6 +2507,79 @@ mem_acct_init (xlator_t *this)
return ret;
}
+
+pl_ctx_t*
+pl_ctx_get (client_t *client, xlator_t *xlator)
+{
+ void *tmp = NULL;
+ pl_ctx_t *ctx = NULL;
+
+ client_ctx_get (client, xlator, &tmp);
+
+ ctx = tmp;
+
+ if (ctx != NULL)
+ goto out;
+
+ ctx = GF_CALLOC (1, sizeof (pl_ctx_t), gf_locks_mt_posix_lock_t);
+
+ if (ctx == NULL)
+ goto out;
+
+ pthread_mutex_init (&ctx->lock, NULL);
+ INIT_LIST_HEAD (&ctx->inodelk_lockers);
+ INIT_LIST_HEAD (&ctx->entrylk_lockers);
+
+ if (client_ctx_set (client, xlator, ctx) != 0) {
+ pthread_mutex_destroy (&ctx->lock);
+ GF_FREE (ctx);
+ ctx = NULL;
+ }
+out:
+ return ctx;
+}
+
+
+static int
+pl_client_disconnect_cbk (xlator_t *this, client_t *client)
+{
+ pl_ctx_t *pl_ctx = NULL;
+
+ pl_ctx = pl_ctx_get (client, this);
+
+ pl_inodelk_client_cleanup (this, pl_ctx);
+
+ pl_entrylk_client_cleanup (this, pl_ctx);
+
+ return 0;
+}
+
+
+static int
+pl_client_destroy_cbk (xlator_t *this, client_t *client)
+{
+ void *tmp = NULL;
+ pl_ctx_t *pl_ctx = NULL;
+
+ pl_client_disconnect_cbk (this, client);
+
+ client_ctx_del (client, this, &tmp);
+
+ if (tmp == NULL)
+ return 0;
+
+ pl_ctx = tmp;
+
+ GF_ASSERT (list_empty(&pl_ctx->inodelk_lockers));
+ GF_ASSERT (list_empty(&pl_ctx->entrylk_lockers));
+
+ pthread_mutex_destroy (&pl_ctx->lock);
+ GF_FREE (pl_ctx);
+
+ return 0;
+}
+
+
int
init (xlator_t *this)
{
@@ -2508,7 +2608,7 @@ init (xlator_t *this)
gf_log (this->name, GF_LOG_CRITICAL,
"'locks' translator is not loaded over a storage "
"translator");
- goto out;;
+ goto out;
}
priv = GF_CALLOC (1, sizeof (*priv),
@@ -2610,9 +2710,11 @@ struct xlator_dumpops dumpops = {
};
struct xlator_cbks cbks = {
- .forget = pl_forget,
- .release = pl_release,
- .releasedir = pl_releasedir,
+ .forget = pl_forget,
+ .release = pl_release,
+ .releasedir = pl_releasedir,
+ .client_destroy = pl_client_destroy_cbk,
+ .client_disconnect = pl_client_disconnect_cbk,
};
diff --git a/xlators/features/mac-compat/src/Makefile.am b/xlators/features/mac-compat/src/Makefile.am
index f8567edce..42ed350e9 100644
--- a/xlators/features/mac-compat/src/Makefile.am
+++ b/xlators/features/mac-compat/src/Makefile.am
@@ -6,9 +6,10 @@ mac_compat_la_LDFLAGS = -module -avoid-version
mac_compat_la_SOURCES = mac-compat.c
mac_compat_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+noinst_HEADERS = mac-compat.h
+
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
-CLEANFILES =
-
+CLEANFILES =
diff --git a/xlators/features/mac-compat/src/mac-compat.c b/xlators/features/mac-compat/src/mac-compat.c
index 7cb550ad5..0eaf563e8 100644
--- a/xlators/features/mac-compat/src/mac-compat.c
+++ b/xlators/features/mac-compat/src/mac-compat.c
@@ -15,35 +15,28 @@
#include "xlator.h"
#include "defaults.h"
#include "compat-errno.h"
+#include "syscall.h"
+#include "mem-pool.h"
+#include "mac-compat.h"
-
-enum apple_xattr {
- GF_FINDER_INFO_XATTR,
- GF_RESOURCE_FORK_XATTR,
- GF_XATTR_ALL,
- GF_XATTR_NONE
-};
-
-static char *apple_xattr_name[] = {
- [GF_FINDER_INFO_XATTR] = "com.apple.FinderInfo",
- [GF_RESOURCE_FORK_XATTR] = "com.apple.ResourceFork"
-};
-
-static const char *apple_xattr_value[] = {
- [GF_FINDER_INFO_XATTR] =
- /* 1 2 3 4 5 6 7 8 */
- "\0\0\0\0\0\0\0\0"
- "\0\0\0\0\0\0\0\0"
- "\0\0\0\0\0\0\0\0"
- "\0\0\0\0\0\0\0\0",
- [GF_RESOURCE_FORK_XATTR] = ""
-};
-
-static int32_t apple_xattr_len[] = {
- [GF_FINDER_INFO_XATTR] = 32,
- [GF_RESOURCE_FORK_XATTR] = 1
-};
-
+static int
+dict_key_remove_namespace(dict_t *dict, char *key, data_t *value, void *data)
+{
+ /*
+ char buffer[3*value->len+1];
+ int index = 0;
+ for (index = 0; index < value->len; index++)
+ sprintf(buffer+3*index, " %02x", value->data[index]);
+ */
+ xlator_t *this = (xlator_t *) data;
+ if (strncmp(key, "user.", 5) == 0) {
+ dict_set (dict, key + 5, value);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "remove_namespace_dict: %s -> %s ", key, key + 5);
+ dict_del (dict, key);
+ }
+ return 0;
+}
int32_t
maccomp_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -53,54 +46,91 @@ maccomp_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
intptr_t ax = (intptr_t)this->private;
int i = 0;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr_cbk: dict %p private: %p xdata %p ", dict,
+ this->private, xdata);
+
+ if (dict) {
+ dict_foreach(dict, dict_key_remove_namespace, this);
+ }
+ else {
+ // TODO: we expect dict to exist here, don't know why this
+ // this is needed
+ dict = dict_new();
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr_cbk: dict %p ax: %ld op_ret %d op_err %d ", dict, ax,
+ op_ret, op_errno);
if ((ax == GF_XATTR_ALL && op_ret >= 0) || ax != GF_XATTR_NONE) {
op_ret = op_errno = 0;
-
for (i = 0; i < GF_XATTR_ALL; i++) {
if (dict_get (dict, apple_xattr_name[i]))
continue;
-
+ /* set dummy data */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr_cbk: setting dummy data %p, %s", dict,
+ apple_xattr_name[i]);
if (dict_set (dict, apple_xattr_name[i],
bin_to_data ((void *)apple_xattr_value[i],
apple_xattr_len[i])) == -1) {
op_ret = -1;
- op_errno = ENOMEM;
+ op_errno = ENOATTR;
break;
}
}
}
-
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
-
return 0;
}
-int32_t
-maccomp_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
+static
+int prepend_xattr_user_namespace(dict_t *dict, char *key, data_t *value, void *obj)
{
- intptr_t ax = GF_XATTR_NONE;
- int i = 0;
+ xlator_t *this = (xlator_t *) obj;
+ dict_t *newdict = (dict_t *) this->private;
+ char *newkey = NULL;
+ gf_add_prefix(XATTR_USER_PREFIX, key, &newkey);
+ key = newkey;
+ dict_set(newdict, (char *)key, value);
+ if (newkey)
+ GF_FREE(newkey);
+ return 0;
+}
+intptr_t
+check_name(const char *name, char **newkey)
+{
+ intptr_t ax = GF_XATTR_NONE;
if (name) {
+ int i = 0;
for (i = 0; i < GF_XATTR_ALL; i++) {
if (strcmp (apple_xattr_name[i], name) == 0) {
ax = i;
-
break;
}
}
+ gf_add_prefix("user.", name, newkey);
} else
ax = GF_XATTR_ALL;
+ return ax;
+}
- this->private = (void *)ax;
+int32_t
+maccomp_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ char *newkey = NULL;
+ this->private = (void *) check_name(name, &newkey);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr: name %s private: %p xdata %p ", name,
+ this->private, xdata);
STACK_WIND (frame, maccomp_getxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->getxattr,
- loc, name, xdata);
+ loc, newkey, xdata);
return 0;
}
@@ -109,30 +139,17 @@ int32_t
maccomp_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
const char *name, dict_t *xdata)
{
- intptr_t ax = GF_XATTR_NONE;
- int i = 0;
-
- if (name) {
- for (i = 0; i < GF_XATTR_ALL; i++) {
- if (strcmp (apple_xattr_name[i], name) == 0) {
- ax = i;
-
- break;
- }
- }
- } else
- ax = GF_XATTR_ALL;
-
- this->private = (void *)ax;
+ char *newkey = NULL;
+ this->private = (void *) check_name(name, &newkey);
STACK_WIND (frame, maccomp_getxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fgetxattr,
- fd, name, xdata);
+ fd, newkey, xdata);
+ GF_FREE(newkey);
return 0;
}
-
int32_t
maccomp_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -141,12 +158,56 @@ maccomp_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1 && ax != GF_XATTR_NONE)
op_ret = op_errno = 0;
-
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setxattr_cbk op_ret %d op_errno %d private: %p xdata %p ",
+ op_ret, op_errno, this->private, xdata);
STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+int32_t
+maccomp_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *iatt1,
+ struct iatt *iattr2, dict_t *xdata)
+{
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setattr_cbk op_ret %d op_errno %d private: %p xdata %p ",
+ op_ret, op_errno, this->private, xdata);
+ STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno,
+ iatt1, iattr2, xdata);
return 0;
}
+int map_flags(int flags)
+{
+ /* DARWIN has different defines on XATTR_ flags.
+ There do not seem to be a POSIX standard
+ Parse any other flags over.
+ NOFOLLOW is always true on Linux and Darwin
+ */
+ int linux_flags = flags & ~(GF_XATTR_CREATE | GF_XATTR_REPLACE | XATTR_REPLACE);
+ if (XATTR_CREATE & flags)
+ linux_flags |= GF_XATTR_CREATE;
+ if (XATTR_REPLACE & flags)
+ linux_flags |= GF_XATTR_REPLACE;
+ return linux_flags;
+}
+
+int32_t
+maccomp_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ char *newkey = NULL;
+
+ this->private = (void *) check_name(name, &newkey);
+
+ STACK_WIND (frame, default_fremovexattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr,
+ fd, newkey, xdata);
+ GF_FREE(newkey);
+ return 0;
+}
int32_t
maccomp_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
@@ -162,16 +223,56 @@ maccomp_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
break;
}
}
+ dict_t *newdict = dict_new();
+ this->private = (void *) newdict;
+ dict_foreach(dict, prepend_xattr_user_namespace, this);
this->private = (void *)ax;
-
+ int linux_flags = map_flags(flags);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setxattr flags: %d -> %d dict %p private: %p xdata %p ",
+ flags, linux_flags, dict, this->private, xdata);
STACK_WIND (frame, maccomp_setxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->setxattr,
- loc, dict, flags, xdata);
+ loc, newdict, linux_flags, xdata);
+ dict_unref(newdict);
return 0;
}
+int32_t
+maccomp_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *iattr,
+ int32_t flags, dict_t *xdata)
+{
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setattr iattr %p private: %p xdata %p ",
+ iattr, this->private, xdata);
+ STACK_WIND (frame, maccomp_setattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, iattr, flags, xdata);
+ return 0;
+}
+
+int32_t
+maccomp_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ char *newkey = NULL;
+ this->private = (void *) check_name(name, &newkey);
+
+ STACK_WIND (frame, default_removexattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ loc, newkey, xdata);
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "removeattr name %p private: %p xdata %p ",
+ name, this->private, xdata);
+ GF_FREE(newkey);
+ return 0;
+
+}
int32_t
maccomp_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
@@ -188,12 +289,20 @@ maccomp_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
}
}
- this->private = (void *)ax;
+ dict_t *newdict = dict_new();
+ this->private = (void *) newdict;
+ dict_foreach(dict, prepend_xattr_user_namespace, this);
+ this->private = (void *)ax;
+ int linux_flags = map_flags(flags);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fsetxattr flags: %d -> %d dict %p private: %p xdata %p ",
+ flags, linux_flags, dict, this->private, xdata);
STACK_WIND (frame, maccomp_setxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsetxattr,
- fd, dict, flags, xdata);
+ fd, newdict, linux_flags, xdata);
+ dict_unref(newdict);
return 0;
}
@@ -224,10 +333,13 @@ fini (xlator_t *this)
struct xlator_fops fops = {
- .getxattr = maccomp_getxattr,
- .fgetxattr = maccomp_fgetxattr,
- .setxattr = maccomp_setxattr,
- .fsetxattr = maccomp_fsetxattr,
+ .getxattr = maccomp_getxattr,
+ .fgetxattr = maccomp_fgetxattr,
+ .setxattr = maccomp_setxattr,
+ .setattr = maccomp_setattr,
+ .fsetxattr = maccomp_fsetxattr,
+ .removexattr = maccomp_removexattr,
+ .fremovexattr = maccomp_fremovexattr,
};
struct xlator_cbks cbks;
diff --git a/xlators/features/mac-compat/src/mac-compat.h b/xlators/features/mac-compat/src/mac-compat.h
new file mode 100644
index 000000000..b033ca0e4
--- /dev/null
+++ b/xlators/features/mac-compat/src/mac-compat.h
@@ -0,0 +1,41 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __MAC_COMPAT_H__
+#define __MAC_COMPAT_H__
+
+enum apple_xattr {
+ GF_FINDER_INFO_XATTR,
+ GF_RESOURCE_FORK_XATTR,
+ GF_XATTR_ALL,
+ GF_XATTR_NONE
+};
+
+static char *apple_xattr_name[] = {
+ [GF_FINDER_INFO_XATTR] = "com.apple.FinderInfo",
+ [GF_RESOURCE_FORK_XATTR] = "com.apple.ResourceFork"
+};
+
+static const char *apple_xattr_value[] = {
+ [GF_FINDER_INFO_XATTR] =
+ /* 1 2 3 4 5 6 7 8 */
+ "\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0",
+ [GF_RESOURCE_FORK_XATTR] = ""
+};
+
+static int32_t apple_xattr_len[] = {
+ [GF_FINDER_INFO_XATTR] = 32,
+ [GF_RESOURCE_FORK_XATTR] = 1
+};
+
+#endif /* __MAC_COMPAT_H__ */
diff --git a/xlators/features/marker/Makefile.am b/xlators/features/marker/Makefile.am
index a6ba2de16..a985f42a8 100644
--- a/xlators/features/marker/Makefile.am
+++ b/xlators/features/marker/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = src @SYNCDAEMON_SUBDIR@
+SUBDIRS = src
CLEANFILES =
diff --git a/xlators/features/marker/src/marker-quota-helper.c b/xlators/features/marker/src/marker-quota-helper.c
index af5fed132..ec0d83316 100644
--- a/xlators/features/marker/src/marker-quota-helper.c
+++ b/xlators/features/marker/src/marker-quota-helper.c
@@ -154,15 +154,17 @@ out:
inode_contribution_t *
-__mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx, loc_t *loc)
+__mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx,
+ loc_t *loc)
{
- int32_t ret = 0;
+ int32_t ret = 0;
inode_contribution_t *contribution = NULL;
if (!loc->parent) {
if (!uuid_is_null (loc->pargfid))
loc->parent = inode_find (loc->inode->table,
loc->pargfid);
+
if (!loc->parent)
loc->parent = inode_parent (loc->inode, loc->pargfid,
loc->name);
@@ -170,9 +172,10 @@ __mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx, loc_t *l
goto out;
}
- list_for_each_entry (contribution, &ctx->contribution_head, contri_list) {
+ list_for_each_entry (contribution, &ctx->contribution_head,
+ contri_list) {
if (loc->parent &&
- uuid_compare (contribution->gfid, loc->parent->gfid) == 0) {
+ uuid_compare (contribution->gfid, loc->parent->gfid) == 0) {
goto out;
}
}
@@ -196,14 +199,16 @@ out:
inode_contribution_t *
-mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx, loc_t *loc)
+mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx,
+ loc_t *loc)
{
inode_contribution_t *contribution = NULL;
if ((ctx == NULL) || (loc == NULL))
return NULL;
- if (strcmp (loc->path, "/") == 0)
+ if (((loc->path) && (strcmp (loc->path, "/") == 0))
+ || (!loc->path && uuid_is_null (loc->pargfid)))
return NULL;
LOCK (&ctx->lock);
@@ -226,12 +231,16 @@ mq_dict_set_contribution (xlator_t *this, dict_t *dict,
GF_VALIDATE_OR_GOTO ("marker", this, out);
GF_VALIDATE_OR_GOTO ("marker", dict, out);
GF_VALIDATE_OR_GOTO ("marker", loc, out);
- GF_VALIDATE_OR_GOTO ("marker", loc->parent, out);
- GET_CONTRI_KEY (contri_key, loc->parent->gfid, ret);
- if (ret < 0) {
- ret = -1;
- goto out;
+ if (loc->parent) {
+ GET_CONTRI_KEY (contri_key, loc->parent->gfid, ret);
+ if (ret < 0) {
+ ret = -1;
+ goto out;
+ }
+ } else {
+ /* nameless lookup, fetch contributions to all parents */
+ GET_CONTRI_KEY (contri_key, NULL, ret);
}
ret = dict_set_int64 (dict, contri_key, 0);
diff --git a/xlators/features/marker/src/marker-quota-helper.h b/xlators/features/marker/src/marker-quota-helper.h
index 6cdd14881..b200413b0 100644
--- a/xlators/features/marker/src/marker-quota-helper.h
+++ b/xlators/features/marker/src/marker-quota-helper.h
@@ -9,7 +9,7 @@
*/
#ifndef _MARKER_QUOTA_HELPER_H
-#define _MARKER_QUOTA_HELPER
+#define _MARKER_QUOTA_HELPER_H
#ifndef _CONFIG_H
#define _CONFIG_H
diff --git a/xlators/features/marker/src/marker-quota.c b/xlators/features/marker/src/marker-quota.c
index 6f9af6e13..1903fdc40 100644
--- a/xlators/features/marker/src/marker-quota.c
+++ b/xlators/features/marker/src/marker-quota.c
@@ -30,7 +30,8 @@ mq_loc_copy (loc_t *dst, loc_t *src)
GF_VALIDATE_OR_GOTO ("marker", src, out);
if (src->inode == NULL ||
- src->path == NULL) {
+ ((src->parent == NULL) && (uuid_is_null (src->pargfid))
+ && !__is_root_gfid (src->inode->gfid))) {
gf_log ("marker", GF_LOG_WARNING,
"src loc is not valid");
goto out;
@@ -364,7 +365,10 @@ mq_update_size_xattr (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path, ntoh64 (*delta));
new_dict = dict_new ();
- if (!new_dict);
+ if (!new_dict) {
+ errno = ENOMEM;
+ goto err;
+ }
ret = dict_set_bin (new_dict, QUOTA_SIZE_KEY, delta, 8);
if (ret)
@@ -384,7 +388,6 @@ mq_update_size_xattr (call_frame_t *frame, void *cookie, xlator_t *this,
err:
if (op_ret == -1 || ret == -1) {
local->err = -1;
-
mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL);
}
@@ -1038,7 +1041,11 @@ mq_create_xattr (xlator_t *this, call_frame_t *frame)
goto free_size;
}
- if (strcmp (local->loc.path, "/") != 0) {
+ if ((local->loc.path && strcmp (local->loc.path, "/") != 0)
+ || (local->loc.inode && !uuid_is_null (local->loc.inode->gfid) &&
+ !__is_root_gfid (local->loc.inode->gfid))
+ || (!uuid_is_null (local->loc.gfid)
+ && !__is_root_gfid (local->loc.gfid))) {
contri = mq_add_new_contribution_node (this, ctx, &local->loc);
if (contri == NULL)
goto err;
@@ -1107,7 +1114,12 @@ mq_check_n_set_inode_xattr (call_frame_t *frame, void *cookie,
goto create_xattr;
//check contribution xattr if not root
- if (strcmp (local->loc.path, "/") != 0) {
+ if ((local->loc.path && strcmp (local->loc.path, "/") != 0)
+ || (!uuid_is_null (local->loc.gfid)
+ && !__is_root_gfid (local->loc.gfid))
+ || (local->loc.inode
+ && !uuid_is_null (local->loc.inode->gfid)
+ && !__is_root_gfid (local->loc.inode->gfid))) {
GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret);
if (ret < 0)
goto out;
@@ -1234,6 +1246,7 @@ mq_get_parent_inode_local (xlator_t *this, quota_local_t *local)
{
int32_t ret = -1;
quota_inode_ctx_t *ctx = NULL;
+ inode_contribution_t *contribution = NULL;
GF_VALIDATE_OR_GOTO ("marker", this, out);
GF_VALIDATE_OR_GOTO ("marker", local, out);
@@ -1263,7 +1276,7 @@ mq_get_parent_inode_local (xlator_t *this, quota_local_t *local)
ret = mq_inode_ctx_get (local->loc.inode, this, &ctx);
if (ret < 0) {
gf_log_callingfn (this->name, GF_LOG_WARNING,
- "inode ctx get failed");
+ "inode ctx get failed");
goto out;
}
@@ -1277,7 +1290,31 @@ mq_get_parent_inode_local (xlator_t *this, quota_local_t *local)
goto out;
}
- local->contri = (inode_contribution_t *) ctx->contribution_head.next;
+ /* Earlier we used to get the next entry in the list maintained
+ by the context. In a good situation it works. i.e the next
+ parent in the directory hierarchy for this path is obtained.
+
+ But consider the below situation:
+ mount-point: /mnt/point
+ quota enabled directories within mount point: /a, /b, /c
+
+ Now when some file (file1) in the directory /c is written some data,
+ then to update the directories, marker has to get the contribution
+ object for the parent inode, i.e /c.
+ Beefore, it was being done by
+ local->contri = (inode_contribution_t *) ctx->contribution_head.next;
+ It works in the normal situations. But suppose /c is moved to /b.
+ Now /b's contribution object is added to the end of the list of
+ parents that the file file1 within /b/c is maintaining. Now if
+ the file /b/c/file1 is copied to /b/c/new, to update the parent in
+ the order c, b and / we cannot go to the next element in the list,
+ as in this case the next contribution object would be / and /b's
+ contribution will be at the end of the list. So get the proper
+ parent's contribution, by searching the entire list.
+ */
+ contribution = mq_get_contribution_node (local->loc.parent, ctx);
+ GF_ASSERT (contribution != NULL);
+ local->contri = contribution;
ret = 0;
out:
@@ -1521,9 +1558,10 @@ mq_update_parent_size (call_frame_t *frame,
}
UNLOCK (&local->contri->lock);
- gf_log (this->name, GF_LOG_DEBUG, "%s %"PRId64 "%"PRId64,
- local->loc.path, local->ctx->size,
- local->contri->contribution);
+ gf_log_callingfn (this->name, GF_LOG_DEBUG, "path: %s size: %"PRId64
+ " contribution:%"PRId64,
+ local->loc.path, local->ctx->size,
+ local->contri->contribution);
if (dict == NULL) {
op_errno = EINVAL;
@@ -1651,7 +1689,7 @@ unlock:
}
UNLOCK (&contribution->lock);
- gf_log (this->name, GF_LOG_DEBUG, "%s %"PRId64 "%"PRId64,
+ gf_log (this->name, GF_LOG_DEBUG, "%s %"PRId64 " %"PRId64,
local->loc.path, size_int, contri_int);
local->delta = size_int - contri_int;
@@ -1730,7 +1768,8 @@ mq_fetch_child_size_and_contri (call_frame_t *frame, void *cookie,
VALIDATE_OR_GOTO (local->ctx, err);
VALIDATE_OR_GOTO (local->contri, err);
- gf_log (this->name, GF_LOG_DEBUG, "%s marked dirty", local->parent_loc.path);
+ gf_log (this->name, GF_LOG_DEBUG, "%s marked dirty",
+ local->parent_loc.path);
//update parent ctx
ret = mq_inode_ctx_get (local->parent_loc.inode, this, &ctx);
@@ -1901,15 +1940,18 @@ fr_destroy:
return -1;
}
-
int
-mq_start_quota_txn (xlator_t *this, loc_t *loc,
- quota_inode_ctx_t *ctx,
- inode_contribution_t *contri)
+mq_prepare_txn_frame (xlator_t *this, loc_t *loc,
+ quota_inode_ctx_t *ctx,
+ inode_contribution_t *contri,
+ call_frame_t **new_frame)
{
- int32_t ret = -1;
- call_frame_t *frame = NULL;
- quota_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ quota_local_t *local = NULL;
+
+ if (!this || !loc || !new_frame)
+ goto err;
frame = create_frame (this, this->ctx->pool);
if (frame == NULL)
@@ -1935,14 +1977,36 @@ mq_start_quota_txn (xlator_t *this, loc_t *loc,
local->ctx = ctx;
local->contri = contri;
+ ret = 0;
+ *new_frame = frame;
+
+ return ret;
+
+fr_destroy:
+ QUOTA_STACK_DESTROY (frame, this);
+err:
+ return ret;
+}
+
+int
+mq_start_quota_txn (xlator_t *this, loc_t *loc,
+ quota_inode_ctx_t *ctx,
+ inode_contribution_t *contri)
+{
+ int32_t ret = -1;
+ call_frame_t *frame = NULL;
+
+ ret = mq_prepare_txn_frame (this, loc, ctx,
+ contri, &frame);
+ if (ret)
+ goto err;
+
ret = mq_get_lock_on_parent (frame, this);
if (ret == -1)
goto err;
return 0;
-fr_destroy:
- QUOTA_STACK_DESTROY (frame, this);
err:
mq_set_ctx_updation_status (ctx, _gf_false);
@@ -1970,11 +2034,46 @@ mq_initiate_quota_txn (xlator_t *this, loc_t *loc)
goto out;
}
+ /* Create the contribution node if its absent. Is it right to
+ assume that if the contribution node is not there, then
+ create one and proceed instead of returning?
+ Reason for this assumption is for hard links. Suppose
+ hard link for a file f1 present in a directory d1 is
+ created in the directory d2 (as f2). Now, since d2's
+ contribution is not there in f1's inode ctx, d2's
+ contribution xattr wont be created and will create problems
+ for quota operations.
+ */
contribution = mq_get_contribution_node (loc->parent, ctx);
- if (contribution == NULL)
- goto out;
+ if (!contribution) {
+ if ((loc->path && strcmp (loc->path, "/"))
+ || (!uuid_is_null (loc->gfid)
+ && !__is_root_gfid (loc->gfid))
+ || (loc->inode && !uuid_is_null (loc->inode->gfid)
+ && !__is_root_gfid (loc->inode->gfid)))
+ gf_log_callingfn (this->name, GF_LOG_TRACE,
+ "contribution node for the "
+ "path (%s) with parent (%s) "
+ "not found", loc->path,
+ loc->parent?
+ uuid_utoa (loc->parent->gfid):
+ NULL);
+
+ contribution = mq_add_new_contribution_node (this, ctx, loc);
+ if (!contribution) {
+ if(loc->path && strcmp (loc->path, "/"))
+ gf_log_callingfn (this->name, GF_LOG_WARNING,
+ "could not allocate "
+ " contribution node for (%s) "
+ "parent: (%s)", loc->path,
+ loc->parent?
+ uuid_utoa (loc->parent->gfid):
+ NULL);
+ goto out;
+ }
+ }
- /* To improve performance, donot start another transaction
+ /* To improve performance, do not start another transaction
* if one is already in progress for same inode
*/
status = _gf_true;
@@ -1993,16 +2092,7 @@ out:
}
-/* int32_t */
-/* validate_inode_size_contribution (xlator_t *this, loc_t *loc, int64_t size, */
-/* int64_t contribution) */
-/* { */
-/* if (size != contribution) { */
-/* mq_initiate_quota_txn (this, loc); */
-/* } */
-/* return 0; */
-/* } */
int32_t
@@ -2031,12 +2121,13 @@ mq_inspect_directory_xattr (xlator_t *this,
}
}
- if (strcmp (loc->path, "/") != 0) {
+ if (!loc->path || (loc->path && strcmp (loc->path, "/") != 0)) {
contribution = mq_add_new_contribution_node (this, ctx, loc);
if (contribution == NULL) {
if (!uuid_is_null (loc->inode->gfid))
- gf_log (this->name, GF_LOG_WARNING,
- "cannot add a new contribution node");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "cannot add a new contribution node "
+ "(%s)", uuid_utoa (loc->inode->gfid));
ret = -1;
goto err;
}
@@ -2050,7 +2141,10 @@ mq_inspect_directory_xattr (xlator_t *this,
if (ret < 0)
goto out;
- if (strcmp (loc->path, "/") != 0) {
+ if ((loc->path && strcmp (loc->path, "/") != 0)
+ || (!uuid_is_null (loc->gfid) && !__is_root_gfid (loc->gfid))
+ || (loc->inode && !uuid_is_null (loc->inode->gfid) &&
+ !__is_root_gfid (loc->inode->gfid))) {
not_root = _gf_true;
GET_CONTRI_KEY (contri_key, contribution->gfid, ret);
@@ -2122,8 +2216,11 @@ mq_inspect_file_xattr (xlator_t *this,
}
contribution = mq_add_new_contribution_node (this, ctx, loc);
- if (contribution == NULL)
+ if (contribution == NULL) {
+ gf_log_callingfn (this->name, GF_LOG_DEBUG, "cannot allocate "
+ "contribution node (path:%s)", loc->path);
goto out;
+ }
LOCK (&ctx->lock);
{
@@ -2155,8 +2252,12 @@ mq_inspect_file_xattr (xlator_t *this,
if (size != contri_int) {
mq_initiate_quota_txn (this, loc);
}
- } else
- mq_initiate_quota_txn (this, loc);
+ } else {
+ if (size)
+ mq_initiate_quota_txn (this, loc);
+ else
+ mq_set_inode_xattr (this, loc);
+ }
}
out:
@@ -2169,8 +2270,8 @@ mq_xattr_state (xlator_t *this,
dict_t *dict,
struct iatt buf)
{
- if (buf.ia_type == IA_IFREG ||
- buf.ia_type == IA_IFLNK) {
+ if (((buf.ia_type == IA_IFREG) && !dht_is_linkfile (&buf, dict))
+ || (buf.ia_type == IA_IFLNK)) {
mq_inspect_file_xattr (this, loc, dict, buf);
} else if (buf.ia_type == IA_IFDIR)
mq_inspect_directory_xattr (this, loc, dict, buf);
@@ -2192,7 +2293,7 @@ mq_req_xattr (xlator_t *this,
goto set_size;
//if not "/" then request contribution
- if (strcmp (loc->path, "/") == 0)
+ if (loc->path && strcmp (loc->path, "/") == 0)
goto set_size;
ret = mq_dict_set_contribution (this, dict, loc);
@@ -2235,6 +2336,12 @@ _mq_inode_remove_done (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t ret = 0;
char contri_key [512] = {0, };
quota_local_t *local = NULL;
+ inode_t *inode = NULL;
+ dentry_t *tmp = NULL;
+ gf_boolean_t last_dentry = _gf_true;
+ loc_t loc = {0, };
+ dentry_t *other_dentry = NULL;
+ gf_boolean_t remove = _gf_false;
local = (quota_local_t *) frame->local;
@@ -2245,17 +2352,84 @@ _mq_inode_remove_done (call_frame_t *frame, void *cookie, xlator_t *this,
frame->local = NULL;
- if (local->hl_count > 1) {
- GET_CONTRI_KEY (contri_key, local->contri->gfid, ret);
+ GET_CONTRI_KEY (contri_key, local->contri->gfid, ret);
- STACK_WIND (frame, mq_removexattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
- &local->loc, contri_key, NULL);
- ret = 0;
- } else {
- mq_removexattr_cbk (frame, NULL, this, 0, 0, NULL);
+ if (!local->loc.inode)
+ inode = inode_grep (local->loc.parent->table, local->loc.parent,
+ local->loc.name);
+ else
+ inode = inode_ref (local->loc.inode);
+
+ /* Suppose there are 2 directories dir1 and dir2. Quota limit is set on
+ both the directories. There is a file (f1) in dir1. A hark link is
+ created for that file inside the directory dir2 (say f2). Now one
+ more xattr is set in the inode as a new hard link is created in a
+ separate directory.
+ i.e trusted.glusterfs.quota.<gfid of dir2>.contri=<contribution>
+
+ Now when the hardlink f2 is removed, then the new xattr added (i.e
+ the xattr indicating its contribution to ITS parent directory) should
+ be removed (IFF there is not another hardlink for that file in the
+ same directory).
+
+ To do that upon getting unlink first check whether any other hard
+ links for the same inode exists in the same directory. If so do not
+ do anything and proceed for quota transaction.
+ Otherwise, if the removed entry was the only link for that inode
+ within that directory, then get another dentry for the inode
+ (by traversing the list of dentries for the inode) and using the
+ the dentry's parent and name, send removexattr so that the xattr
+ is removed.
+
+ If it is not done, then if the volume is restarted or the brick
+ process is restarted, then wrong quota usage will be shown for the
+ directory dir2.
+ */
+ if (inode) {
+ tmp = NULL;
+ list_for_each_entry (tmp, &inode->dentry_list, inode_list) {
+ if (local->loc.parent == tmp->parent) {
+ if (strcmp (local->loc.name, local->loc.name)) {
+ last_dentry = _gf_false;
+ break;
+ }
+ }
+ }
+ remove = last_dentry;
}
+ if (remove) {
+ if (!other_dentry) {
+ list_for_each_entry (tmp, &inode->dentry_list,
+ inode_list) {
+ if (local->loc.parent != tmp->parent) {
+ other_dentry = tmp;
+ break;
+ }
+ }
+ }
+
+ if (!other_dentry)
+ mq_removexattr_cbk (frame, NULL, this, 0, 0, NULL);
+ else {
+ loc.parent = inode_ref (other_dentry->parent);
+ loc.name = gf_strdup (other_dentry->name);
+ uuid_copy (loc.pargfid , other_dentry->parent->gfid);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+ inode_path (other_dentry->parent, other_dentry->name,
+ (char **)&loc.path);
+
+ STACK_WIND (frame, mq_removexattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ &loc, contri_key, NULL);
+ }
+ } else
+ mq_removexattr_cbk (frame, NULL, this, 0, 0, NULL);
+
+ ret = 0;
+
if (strcmp (local->parent_loc.path, "/") != 0) {
ret = mq_get_parent_inode_local (this, local);
if (ret < 0)
@@ -2266,6 +2440,8 @@ _mq_inode_remove_done (call_frame_t *frame, void *cookie, xlator_t *this,
out:
mq_local_unref (this, local);
+ loc_wipe (&loc);
+ inode_unref (inode);
return 0;
}
@@ -2392,8 +2568,11 @@ mq_reduce_parent_size (xlator_t *this, loc_t *loc, int64_t contri)
goto out;
contribution = mq_get_contribution_node (loc->parent, ctx);
- if (contribution == NULL)
+ if (contribution == NULL) {
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "contribution for"
+ " the node %s is NULL", loc->path);
goto out;
+ }
local = mq_local_new ();
if (local == NULL) {
@@ -2412,6 +2591,8 @@ mq_reduce_parent_size (xlator_t *this, loc_t *loc, int64_t contri)
}
if (local->size == 0) {
+ gf_log_callingfn (this->name, GF_LOG_TRACE,
+ "local->size is 0 " "path: (%s)", loc->path);
ret = 0;
goto out;
}
@@ -2424,8 +2605,12 @@ mq_reduce_parent_size (xlator_t *this, loc_t *loc, int64_t contri)
local->contri = contribution;
ret = mq_inode_loc_fill (NULL, loc->parent, &local->parent_loc);
- if (ret < 0)
+ if (ret < 0) {
+ gf_log_callingfn (this->name, GF_LOG_INFO, "building parent loc"
+ " failed. (gfid: %s)",
+ uuid_utoa (loc->parent->gfid));
goto out;
+ }
frame = create_frame (this, this->ctx->pool);
if (!frame) {
diff --git a/xlators/features/marker/src/marker-quota.h b/xlators/features/marker/src/marker-quota.h
index 385760ac4..42def9d22 100644
--- a/xlators/features/marker/src/marker-quota.h
+++ b/xlators/features/marker/src/marker-quota.h
@@ -42,8 +42,6 @@
var = GF_CALLOC (sizeof (type), 1, \
gf_marker_mt_##type); \
if (!var) { \
- gf_log ("", GF_LOG_ERROR, \
- "out of memory"); \
ret = -1; \
} \
} while (0);
@@ -61,13 +59,20 @@
ret = 0; \
} while (0);
-#define GET_CONTRI_KEY(var, _gfid, _ret) \
- do { \
- char _gfid_unparsed[40]; \
- uuid_unparse (_gfid, _gfid_unparsed); \
- _ret = snprintf (var, CONTRI_KEY_MAX, QUOTA_XATTR_PREFIX \
- ".%s.%s." CONTRIBUTION, "quota", \
- _gfid_unparsed); \
+#define GET_CONTRI_KEY(var, _gfid, _ret) \
+ do { \
+ if (_gfid != NULL) { \
+ char _gfid_unparsed[40]; \
+ uuid_unparse (_gfid, _gfid_unparsed); \
+ _ret = snprintf (var, CONTRI_KEY_MAX, \
+ QUOTA_XATTR_PREFIX \
+ ".%s.%s." CONTRIBUTION, "quota", \
+ _gfid_unparsed); \
+ } else { \
+ _ret = snprintf (var, CONTRI_KEY_MAX, \
+ QUOTA_XATTR_PREFIX \
+ ".%s.." CONTRIBUTION, "quota"); \
+ } \
} while (0);
#define QUOTA_SAFE_INCREMENT(lock, var) \
diff --git a/xlators/features/marker/src/marker.c b/xlators/features/marker/src/marker.c
index 82d9066d5..adcf3d8e7 100644
--- a/xlators/features/marker/src/marker.c
+++ b/xlators/features/marker/src/marker.c
@@ -21,6 +21,7 @@
#include "marker-quota-helper.h"
#include "marker-common.h"
#include "byte-order.h"
+#include "syncop.h"
#define _GF_UID_GID_CHANGED 1
@@ -185,6 +186,8 @@ marker_local_unref (marker_local_t *local)
loc_wipe (&local->loc);
loc_wipe (&local->parent_loc);
+ if (local->xdata)
+ dict_unref (local->xdata);
if (local->oplocal) {
marker_local_unref (local->oplocal);
@@ -252,18 +255,18 @@ out:
return 0;
}
-int32_t
+gf_boolean_t
call_from_special_client (call_frame_t *frame, xlator_t *this, const char *name)
{
struct volume_mark *vol_mark = NULL;
marker_conf_t *priv = NULL;
- gf_boolean_t ret = _gf_true;
+ gf_boolean_t is_true = _gf_true;
priv = (marker_conf_t *)this->private;
if (frame->root->pid != GF_CLIENT_PID_GSYNCD || name == NULL ||
strcmp (name, MARKER_XATTR_PREFIX "." VOLUME_MARK) != 0) {
- ret = _gf_false;
+ is_true = _gf_false;
goto out;
}
@@ -271,7 +274,7 @@ call_from_special_client (call_frame_t *frame, xlator_t *this, const char *name)
marker_getxattr_stampfile_cbk (frame, this, name, vol_mark, NULL);
out:
- return ret;
+ return is_true;
}
int32_t
@@ -279,15 +282,65 @@ marker_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
+ int ret = 0;
+ char *src = NULL;
+ char *dst = NULL;
+ int len = 0;
+ marker_local_t *local = NULL;
+
+ local = frame->local;
+
+
if (cookie) {
gf_log (this->name, GF_LOG_DEBUG,
"Filtering the quota extended attributes");
- dict_foreach_fnmatch (dict, "trusted.glusterfs.quota*",
- marker_filter_quota_xattr, NULL);
+ /* If the getxattr is from a non special client, then do not
+ copy the quota related xattrs (except the quota limit key
+ i.e trusted.glusterfs.quota.limit-set which has been set by
+ glusterd on the directory on which quota limit is set.) for
+ directories. Let the healing of xattrs happen upon lookup.
+ NOTE: setting of trusted.glusterfs.quota.limit-set as of now
+ happens from glusterd. It should be moved to quotad. Also
+ trusted.glusterfs.quota.limit-set is set on directory which
+ is permanent till quota is removed on that directory or limit
+ is changed. So let that xattr be healed by other xlators
+ properly whenever directory healing is done.
+ */
+ ret = dict_get_ptr_and_len (dict, QUOTA_LIMIT_KEY,
+ (void **)&src, &len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "dict_get on %s "
+ "failed", QUOTA_LIMIT_KEY);
+ } else {
+ dst = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+ if (dst)
+ memcpy (dst, src, len);
+ }
+
+ /*
+ * Except limit-set xattr, rest of the xattrs are maintained
+ * by quota xlator. Don't expose them to other xlators.
+ * This filter makes sure quota xattrs are not healed as part of
+ * metadata self-heal
+ */
+ GF_REMOVE_INTERNAL_XATTR ("trusted.glusterfs.quota*", dict);
+ if (!ret && IA_ISDIR (local->loc.inode->ia_type) && dst) {
+ ret = dict_set_dynptr (dict, QUOTA_LIMIT_KEY,
+ dst, len);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "setting "
+ "key %s failed", QUOTA_LIMIT_KEY);
+ else
+ dst = NULL;
+ }
}
+ GF_FREE (dst);
+
+ frame->local = NULL;
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
+ marker_local_unref (local);
return 0;
}
@@ -295,20 +348,29 @@ int32_t
marker_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
- gf_boolean_t ret = _gf_false;
- marker_conf_t *priv = NULL;
- unsigned long cookie = 0;
+ gf_boolean_t is_true = _gf_false;
+ marker_conf_t *priv = NULL;
+ unsigned long cookie = 0;
+ marker_local_t *local = NULL;
priv = this->private;
- if (priv == NULL || (priv->feature_enabled & GF_XTIME) == 0)
- goto wind;
+ frame->local = mem_get0 (this->local_pool);
+ local = frame->local;
+ if (local == NULL)
+ goto out;
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ if ((loc_copy (&local->loc, loc)) < 0)
+ goto out;
gf_log (this->name, GF_LOG_DEBUG, "USER:PID = %d", frame->root->pid);
- ret = call_from_special_client (frame, this, name);
-wind:
- if (ret == _gf_false) {
+ if (priv && priv->feature_enabled & GF_XTIME)
+ is_true = call_from_special_client (frame, this, name);
+
+ if (is_true == _gf_false) {
if (name == NULL) {
/* Signifies that marker translator
* has to filter the quota's xattr's,
@@ -317,13 +379,19 @@ wind:
*/
cookie = 1;
}
- STACK_WIND_COOKIE (frame, marker_getxattr_cbk, (void *)cookie,
+ STACK_WIND_COOKIE (frame, marker_getxattr_cbk,
+ (void *)cookie,
FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr, loc,
- name, xdata);
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
}
return 0;
+out:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL);
+ marker_local_unref (local);
+ return 0;
}
@@ -466,11 +534,16 @@ marker_create_frame (xlator_t *this, marker_local_t *local)
int32_t
marker_xtime_update_marks (xlator_t *this, marker_local_t *local)
{
+ marker_conf_t *priv = NULL;
+
GF_VALIDATE_OR_GOTO ("marker", this, out);
GF_VALIDATE_OR_GOTO (this->name, local, out);
- if ((local->pid == GF_CLIENT_PID_GSYNCD) ||
- (local->pid == GF_CLIENT_PID_DEFRAG))
+ priv = this->private;
+
+ if ((local->pid == GF_CLIENT_PID_GSYNCD
+ && !(priv->feature_enabled & GF_XTIME_GSYNC_FORCE))
+ || (local->pid == GF_CLIENT_PID_DEFRAG))
goto out;
marker_gettimeofday (local);
@@ -800,8 +873,10 @@ marker_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv = this->private;
- if ((priv->feature_enabled & GF_QUOTA) && (local->ia_nlink == 1))
- mq_reduce_parent_size (this, &local->loc, -1);
+ if (priv->feature_enabled & GF_QUOTA) {
+ if (!local->skip_txn)
+ mq_reduce_parent_size (this, &local->loc, -1);
+ }
if (priv->feature_enabled & GF_XTIME)
marker_xtime_update_marks (this, local);
@@ -813,37 +888,6 @@ out:
int32_t
-marker_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
-{
- marker_local_t *local = NULL;
-
- local = frame->local;
- if (op_ret < 0) {
- goto err;
- }
-
- if (local == NULL) {
- op_errno = EINVAL;
- goto err;
- }
-
- local->ia_nlink = buf->ia_nlink;
-
- STACK_WIND (frame, marker_unlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag,
- NULL);
- return 0;
-err:
- frame->local = NULL;
- STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, NULL, NULL);
- marker_local_unref (local);
- return 0;
-}
-
-
-int32_t
marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
dict_t *xdata)
{
@@ -858,6 +902,8 @@ marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
local = mem_get0 (this->local_pool);
local->xflag = xflag;
+ if (xdata)
+ local->xdata = dict_ref (xdata);
MARKER_INIT_LOCAL (frame, local);
ret = loc_copy (&local->loc, loc);
@@ -865,12 +911,10 @@ marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
if (ret == -1)
goto err;
- if (uuid_is_null (loc->gfid) && loc->inode)
- uuid_copy (loc->gfid, loc->inode->gfid);
-
- STACK_WIND (frame, marker_unlink_stat_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat, loc, xdata);
- return 0;
+ if (xdata && dict_get (xdata, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY)) {
+ local->skip_txn = 1;
+ goto unlink_wind;
+ }
unlink_wind:
STACK_WIND (frame, marker_unlink_cbk, FIRST_CHILD(this),
@@ -910,8 +954,11 @@ marker_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv = this->private;
- if (priv->feature_enabled & GF_QUOTA)
- mq_initiate_quota_txn (this, &local->loc);
+ if (priv->feature_enabled & GF_QUOTA) {
+ if (!local->skip_txn)
+ mq_set_inode_xattr (this, &local->loc);
+ }
+
if (priv->feature_enabled & GF_XTIME)
marker_xtime_update_marks (this, local);
@@ -942,6 +989,9 @@ marker_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
if (ret == -1)
goto err;
+
+ if (xdata && dict_get (xdata, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY))
+ local->skip_txn = 1;
wind:
STACK_WIND (frame, marker_link_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
@@ -971,7 +1021,7 @@ marker_rename_done (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret < 0) {
if (local->err == 0) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
}
gf_log (this->name, GF_LOG_WARNING,
@@ -987,6 +1037,11 @@ marker_rename_done (call_frame_t *frame, void *cookie, xlator_t *this,
} else if (local->err != 0) {
STACK_UNWIND_STRICT (rename, frame, -1, local->err, NULL, NULL,
NULL, NULL, NULL, NULL);
+ } else {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "continuation stub to unwind the call is absent, hence "
+ "call will be hung (call-stack id = %"PRIu64")",
+ frame->root->unique);
}
mq_reduce_parent_size (this, &oplocal->loc, oplocal->contribution);
@@ -1002,7 +1057,7 @@ marker_rename_done (call_frame_t *frame, void *cookie, xlator_t *this,
newloc.name++;
newloc.parent = inode_ref (local->loc.parent);
- mq_rename_update_newpath (this, &newloc);
+ mq_set_inode_xattr (this, &newloc);
loc_wipe (&newloc);
@@ -1032,7 +1087,7 @@ marker_rename_release_newp_lock (call_frame_t *frame, void *cookie,
if (op_ret < 0) {
if (local->err == 0) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
}
gf_log (this->name, GF_LOG_WARNING,
@@ -1221,7 +1276,7 @@ marker_do_rename (call_frame_t *frame, void *cookie, xlator_t *this,
MARKER_RESET_UID_GID (frame, frame->root, local);
if ((op_ret < 0) && (op_errno != ENOATTR)) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
gf_log (this->name, GF_LOG_WARNING,
"fetching contribution values from %s (gfid:%s) "
"failed (%s)", local->loc.path,
@@ -1233,7 +1288,7 @@ marker_do_rename (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->loc.inode != NULL) {
GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret);
if (ret < 0) {
- local->err = errno;
+ local->err = errno ? errno : ENOMEM;
goto err;
}
@@ -1273,7 +1328,7 @@ marker_get_newpath_contribution (call_frame_t *frame, void *cookie,
MARKER_RESET_UID_GID (frame, frame->root, local);
if ((op_ret < 0) && (op_errno != ENOATTR)) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
gf_log (this->name, GF_LOG_WARNING,
"fetching contribution values from %s (gfid:%s) "
"failed (%s)", oplocal->loc.path,
@@ -1284,7 +1339,7 @@ marker_get_newpath_contribution (call_frame_t *frame, void *cookie,
GET_CONTRI_KEY (contri_key, oplocal->loc.parent->gfid, ret);
if (ret < 0) {
- local->err = errno;
+ local->err = errno ? errno : ENOMEM;
goto err;
}
@@ -1294,7 +1349,7 @@ marker_get_newpath_contribution (call_frame_t *frame, void *cookie,
if (local->loc.inode != NULL) {
GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret);
if (ret < 0) {
- local->err = errno;
+ local->err = errno ? errno : ENOMEM;
goto err;
}
@@ -1335,7 +1390,7 @@ marker_get_oldpath_contribution (call_frame_t *frame, void *cookie,
oplocal = local->oplocal;
if (op_ret < 0) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
gf_log (this->name, GF_LOG_WARNING,
"cannot hold inodelk on %s (gfid:%s) (%s)",
local->next_lock_on->path,
@@ -1346,7 +1401,7 @@ marker_get_oldpath_contribution (call_frame_t *frame, void *cookie,
GET_CONTRI_KEY (contri_key, oplocal->loc.parent->gfid, ret);
if (ret < 0) {
- local->err = errno;
+ local->err = errno ? errno : ENOMEM;
goto quota_err;
}
@@ -1402,7 +1457,7 @@ marker_rename_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
loc = &local->parent_loc;
}
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
gf_log (this->name, GF_LOG_WARNING,
"cannot hold inodelk on %s (gfid:%s) (%s)",
loc->path, uuid_utoa (loc->inode->gfid),
@@ -1808,6 +1863,210 @@ err:
}
+int32_t
+marker_fallocate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_TRACE, "%s occurred while "
+ "fallocating a file ", strerror (op_errno));
+ }
+
+ local = (marker_local_t *) frame->local;
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+
+ if (op_ret == -1 || local == NULL)
+ goto out;
+
+ priv = this->private;
+
+ if (priv->feature_enabled & GF_QUOTA)
+ mq_initiate_quota_txn (this, &local->loc);
+
+ if (priv->feature_enabled & GF_XTIME)
+ marker_xtime_update_marks (this, local);
+out:
+ marker_local_unref (local);
+
+ return 0;
+}
+
+int32_t
+marker_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv->feature_enabled == 0)
+ goto wind;
+
+ local = mem_get0 (this->local_pool);
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ ret = marker_inode_loc_fill (fd->inode, &local->loc);
+
+ if (ret == -1)
+ goto err;
+wind:
+ STACK_WIND (frame, marker_fallocate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+marker_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_TRACE, "%s occurred during discard",
+ strerror (op_errno));
+ }
+
+ local = (marker_local_t *) frame->local;
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+
+ if (op_ret == -1 || local == NULL)
+ goto out;
+
+ priv = this->private;
+
+ if (priv->feature_enabled & GF_QUOTA)
+ mq_initiate_quota_txn (this, &local->loc);
+
+ if (priv->feature_enabled & GF_XTIME)
+ marker_xtime_update_marks (this, local);
+out:
+ marker_local_unref (local);
+
+ return 0;
+}
+
+int32_t
+marker_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv->feature_enabled == 0)
+ goto wind;
+
+ local = mem_get0 (this->local_pool);
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ ret = marker_inode_loc_fill (fd->inode, &local->loc);
+
+ if (ret == -1)
+ goto err;
+wind:
+ STACK_WIND (frame, marker_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (discard, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+marker_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_TRACE, "%s occurred during zerofill",
+ strerror (op_errno));
+ }
+
+ local = (marker_local_t *) frame->local;
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+
+ if (op_ret == -1 || local == NULL)
+ goto out;
+
+ priv = this->private;
+
+ if (priv->feature_enabled & GF_QUOTA)
+ mq_initiate_quota_txn (this, &local->loc);
+
+ if (priv->feature_enabled & GF_XTIME)
+ marker_xtime_update_marks (this, local);
+out:
+ marker_local_unref (local);
+
+ return 0;
+}
+
+int32_t
+marker_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv->feature_enabled == 0)
+ goto wind;
+
+ local = mem_get0 (this->local_pool);
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ ret = marker_inode_loc_fill (fd->inode, &local->loc);
+
+ if (ret == -1)
+ goto err;
+wind:
+ STACK_WIND (frame, marker_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
/* when a call from the special client is received on
* key trusted.glusterfs.volume-mark with value "RESET"
* or if the value is 0length, update the change the
@@ -1900,16 +2159,150 @@ out:
return 0;
}
+int
+remove_quota_keys (dict_t *dict, char *k, data_t *v, void *data)
+{
+ call_frame_t *frame = data;
+ marker_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ int ret = -1;
+
+ ret = syncop_removexattr (FIRST_CHILD (this), &local->loc, k, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Failed to remove "
+ "extended attribute: %s", local->loc.path, k);
+ return -1;
+ }
+ return 0;
+}
+
+int
+quota_xattr_cleaner_cbk (int ret, call_frame_t *frame, void *args)
+{
+ dict_t *xdata = args;
+ int op_ret = -1;
+ int op_errno = 0;
+ marker_local_t *local = NULL;
+
+ local = frame->local;
+ frame->local = NULL;
+
+ op_ret = (ret < 0)? -1: 0;
+ op_errno = -ret;
+
+ STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+ marker_local_unref (local);
+ return ret;
+}
+
+int
+quota_xattr_cleaner (void *args)
+{
+ struct synctask *task = NULL;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ marker_local_t *local = NULL;
+ dict_t *xdata = NULL;
+ int ret = -1;
+
+ task = synctask_get ();
+ if (!task)
+ goto out;
+
+ frame = task->frame;
+ this = frame->this;
+ local = frame->local;
+
+ ret = syncop_listxattr (FIRST_CHILD(this), &local->loc, &xdata);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ ret = dict_foreach_fnmatch (xdata, "trusted.glusterfs.quota.*",
+ remove_quota_keys, frame);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+ ret = dict_foreach_fnmatch (xdata, PGFID_XATTR_KEY_PREFIX"*",
+ remove_quota_keys, frame);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (xdata)
+ dict_unref (xdata);
+
+ return ret;
+}
+
+int
+marker_do_xattr_cleanup (call_frame_t *frame, xlator_t *this, dict_t *xdata,
+ loc_t *loc)
+{
+ int ret = -1;
+ marker_local_t *local = NULL;
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto out;
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ loc_copy (&local->loc, loc);
+ ret = synctask_new (this->ctx->env, quota_xattr_cleaner,
+ quota_xattr_cleaner_cbk, frame, xdata);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create synctask "
+ "for cleaning up quota extended attributes");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (setxattr, frame, -1, ENOMEM, xdata);
+ marker_local_unref (local);
+ }
+ return ret;
+}
+
+static inline gf_boolean_t
+marker_xattr_cleanup_cmd (dict_t *dict)
+{
+ return (dict_get (dict, VIRTUAL_QUOTA_XATTR_CLEANUP_KEY) != NULL);
+}
+
int32_t
marker_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
int32_t flags, dict_t *xdata)
{
- int32_t ret = 0;
- marker_local_t *local = NULL;
- marker_conf_t *priv = NULL;
+ int32_t ret = 0;
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+ int op_errno = ENOMEM;
priv = this->private;
+ if (marker_xattr_cleanup_cmd (dict)) {
+ if (frame->root->uid != 0 || frame->root->gid != 0) {
+ op_errno = EPERM;
+ ret = -1;
+ goto err;
+ }
+
+ /* The following function does the cleanup and then unwinds the
+ * corresponding call*/
+ loc_path (loc, NULL);
+ marker_do_xattr_cleanup (frame, this, xdata, loc);
+ return 0;
+ }
+
if (priv->feature_enabled == 0)
goto wind;
@@ -1930,7 +2323,7 @@ wind:
FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
return 0;
err:
- STACK_UNWIND_STRICT (setxattr, frame, -1, ENOMEM, NULL);
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, NULL);
return 0;
}
@@ -2277,22 +2670,94 @@ err:
return 0;
}
+
+int
+marker_build_ancestry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+ loc_t loc = {0, };
+ inode_t *parent = NULL;
+
+ if ((op_ret <= 0) || (entries == NULL)) {
+ goto out;
+ }
+
+
+ list_for_each_entry (entry, &entries->list, list) {
+ if (entry->inode == entry->inode->table->root) {
+ loc.path = gf_strdup ("/");
+ inode_unref (parent);
+ parent = NULL;
+ }
+
+ loc.inode = inode_ref (entry->inode);
+
+ if (parent != NULL) {
+ loc.parent = inode_ref (parent);
+ uuid_copy (loc.pargfid, parent->gfid);
+ }
+
+ uuid_copy (loc.gfid, entry->d_stat.ia_gfid);
+
+ mq_xattr_state (this, &loc, entry->dict, entry->d_stat);
+
+ inode_unref (parent);
+ parent = inode_ref (entry->inode);
+ loc_wipe (&loc);
+ }
+
+ if (parent)
+ inode_unref (parent);
+
+out:
+ STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
int
marker_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, gf_dirent_t *entries,
dict_t *xdata)
{
- gf_dirent_t *entry = NULL;
+ gf_dirent_t *entry = NULL;
+ marker_conf_t *priv = NULL;
+ marker_local_t *local = NULL;
+ loc_t loc = {0, };
if (op_ret <= 0)
goto unwind;
+ priv = this->private;
+ local = frame->local;
+
+ if (!(priv->feature_enabled & GF_QUOTA) || (local == NULL)) {
+ goto unwind;
+ }
+
list_for_each_entry (entry, &entries->list, list) {
- /* TODO: fill things */
+ if ((strcmp (entry->d_name, ".") == 0) ||
+ (strcmp (entry->d_name, "..") == 0))
+ continue;
+
+ loc.inode = inode_ref (entry->inode);
+ loc.parent = inode_ref (local->loc.inode);
+
+ uuid_copy (loc.gfid, entry->d_stat.ia_gfid);
+ uuid_copy (loc.pargfid, loc.parent->gfid);
+
+ mq_xattr_state (this, &loc, entry->dict, entry->d_stat);
+
+ loc_wipe (&loc);
}
unwind:
+ local = frame->local;
+ frame->local = NULL;
+
STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
+ marker_local_unref (local);
return 0;
}
@@ -2301,20 +2766,36 @@ int
marker_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *dict)
{
- marker_conf_t *priv = NULL;
+ marker_conf_t *priv = NULL;
+ loc_t loc = {0, };
+ marker_local_t *local = NULL;
priv = this->private;
- if (priv->feature_enabled == 0)
- goto wind;
+ if ((dict != NULL) && dict_get (dict, GET_ANCESTRY_DENTRY_KEY)) {
+ STACK_WIND (frame, marker_build_ancestry_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp,
+ fd, size, offset, dict);
+ } else {
+ if (priv->feature_enabled & GF_QUOTA) {
+ local = mem_get0 (this->local_pool);
- if ((priv->feature_enabled & GF_QUOTA) && dict)
- mq_req_xattr (this, NULL, dict);
+ MARKER_INIT_LOCAL (frame, local);
-wind:
- STACK_WIND (frame, marker_readdirp_cbk,
- FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
- fd, size, offset, dict);
+ loc.parent = local->loc.inode = inode_ref (fd->inode);
+
+ if (dict == NULL)
+ dict = dict_new ();
+
+ mq_req_xattr (this, &loc, dict);
+ }
+
+ STACK_WIND (frame, marker_readdirp_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp,
+ fd, size, offset, dict);
+ }
return 0;
}
@@ -2452,7 +2933,7 @@ out:
int32_t
reconfigure (xlator_t *this, dict_t *options)
{
- int32_t ret = -1;
+ int32_t ret = 0;
data_t *data = NULL;
gf_boolean_t flag = _gf_false;
marker_conf_t *priv = NULL;
@@ -2493,11 +2974,17 @@ reconfigure (xlator_t *this, dict_t *options)
"xtime updation will fail");
} else {
priv->feature_enabled |= GF_XTIME;
+ data = dict_get (options, "gsync-force-xtime");
+ if (!data)
+ goto out;
+ ret = gf_string2boolean (data->data, &flag);
+ if (ret == 0 && flag)
+ priv->feature_enabled |= GF_XTIME_GSYNC_FORCE;
}
}
}
out:
- return 0;
+ return ret;
}
@@ -2553,9 +3040,16 @@ init (xlator_t *this)
goto err;
priv->feature_enabled |= GF_XTIME;
+ data = dict_get (options, "gsync-force-xtime");
+ if (!data)
+ goto cont;
+ ret = gf_string2boolean (data->data, &flag);
+ if (ret == 0 && flag)
+ priv->feature_enabled |= GF_XTIME_GSYNC_FORCE;
}
}
+ cont:
this->local_pool = mem_pool_new (marker_local_t, 128);
if (!this->local_pool) {
gf_log (this->name, GF_LOG_ERROR,
@@ -2617,6 +3111,9 @@ struct xlator_fops fops = {
.removexattr = marker_removexattr,
.getxattr = marker_getxattr,
.readdirp = marker_readdirp,
+ .fallocate = marker_fallocate,
+ .discard = marker_discard,
+ .zerofill = marker_zerofill,
};
struct xlator_cbks cbks = {
@@ -2628,5 +3125,6 @@ struct volume_options options[] = {
{.key = {"timestamp-file"}},
{.key = {"quota"}},
{.key = {"xtime"}},
+ {.key = {"gsync-force-xtime"}},
{.key = {NULL}}
};
diff --git a/xlators/features/marker/src/marker.h b/xlators/features/marker/src/marker.h
index 63491ab37..23d1580f0 100644
--- a/xlators/features/marker/src/marker.h
+++ b/xlators/features/marker/src/marker.h
@@ -28,8 +28,9 @@
#define TIMESTAMP_FILE "timestamp-file"
enum {
- GF_QUOTA=1,
- GF_XTIME=2
+ GF_QUOTA = 1,
+ GF_XTIME = 2,
+ GF_XTIME_GSYNC_FORCE = 4,
};
/*initialize the local variable*/
@@ -110,6 +111,8 @@ struct marker_local{
inode_contribution_t *contri;
int xflag;
+ dict_t *xdata;
+ gf_boolean_t skip_txn;
};
typedef struct marker_local marker_local_t;
diff --git a/xlators/features/marker/utils/Makefile.am b/xlators/features/marker/utils/Makefile.am
deleted file mode 100644
index 556951d9f..000000000
--- a/xlators/features/marker/utils/Makefile.am
+++ /dev/null
@@ -1,3 +0,0 @@
-SUBDIRS = syncdaemon src
-
-CLEANFILES =
diff --git a/xlators/features/marker/utils/src/Makefile.am b/xlators/features/marker/utils/src/Makefile.am
deleted file mode 100644
index 9e410cda6..000000000
--- a/xlators/features/marker/utils/src/Makefile.am
+++ /dev/null
@@ -1,26 +0,0 @@
-gsyncddir = $(libexecdir)/glusterfs
-
-gsyncd_PROGRAMS = gsyncd
-
-gsyncd_SOURCES = gsyncd.c procdiggy.c
-
-gsyncd_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
- $(GF_GLUSTERFS_LIBS)
-
-gsyncd_LDFLAGS = $(GF_LDFLAGS)
-
-noinst_HEADERS = procdiggy.h
-
-AM_CPPFLAGS = $(GF_CPPFLAGS) \
- -I$(top_srcdir)/libglusterfs/src\
- -DGSYNCD_PREFIX=\"$(libexecdir)/glusterfs\"\
- -DUSE_LIBGLUSTERFS\
- -DSBIN_DIR=\"$(sbindir)\" -DPYTHON=\"$(PYTHON)\"
-
-AM_CFLAGS = -Wall $(GF_CFLAGS)
-
-
-CLEANFILES =
-
-$(top_builddir)/libglusterfs/src/libglusterfs.la:
- $(MAKE) -C $(top_builddir)/libglusterfs/src/ all
diff --git a/xlators/features/marker/utils/src/gsyncd.c b/xlators/features/marker/utils/src/gsyncd.c
deleted file mode 100644
index 9c4a5bdff..000000000
--- a/xlators/features/marker/utils/src/gsyncd.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/param.h> /* for PATH_MAX */
-
-/* NOTE (USE_LIBGLUSTERFS):
- * ------------------------
- * When USE_LIBGLUSTERFS debugging sumbol is passed; perform
- * glusterfs translator like initialization so that glusterfs
- * globals, contexts are valid when glustefs api's are invoked.
- * We unconditionally pass then while building gsyncd binary.
- */
-#ifdef USE_LIBGLUSTERFS
-#include "glusterfs.h"
-#include "globals.h"
-#endif
-
-#include "common-utils.h"
-#include "run.h"
-#include "procdiggy.h"
-
-#define _GLUSTERD_CALLED_ "_GLUSTERD_CALLED_"
-#define _GSYNCD_DISPATCHED_ "_GSYNCD_DISPATCHED_"
-#define GSYNCD_CONF "geo-replication/gsyncd.conf"
-#define GSYNCD_PY "gsyncd.py"
-#define RSYNC "rsync"
-
-int restricted = 0;
-
-static int
-duplexpand (void **buf, size_t tsiz, size_t *len)
-{
- size_t osiz = tsiz * *len;
- char *p = realloc (*buf, osiz << 1);
- if (!p) {
- free(*buf);
- return -1;
- }
-
- memset (p + osiz, 0, osiz);
- *buf = p;
- *len <<= 1;
-
- return 0;
-}
-
-static int
-str2argv (char *str, char ***argv)
-{
- char *p = NULL;
- char *savetok = NULL;
- int argc = 0;
- size_t argv_len = 32;
- int ret = 0;
-
- assert (str);
- str = strdup (str);
- if (!str)
- return -1;
-
- *argv = calloc (argv_len, sizeof (**argv));
- if (!*argv)
- goto error;
-
- while ((p = strtok_r (str, " ", &savetok))) {
- str = NULL;
-
- argc++;
- if (argc == argv_len) {
- ret = duplexpand ((void *)argv,
- sizeof (**argv),
- &argv_len);
- if (ret == -1)
- goto error;
- }
- (*argv)[argc - 1] = p;
- }
-
- return argc;
-
- error:
- fprintf (stderr, "out of memory\n");
- return -1;
-}
-
-static int
-invoke_gsyncd (int argc, char **argv)
-{
- char config_file[PATH_MAX] = {0,};
- size_t gluster_workdir_len = 0;
- runner_t runner = {0,};
- int i = 0;
- int j = 0;
- char *nargv[argc + 4];
- char *python = NULL;
-
- if (restricted) {
- size_t len;
- /* in restricted mode we forcibly use the system-wide config */
- runinit (&runner);
- runner_add_args (&runner, SBIN_DIR"/gluster",
- "--log-file=-", "system::", "getwd",
- NULL);
- runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
- if (runner_start (&runner) == 0 &&
- fgets (config_file, PATH_MAX,
- runner_chio (&runner, STDOUT_FILENO)) != NULL &&
- (len = strlen (config_file)) &&
- config_file[len - 1] == '\n' &&
- runner_end (&runner) == 0)
- gluster_workdir_len = len - 1;
-
- if (gluster_workdir_len) {
- if (gluster_workdir_len + 1 + strlen (GSYNCD_CONF) + 1 >
- PATH_MAX)
- goto error;
- config_file[gluster_workdir_len] = '/';
- strcat (config_file, GSYNCD_CONF);
- } else
- goto error;
-
- if (setenv ("_GSYNCD_RESTRICTED_", "1", 1) == -1)
- goto error;
- }
-
- if (chdir ("/") == -1)
- goto error;
-
- j = 0;
- python = getenv("PYTHON");
- if(!python)
- python = PYTHON;
- nargv[j++] = python;
- nargv[j++] = GSYNCD_PREFIX"/python/syncdaemon/"GSYNCD_PY;
- for (i = 1; i < argc; i++)
- nargv[j++] = argv[i];
- if (config_file[0]) {
- nargv[j++] = "-c";
- nargv[j++] = config_file;
- }
- nargv[j++] = NULL;
-
- execvp (python, nargv);
-
- fprintf (stderr, "exec of '%s' failed\n", python);
- return 127;
-
- error:
- fprintf (stderr, "gsyncd initializaion failed\n");
- return 1;
-}
-
-
-static int
-find_gsyncd (pid_t pid, pid_t ppid, char *name, void *data)
-{
- char buf[NAME_MAX * 2] = {0,};
- char path[PATH_MAX] = {0,};
- char *p = NULL;
- int zeros = 0;
- int ret = 0;
- int fd = -1;
- pid_t *pida = (pid_t *)data;
-
- if (ppid != pida[0])
- return 0;
-
- sprintf (path, PROC"/%d/cmdline", pid);
- fd = open (path, O_RDONLY);
- if (fd == -1)
- return 0;
- ret = read (fd, buf, sizeof (buf));
- close (fd);
- if (ret == -1)
- return 0;
- for (zeros = 0, p = buf; zeros < 2 && p < buf + ret; p++)
- zeros += !*p;
-
- ret = 0;
- switch (zeros) {
- case 2:
- if ((strcmp (basename (buf), basename (PYTHON)) ||
- strcmp (basename (buf + strlen (buf) + 1), GSYNCD_PY)) == 0) {
- ret = 1;
- break;
- }
- /* fallthrough */
- case 1:
- if (strcmp (basename (buf), GSYNCD_PY) == 0)
- ret = 1;
- }
-
- if (ret == 1) {
- if (pida[1] != -1) {
- fprintf (stderr, GSYNCD_PY" sibling is not unique");
- return -1;
- }
- pida[1] = pid;
- }
-
- return 0;
-}
-
-static int
-invoke_rsync (int argc, char **argv)
-{
- int i = 0;
- char path[PATH_MAX] = {0,};
- pid_t pid = -1;
- pid_t ppid = -1;
- pid_t pida[] = {-1, -1};
- char *name = NULL;
- char buf[PATH_MAX + 1] = {0,};
- int ret = 0;
-
- assert (argv[argc] == NULL);
-
- if (argc < 2 || strcmp (argv[1], "--server") != 0)
- goto error;
-
- for (i = 2; i < argc && argv[i][0] == '-'; i++);
-
- if (!(i == argc - 2 && strcmp (argv[i], ".") == 0 && argv[i + 1][0] == '/')) {
- fprintf (stderr, "need an rsync invocation without protected args\n");
- goto error;
- }
-
- /* look up sshd we are spawned from */
- for (pid = getpid () ;; pid = ppid) {
- ppid = pidinfo (pid, &name);
- if (ppid < 0) {
- fprintf (stderr, "sshd ancestor not found\n");
- goto error;
- }
- if (strcmp (name, "sshd") == 0) {
- GF_FREE (name);
- break;
- }
- GF_FREE (name);
- }
- /* look up "ssh-sibling" gsyncd */
- pida[0] = pid;
- ret = prociter (find_gsyncd, pida);
- if (ret == -1 || pida[1] == -1) {
- fprintf (stderr, "gsyncd sibling not found\n");
- goto error;
- }
- /* check if rsync target matches gsyncd target */
- sprintf (path, PROC"/%d/cwd", pida[1]);
- ret = readlink (path, buf, sizeof (buf));
- if (ret == -1 || ret == sizeof (buf))
- goto error;
- if (strcmp (argv[argc - 1], "/") == 0 /* root dir cannot be a target */ ||
- (strcmp (argv[argc - 1], path) /* match against gluster target */ &&
- strcmp (argv[argc - 1], buf) /* match against file target */) != 0) {
- fprintf (stderr, "rsync target does not match "GEOREP" session\n");
- goto error;
- }
-
- argv[0] = RSYNC;
-
- execvp (RSYNC, argv);
-
- fprintf (stderr, "exec of "RSYNC" failed\n");
- return 127;
-
- error:
- fprintf (stderr, "disallowed "RSYNC" invocation\n");
- return 1;
-}
-
-
-struct invocable {
- char *name;
- int (*invoker) (int argc, char **argv);
-};
-
-struct invocable invocables[] = {
- { "rsync", invoke_rsync },
- { "gsyncd", invoke_gsyncd },
- { NULL, NULL}
-};
-
-int
-main (int argc, char **argv)
-{
- char *evas = NULL;
- struct invocable *i = NULL;
- char *b = NULL;
- char *sargv = NULL;
-
-#ifdef USE_LIBGLUSTERFS
- glusterfs_ctx_t *ctx = NULL;
-
- ctx = glusterfs_ctx_new ();
- if (!ctx)
- return ENOMEM;
-
- if (glusterfs_globals_init (ctx))
- return 1;
-
- THIS->ctx = ctx;
-#endif
-
- evas = getenv (_GLUSTERD_CALLED_);
- if (evas && strcmp (evas, "1") == 0)
- /* OK, we know glusterd called us, no need to look for further config
- * ... altough this conclusion should not inherit to our children
- */
- unsetenv (_GLUSTERD_CALLED_);
- else {
- /* we regard all gsyncd invocations unsafe
- * that do not come from glusterd and
- * therefore restrict it
- */
- restricted = 1;
-
- if (!getenv (_GSYNCD_DISPATCHED_)) {
- evas = getenv ("SSH_ORIGINAL_COMMAND");
- if (evas)
- sargv = evas;
- else {
- evas = getenv ("SHELL");
- if (evas && strcmp (basename (evas), "gsyncd") == 0 &&
- argc == 3 && strcmp (argv[1], "-c") == 0)
- sargv = argv[2];
- }
- }
-
- }
-
- if (!(sargv && restricted))
- return invoke_gsyncd (argc, argv);
-
- argc = str2argv (sargv, &argv);
- if (argc == -1 || setenv (_GSYNCD_DISPATCHED_, "1", 1) == -1) {
- fprintf (stderr, "internal error\n");
- return 1;
- }
-
- b = basename (argv[0]);
- for (i = invocables; i->name; i++) {
- if (strcmp (b, i->name) == 0)
- return i->invoker (argc, argv);
- }
-
- fprintf (stderr, "invoking %s in restricted SSH session is not allowed\n",
- b);
-
- return 1;
-}
diff --git a/xlators/features/marker/utils/src/procdiggy.c b/xlators/features/marker/utils/src/procdiggy.c
deleted file mode 100644
index 1eba414c1..000000000
--- a/xlators/features/marker/utils/src/procdiggy.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <ctype.h>
-#include <sys/param.h> /* for PATH_MAX */
-
-#include "common-utils.h"
-#include "procdiggy.h"
-
-pid_t
-pidinfo (pid_t pid, char **name)
-{
- char buf[NAME_MAX * 2] = {0,};
- FILE *f = NULL;
- char path[PATH_MAX] = {0,};
- char *p = NULL;
- int ret = 0;
-
- sprintf (path, PROC"/%d/status", pid);
-
- f = fopen (path, "r");
- if (!f)
- return -1;
-
- if (name)
- *name = NULL;
- for (;;) {
- size_t len;
- memset (buf, 0, sizeof (buf));
- if (fgets (buf, sizeof (buf), f) == NULL ||
- (len = strlen (buf)) == 0 ||
- buf[len - 1] != '\n') {
- pid = -1;
- goto out;
- }
- buf[len - 1] = '\0';
-
- if (name && !*name) {
- p = strtail (buf, "Name:");
- if (p) {
- while (isspace (*++p));
- *name = gf_strdup (p);
- if (!*name) {
- pid = -2;
- goto out;
- }
- continue;
- }
- }
-
- p = strtail (buf, "PPid:");
- if (p)
- break;
- }
-
- while (isspace (*++p));
- ret = gf_string2int (p, &pid);
- if (ret == -1)
- pid = -1;
-
- out:
- fclose (f);
- if (pid == -1 && name && *name)
- GF_FREE (name);
- if (pid == -2)
- fprintf (stderr, "out of memory\n");
- return pid;
-}
-
-int
-prociter (int (*proch) (pid_t pid, pid_t ppid, char *tmpname, void *data),
- void *data)
-{
- char *name = NULL;
- DIR *d = NULL;
- struct dirent *de = NULL;
- pid_t pid = -1;
- pid_t ppid = -1;
- int ret = 0;
-
- d = opendir (PROC);
- if (!d)
- return -1;
- while (errno = 0, de = readdir (d)) {
- if (gf_string2int (de->d_name, &pid) != -1 && pid >= 0) {
- ppid = pidinfo (pid, &name);
- switch (ppid) {
- case -1: continue;
- case -2: ret = -1; break;
- }
- ret = proch (pid, ppid, name, data);
- GF_FREE (name);
- if (ret)
- break;
- }
- }
- closedir (d);
- if (!de && errno) {
- fprintf (stderr, "failed to traverse "PROC" (%s)\n",
- strerror (errno));
- ret = -1;
- }
-
- return ret;
-}
diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am
deleted file mode 100644
index cc7cee102..000000000
--- a/xlators/features/marker/utils/syncdaemon/Makefile.am
+++ /dev/null
@@ -1,6 +0,0 @@
-syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon
-
-syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py libcxattr.py \
- $(top_builddir)/contrib/ipaddr-py/ipaddr.py
-
-CLEANFILES =
diff --git a/xlators/features/marker/utils/syncdaemon/README.md b/xlators/features/marker/utils/syncdaemon/README.md
deleted file mode 100644
index d45006932..000000000
--- a/xlators/features/marker/utils/syncdaemon/README.md
+++ /dev/null
@@ -1,81 +0,0 @@
-gsycnd, the Gluster Syncdaemon
-==============================
-
-REQUIREMENTS
-------------
-
-_gsyncd_ is a program which can operate either in _master_ or in _slave_ mode.
-Requirements are categorized according to this.
-
-* supported OS is GNU/Linux
-* Python >= 2.5, or 2.4 with Ctypes (see below) (both)
-* OpenSSH >= 4.0 (master) / SSH2 compliant sshd (eg. openssh) (slave)
-* rsync (both)
-* glusterfs with marker support (master); glusterfs (optional on slave)
-* FUSE; for supported versions consult glusterfs
-
-INSTALLATION
-------------
-
-As of now, the supported way of operation is running from the source directory.
-
-If you use Python 2.4.x, you need to install the [Ctypes module](http://python.net/crew/theller/ctypes/).
-
-CONFIGURATION
--------------
-
-gsyncd tunables are a subset of the long command-line options; for listing them,
-type
-
- gsyncd.py --help
-
-and see the long options up to "--config-file". (The leading double dash should be omitted;
-interim underscores and dashes are interchangeable.) The set of options bear some resemblance
-to those of glusterfs and rsync.
-
-The config file format matches the following syntax:
-
- <option1>: <value1>
- <option2>: <value2>
- # comment
-
-By default (unless specified by the option `-c`), gsyncd looks for config file at _conf/gsyncd.conf_
-in the source tree.
-
-USAGE
------
-
-gsyncd is a utilitly for continous mirroring, ie. it mirrors master to slave incrementally.
-Assume we have a gluster volume _pop_ at localhost. We try to set up the following mirrors
-for it with gysncd:
-
-1. _/data/mirror_
-2. local gluster volume _yow_
-3. _/data/far_mirror_ at example.com
-4. gluster volume _moz_ at example.com
-
-The respective gsyncd invocations are (demoing some syntax sugaring):
-
-1.
-
- gsyncd.py gluster://localhost:pop file:///data/mirror
-
- or short form
-
- gsyncd.py :pop /data/mirror
-
-2. `gsyncd :pop :yow`
-3.
-
- gsyncd.py :pop ssh://example.com:/data/far_mirror
-
- or short form
-
- gsyncd.py :pop example.com:/data/far_mirror
-
-4. `gsyncd.py :pop example.com::moz`
-
-gsyncd has to be available on both sides; it's location on the remote side has to be specified
-via the "--remote-gsyncd" option (or "remote-gsyncd" config file parameter). (This option can also be
-used for setting options on the remote side, although the suggested mode of operation is to
-set parameters like log file / pid file in the configuration file.)
diff --git a/xlators/features/marker/utils/syncdaemon/__codecheck.py b/xlators/features/marker/utils/syncdaemon/__codecheck.py
deleted file mode 100644
index e3386afba..000000000
--- a/xlators/features/marker/utils/syncdaemon/__codecheck.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import os.path
-import sys
-import tempfile
-import shutil
-
-ipd = tempfile.mkdtemp(prefix = 'codecheck-aux')
-
-try:
- # add a fake ipaddr module, we don't want to
- # deal with the real one (just test our code)
- f = open(os.path.join(ipd, 'ipaddr.py'), 'w')
- f.write("""
-class IPAddress(object):
- pass
-class IPNetwork(list):
- pass
-""")
- f.close()
- sys.path.append(ipd)
-
- fl = os.listdir(os.path.dirname(sys.argv[0]) or '.')
- fl.sort()
- for f in fl:
- if f[-3:] != '.py' or f[0] == '_':
- continue
- m = f[:-3]
- sys.stdout.write('importing %s ...' % m)
- __import__(m)
- print(' OK.')
-
- def sys_argv_set(a):
- sys.argv = sys.argv[:1] + a
-
- gsyncd = sys.modules['gsyncd']
- for a in [['--help'], ['--version'], ['--canonicalize-escape-url', '/foo']]:
- print('>>> invoking program with args: %s' % ' '.join(a))
- pid = os.fork()
- if not pid:
- sys_argv_set(a)
- gsyncd.main()
- _, r = os.waitpid(pid, 0)
- if r:
- raise RuntimeError('invocation failed')
-finally:
- shutil.rmtree(ipd)
diff --git a/xlators/features/marker/utils/syncdaemon/__init__.py b/xlators/features/marker/utils/syncdaemon/__init__.py
deleted file mode 100644
index e69de29bb..000000000
--- a/xlators/features/marker/utils/syncdaemon/__init__.py
+++ /dev/null
diff --git a/xlators/features/marker/utils/syncdaemon/configinterface.py b/xlators/features/marker/utils/syncdaemon/configinterface.py
deleted file mode 100644
index e55bec519..000000000
--- a/xlators/features/marker/utils/syncdaemon/configinterface.py
+++ /dev/null
@@ -1,224 +0,0 @@
-try:
- import ConfigParser
-except ImportError:
- # py 3
- import configparser as ConfigParser
-import re
-from string import Template
-
-from syncdutils import escape, unescape, norm, update_file, GsyncdError
-
-SECT_ORD = '__section_order__'
-SECT_META = '__meta__'
-config_version = 2.0
-
-re_type = type(re.compile(''))
-
-
-class MultiDict(object):
- """a virtual dict-like class which functions as the union of underlying dicts"""
-
- def __init__(self, *dd):
- self.dicts = dd
-
- def __getitem__(self, key):
- val = None
- for d in self.dicts:
- if d.get(key):
- val = d[key]
- if not val:
- raise KeyError(key)
- return val
-
-
-class GConffile(object):
- """A high-level interface to ConfigParser which flattens the two-tiered
- config layout by implenting automatic section dispatch based on initial
- parameters.
-
- Also ensure section ordering in terms of their time of addition -- a compat
- hack for Python < 2.7.
- """
-
- def _normconfig(self):
- """normalize config keys by s/-/_/g"""
- for n, s in self.config._sections.items():
- if n.find('__') == 0:
- continue
- s2 = type(s)()
- for k, v in s.items():
- if k.find('__') != 0:
- k = norm(k)
- s2[k] = v
- self.config._sections[n] = s2
-
- def __init__(self, path, peers, *dd):
- """
- - .path: location of config file
- - .config: underlying ConfigParser instance
- - .peers: on behalf of whom we flatten .config
- (master, or master-slave url pair)
- - .auxdicts: template subtituents
- """
- self.peers = peers
- self.path = path
- self.auxdicts = dd
- self.config = ConfigParser.RawConfigParser()
- self.config.read(path)
- self._normconfig()
-
- def section(self, rx=False):
- """get the section name of the section representing .peers in .config"""
- peers = self.peers
- if not peers:
- peers = ['.', '.']
- rx = True
- if rx:
- st = 'peersrx'
- else:
- st = 'peers'
- return ' '.join([st] + [escape(u) for u in peers])
-
- @staticmethod
- def parse_section(section):
- """retrieve peers sequence encoded by section name
- (as urls or regexen, depending on section type)
- """
- sl = section.split()
- st = sl.pop(0)
- sl = [unescape(u) for u in sl]
- if st == 'peersrx':
- sl = [re.compile(u) for u in sl]
- return sl
-
- def ord_sections(self):
- """Return an ordered list of sections.
-
- Ordering happens based on the auxiliary
- SECT_ORD section storing indices for each
- section added through the config API.
-
- To not to go corrupt in case of manually
- written config files, we take care to append
- also those sections which are not registered
- in SECT_ORD.
-
- Needed for python 2.{4,5,6} where ConfigParser
- cannot yet order sections/options internally.
- """
- so = {}
- if self.config.has_section(SECT_ORD):
- so = self.config._sections[SECT_ORD]
- so2 = {}
- for k, v in so.items():
- if k != '__name__':
- so2[k] = int(v)
- tv = 0
- if so2:
- tv = max(so2.values()) + 1
- ss = [s for s in self.config.sections() if s.find('__') != 0]
- for s in ss:
- if s in so.keys():
- continue
- so2[s] = tv
- tv += 1
- def scmp(x, y):
- return cmp(*(so2[s] for s in (x, y)))
- ss.sort(scmp)
- return ss
-
- def update_to(self, dct, allow_unresolved=False):
- """update @dct from key/values of ours.
-
- key/values are collected from .config by filtering the regexp sections
- according to match, and from .section. The values are treated as templates,
- which are substituted from .auxdicts and (in case of regexp sections)
- match groups.
- """
- if not self.peers:
- raise GsyncdError('no peers given, cannot select matching options')
- def update_from_sect(sect, mud):
- for k, v in self.config._sections[sect].items():
- if k == '__name__':
- continue
- if allow_unresolved:
- dct[k] = Template(v).safe_substitute(mud)
- else:
- dct[k] = Template(v).substitute(mud)
- for sect in self.ord_sections():
- sp = self.parse_section(sect)
- if isinstance(sp[0], re_type) and len(sp) == len(self.peers):
- match = True
- mad = {}
- for i in range(len(sp)):
- m = sp[i].search(self.peers[i])
- if not m:
- match = False
- break
- for j in range(len(m.groups())):
- mad['match%d_%d' % (i+1, j+1)] = m.groups()[j]
- if match:
- update_from_sect(sect, MultiDict(dct, mad, *self.auxdicts))
- if self.config.has_section(self.section()):
- update_from_sect(self.section(), MultiDict(dct, *self.auxdicts))
-
- def get(self, opt=None):
- """print the matching key/value pairs from .config,
- or if @opt given, the value for @opt (according to the
- logic described in .update_to)
- """
- d = {}
- self.update_to(d, allow_unresolved = True)
- if opt:
- opt = norm(opt)
- v = d.get(opt)
- if v:
- print(v)
- else:
- for k, v in d.iteritems():
- if k == '__name__':
- continue
- print("%s: %s" % (k, v))
-
- def write(self, trfn, opt, *a, **kw):
- """update on-disk config transactionally
-
- @trfn is the transaction function
- """
- def mergeconf(f):
- self.config = ConfigParser.RawConfigParser()
- self.config.readfp(f)
- self._normconfig()
- if not self.config.has_section(SECT_META):
- self.config.add_section(SECT_META)
- self.config.set(SECT_META, 'version', config_version)
- return trfn(norm(opt), *a, **kw)
- def updateconf(f):
- self.config.write(f)
- update_file(self.path, updateconf, mergeconf)
-
- def _set(self, opt, val, rx=False):
- """set @opt to @val in .section"""
- sect = self.section(rx)
- if not self.config.has_section(sect):
- self.config.add_section(sect)
- # regarding SECT_ORD, cf. ord_sections
- if not self.config.has_section(SECT_ORD):
- self.config.add_section(SECT_ORD)
- self.config.set(SECT_ORD, sect, len(self.config._sections[SECT_ORD]))
- self.config.set(sect, opt, val)
- return True
-
- def set(self, opt, *a, **kw):
- """perform ._set transactionally"""
- self.write(self._set, opt, *a, **kw)
-
- def _delete(self, opt, rx=False):
- """delete @opt from .section"""
- sect = self.section(rx)
- if self.config.has_section(sect):
- return self.config.remove_option(sect, opt)
-
- def delete(self, opt, *a, **kw):
- """perform ._delete transactionally"""
- self.write(self._delete, opt, *a, **kw)
diff --git a/xlators/features/marker/utils/syncdaemon/gconf.py b/xlators/features/marker/utils/syncdaemon/gconf.py
deleted file mode 100644
index 146c72a18..000000000
--- a/xlators/features/marker/utils/syncdaemon/gconf.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import os
-
-class GConf(object):
- """singleton class to store globals
- shared between gsyncd modules"""
-
- ssh_ctl_dir = None
- ssh_ctl_args = None
- cpid = None
- pid_file_owned = False
- log_exit = False
- permanent_handles = []
- log_metadata = {}
-
- @classmethod
- def setup_ssh_ctl(cls, ctld):
- cls.ssh_ctl_dir = ctld
- cls.ssh_ctl_args = ["-oControlMaster=auto", "-S", os.path.join(ctld, "gsycnd-ssh-%r@%h:%p")]
-
-gconf = GConf()
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
deleted file mode 100644
index 387900e6c..000000000
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ /dev/null
@@ -1,419 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import os.path
-import sys
-import time
-import logging
-import signal
-import optparse
-import fcntl
-import fnmatch
-from optparse import OptionParser, SUPPRESS_HELP
-from logging import Logger
-from errno import EEXIST, ENOENT
-
-from ipaddr import IPAddress, IPNetwork
-
-from gconf import gconf
-from syncdutils import FreeObject, norm, grabpidfile, finalize, log_raise_exception
-from syncdutils import GsyncdError, select, set_term_handler, privileged
-from configinterface import GConffile
-import resource
-from monitor import monitor
-
-class GLogger(Logger):
- """Logger customizations for gsyncd.
-
- It implements a log format similar to that of glusterfs.
- """
-
- def makeRecord(self, name, level, *a):
- rv = Logger.makeRecord(self, name, level, *a)
- rv.nsecs = (rv.created - int(rv.created)) * 1000000
- fr = sys._getframe(4)
- callee = fr.f_locals.get('self')
- if callee:
- ctx = str(type(callee)).split("'")[1].split('.')[-1]
- else:
- ctx = '<top>'
- if not hasattr(rv, 'funcName'):
- rv.funcName = fr.f_code.co_name
- rv.lvlnam = logging.getLevelName(level)[0]
- rv.ctx = ctx
- return rv
-
- @classmethod
- def setup(cls, **kw):
- lbl = kw.get('label', "")
- if lbl:
- lbl = '(' + lbl + ')'
- lprm = {'datefmt': "%Y-%m-%d %H:%M:%S",
- 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"}
- lprm.update(kw)
- lvl = kw.get('level', logging.INFO)
- lprm['level'] = lvl
- logging.root = cls("root", lvl)
- logging.setLoggerClass(cls)
- logging.getLogger().handlers = []
- logging.basicConfig(**lprm)
-
- @classmethod
- def _gsyncd_loginit(cls, **kw):
- lkw = {}
- if gconf.log_level:
- lkw['level'] = gconf.log_level
- if kw.get('log_file'):
- if kw['log_file'] in ('-', '/dev/stderr'):
- lkw['stream'] = sys.stderr
- elif kw['log_file'] == '/dev/stdout':
- lkw['stream'] = sys.stdout
- else:
- lkw['filename'] = kw['log_file']
-
- cls.setup(label=kw.get('label'), **lkw)
-
- lkw.update({'saved_label': kw.get('label')})
- gconf.log_metadata = lkw
- gconf.log_exit = True
-
-def startup(**kw):
- """set up logging, pidfile grabbing, daemonization"""
- if getattr(gconf, 'pid_file', None) and kw.get('go_daemon') != 'postconn':
- if not grabpidfile():
- sys.stderr.write("pidfile is taken, exiting.\n")
- sys.exit(2)
- gconf.pid_file_owned = True
-
- if kw.get('go_daemon') == 'should':
- x, y = os.pipe()
- gconf.cpid = os.fork()
- if gconf.cpid:
- os.close(x)
- sys.exit()
- os.close(y)
- os.setsid()
- dn = os.open(os.devnull, os.O_RDWR)
- for f in (sys.stdin, sys.stdout, sys.stderr):
- os.dup2(dn, f.fileno())
- if getattr(gconf, 'pid_file', None):
- if not grabpidfile(gconf.pid_file + '.tmp'):
- raise GsyncdError("cannot grab temporary pidfile")
- os.rename(gconf.pid_file + '.tmp', gconf.pid_file)
- # wait for parent to terminate
- # so we can start up with
- # no messing from the dirty
- # ol' bustard
- select((x,), (), ())
- os.close(x)
-
- GLogger._gsyncd_loginit(**kw)
-
-def main():
- """main routine, signal/exception handling boilerplates"""
- gconf.starttime = time.time()
- set_term_handler()
- GLogger.setup()
- excont = FreeObject(exval = 0)
- try:
- try:
- main_i()
- except:
- log_raise_exception(excont)
- finally:
- finalize(exval = excont.exval)
-
-def main_i():
- """internal main routine
-
- parse command line, decide what action will be taken;
- we can either:
- - query/manipulate configuration
- - format gsyncd urls using gsyncd's url parsing engine
- - start service in following modes, in given stages:
- - monitor: startup(), monitor()
- - master: startup(), connect_remote(), connect(), service_loop()
- - slave: startup(), connect(), service_loop()
- """
- rconf = {'go_daemon': 'should'}
-
- def store_abs(opt, optstr, val, parser):
- if val and val != '-':
- val = os.path.abspath(val)
- setattr(parser.values, opt.dest, val)
- def store_local(opt, optstr, val, parser):
- rconf[opt.dest] = val
- def store_local_curry(val):
- return lambda o, oo, vx, p: store_local(o, oo, val, p)
- def store_local_obj(op, dmake):
- return lambda o, oo, vx, p: store_local(o, oo, FreeObject(op=op, **dmake(vx)), p)
-
- op = OptionParser(usage="%prog [options...] <master> <slave>", version="%prog 0.0.1")
- op.add_option('--gluster-command-dir', metavar='DIR', default='')
- op.add_option('--gluster-log-file', metavar='LOGF', default=os.devnull, type=str, action='callback', callback=store_abs)
- op.add_option('--gluster-log-level', metavar='LVL')
- op.add_option('--gluster-params', metavar='PRMS', default='')
- op.add_option('--gluster-cli-options', metavar='OPTS', default='--log-file=-')
- op.add_option('--mountbroker', metavar='LABEL')
- op.add_option('-p', '--pid-file', metavar='PIDF', type=str, action='callback', callback=store_abs)
- op.add_option('-l', '--log-file', metavar='LOGF', type=str, action='callback', callback=store_abs)
- op.add_option('--log-file-mbr', metavar='LOGF', type=str, action='callback', callback=store_abs)
- op.add_option('--state-file', metavar='STATF', type=str, action='callback', callback=store_abs)
- op.add_option('--ignore-deletes', default=False, action='store_true')
- op.add_option('--use-rsync-xattrs', default=False, action='store_true')
- op.add_option('-L', '--log-level', metavar='LVL')
- op.add_option('-r', '--remote-gsyncd', metavar='CMD', default=os.path.abspath(sys.argv[0]))
- op.add_option('--volume-id', metavar='UUID')
- op.add_option('--session-owner', metavar='ID')
- op.add_option('-s', '--ssh-command', metavar='CMD', default='ssh')
- op.add_option('--rsync-command', metavar='CMD', default='rsync')
- op.add_option('--rsync-options', metavar='OPTS', default='--sparse')
- op.add_option('--rsync-ssh-options', metavar='OPTS', default='--compress')
- op.add_option('--timeout', metavar='SEC', type=int, default=120)
- op.add_option('--connection-timeout', metavar='SEC', type=int, default=60, help=SUPPRESS_HELP)
- op.add_option('--sync-jobs', metavar='N', type=int, default=3)
- op.add_option('--turns', metavar='N', type=int, default=0, help=SUPPRESS_HELP)
- op.add_option('--allow-network', metavar='IPS', default='')
- op.add_option('--socketdir', metavar='DIR')
- op.add_option('--state-socket-unencoded', metavar='SOCKF', type=str, action='callback', callback=store_abs)
- op.add_option('--checkpoint', metavar='LABEL', default='')
- # tunables for failover/failback mechanism:
- # None - gsyncd behaves as normal
- # blind - gsyncd works with xtime pairs to identify
- # candidates for synchronization
- # wrapup - same as normal mode but does not assign
- # xtimes to orphaned files
- # see crawl() for usage of the above tunables
- op.add_option('--special-sync-mode', type=str, help=SUPPRESS_HELP)
-
- op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local)
- # duh. need to specify dest or value will be mapped to None :S
- op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True))
- op.add_option('--feedback-fd', dest='feedback_fd', type=int, help=SUPPRESS_HELP, action='callback', callback=store_local)
- op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True))
- op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont'))
- op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a),
- setattr(a[-1].values, 'log_file', '-'),
- setattr(a[-1].values, 'log_level', 'DEBUG'))),
-
- for a in ('check', 'get'):
- op.add_option('--config-' + a, metavar='OPT', type=str, dest='config', action='callback',
- callback=store_local_obj(a, lambda vx: {'opt': vx}))
- op.add_option('--config-get-all', dest='config', action='callback', callback=store_local_obj('get', lambda vx: {'opt': None}))
- for m in ('', '-rx', '-glob'):
- # call this code 'Pythonic' eh?
- # have to define a one-shot local function to be able to inject (a value depending on the)
- # iteration variable into the inner lambda
- def conf_mod_opt_regex_variant(rx):
- op.add_option('--config-set' + m, metavar='OPT VAL', type=str, nargs=2, dest='config', action='callback',
- callback=store_local_obj('set', lambda vx: {'opt': vx[0], 'val': vx[1], 'rx': rx}))
- op.add_option('--config-del' + m, metavar='OPT', type=str, dest='config', action='callback',
- callback=store_local_obj('del', lambda vx: {'opt': vx, 'rx': rx}))
- conf_mod_opt_regex_variant(m and m[1:] or False)
-
- op.add_option('--normalize-url', dest='url_print', action='callback', callback=store_local_curry('normal'))
- op.add_option('--canonicalize-url', dest='url_print', action='callback', callback=store_local_curry('canon'))
- op.add_option('--canonicalize-escape-url', dest='url_print', action='callback', callback=store_local_curry('canon_esc'))
-
- tunables = [ norm(o.get_opt_string()[2:]) for o in op.option_list if o.callback in (store_abs, 'store_true', None) and o.get_opt_string() not in ('--version', '--help') ]
- remote_tunables = [ 'listen', 'go_daemon', 'timeout', 'session_owner', 'config_file', 'use_rsync_xattrs' ]
- rq_remote_tunables = { 'listen': True }
-
- # precedence for sources of values: 1) commandline, 2) cfg file, 3) defaults
- # -- for this to work out we need to tell apart defaults from explicitly set
- # options... so churn out the defaults here and call the parser with virgin
- # values container.
- defaults = op.get_default_values()
- opts, args = op.parse_args(values=optparse.Values())
- confdata = rconf.get('config')
- if not (len(args) == 2 or \
- (len(args) == 1 and rconf.get('listen')) or \
- (len(args) <= 2 and confdata) or \
- rconf.get('url_print')):
- sys.stderr.write("error: incorrect number of arguments\n\n")
- sys.stderr.write(op.get_usage() + "\n")
- sys.exit(1)
-
- restricted = os.getenv('_GSYNCD_RESTRICTED_')
-
- if restricted:
- allopts = {}
- allopts.update(opts.__dict__)
- allopts.update(rconf)
- bannedtuns = set(allopts.keys()) - set(remote_tunables)
- if bannedtuns:
- raise GsyncdError('following tunables cannot be set with restricted SSH invocaton: ' + \
- ', '.join(bannedtuns))
- for k, v in rq_remote_tunables.items():
- if not k in allopts or allopts[k] != v:
- raise GsyncdError('tunable %s is not set to value %s required for restricted SSH invocaton' % \
- (k, v))
-
- confrx = getattr(confdata, 'rx', None)
- if confrx:
- # peers are regexen, don't try to parse them
- if confrx == 'glob':
- args = [ '\A' + fnmatch.translate(a) for a in args ]
- canon_peers = args
- namedict = {}
- else:
- rscs = [resource.parse_url(u) for u in args]
- dc = rconf.get('url_print')
- if dc:
- for r in rscs:
- print(r.get_url(**{'normal': {},
- 'canon': {'canonical': True},
- 'canon_esc': {'canonical': True, 'escaped': True}}[dc]))
- return
- local = remote = None
- if rscs:
- local = rscs[0]
- if len(rscs) > 1:
- remote = rscs[1]
- if not local.can_connect_to(remote):
- raise GsyncdError("%s cannot work with %s" % (local.path, remote and remote.path))
- pa = ([], [], [])
- urlprms = ({}, {'canonical': True}, {'canonical': True, 'escaped': True})
- for x in rscs:
- for i in range(len(pa)):
- pa[i].append(x.get_url(**urlprms[i]))
- peers, canon_peers, canon_esc_peers = pa
- # creating the namedict, a dict representing various ways of referring to / repreenting
- # peers to be fillable in config templates
- mods = (lambda x: x, lambda x: x[0].upper() + x[1:], lambda x: 'e' + x[0].upper() + x[1:])
- if remote:
- rmap = { local: ('local', 'master'), remote: ('remote', 'slave') }
- else:
- rmap = { local: ('local', 'slave') }
- namedict = {}
- for i in range(len(rscs)):
- x = rscs[i]
- for name in rmap[x]:
- for j in range(3):
- namedict[mods[j](name)] = pa[j][i]
- if x.scheme == 'gluster':
- namedict[name + 'vol'] = x.volume
- if not 'config_file' in rconf:
- rconf['config_file'] = os.path.join(os.path.dirname(sys.argv[0]), "conf/gsyncd.conf")
- gcnf = GConffile(rconf['config_file'], canon_peers, defaults.__dict__, opts.__dict__, namedict)
-
- checkpoint_change = False
- if confdata:
- opt_ok = norm(confdata.opt) in tunables + [None]
- if confdata.op == 'check':
- if opt_ok:
- sys.exit(0)
- else:
- sys.exit(1)
- elif not opt_ok:
- raise GsyncdError("not a valid option: " + confdata.opt)
- if confdata.op == 'get':
- gcnf.get(confdata.opt)
- elif confdata.op == 'set':
- gcnf.set(confdata.opt, confdata.val, confdata.rx)
- elif confdata.op == 'del':
- gcnf.delete(confdata.opt, confdata.rx)
- # when modifying checkpoint, it's important to make a log
- # of that, so in that case we go on to set up logging even
- # if its just config invocation
- if confdata.opt == 'checkpoint' and confdata.op in ('set', 'del') and \
- not confdata.rx:
- checkpoint_change = True
- if not checkpoint_change:
- return
-
- gconf.__dict__.update(defaults.__dict__)
- gcnf.update_to(gconf.__dict__)
- gconf.__dict__.update(opts.__dict__)
- gconf.configinterface = gcnf
-
- if restricted and gconf.allow_network:
- ssh_conn = os.getenv('SSH_CONNECTION')
- if not ssh_conn:
- #legacy env var
- ssh_conn = os.getenv('SSH_CLIENT')
- if ssh_conn:
- allowed_networks = [ IPNetwork(a) for a in gconf.allow_network.split(',') ]
- client_ip = IPAddress(ssh_conn.split()[0])
- allowed = False
- for nw in allowed_networks:
- if client_ip in nw:
- allowed = True
- break
- if not allowed:
- raise GsyncdError("client IP address is not allowed")
-
- ffd = rconf.get('feedback_fd')
- if ffd:
- fcntl.fcntl(ffd, fcntl.F_SETFD, fcntl.FD_CLOEXEC)
-
- #normalize loglevel
- lvl0 = gconf.log_level
- if isinstance(lvl0, str):
- lvl1 = lvl0.upper()
- lvl2 = logging.getLevelName(lvl1)
- # I have _never_ _ever_ seen such an utterly braindead
- # error condition
- if lvl2 == "Level " + lvl1:
- raise GsyncdError('cannot recognize log level "%s"' % lvl0)
- gconf.log_level = lvl2
-
- if not privileged() and gconf.log_file_mbr:
- gconf.log_file = gconf.log_file_mbr
-
- if checkpoint_change:
- try:
- GLogger._gsyncd_loginit(log_file=gconf.log_file, label='conf')
- if confdata.op == 'set':
- logging.info('checkpoint %s set' % confdata.val)
- elif confdata.op == 'del':
- logging.info('checkpoint info was reset')
- except IOError:
- if sys.exc_info()[1].errno == ENOENT:
- # directory of log path is not present,
- # which happens if we get here from
- # a peer-multiplexed "config-set checkpoint"
- # (as that directory is created only on the
- # original node)
- pass
- else:
- raise
- return
-
- go_daemon = rconf['go_daemon']
- be_monitor = rconf.get('monitor')
-
- if not be_monitor and isinstance(remote, resource.SSH) and \
- go_daemon == 'should':
- go_daemon = 'postconn'
- log_file = None
- else:
- log_file = gconf.log_file
- if be_monitor:
- label = 'monitor'
- elif remote:
- #master
- label = ''
- else:
- label = 'slave'
- startup(go_daemon=go_daemon, log_file=log_file, label=label)
-
- if be_monitor:
- return monitor()
-
- logging.info("syncing: %s" % " -> ".join(peers))
- resource.Popen.init_errhandler()
- if remote:
- go_daemon = remote.connect_remote(go_daemon=go_daemon)
- if go_daemon:
- startup(go_daemon=go_daemon, log_file=gconf.log_file)
- # complete remote connection in child
- remote.connect_remote(go_daemon='done')
- local.connect()
- if ffd:
- os.close(ffd)
- local.service_loop(*[r for r in [remote] if r])
-
-
-if __name__ == "__main__":
- main()
diff --git a/xlators/features/marker/utils/syncdaemon/libcxattr.py b/xlators/features/marker/utils/syncdaemon/libcxattr.py
deleted file mode 100644
index f0a9d2292..000000000
--- a/xlators/features/marker/utils/syncdaemon/libcxattr.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-from ctypes import *
-from ctypes.util import find_library
-
-class Xattr(object):
- """singleton that wraps the extended attribues system
- interface for python using ctypes
-
- Just implement it to the degree we need it, in particular
- - we need just the l*xattr variants, ie. we never want symlinks to be
- followed
- - don't need size discovery for getxattr, as we always know the exact
- sizes we expect
- """
-
- libc = CDLL(find_library("libc"))
-
- @classmethod
- def geterrno(cls):
- return c_int.in_dll(cls.libc, 'errno').value
-
- @classmethod
- def raise_oserr(cls):
- errn = cls.geterrno()
- raise OSError(errn, os.strerror(errn))
-
- @classmethod
- def _query_xattr(cls, path, siz, syscall, *a):
- if siz:
- buf = create_string_buffer('\0' * siz)
- else:
- buf = None
- ret = getattr(cls.libc, syscall)(*((path,) + a + (buf, siz)))
- if ret == -1:
- cls.raise_oserr()
- if siz:
- return buf.raw[:ret]
- else:
- return ret
-
- @classmethod
- def lgetxattr(cls, path, attr, siz=0):
- return cls._query_xattr( path, siz, 'lgetxattr', attr)
-
- @classmethod
- def llistxattr(cls, path, siz=0):
- ret = cls._query_xattr(path, siz, 'llistxattr')
- if isinstance(ret, str):
- ret = ret.split('\0')
- return ret
-
- @classmethod
- def lsetxattr(cls, path, attr, val):
- ret = cls.libc.lsetxattr(path, attr, val, len(val), 0)
- if ret == -1:
- cls.raise_oserr()
-
- @classmethod
- def lremovexattr(cls, path, attr):
- ret = cls.libc.lremovexattr(path, attr)
- if ret == -1:
- cls.raise_oserr()
-
- @classmethod
- def llistxattr_buf(cls, path):
- """listxattr variant with size discovery"""
- size = cls.llistxattr(path)
- if size == -1:
- cls.raise_oserr()
- if size == 0:
- return []
- return cls.llistxattr(path, size)
diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py
deleted file mode 100644
index f903f3059..000000000
--- a/xlators/features/marker/utils/syncdaemon/master.py
+++ /dev/null
@@ -1,961 +0,0 @@
-import os
-import sys
-import time
-import stat
-import random
-import signal
-import logging
-import socket
-import errno
-import re
-from errno import ENOENT, ENODATA, EPIPE
-from threading import currentThread, Condition, Lock
-from datetime import datetime
-try:
- from hashlib import md5 as md5
-except ImportError:
- # py 2.4
- from md5 import new as md5
-
-from gconf import gconf
-from syncdutils import FreeObject, Thread, GsyncdError, boolify, \
- escape, unescape, select
-
-URXTIME = (-1, 0)
-
-# Utility functions to help us to get to closer proximity
-# of the DRY principle (no, don't look for elevated or
-# perspectivistic things here)
-
-def _xtime_now():
- t = time.time()
- sec = int(t)
- nsec = int((t - sec) * 1000000)
- return (sec, nsec)
-
-def _volinfo_hook_relax_foreign(self):
- volinfo_sys = self.get_sys_volinfo()
- fgn_vi = volinfo_sys[self.KFGN]
- if fgn_vi:
- expiry = fgn_vi['timeout'] - int(time.time()) + 1
- logging.info('foreign volume info found, waiting %d sec for expiry' % \
- expiry)
- time.sleep(expiry)
- volinfo_sys = self.get_sys_volinfo()
- self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state,
- volinfo_sys)
- if self.inter_master:
- raise GsyncdError("cannot be intermediate master in special mode")
- return (volinfo_sys, state_change)
-
-
-# The API!
-
-def gmaster_builder():
- """produce the GMaster class variant corresponding
- to sync mode"""
- this = sys.modules[__name__]
- modemixin = gconf.special_sync_mode
- if not modemixin:
- modemixin = 'normal'
- logging.info('setting up master for %s sync mode' % modemixin)
- modemixin = getattr(this, modemixin.capitalize() + 'Mixin')
- sendmarkmixin = boolify(gconf.use_rsync_xattrs) and SendmarkRsyncMixin or SendmarkNormalMixin
- purgemixin = boolify(gconf.ignore_deletes) and PurgeNoopMixin or PurgeNormalMixin
- class _GMaster(GMasterBase, modemixin, sendmarkmixin, purgemixin):
- pass
- return _GMaster
-
-
-# Mixin classes that implement the data format
-# and logic particularities of the certain
-# sync modes
-
-class NormalMixin(object):
- """normal geo-rep behavior"""
-
- minus_infinity = URXTIME
-
- # following staticmethods ideally would be
- # methods of an xtime object (in particular,
- # implementing the hooks needed for comparison
- # operators), but at this point we don't yet
- # have a dedicated xtime class
-
- @staticmethod
- def serialize_xtime(xt):
- return "%d.%d" % tuple(xt)
-
- @staticmethod
- def deserialize_xtime(xt):
- return tuple(int(x) for x in xt.split("."))
-
- @staticmethod
- def native_xtime(xt):
- return xt
-
- @staticmethod
- def xtime_geq(xt0, xt1):
- return xt0 >= xt1
-
- def make_xtime_opts(self, is_master, opts):
- if not 'create' in opts:
- opts['create'] = is_master and not self.inter_master
- if not 'default_xtime' in opts:
- if is_master and self.inter_master:
- opts['default_xtime'] = ENODATA
- else:
- opts['default_xtime'] = URXTIME
-
- def xtime_low(self, server, path, **opts):
- xt = server.xtime(path, self.uuid)
- if isinstance(xt, int) and xt != ENODATA:
- return xt
- if xt == ENODATA or xt < self.volmark:
- if opts['create']:
- xt = _xtime_now()
- server.set_xtime(path, self.uuid, xt)
- else:
- xt = opts['default_xtime']
- return xt
-
- def keepalive_payload_hook(self, timo, gap):
- # first grab a reference as self.volinfo
- # can be changed in main thread
- vi = self.volinfo
- if vi:
- # then have a private copy which we can mod
- vi = vi.copy()
- vi['timeout'] = int(time.time()) + timo
- else:
- # send keep-alives more frequently to
- # avoid a delay in announcing our volume info
- # to slave if it becomes established in the
- # meantime
- gap = min(10, gap)
- return (vi, gap)
-
- def volinfo_hook(self):
- volinfo_sys = self.get_sys_volinfo()
- self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state,
- volinfo_sys)
- return (volinfo_sys, state_change)
-
- def xtime_reversion_hook(self, path, xtl, xtr):
- if xtr > xtl:
- raise GsyncdError("timestamp corruption for " + path)
-
- def need_sync(self, e, xte, xtrd):
- return xte > xtrd
-
- def set_slave_xtime(self, path, mark):
- self.slave.server.set_xtime(path, self.uuid, mark)
-
-class WrapupMixin(NormalMixin):
- """a variant that differs from normal in terms
- of ignoring non-indexed files"""
-
- @staticmethod
- def make_xtime_opts(is_master, opts):
- if not 'create' in opts:
- opts['create'] = False
- if not 'default_xtime' in opts:
- opts['default_xtime'] = URXTIME
-
- @staticmethod
- def keepalive_payload_hook(timo, gap):
- return (None, gap)
-
- def volinfo_hook(self):
- return _volinfo_hook_relax_foreign(self)
-
-class BlindMixin(object):
- """Geo-rep flavor using vectored xtime.
-
- Coordinates are the master, slave uuid pair;
- in master coordinate behavior is normal,
- in slave coordinate we force synchronization
- on any value difference (these are in disjunctive
- relation, ie. if either orders the entry to be
- synced, it shall be synced.
- """
-
- minus_infinity = (URXTIME, None)
-
- @staticmethod
- def serialize_xtime(xt):
- a = []
- for x in xt:
- if not x:
- x = ('None', '')
- a.extend(x)
- return '.'.join(str(n) for n in a)
-
- @staticmethod
- def deserialize_xtime(xt):
- a = xt.split(".")
- a = (tuple(a[0:2]), tuple(a[3:4]))
- b = []
- for p in a:
- if p[0] == 'None':
- p = None
- else:
- p = tuple(int(x) for x in p)
- b.append(p)
- return tuple(b)
-
- @staticmethod
- def native_xtime(xt):
- return xt[0]
-
- @staticmethod
- def xtime_geq(xt0, xt1):
- return (not xt1[0] or xt0[0] >= xt1[0]) and \
- (not xt1[1] or xt0[1] >= xt1[1])
-
- @property
- def ruuid(self):
- if self.volinfo_r:
- return self.volinfo_r['uuid']
-
- @staticmethod
- def make_xtime_opts(is_master, opts):
- if not 'create' in opts:
- opts['create'] = is_master
- if not 'default_xtime' in opts:
- opts['default_xtime'] = URXTIME
-
- def xtime_low(self, server, path, **opts):
- xtd = server.xtime_vec(path, self.uuid, self.ruuid)
- if isinstance(xtd, int):
- return xtd
- xt = (xtd[self.uuid], xtd[self.ruuid])
- if not xt[1] and (not xt[0] or xt[0] < self.volmark):
- if opts['create']:
- # not expected, but can happen if file originates
- # from interrupted gsyncd transfer
- logging.warn('have to fix up missing xtime on ' + path)
- xt0 = _xtime_now()
- server.set_xtime(path, self.uuid, xt0)
- else:
- xt0 = opts['default_xtime']
- xt = (xt0, xt[1])
- return xt
-
- @staticmethod
- def keepalive_payload_hook(timo, gap):
- return (None, gap)
-
- def volinfo_hook(self):
- res = _volinfo_hook_relax_foreign(self)
- volinfo_r_new = self.slave.server.native_volume_info()
- if volinfo_r_new['retval']:
- raise GsyncdError("slave is corrupt")
- if getattr(self, 'volinfo_r', None):
- if self.volinfo_r['uuid'] != volinfo_r_new['uuid']:
- raise GsyncdError("uuid mismatch on slave")
- self.volinfo_r = volinfo_r_new
- return res
-
- def xtime_reversion_hook(self, path, xtl, xtr):
- if not isinstance(xtr[0], int) and \
- (isinstance(xtl[0], int) or xtr[0] > xtl[0]):
- raise GsyncdError("timestamp corruption for " + path)
-
- def need_sync(self, e, xte, xtrd):
- if xte[0]:
- if not xtrd[0] or xte[0] > xtrd[0]:
- # there is outstanding diff at 0th pos,
- # we can short-cut to true
- return True
- # we arrived to this point by either of these
- # two possiblilites:
- # - no outstanding difference at 0th pos,
- # wanna see 1st pos if he raises veto
- # against "no need to sync" proposal
- # - no data at 0th pos, 1st pos will have
- # to decide (due to xtime assignment,
- # in this case 1st pos does carry data
- # -- iow, if 1st pos did not have data,
- # and 0th neither, 0th would have been
- # force-feeded)
- if not xte[1]:
- # no data, no veto
- return False
- # the hard work: for 1st pos,
- # the conduct is fetch corresponding
- # slave data and do a "blind" comparison
- # (ie. do not care who is newer, we trigger
- # sync on non-identical xitmes)
- xtr = self.xtime(e, self.slave)
- return isinstance(xtr, int) or xte[1] != xtr[1]
-
- def set_slave_xtime(self, path, mark):
- xtd = {}
- for (u, t) in zip((self.uuid, self.ruuid), mark):
- if t:
- xtd[u] = t
- self.slave.server.set_xtime_vec(path, xtd)
-
-
-# Further mixins for certain tunable behaviors
-
-class SendmarkNormalMixin(object):
-
- def sendmark_regular(self, *a, **kw):
- return self.sendmark(*a, **kw)
-
-class SendmarkRsyncMixin(object):
-
- def sendmark_regular(self, *a, **kw):
- pass
-
-
-class PurgeNormalMixin(object):
-
- def purge_missing(self, path, names):
- self.slave.server.purge(path, names)
-
-class PurgeNoopMixin(object):
-
- def purge_missing(self, path, names):
- pass
-
-
-
-class GMasterBase(object):
- """abstract class impementling master role"""
-
- KFGN = 0
- KNAT = 1
-
- def get_sys_volinfo(self):
- """query volume marks on fs root
-
- err out on multiple foreign masters
- """
- fgn_vis, nat_vi = self.master.server.foreign_volume_infos(), \
- self.master.server.native_volume_info()
- fgn_vi = None
- if fgn_vis:
- if len(fgn_vis) > 1:
- raise GsyncdError("cannot work with multiple foreign masters")
- fgn_vi = fgn_vis[0]
- return fgn_vi, nat_vi
-
- @property
- def uuid(self):
- if self.volinfo:
- return self.volinfo['uuid']
-
- @property
- def volmark(self):
- if self.volinfo:
- return self.volinfo['volume_mark']
-
- @property
- def inter_master(self):
- """decide if we are an intermediate master
- in a cascading setup
- """
- return self.volinfo_state[self.KFGN] and True or False
-
- def xtime(self, path, *a, **opts):
- """get amended xtime
-
- as of amending, we can create missing xtime, or
- determine a valid value if what we get is expired
- (as of the volume mark expiry); way of amendig
- depends on @opts and on subject of query (master
- or slave).
- """
- if a:
- rsc = a[0]
- else:
- rsc = self.master
- self.make_xtime_opts(rsc == self.master, opts)
- return self.xtime_low(rsc.server, path, **opts)
-
- def __init__(self, master, slave):
- self.master = master
- self.slave = slave
- self.jobtab = {}
- self.syncer = Syncer(slave)
- # crawls vs. turns:
- # - self.crawls is simply the number of crawl() invocations on root
- # - one turn is a maximal consecutive sequence of crawls so that each
- # crawl in it detects a change to be synced
- # - self.turns is the number of turns since start
- # - self.total_turns is a limit so that if self.turns reaches it, then
- # we exit (for diagnostic purposes)
- # so, eg., if the master fs changes unceasingly, self.turns will remain 0.
- self.crawls = 0
- self.turns = 0
- self.total_turns = int(gconf.turns)
- self.lastreport = {'crawls': 0, 'turns': 0}
- self.start = None
- self.change_seen = None
- self.syncTime=0
- self.lastSyncTime=0
- self.crawlStartTime=0
- self.crawlTime=0
- self.filesSynced=0
- self.bytesSynced=0
- # the authoritative (foreign, native) volinfo pair
- # which lets us deduce what to do when we refetch
- # the volinfos from system
- uuid_preset = getattr(gconf, 'volume_id', None)
- self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None)
- # the actual volinfo we make use of
- self.volinfo = None
- self.terminate = False
- self.checkpoint_thread = None
-
- @classmethod
- def _checkpt_param(cls, chkpt, prm, xtimish=True):
- """use config backend to lookup a parameter belonging to
- checkpoint @chkpt"""
- cprm = getattr(gconf, 'checkpoint_' + prm, None)
- if not cprm:
- return
- chkpt_mapped, val = cprm.split(':', 1)
- if unescape(chkpt_mapped) != chkpt:
- return
- if xtimish:
- val = cls.deserialize_xtime(val)
- return val
-
- @classmethod
- def _set_checkpt_param(cls, chkpt, prm, val, xtimish=True):
- """use config backend to store a parameter associated
- with checkpoint @chkpt"""
- if xtimish:
- val = cls.serialize_xtime(val)
- gconf.configinterface.set('checkpoint_' + prm, "%s:%s" % (escape(chkpt), val))
-
- @staticmethod
- def humantime(*tpair):
- """format xtime-like (sec, nsec) pair to human readable format"""
- ts = datetime.fromtimestamp(float('.'.join(str(n) for n in tpair))).\
- strftime("%Y-%m-%d %H:%M:%S")
- if len(tpair) > 1:
- ts += '.' + str(tpair[1])
- return ts
-
- def get_extra_info(self):
- str_info="\nFile synced : %d" %(self.filesSynced)
- str_info+="\nBytes Synced : %d KB" %(self.syncer.bytesSynced)
- str_info+="\nSync Time : %f seconds" %(self.syncTime)
- self.crawlTime=datetime.now()-self.crawlStartTime
- years , days =divmod(self.crawlTime.days,365.25)
- years=int(years)
- days=int(days)
-
- date=""
- m, s = divmod(self.crawlTime.seconds, 60)
- h, m = divmod(m, 60)
-
- if years!=0 :
- date+=str(years)+" year "
- if days!=0 :
- date+=str(days)+" day "
- if h!=0 :
- date+=str(h)+" H : "
- if m!=0 or h!=0 :
- date+=str(m)+" M : "
-
- date+=str(s)+" S"
- self.crawlTime=date
- str_info+="\nCrawl Time : %s" %(str(self.crawlTime))
- str_info+="\n\0"
- return str_info
-
- def checkpt_service(self, chan, chkpt, tgt):
- """checkpoint service loop
-
- monitor and verify checkpoint status for @chkpt, and listen
- for incoming requests for whom we serve a pretty-formatted
- status report"""
- if not chkpt:
- # dummy loop for the case when there is no checkpt set
- while True:
- select([chan], [], [])
- conn, _ = chan.accept()
- conn.send(self.get_extra_info())
- conn.close()
- completed = self._checkpt_param(chkpt, 'completed', xtimish=False)
- if completed:
- completed = tuple(int(x) for x in completed.split('.'))
- while True:
- s,_,_ = select([chan], [], [], (not completed) and 5 or None)
- # either request made and we re-check to not
- # give back stale data, or we still hunting for completion
- if self.native_xtime(tgt) and self.native_xtime(tgt) < self.volmark:
- # indexing has been reset since setting the checkpoint
- status = "is invalid"
- else:
- xtr = self.xtime('.', self.slave)
- if isinstance(xtr, int):
- raise GsyncdError("slave root directory is unaccessible (%s)",
- os.strerror(xtr))
- ncompleted = self.xtime_geq(xtr, tgt)
- if completed and not ncompleted: # stale data
- logging.warn("completion time %s for checkpoint %s became stale" % \
- (self.humantime(*completed), chkpt))
- completed = None
- gconf.confdata.delete('checkpoint-completed')
- if ncompleted and not completed: # just reaching completion
- completed = "%.6f" % time.time()
- self._set_checkpt_param(chkpt, 'completed', completed, xtimish=False)
- completed = tuple(int(x) for x in completed.split('.'))
- logging.info("checkpoint %s completed" % chkpt)
- status = completed and \
- "completed at " + self.humantime(completed[0]) or \
- "not reached yet"
- if s:
- conn = None
- try:
- conn, _ = chan.accept()
- try:
- conn.send(" | checkpoint %s %s %s" % (chkpt, status,self.get_extra_info()))
- except:
- exc = sys.exc_info()[1]
- if (isinstance(exc, OSError) or isinstance(exc, IOError)) and \
- exc.errno == EPIPE:
- logging.debug('checkpoint client disconnected')
- else:
- raise
- finally:
- if conn:
- conn.close()
-
- def start_checkpoint_thread(self):
- """prepare and start checkpoint service"""
- if self.checkpoint_thread or not (
- getattr(gconf, 'state_socket_unencoded', None) and getattr(gconf, 'socketdir', None)
- ):
- return
- chan = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
- state_socket = os.path.join(gconf.socketdir, md5(gconf.state_socket_unencoded).hexdigest() + ".socket")
- try:
- os.unlink(state_socket)
- except:
- if sys.exc_info()[0] == OSError:
- pass
- chan.bind(state_socket)
- chan.listen(1)
- checkpt_tgt = None
- if gconf.checkpoint:
- checkpt_tgt = self._checkpt_param(gconf.checkpoint, 'target')
- if not checkpt_tgt:
- checkpt_tgt = self.xtime('.')
- if isinstance(checkpt_tgt, int):
- raise GsyncdError("master root directory is unaccessible (%s)",
- os.strerror(checkpt_tgt))
- self._set_checkpt_param(gconf.checkpoint, 'target', checkpt_tgt)
- logging.debug("checkpoint target %s has been determined for checkpoint %s" % \
- (repr(checkpt_tgt), gconf.checkpoint))
- t = Thread(target=self.checkpt_service, args=(chan, gconf.checkpoint, checkpt_tgt))
- t.start()
- self.checkpoint_thread = t
-
- def crawl_loop(self):
- """start the keep-alive thread and iterate .crawl"""
- timo = int(gconf.timeout or 0)
- if timo > 0:
- def keep_alive():
- while True:
- vi, gap = self.keepalive_payload_hook(timo, timo * 0.5)
- self.slave.server.keep_alive(vi)
- time.sleep(gap)
- t = Thread(target=keep_alive)
- t.start()
- self.lastreport['time'] = time.time()
- self.crawlStartTime=datetime.now()
- while not self.terminate:
- self.crawl()
-
- def add_job(self, path, label, job, *a, **kw):
- """insert @job function to job table at @path with @label"""
- if self.jobtab.get(path) == None:
- self.jobtab[path] = []
- self.jobtab[path].append((label, a, lambda : job(*a, **kw)))
-
- def add_failjob(self, path, label):
- """invoke .add_job with a job that does nothing just fails"""
- logging.debug('salvaged: ' + label)
- self.add_job(path, label, lambda: False)
-
- def wait(self, path, *args):
- """perform jobs registered for @path
-
- Reset jobtab entry for @path,
- determine success as the conjuction of
- success of all the jobs. In case of
- success, call .sendmark on @path
- """
- jobs = self.jobtab.pop(path, [])
- succeed = True
- for j in jobs:
- ret = j[-1]()
- if not ret:
- succeed = False
- if succeed:
- self.sendmark(path, *args)
- return succeed
-
- def sendmark(self, path, mark, adct=None):
- """update slave side xtime for @path to master side xtime
-
- also can send a setattr payload (see Server.setattr).
- """
- if adct:
- self.slave.server.setattr(path, adct)
- self.set_slave_xtime(path, mark)
-
- @staticmethod
- def volinfo_state_machine(volinfo_state, volinfo_sys):
- """compute new volinfo_state from old one and incoming
- as of current system state, also indicating if there was a
- change regarding which volume mark is the authoritative one
-
- @volinfo_state, @volinfo_sys are pairs of volume mark dicts
- (foreign, native).
-
- Note this method is marked as static, ie. the computation is
- pure, without reliance on any excess implicit state. State
- transitions which are deemed as ambiguous or banned will raise
- an exception.
-
- """
- # store the value below "boxed" to emulate proper closures
- # (variables of the enclosing scope are available inner functions
- # provided they are no reassigned; mutation is OK).
- param = FreeObject(relax_mismatch = False, state_change = None, index=-1)
- def select_vi(vi0, vi):
- param.index += 1
- if vi and (not vi0 or vi0['uuid'] == vi['uuid']):
- if not vi0 and not param.relax_mismatch:
- param.state_change = param.index
- # valid new value found; for the rest, we are graceful about
- # uuid mismatch
- param.relax_mismatch = True
- return vi
- if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch:
- # uuid mismatch for master candidate, bail out
- raise GsyncdError("aborting on uuid change from %s to %s" % \
- (vi0['uuid'], vi['uuid']))
- # fall back to old
- return vi0
- newstate = tuple(select_vi(*vip) for vip in zip(volinfo_state, volinfo_sys))
- srep = lambda vi: vi and vi['uuid'][0:8]
- logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \
- tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate))
- return newstate, param.state_change
-
- def crawl(self, path='.', xtl=None):
- """crawling...
-
- Standing around
- All the right people
- Crawling
- Tennis on Tuesday
- The ladder is long
- It is your nature
- You've gotta suntan
- Football on Sunday
- Society boy
-
- Recursively walk the master side tree and check if updates are
- needed due to xtime differences. One invocation of crawl checks
- children of @path and do a recursive enter only on
- those directory children where there is an update needed.
-
- Way of updates depend on file type:
- - for symlinks, sync them directy and synchronously
- - for regular children, register jobs for @path (cf. .add_job) to start
- and wait on their rsync
- - for directory children, register a job for @path which waits (.wait)
- on jobs for the given child
- (other kind of filesystem nodes are not considered)
-
- Those slave side children which do not exist on master are simply
- purged (see Server.purge).
-
- Behavior is fault tolerant, synchronization is adaptive: if some action fails,
- just go on relentlessly, adding a fail job (see .add_failjob) which will prevent
- the .sendmark on @path, so when the next crawl will arrive to @path it will not
- see it as up-to-date and will try to sync it again. While this semantics can be
- supported by funky design principles (http://c2.com/cgi/wiki?LazinessImpatienceHubris),
- the ultimate reason which excludes other possibilities is simply transience: we cannot
- assert that the file systems (master / slave) underneath do not change and actions
- taken upon some condition will not lose their context by the time they are performed.
- """
- if path == '.':
- if self.start:
- self.crawls += 1
- logging.debug("... crawl #%d done, took %.6f seconds" % \
- (self.crawls, time.time() - self.start))
- time.sleep(1)
- self.start = time.time()
- should_display_info = self.start - self.lastreport['time'] >= 60
- if should_display_info:
- logging.info("completed %d crawls, %d turns",
- self.crawls - self.lastreport['crawls'],
- self.turns - self.lastreport['turns'])
- self.lastreport.update(crawls = self.crawls,
- turns = self.turns,
- time = self.start)
- volinfo_sys, state_change = self.volinfo_hook()
- if self.inter_master:
- self.volinfo = volinfo_sys[self.KFGN]
- else:
- self.volinfo = volinfo_sys[self.KNAT]
- if state_change == self.KFGN or (state_change == self.KNAT and not self.inter_master):
- logging.info('new master is %s', self.uuid)
- if self.volinfo:
- logging.info("%s master with volume id %s ..." % \
- (self.inter_master and "intermediate" or "primary",
- self.uuid))
- if state_change == self.KFGN:
- gconf.configinterface.set('volume_id', self.uuid)
- if self.volinfo:
- if self.volinfo['retval']:
- raise GsyncdError ("master is corrupt")
- self.start_checkpoint_thread()
- else:
- if should_display_info or self.crawls == 0:
- if self.inter_master:
- logging.info("waiting for being synced from %s ..." % \
- self.volinfo_state[self.KFGN]['uuid'])
- else:
- logging.info("waiting for volume info ...")
- return
- logging.debug("entering " + path)
- if not xtl:
- xtl = self.xtime(path)
- if isinstance(xtl, int):
- self.add_failjob(path, 'no-local-node')
- return
- xtr = self.xtime(path, self.slave)
- if isinstance(xtr, int):
- if xtr != ENOENT:
- self.slave.server.purge(path)
- try:
- self.slave.server.mkdir(path)
- except OSError:
- self.add_failjob(path, 'no-remote-node')
- return
- xtr = self.minus_infinity
- else:
- self.xtime_reversion_hook(path, xtl, xtr)
- if xtl == xtr:
- if path == '.' and self.change_seen:
- self.turns += 1
- self.change_seen = False
- if self.total_turns:
- logging.info("finished turn #%s/%s" % \
- (self.turns, self.total_turns))
- if self.turns == self.total_turns:
- logging.info("reached turn limit")
- self.terminate = True
- return
- if path == '.':
- self.change_seen = True
- try:
- dem = self.master.server.entries(path)
- except OSError:
- self.add_failjob(path, 'local-entries-fail')
- return
- random.shuffle(dem)
- try:
- des = self.slave.server.entries(path)
- except OSError:
- self.slave.server.purge(path)
- try:
- self.slave.server.mkdir(path)
- des = self.slave.server.entries(path)
- except OSError:
- self.add_failjob(path, 'remote-entries-fail')
- return
- dd = set(des) - set(dem)
- if dd:
- self.purge_missing(path, dd)
- chld = []
- for e in dem:
- e = os.path.join(path, e)
- xte = self.xtime(e)
- if isinstance(xte, int):
- logging.warn("irregular xtime for %s: %s" % (e, errno.errorcode[xte]))
- elif self.need_sync(e, xte, xtr):
- chld.append((e, xte))
- def indulgently(e, fnc, blame=None):
- if not blame:
- blame = path
- try:
- return fnc(e)
- except (IOError, OSError):
- ex = sys.exc_info()[1]
- if ex.errno == ENOENT:
- logging.warn("salvaged ENOENT for " + e)
- self.add_failjob(blame, 'by-indulgently')
- return False
- else:
- raise
- for e, xte in chld:
- st = indulgently(e, lambda e: os.lstat(e))
- if st == False:
- continue
- mo = st.st_mode
- adct = {'own': (st.st_uid, st.st_gid)}
- if stat.S_ISLNK(mo):
- if indulgently(e, lambda e: self.slave.server.symlink(os.readlink(e), e)) == False:
- continue
- self.sendmark(e, xte, adct)
- elif stat.S_ISREG(mo):
- logging.debug("syncing %s ..." % e)
- pb = self.syncer.add(e)
- timeA=datetime.now()
- def regjob(e, xte, pb):
- if pb.wait():
- logging.debug("synced " + e)
- self.sendmark_regular(e, xte)
-
- timeB=datetime.now()
- self.lastSyncTime=timeB-timeA
- self.syncTime=(self.syncTime+self.lastSyncTime.microseconds)/(10.0**6)
- self.filesSynced=self.filesSynced+1
- return True
- else:
- logging.warn("failed to sync " + e)
- self.add_job(path, 'reg', regjob, e, xte, pb)
- elif stat.S_ISDIR(mo):
- adct['mode'] = mo
- if indulgently(e, lambda e: (self.add_job(path, 'cwait', self.wait, e, xte, adct),
- self.crawl(e, xte),
- True)[-1], blame=e) == False:
- continue
- else:
- # ignore fifos, sockets and special files
- pass
- if path == '.':
- self.wait(path, xtl)
-
-class BoxClosedErr(Exception):
- pass
-
-class PostBox(list):
- """synchronized collection for storing things thought of as "requests" """
-
- def __init__(self, *a):
- list.__init__(self, *a)
- # too bad Python stdlib does not have read/write locks...
- # it would suffivce to grab the lock in .append as reader, in .close as writer
- self.lever = Condition()
- self.open = True
- self.done = False
-
- def wait(self):
- """wait on requests to be processed"""
- self.lever.acquire()
- if not self.done:
- self.lever.wait()
- self.lever.release()
- return self.result
-
- def wakeup(self, data):
- """wake up requestors with the result"""
- self.result = data
- self.lever.acquire()
- self.done = True
- self.lever.notifyAll()
- self.lever.release()
-
- def append(self, e):
- """post a request"""
- self.lever.acquire()
- if not self.open:
- raise BoxClosedErr
- list.append(self, e)
- self.lever.release()
-
- def close(self):
- """prohibit the posting of further requests"""
- self.lever.acquire()
- self.open = False
- self.lever.release()
-
-class Syncer(object):
- """a staged queue to relay rsync requests to rsync workers
-
- By "staged queue" its meant that when a consumer comes to the
- queue, it takes _all_ entries, leaving the queue empty.
- (I don't know if there is an official term for this pattern.)
-
- The queue uses a PostBox to accumulate incoming items.
- When a consumer (rsync worker) comes, a new PostBox is
- set up and the old one is passed on to the consumer.
-
- Instead of the simplistic scheme of having one big lock
- which synchronizes both the addition of new items and
- PostBox exchanges, use a separate lock to arbitrate consumers,
- and rely on PostBox's synchronization mechanisms take
- care about additions.
-
- There is a corner case racy situation, producers vs. consumers,
- which is not handled by this scheme: namely, when the PostBox
- exchange occurs in between being passed to the producer for posting
- and the post placement. But that's what Postbox.close is for:
- such a posting will find the PostBox closed, in which case
- the producer can re-try posting against the actual PostBox of
- the queue.
-
- To aid accumlation of items in the PostBoxen before grabbed
- by an rsync worker, the worker goes to sleep a bit after
- each completed syncjob.
- """
-
- def __init__(self, slave):
- """spawn worker threads"""
- self.slave = slave
- self.lock = Lock()
- self.pb = PostBox()
- self.bytesSynced=0
- for i in range(int(gconf.sync_jobs)):
- t = Thread(target=self.syncjob)
- t.start()
-
- def syncjob(self):
- """the life of a worker"""
- while True:
- pb = None
- while True:
- self.lock.acquire()
- if self.pb:
- pb, self.pb = self.pb, PostBox()
- self.lock.release()
- if pb:
- break
- time.sleep(0.5)
- pb.close()
- po = self.slave.rsync(pb)
- if po.returncode == 0:
- regEx=re.search('\ *total\ *transferred\ *file\ *size:\ *(\d+)\ *bytes\ *',po.stdout.read(),re.IGNORECASE)
- if regEx:
- self.bytesSynced+=(int(regEx.group(1)))/1024
- ret = True
- elif po.returncode in (23, 24):
- # partial transfer (cf. rsync(1)), that's normal
- ret = False
- else:
- po.errfail()
- pb.wakeup(ret)
-
- def add(self, e):
- while True:
- pb = self.pb
- try:
- pb.append(e)
- return pb
- except BoxClosedErr:
- pass
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
deleted file mode 100644
index b8956dcc2..000000000
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import os
-import sys
-import time
-import signal
-import logging
-from gconf import gconf
-from syncdutils import update_file, select, waitpid, set_term_handler
-
-class Monitor(object):
- """class which spawns and manages gsyncd workers"""
-
- def __init__(self):
- self.state = None
-
- def set_state(self, state):
- """set the state that can be used by external agents
- like glusterd for status reporting"""
- if state == self.state:
- return
- self.state = state
- logging.info('new state: %s' % state)
- if getattr(gconf, 'state_file', None):
- update_file(gconf.state_file, lambda f: f.write(state + '\n'))
-
- def monitor(self):
- """the monitor loop
-
- Basic logic is a blantantly simple blunt heuristics:
- if spawned client survives 60 secs, it's considered OK.
- This servers us pretty well as it's not vulneralbe to
- any kind of irregular behavior of the child...
-
- ... well, except for one: if children is hung up on
- waiting for some event, it can survive aeons, still
- will be defunct. So we tweak the above logic to
- expect the worker to send us a signal within 60 secs
- (in the form of closing its end of a pipe). The worker
- does this when it's done with the setup stage
- ready to enter the service loop (note it's the setup
- stage which is vulnerable to hangs -- the full
- blown worker blows up on EPIPE if the net goes down,
- due to the keep-alive thread)
- """
- def sigcont_handler(*a):
- """
- Re-init logging and send group kill signal
- """
- md = gconf.log_metadata
- logging.shutdown()
- lcls = logging.getLoggerClass()
- lcls.setup(label=md.get('saved_label'), **md)
- pid = os.getpid()
- os.kill(-pid, signal.SIGUSR1)
- signal.signal(signal.SIGUSR1, lambda *a: ())
- signal.signal(signal.SIGCONT, sigcont_handler)
-
- argv = sys.argv[:]
- for o in ('-N', '--no-daemon', '--monitor'):
- while o in argv:
- argv.remove(o)
- argv.extend(('-N', '-p', ''))
- argv.insert(0, os.path.basename(sys.executable))
-
- self.set_state('starting...')
- ret = 0
- def nwait(p, o=0):
- p2, r = waitpid(p, o)
- if not p2:
- return
- return r
- def exit_signalled(s):
- """ child teminated due to receipt of SIGUSR1 """
- return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))
- def exit_status(s):
- if os.WIFEXITED(s):
- return os.WEXITSTATUS(s)
- return 1
- conn_timeout = int(gconf.connection_timeout)
- while ret in (0, 1):
- logging.info('-' * conn_timeout)
- logging.info('starting gsyncd worker')
- pr, pw = os.pipe()
- cpid = os.fork()
- if cpid == 0:
- os.close(pr)
- os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
- os.close(pw)
- t0 = time.time()
- so = select((pr,), (), (), conn_timeout)[0]
- os.close(pr)
- if so:
- ret = nwait(cpid, os.WNOHANG)
- if ret != None:
- logging.debug("worker died before establishing connection")
- else:
- logging.debug("worker seems to be connected (?? racy check)")
- while time.time() < t0 + conn_timeout:
- ret = nwait(cpid, os.WNOHANG)
- if ret != None:
- logging.debug("worker died in startup phase")
- break
- time.sleep(1)
- else:
- logging.debug("worker not confirmed in %d sec, aborting it" % \
- conn_timeout)
- # relax one SIGTERM by setting a handler that sets back
- # standard handler
- set_term_handler(lambda *a: set_term_handler())
- # give a chance to graceful exit
- os.kill(-os.getpid(), signal.SIGTERM)
- time.sleep(1)
- os.kill(cpid, signal.SIGKILL)
- ret = nwait(cpid)
- if ret == None:
- self.set_state('OK')
- ret = nwait(cpid)
- if exit_signalled(ret):
- ret = 0
- else:
- ret = exit_status(ret)
- if ret in (0,1):
- self.set_state('faulty')
- time.sleep(10)
- self.set_state('inconsistent')
- return ret
-
-def monitor():
- """oh yeah, actually Monitor is used as singleton, too"""
- return Monitor().monitor()
diff --git a/xlators/features/marker/utils/syncdaemon/repce.py b/xlators/features/marker/utils/syncdaemon/repce.py
deleted file mode 100644
index 755fb61df..000000000
--- a/xlators/features/marker/utils/syncdaemon/repce.py
+++ /dev/null
@@ -1,225 +0,0 @@
-import os
-import sys
-import time
-import logging
-from threading import Condition
-try:
- import thread
-except ImportError:
- # py 3
- import _thread as thread
-try:
- from Queue import Queue
-except ImportError:
- # py 3
- from queue import Queue
-try:
- import cPickle as pickle
-except ImportError:
- # py 3
- import pickle
-
-from syncdutils import Thread, select
-
-pickle_proto = -1
-repce_version = 1.0
-
-def ioparse(i, o):
- if isinstance(i, int):
- i = os.fdopen(i)
- # rely on duck typing for recognizing
- # streams as that works uniformly
- # in py2 and py3
- if hasattr(o, 'fileno'):
- o = o.fileno()
- return (i, o)
-
-def send(out, *args):
- """pickle args and write out wholly in one syscall
-
- ie. not use the ability of pickle to dump directly to
- a stream, as that would potentially mess up messages
- by interleaving them
- """
- os.write(out, pickle.dumps(args, pickle_proto))
-
-def recv(inf):
- """load an object from input stream"""
- return pickle.load(inf)
-
-
-class RepceServer(object):
- """RePCe is Hungarian for canola, http://hu.wikipedia.org/wiki/Repce
-
- ... also our homebrewed RPC backend where the transport layer is
- reduced to a pair of filehandles.
-
- This is the server component.
- """
-
- def __init__(self, obj, i, o, wnum=6):
- """register a backend object .obj to which incoming messages
- are dispatched, also incoming/outcoming streams
- """
- self.obj = obj
- self.inf, self.out = ioparse(i, o)
- self.wnum = wnum
- self.q = Queue()
-
- def service_loop(self):
- """fire up worker threads, get messages and dispatch among them"""
- for i in range(self.wnum):
- t = Thread(target=self.worker)
- t.start()
- try:
- while True:
- self.q.put(recv(self.inf))
- except EOFError:
- logging.info("terminating on reaching EOF.")
-
- def worker(self):
- """life of a worker
-
- Get message, extract its id, method name and arguments
- (kwargs not supported), call method on .obj.
- Send back message id + return value.
- If method call throws an exception, rescue it, and send
- back the exception as result (with flag marking it as
- exception).
- """
- while True:
- in_data = self.q.get(True)
- rid = in_data[0]
- rmeth = in_data[1]
- exc = False
- if rmeth == '__repce_version__':
- res = repce_version
- else:
- try:
- res = getattr(self.obj, rmeth)(*in_data[2:])
- except:
- res = sys.exc_info()[1]
- exc = True
- logging.exception("call failed: ")
- send(self.out, rid, exc, res)
-
-
-class RepceJob(object):
- """class representing message status we can use
- for waiting on reply"""
-
- def __init__(self, cbk):
- """
- - .rid: (process-wise) unique id
- - .cbk: what we do upon receiving reply
- """
- self.rid = (os.getpid(), thread.get_ident(), time.time())
- self.cbk = cbk
- self.lever = Condition()
- self.done = False
-
- def __repr__(self):
- return ':'.join([str(x) for x in self.rid])
-
- def wait(self):
- self.lever.acquire()
- if not self.done:
- self.lever.wait()
- self.lever.release()
- return self.result
-
- def wakeup(self, data):
- self.result = data
- self.lever.acquire()
- self.done = True
- self.lever.notify()
- self.lever.release()
-
-
-class RepceClient(object):
- """RePCe is Hungarian for canola, http://hu.wikipedia.org/wiki/Repce
-
- ... also our homebrewed RPC backend where the transport layer is
- reduced to a pair of filehandles.
-
- This is the client component.
- """
-
- def __init__(self, i, o):
- self.inf, self.out = ioparse(i, o)
- self.jtab = {}
- t = Thread(target = self.listen)
- t.start()
-
- def listen(self):
- while True:
- select((self.inf,), (), ())
- rid, exc, res = recv(self.inf)
- rjob = self.jtab.pop(rid)
- if rjob.cbk:
- rjob.cbk(rjob, [exc, res])
-
- def push(self, meth, *args, **kw):
- """wrap arguments in a RepceJob, send them to server
- and return the RepceJob
-
- @cbk to pass on RepceJob can be given as kwarg.
- """
- cbk = kw.get('cbk')
- if not cbk:
- def cbk(rj, res):
- if res[0]:
- raise res[1]
- rjob = RepceJob(cbk)
- self.jtab[rjob.rid] = rjob
- logging.debug("call %s %s%s ..." % (repr(rjob), meth, repr(args)))
- send(self.out, rjob.rid, meth, *args)
- return rjob
-
- def __call__(self, meth, *args):
- """RePCe client is callabe, calling it implements a synchronous remote call
-
- We do a .push with a cbk which does a wakeup upon receiving anwser, then wait
- on the RepceJob.
- """
- rjob = self.push(meth, *args, **{'cbk': lambda rj, res: rj.wakeup(res)})
- exc, res = rjob.wait()
- if exc:
- logging.error('call %s (%s) failed on peer with %s' % (repr(rjob), meth, str(type(res).__name__)))
- raise res
- logging.debug("call %s %s -> %s" % (repr(rjob), meth, repr(res)))
- return res
-
- class mprx(object):
- """method proxy, standard trick to implement rubyesque method_missing
- in Python
-
- A class is a closure factory, you know what I mean, or go read some SICP.
- """
- def __init__(self, ins, meth):
- self.ins = ins
- self.meth = meth
-
- def __call__(self, *a):
- return self.ins(self.meth, *a)
-
- def __getattr__(self, meth):
- """this implements transparent method dispatch to remote object,
- so that you don't need to call the RepceClient instance like
-
- rclient('how_old_are_you_if_born_in', 1979)
-
- but you can make it into an ordinary method call like
-
- rclient.how_old_are_you_if_born_in(1979)
- """
- return self.mprx(self, meth)
-
- def __version__(self):
- """used in handshake to verify compatibility"""
- d = {'proto': self('__repce_version__')}
- try:
- d['object'] = self('version')
- except AttributeError:
- pass
- return d
diff --git a/xlators/features/marker/utils/syncdaemon/resource.py b/xlators/features/marker/utils/syncdaemon/resource.py
deleted file mode 100644
index 73102fbcb..000000000
--- a/xlators/features/marker/utils/syncdaemon/resource.py
+++ /dev/null
@@ -1,972 +0,0 @@
-import re
-import os
-import sys
-import stat
-import time
-import fcntl
-import errno
-import struct
-import socket
-import logging
-import tempfile
-import threading
-import subprocess
-from errno import EEXIST, ENOENT, ENODATA, ENOTDIR, ELOOP, EISDIR
-from select import error as SelectError
-
-from gconf import gconf
-import repce
-from repce import RepceServer, RepceClient
-from master import gmaster_builder
-import syncdutils
-from syncdutils import GsyncdError, select, privileged, boolify
-
-UrlRX = re.compile('\A(\w+)://([^ *?[]*)\Z')
-HostRX = re.compile('[a-z\d](?:[a-z\d.-]*[a-z\d])?', re.I)
-UserRX = re.compile("[\w!\#$%&'*+-\/=?^_`{|}~]+")
-
-def sup(x, *a, **kw):
- """a rubyesque "super" for python ;)
-
- invoke caller method in parent class with given args.
- """
- return getattr(super(type(x), x), sys._getframe(1).f_code.co_name)(*a, **kw)
-
-def desugar(ustr):
- """transform sugared url strings to standard <scheme>://<urlbody> form
-
- parsing logic enforces the constraint that sugared forms should contatin
- a ':' or a '/', which ensures that sugared urls do not conflict with
- gluster volume names.
- """
- m = re.match('([^:]*):(.*)', ustr)
- if m:
- if not m.groups()[0]:
- return "gluster://localhost" + ustr
- elif '@' in m.groups()[0] or re.search('[:/]', m.groups()[1]):
- return "ssh://" + ustr
- else:
- return "gluster://" + ustr
- else:
- if ustr[0] != '/':
- raise GsyncdError("cannot resolve sugared url '%s'" % ustr)
- ap = os.path.normpath(ustr)
- if ap.startswith('//'):
- ap = ap[1:]
- return "file://" + ap
-
-def gethostbyname(hnam):
- """gethostbyname wrapper"""
- try:
- return socket.gethostbyname(hnam)
- except socket.gaierror:
- ex = sys.exc_info()[1]
- raise GsyncdError("failed to resolve %s: %s" % \
- (hnam, ex.strerror))
-
-def parse_url(ustr):
- """instantiate an url object by scheme-to-class dispatch
-
- The url classes taken into consideration are the ones in
- this module whose names are full-caps.
- """
- m = UrlRX.match(ustr)
- if not m:
- ustr = desugar(ustr)
- m = UrlRX.match(ustr)
- if not m:
- raise GsyncdError("malformed url")
- sch, path = m.groups()
- this = sys.modules[__name__]
- if not hasattr(this, sch.upper()):
- raise GsyncdError("unknown url scheme " + sch)
- return getattr(this, sch.upper())(path)
-
-
-class _MetaXattr(object):
- """singleton class, a lazy wrapper around the
- libcxattr module
-
- libcxattr (a heavy import due to ctypes) is
- loaded only when when the single
- instance is tried to be used.
-
- This reduces runtime for those invocations
- which do not need filesystem manipulation
- (eg. for config, url parsing)
- """
-
- def __getattr__(self, meth):
- from libcxattr import Xattr as LXattr
- xmeth = [ m for m in dir(LXattr) if m[0] != '_' ]
- if not meth in xmeth:
- return
- for m in xmeth:
- setattr(self, m, getattr(LXattr, m))
- return getattr(self, meth)
-
-Xattr = _MetaXattr()
-
-
-class Popen(subprocess.Popen):
- """customized subclass of subprocess.Popen with a ring
- buffer for children error output"""
-
- @classmethod
- def init_errhandler(cls):
- """start the thread which handles children's error output"""
- cls.errstore = {}
- def tailer():
- while True:
- errstore = cls.errstore.copy()
- try:
- poe, _ ,_ = select([po.stderr for po in errstore], [], [], 1)
- except (ValueError, SelectError):
- continue
- for po in errstore:
- if po.stderr not in poe:
- continue
- po.lock.acquire()
- try:
- if po.on_death_row:
- continue
- la = errstore[po]
- try:
- fd = po.stderr.fileno()
- except ValueError: # file is already closed
- continue
- l = os.read(fd, 1024)
- if not l:
- continue
- tots = len(l)
- for lx in la:
- tots += len(lx)
- while tots > 1<<20 and la:
- tots -= len(la.pop(0))
- la.append(l)
- finally:
- po.lock.release()
- t = syncdutils.Thread(target = tailer)
- t.start()
- cls.errhandler = t
-
- @classmethod
- def fork(cls):
- """fork wrapper that restarts errhandler thread in child"""
- pid = os.fork()
- if not pid:
- cls.init_errhandler()
- return pid
-
- def __init__(self, args, *a, **kw):
- """customizations for subprocess.Popen instantiation
-
- - 'close_fds' is taken to be the default
- - if child's stderr is chosen to be managed,
- register it with the error handler thread
- """
- self.args = args
- if 'close_fds' not in kw:
- kw['close_fds'] = True
- self.lock = threading.Lock()
- self.on_death_row = False
- try:
- sup(self, args, *a, **kw)
- except:
- ex = sys.exc_info()[1]
- if not isinstance(ex, OSError):
- raise
- raise GsyncdError("""execution of "%s" failed with %s (%s)""" % \
- (args[0], errno.errorcode[ex.errno], os.strerror(ex.errno)))
- if kw.get('stderr') == subprocess.PIPE:
- assert(getattr(self, 'errhandler', None))
- self.errstore[self] = []
-
- def errlog(self):
- """make a log about child's failure event"""
- filling = ""
- if self.elines:
- filling = ", saying:"
- logging.error("""command "%s" returned with %s%s""" % \
- (" ".join(self.args), repr(self.returncode), filling))
- lp = ''
- def logerr(l):
- logging.error(self.args[0] + "> " + l)
- for l in self.elines:
- ls = l.split('\n')
- ls[0] = lp + ls[0]
- lp = ls.pop()
- for ll in ls:
- logerr(ll)
- if lp:
- logerr(lp)
-
- def errfail(self):
- """fail nicely if child did not terminate with success"""
- self.errlog()
- syncdutils.finalize(exval = 1)
-
- def terminate_geterr(self, fail_on_err = True):
- """kill child, finalize stderr harvesting (unregister
- from errhandler, set up .elines), fail on error if
- asked for
- """
- self.lock.acquire()
- try:
- self.on_death_row = True
- finally:
- self.lock.release()
- elines = self.errstore.pop(self)
- if self.poll() == None:
- self.terminate()
- if self.poll() == None:
- time.sleep(0.1)
- self.kill()
- self.wait()
- while True:
- if not select([self.stderr],[],[],0.1)[0]:
- break
- b = os.read(self.stderr.fileno(), 1024)
- if b:
- elines.append(b)
- else:
- break
- self.stderr.close()
- self.elines = elines
- if fail_on_err and self.returncode != 0:
- self.errfail()
-
-
-class Server(object):
- """singleton implemening those filesystem access primitives
- which are needed for geo-replication functionality
-
- (Singleton in the sense it's a class which has only static
- and classmethods and is used directly, without instantiation.)
- """
-
- GX_NSPACE = (privileged() and "trusted" or "system") + ".glusterfs"
- NTV_FMTSTR = "!" + "B"*19 + "II"
- FRGN_XTRA_FMT = "I"
- FRGN_FMTSTR = NTV_FMTSTR + FRGN_XTRA_FMT
-
- def _pathguard(f):
- """decorator method that checks
- the path argument of the decorated
- functions to make sure it does not
- point out of the managed tree
- """
-
- fc = getattr(f, 'func_code', None)
- if not fc:
- # python 3
- fc = f.__code__
- pi = list(fc.co_varnames).index('path')
- def ff(*a):
- path = a[pi]
- ps = path.split('/')
- if path[0] == '/' or '..' in ps:
- raise ValueError('unsafe path')
- return f(*a)
- return ff
-
- @staticmethod
- @_pathguard
- def entries(path):
- """directory entries in an array"""
- # prevent symlinks being followed
- if not stat.S_ISDIR(os.lstat(path).st_mode):
- raise OSError(ENOTDIR, os.strerror(ENOTDIR))
- return os.listdir(path)
-
- @classmethod
- @_pathguard
- def purge(cls, path, entries=None):
- """force-delete subtrees
-
- If @entries is not specified, delete
- the whole subtree under @path (including
- @path).
-
- Otherwise, @entries should be a
- a sequence of children of @path, and
- the effect is identical with a joint
- @entries-less purge on them, ie.
-
- for e in entries:
- cls.purge(os.path.join(path, e))
- """
- me_also = entries == None
- if not entries:
- try:
- # if it's a symlink, prevent
- # following it
- try:
- os.unlink(path)
- return
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno == EISDIR:
- entries = os.listdir(path)
- else:
- raise
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno in (ENOTDIR, ENOENT, ELOOP):
- try:
- os.unlink(path)
- return
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno == ENOENT:
- return
- raise
- else:
- raise
- for e in entries:
- cls.purge(os.path.join(path, e))
- if me_also:
- os.rmdir(path)
-
- @classmethod
- @_pathguard
- def _create(cls, path, ctor):
- """path creation backend routine"""
- try:
- ctor(path)
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno == EEXIST:
- cls.purge(path)
- return ctor(path)
- raise
-
- @classmethod
- @_pathguard
- def mkdir(cls, path):
- cls._create(path, os.mkdir)
-
- @classmethod
- @_pathguard
- def symlink(cls, lnk, path):
- cls._create(path, lambda p: os.symlink(lnk, p))
-
- @classmethod
- @_pathguard
- def xtime(cls, path, uuid):
- """query xtime extended attribute
-
- Return xtime of @path for @uuid as a pair of integers.
- "Normal" errors due to non-existent @path or extended attribute
- are tolerated and errno is returned in such a case.
- """
-
- try:
- return struct.unpack('!II', Xattr.lgetxattr(path, '.'.join([cls.GX_NSPACE, uuid, 'xtime']), 8))
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno in (ENOENT, ENODATA, ENOTDIR):
- return ex.errno
- else:
- raise
-
- @classmethod
- def xtime_vec(cls, path, *uuids):
- """vectored version of @xtime
-
- accepts a list of uuids and returns a dictionary
- with uuid as key(s) and xtime as value(s)
- """
- xt = {}
- for uuid in uuids:
- xtu = cls.xtime(path, uuid)
- if xtu == ENODATA:
- xtu = None
- if isinstance(xtu, int):
- return xtu
- xt[uuid] = xtu
- return xt
-
- @classmethod
- @_pathguard
- def set_xtime(cls, path, uuid, mark):
- """set @mark as xtime for @uuid on @path"""
- Xattr.lsetxattr(path, '.'.join([cls.GX_NSPACE, uuid, 'xtime']), struct.pack('!II', *mark))
-
- @classmethod
- def set_xtime_vec(cls, path, mark_dct):
- """vectored (or dictered) version of set_xtime
-
- ignore values that match @ignore
- """
- for u,t in mark_dct.items():
- cls.set_xtime(path, u, t)
-
- @staticmethod
- @_pathguard
- def setattr(path, adct):
- """set file attributes
-
- @adct is a dict, where 'own', 'mode' and 'times'
- keys are looked for and values used to perform
- chown, chmod or utimes on @path.
- """
- own = adct.get('own')
- if own:
- os.lchown(path, *own)
- mode = adct.get('mode')
- if mode:
- os.chmod(path, stat.S_IMODE(mode))
- times = adct.get('times')
- if times:
- os.utime(path, times)
-
- @staticmethod
- def pid():
- return os.getpid()
-
- last_keep_alive = 0
- @classmethod
- def keep_alive(cls, dct):
- """process keepalive messages.
-
- Return keep-alive counter (number of received keep-alive
- messages).
-
- Now the "keep-alive" message can also have a payload which is
- used to set a foreign volume-mark on the underlying file system.
- """
- if dct:
- key = '.'.join([cls.GX_NSPACE, 'volume-mark', dct['uuid']])
- val = struct.pack(cls.FRGN_FMTSTR,
- *(dct['version'] +
- tuple(int(x,16) for x in re.findall('(?:[\da-f]){2}', dct['uuid'])) +
- (dct['retval'],) + dct['volume_mark'][0:2] + (dct['timeout'],)))
- Xattr.lsetxattr('.', key, val)
- cls.last_keep_alive += 1
- return cls.last_keep_alive
-
- @staticmethod
- def version():
- """version used in handshake"""
- return 1.0
-
-
-class SlaveLocal(object):
- """mix-in class to implement some factes of a slave server
-
- ("mix-in" is sort of like "abstract class", ie. it's not
- instantiated just included in the ancesty DAG. I use "mix-in"
- to indicate that it's not used as an abstract base class,
- rather just taken in to implement additional functionality
- on the basis of the assumed availability of certain interfaces.)
- """
-
- def can_connect_to(self, remote):
- """determine our position in the connectibility matrix"""
- return not remote
-
- def service_loop(self):
- """start a RePCe server serving self's server
-
- stop servicing if a timeout is configured and got no
- keep-alime in that inteval
- """
-
- if boolify(gconf.use_rsync_xattrs) and not privileged():
- raise GsyncdError("using rsync for extended attributes is not supported")
-
- repce = RepceServer(self.server, sys.stdin, sys.stdout, int(gconf.sync_jobs))
- t = syncdutils.Thread(target=lambda: (repce.service_loop(),
- syncdutils.finalize()))
- t.start()
- logging.info("slave listening")
- if gconf.timeout and int(gconf.timeout) > 0:
- while True:
- lp = self.server.last_keep_alive
- time.sleep(int(gconf.timeout))
- if lp == self.server.last_keep_alive:
- logging.info("connection inactive for %d seconds, stopping" % int(gconf.timeout))
- break
- else:
- select((), (), ())
-
-class SlaveRemote(object):
- """mix-in class to implement an interface to a remote slave"""
-
- def connect_remote(self, rargs=[], **opts):
- """connects to a remote slave
-
- Invoke an auxiliary utility (slave gsyncd, possibly wrapped)
- which sets up the connection and set up a RePCe client to
- communicate throuh its stdio.
- """
- slave = opts.get('slave', self.url)
- extra_opts = []
- so = getattr(gconf, 'session_owner', None)
- if so:
- extra_opts += ['--session-owner', so]
- if boolify(gconf.use_rsync_xattrs):
- extra_opts.append('--use-rsync-xattrs')
- po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + \
- ['-N', '--listen', '--timeout', str(gconf.timeout), slave],
- stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- gconf.transport = po
- return self.start_fd_client(po.stdout, po.stdin, **opts)
-
- def start_fd_client(self, i, o, **opts):
- """set up RePCe client, handshake with server
-
- It's cut out as a separate method to let
- subclasses hook into client startup
- """
- self.server = RepceClient(i, o)
- rv = self.server.__version__()
- exrv = {'proto': repce.repce_version, 'object': Server.version()}
- da0 = (rv, exrv)
- da1 = ({}, {})
- for i in range(2):
- for k, v in da0[i].iteritems():
- da1[i][k] = int(v)
- if da1[0] != da1[1]:
- raise GsyncdError("RePCe major version mismatch: local %s, remote %s" % (exrv, rv))
-
- def rsync(self, files, *args):
- """invoke rsync"""
- if not files:
- raise GsyncdError("no files to sync")
- logging.debug("files: " + ", ".join(files))
- argv = gconf.rsync_command.split() + \
- ['-aR0', '--files-from=-', '--super','--stats', '--numeric-ids', '--no-implied-dirs'] + \
- gconf.rsync_options.split() + (boolify(gconf.use_rsync_xattrs) and ['--xattrs'] or []) + \
- ['.'] + list(args)
- po = Popen(argv, stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- for f in files:
- po.stdin.write(f)
- po.stdin.write('\0')
-
- po.stdin.close()
- po.wait()
- po.terminate_geterr(fail_on_err = False)
-
- return po
-
-
-class AbstractUrl(object):
- """abstract base class for url scheme classes"""
-
- def __init__(self, path, pattern):
- m = re.search(pattern, path)
- if not m:
- raise GsyncdError("malformed path")
- self.path = path
- return m.groups()
-
- @property
- def scheme(self):
- return type(self).__name__.lower()
-
- def canonical_path(self):
- return self.path
-
- def get_url(self, canonical=False, escaped=False):
- """format self's url in various styles"""
- if canonical:
- pa = self.canonical_path()
- else:
- pa = self.path
- u = "://".join((self.scheme, pa))
- if escaped:
- u = syncdutils.escape(u)
- return u
-
- @property
- def url(self):
- return self.get_url()
-
-
- ### Concrete resource classes ###
-
-
-class FILE(AbstractUrl, SlaveLocal, SlaveRemote):
- """scheme class for file:// urls
-
- can be used to represent a file slave server
- on slave side, or interface to a remote file
- file server on master side
- """
-
- class FILEServer(Server):
- """included server flavor"""
- pass
-
- server = FILEServer
-
- def __init__(self, path):
- sup(self, path, '^/')
-
- def connect(self):
- """inhibit the resource beyond"""
- os.chdir(self.path)
-
- def rsync(self, files):
- return sup(self, files, self.path)
-
-
-class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
- """scheme class for gluster:// urls
-
- can be used to represent a gluster slave server
- on slave side, or interface to a remote gluster
- slave on master side, or to represent master
- (slave-ish features come from the mixins, master
- functionality is outsourced to GMaster from master)
- """
-
- class GLUSTERServer(Server):
- "server enhancements for a glusterfs backend"""
-
- @classmethod
- def _attr_unpack_dict(cls, xattr, extra_fields = ''):
- """generic volume mark fetching/parsing backed"""
- fmt_string = cls.NTV_FMTSTR + extra_fields
- buf = Xattr.lgetxattr('.', xattr, struct.calcsize(fmt_string))
- vm = struct.unpack(fmt_string, buf)
- m = re.match('(.{8})(.{4})(.{4})(.{4})(.{12})', "".join(['%02x' % x for x in vm[2:18]]))
- uuid = '-'.join(m.groups())
- volinfo = { 'version': vm[0:2],
- 'uuid' : uuid,
- 'retval' : vm[18],
- 'volume_mark': vm[19:21],
- }
- if extra_fields:
- return volinfo, vm[-len(extra_fields):]
- else:
- return volinfo
-
- @classmethod
- def foreign_volume_infos(cls):
- """return list of valid (not expired) foreign volume marks"""
- dict_list = []
- xattr_list = Xattr.llistxattr_buf('.')
- for ele in xattr_list:
- if ele.find('.'.join([cls.GX_NSPACE, 'volume-mark', ''])) == 0:
- d, x = cls._attr_unpack_dict(ele, cls.FRGN_XTRA_FMT)
- now = int(time.time())
- if x[0] > now:
- logging.debug("volinfo[%s] expires: %d (%d sec later)" % \
- (d['uuid'], x[0], x[0] - now))
- d['timeout'] = x[0]
- dict_list.append(d)
- else:
- try:
- Xattr.lremovexattr('.', ele)
- except OSError:
- pass
- return dict_list
-
- @classmethod
- def native_volume_info(cls):
- """get the native volume mark of the underlying gluster volume"""
- try:
- return cls._attr_unpack_dict('.'.join([cls.GX_NSPACE, 'volume-mark']))
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno != ENODATA:
- raise
-
- server = GLUSTERServer
-
- def __init__(self, path):
- self.host, self.volume = sup(self, path, '^(%s):(.+)' % HostRX.pattern)
-
- def canonical_path(self):
- return ':'.join([gethostbyname(self.host), self.volume])
-
- def can_connect_to(self, remote):
- """determine our position in the connectibility matrix"""
- return True
-
- class Mounter(object):
- """Abstract base class for mounter backends"""
-
- def __init__(self, params):
- self.params = params
- self.mntpt = None
-
- @classmethod
- def get_glusterprog(cls):
- return os.path.join(gconf.gluster_command_dir, cls.glusterprog)
-
- def umount_l(self, d):
- """perform lazy umount"""
- po = Popen(self.make_umount_argv(d), stderr=subprocess.PIPE)
- po.wait()
- return po
-
- @classmethod
- def make_umount_argv(cls, d):
- raise NotImplementedError
-
- def make_mount_argv(self, *a):
- raise NotImplementedError
-
- def cleanup_mntpt(self, *a):
- pass
-
- def handle_mounter(self, po):
- po.wait()
-
- def inhibit(self, *a):
- """inhibit a gluster filesystem
-
- Mount glusterfs over a temporary mountpoint,
- change into the mount, and lazy unmount the
- filesystem.
- """
-
- mpi, mpo = os.pipe()
- mh = Popen.fork()
- if mh:
- os.close(mpi)
- fcntl.fcntl(mpo, fcntl.F_SETFD, fcntl.FD_CLOEXEC)
- d = None
- margv = self.make_mount_argv(*a)
- if self.mntpt:
- # mntpt is determined pre-mount
- d = self.mntpt
- os.write(mpo, d + '\0')
- po = Popen(margv, **self.mountkw)
- self.handle_mounter(po)
- po.terminate_geterr()
- logging.debug('auxiliary glusterfs mount in place')
- if not d:
- # mntpt is determined during mount
- d = self.mntpt
- os.write(mpo, d + '\0')
- os.write(mpo, 'M')
- t = syncdutils.Thread(target=lambda: os.chdir(d))
- t.start()
- tlim = gconf.starttime + int(gconf.connection_timeout)
- while True:
- if not t.isAlive():
- break
- if time.time() >= tlim:
- syncdutils.finalize(exval = 1)
- time.sleep(1)
- os.close(mpo)
- _, rv = syncdutils.waitpid(mh, 0)
- if rv:
- rv = (os.WIFEXITED(rv) and os.WEXITSTATUS(rv) or 0) - \
- (os.WIFSIGNALED(rv) and os.WTERMSIG(rv) or 0)
- logging.warn('stale mount possibly left behind on ' + d)
- raise GsyncdError("cleaning up temp mountpoint %s failed with status %d" % \
- (d, rv))
- else:
- rv = 0
- try:
- os.setsid()
- os.close(mpo)
- mntdata = ''
- while True:
- c = os.read(mpi, 1)
- if not c:
- break
- mntdata += c
- if mntdata:
- mounted = False
- if mntdata[-1] == 'M':
- mntdata = mntdata[:-1]
- assert(mntdata)
- mounted = True
- assert(mntdata[-1] == '\0')
- mntpt = mntdata[:-1]
- assert(mntpt)
- if mounted:
- po = self.umount_l(mntpt)
- po.terminate_geterr(fail_on_err = False)
- if po.returncode != 0:
- po.errlog()
- rv = po.returncode
- self.cleanup_mntpt(mntpt)
- except:
- logging.exception('mount cleanup failure:')
- rv = 200
- os._exit(rv)
- logging.debug('auxiliary glusterfs mount prepared')
-
- class DirectMounter(Mounter):
- """mounter backend which calls mount(8), umount(8) directly"""
-
- mountkw = {'stderr': subprocess.PIPE}
- glusterprog = 'glusterfs'
-
- @staticmethod
- def make_umount_argv(d):
- return ['umount', '-l', d]
-
- def make_mount_argv(self):
- self.mntpt = tempfile.mkdtemp(prefix = 'gsyncd-aux-mount-')
- return [self.get_glusterprog()] + ['--' + p for p in self.params] + [self.mntpt]
-
- def cleanup_mntpt(self, mntpt = None):
- if not mntpt:
- mntpt = self.mntpt
- os.rmdir(mntpt)
-
- class MountbrokerMounter(Mounter):
- """mounter backend using the mountbroker gluster service"""
-
- mountkw = {'stderr': subprocess.PIPE, 'stdout': subprocess.PIPE}
- glusterprog = 'gluster'
-
- @classmethod
- def make_cli_argv(cls):
- return [cls.get_glusterprog()] + gconf.gluster_cli_options.split() + ['system::']
-
- @classmethod
- def make_umount_argv(cls, d):
- return cls.make_cli_argv() + ['umount', d, 'lazy']
-
- def make_mount_argv(self, label):
- return self.make_cli_argv() + \
- ['mount', label, 'user-map-root=' + syncdutils.getusername()] + self.params
-
- def handle_mounter(self, po):
- self.mntpt = po.stdout.readline()[:-1]
- po.stdout.close()
- sup(self, po)
- if po.returncode != 0:
- # if cli terminated with error due to being
- # refused by glusterd, what it put
- # out on stdout is a diagnostic message
- logging.error('glusterd answered: %s' % self.mntpt)
-
- def connect(self):
- """inhibit the resource beyond
-
- Choose mounting backend (direct or mountbroker),
- set up glusterfs parameters and perform the mount
- with given backend
- """
-
- label = getattr(gconf, 'mountbroker', None)
- if not label and not privileged():
- label = syncdutils.getusername()
- mounter = label and self.MountbrokerMounter or self.DirectMounter
- params = gconf.gluster_params.split() + \
- (gconf.gluster_log_level and ['log-level=' + gconf.gluster_log_level] or []) + \
- ['log-file=' + gconf.gluster_log_file, 'volfile-server=' + self.host,
- 'volfile-id=' + self.volume, 'client-pid=-1']
- mounter(params).inhibit(*[l for l in [label] if l])
-
- def connect_remote(self, *a, **kw):
- sup(self, *a, **kw)
- self.slavedir = "/proc/%d/cwd" % self.server.pid()
-
- def service_loop(self, *args):
- """enter service loop
-
- - if slave given, instantiate GMaster and
- pass control to that instance, which implements
- master behavior
- - else do that's what's inherited
- """
- if args:
- gmaster_builder()(self, args[0]).crawl_loop()
- else:
- sup(self, *args)
-
- def rsync(self, files):
- return sup(self, files, self.slavedir)
-
-
-class SSH(AbstractUrl, SlaveRemote):
- """scheme class for ssh:// urls
-
- interface to remote slave on master side
- implementing an ssh based proxy
- """
-
- def __init__(self, path):
- self.remote_addr, inner_url = sup(self, path,
- '^((?:%s@)?%s):(.+)' % tuple([ r.pattern for r in (UserRX, HostRX) ]))
- self.inner_rsc = parse_url(inner_url)
-
- def canonical_path(self):
- m = re.match('([^@]+)@(.+)', self.remote_addr)
- if m:
- u, h = m.groups()
- else:
- u, h = syncdutils.getusername(), self.remote_addr
- remote_addr = '@'.join([u, gethostbyname(h)])
- return ':'.join([remote_addr, self.inner_rsc.get_url(canonical=True)])
-
- def can_connect_to(self, remote):
- """determine our position in the connectibility matrix"""
- return False
-
- def start_fd_client(self, *a, **opts):
- """customizations for client startup
-
- - be a no-op if we are to daemonize (client startup is deferred
- to post-daemon stage)
- - determine target url for rsync after consulting server
- """
- if opts.get('deferred'):
- return a
- sup(self, *a)
- ityp = type(self.inner_rsc)
- if ityp == FILE:
- slavepath = self.inner_rsc.path
- elif ityp == GLUSTER:
- slavepath = "/proc/%d/cwd" % self.server.pid()
- else:
- raise NotImplementedError
- self.slaveurl = ':'.join([self.remote_addr, slavepath])
-
- def connect_remote(self, go_daemon=None):
- """connect to inner slave url through outer ssh url
-
- Wrap the connecting utility in ssh.
-
- Much care is put into daemonizing: in that case
- ssh is started before daemonization, but
- RePCe client is to be created after that (as ssh
- interactive password auth would be defeated by
- a daemonized ssh, while client should be present
- only in the final process). In that case the action
- is taken apart to two parts, this method is ivoked
- once pre-daemon, once post-daemon. Use @go_daemon
- to deiced what part to perform.
-
- [NB. ATM gluster product does not makes use of interactive
- authentication.]
- """
- if go_daemon == 'done':
- return self.start_fd_client(*self.fd_pair)
- gconf.setup_ssh_ctl(tempfile.mkdtemp(prefix='gsyncd-aux-ssh-'))
- deferred = go_daemon == 'postconn'
- ret = sup(self, gconf.ssh_command.split() + gconf.ssh_ctl_args + [self.remote_addr], slave=self.inner_rsc.url, deferred=deferred)
- if deferred:
- # send a message to peer so that we can wait for
- # the answer from which we know connection is
- # established and we can proceed with daemonization
- # (doing that too early robs the ssh passwd prompt...)
- # However, we'd better not start the RepceClient
- # before daemonization (that's not preserved properly
- # in daemon), we just do a an ad-hoc linear put/get.
- i, o = ret
- inf = os.fdopen(i)
- repce.send(o, None, '__repce_version__')
- select((inf,), (), ())
- repce.recv(inf)
- # hack hack hack: store a global reference to the file
- # to save it from getting GC'd which implies closing it
- gconf.permanent_handles.append(inf)
- self.fd_pair = (i, o)
- return 'should'
-
- def rsync(self, files):
- return sup(self, files, '-e', " ".join(gconf.ssh_command.split() + gconf.ssh_ctl_args),
- *(gconf.rsync_ssh_options.split() + [self.slaveurl]))
diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py
deleted file mode 100644
index 0764c0790..000000000
--- a/xlators/features/marker/utils/syncdaemon/syncdutils.py
+++ /dev/null
@@ -1,288 +0,0 @@
-import os
-import sys
-import pwd
-import time
-import fcntl
-import shutil
-import logging
-from threading import Lock, Thread as baseThread
-from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED, EINTR, errorcode
-from signal import signal, SIGTERM, SIGKILL
-from time import sleep
-import select as oselect
-from os import waitpid as owaitpid
-try:
- from cPickle import PickleError
-except ImportError:
- # py 3
- from pickle import PickleError
-
-from gconf import gconf
-
-try:
- # py 3
- from urllib import parse as urllib
-except ImportError:
- import urllib
-
-def escape(s):
- """the chosen flavor of string escaping, used all over
- to turn whatever data to creatable representation"""
- return urllib.quote_plus(s)
-
-def unescape(s):
- """inverse of .escape"""
- return urllib.unquote_plus(s)
-
-def norm(s):
- if s:
- return s.replace('-', '_')
-
-def update_file(path, updater, merger = lambda f: True):
- """update a file in a transaction-like manner"""
-
- fr = fw = None
- try:
- fd = os.open(path, os.O_CREAT|os.O_RDWR)
- try:
- fr = os.fdopen(fd, 'r+b')
- except:
- os.close(fd)
- raise
- fcntl.lockf(fr, fcntl.LOCK_EX)
- if not merger(fr):
- return
-
- tmpp = path + '.tmp.' + str(os.getpid())
- fd = os.open(tmpp, os.O_CREAT|os.O_EXCL|os.O_WRONLY)
- try:
- fw = os.fdopen(fd, 'wb', 0)
- except:
- os.close(fd)
- raise
- updater(fw)
- os.fsync(fd)
- os.rename(tmpp, path)
- finally:
- for fx in (fr, fw):
- if fx:
- fx.close()
-
-def grabfile(fname, content=None):
- """open @fname + contest for its fcntl lock
-
- @content: if given, set the file content to it
- """
- # damn those messy open() mode codes
- fd = os.open(fname, os.O_CREAT|os.O_RDWR)
- f = os.fdopen(fd, 'r+b', 0)
- try:
- fcntl.lockf(f, fcntl.LOCK_EX|fcntl.LOCK_NB)
- except:
- ex = sys.exc_info()[1]
- f.close()
- if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN):
- # cannot grab, it's taken
- return
- raise
- if content:
- try:
- f.truncate()
- f.write(content)
- except:
- f.close()
- raise
- gconf.permanent_handles.append(f)
- return f
-
-def grabpidfile(fname=None, setpid=True):
- """.grabfile customization for pid files"""
- if not fname:
- fname = gconf.pid_file
- content = None
- if setpid:
- content = str(os.getpid()) + '\n'
- return grabfile(fname, content=content)
-
-final_lock = Lock()
-
-def finalize(*a, **kw):
- """all those messy final steps we go trough upon termination
-
- Do away with pidfile, ssh control dir and logging.
- """
- final_lock.acquire()
- if getattr(gconf, 'pid_file', None):
- rm_pidf = gconf.pid_file_owned
- if gconf.cpid:
- # exit path from parent branch of daemonization
- rm_pidf = False
- while True:
- f = grabpidfile(setpid=False)
- if not f:
- # child has already taken over pidfile
- break
- if os.waitpid(gconf.cpid, os.WNOHANG)[0] == gconf.cpid:
- # child has terminated
- rm_pidf = True
- break;
- time.sleep(0.1)
- if rm_pidf:
- try:
- os.unlink(gconf.pid_file)
- except:
- ex = sys.exc_info()[1]
- if ex.errno == ENOENT:
- pass
- else:
- raise
- if gconf.ssh_ctl_dir and not gconf.cpid:
- shutil.rmtree(gconf.ssh_ctl_dir)
- if getattr(gconf, 'state_socket', None):
- try:
- os.unlink(gconf.state_socket)
- except:
- if sys.exc_info()[0] == OSError:
- pass
- if gconf.log_exit:
- logging.info("exiting.")
- sys.stdout.flush()
- sys.stderr.flush()
- os._exit(kw.get('exval', 0))
-
-def log_raise_exception(excont):
- """top-level exception handler
-
- Try to some fancy things to cover up we face with an error.
- Translate some weird sounding but well understood exceptions
- into human-friendly lingo
- """
- is_filelog = False
- for h in logging.getLogger().handlers:
- fno = getattr(getattr(h, 'stream', None), 'fileno', None)
- if fno and not os.isatty(fno()):
- is_filelog = True
-
- exc = sys.exc_info()[1]
- if isinstance(exc, SystemExit):
- excont.exval = exc.code or 0
- raise
- else:
- logtag = None
- if isinstance(exc, GsyncdError):
- if is_filelog:
- logging.error(exc.args[0])
- sys.stderr.write('failure: ' + exc.args[0] + '\n')
- elif isinstance(exc, PickleError) or isinstance(exc, EOFError) or \
- ((isinstance(exc, OSError) or isinstance(exc, IOError)) and \
- exc.errno == EPIPE):
- logging.error('connection to peer is broken')
- if hasattr(gconf, 'transport'):
- gconf.transport.wait()
- if gconf.transport.returncode == 127:
- logging.warn("!!!!!!!!!!!!!")
- logging.warn('!!! getting "No such file or directory" errors '
- "is most likely due to MISCONFIGURATION, please consult "
- "http://access.redhat.com/knowledge/docs/en-US/Red_Hat_Storage/2.0/html/Administration_Guide/chap-User_Guide-Geo_Rep-Preparation-Settingup_Environment.html")
- logging.warn("!!!!!!!!!!!!!")
- gconf.transport.terminate_geterr()
- elif isinstance(exc, OSError) and exc.errno in (ENOTCONN, ECONNABORTED):
- logging.error('glusterfs session went down [%s]', errorcode[exc.errno])
- else:
- logtag = "FAIL"
- if not logtag and logging.getLogger().isEnabledFor(logging.DEBUG):
- logtag = "FULL EXCEPTION TRACE"
- if logtag:
- logging.exception(logtag + ": ")
- sys.stderr.write("failed with %s.\n" % type(exc).__name__)
- excont.exval = 1
- sys.exit(excont.exval)
-
-
-class FreeObject(object):
- """wildcard class for which any attribute can be set"""
-
- def __init__(self, **kw):
- for k,v in kw.items():
- setattr(self, k, v)
-
-class Thread(baseThread):
- """thread class flavor for gsyncd
-
- - always a daemon thread
- - force exit for whole program if thread
- function coughs up an exception
- """
- def __init__(self, *a, **kw):
- tf = kw.get('target')
- if tf:
- def twrap(*aa):
- excont = FreeObject(exval = 0)
- try:
- tf(*aa)
- except:
- try:
- log_raise_exception(excont)
- finally:
- finalize(exval = excont.exval)
- kw['target'] = twrap
- baseThread.__init__(self, *a, **kw)
- self.setDaemon(True)
-
-class GsyncdError(Exception):
- pass
-
-def getusername(uid = None):
- if uid == None:
- uid = os.geteuid()
- return pwd.getpwuid(uid).pw_name
-
-def privileged():
- return os.geteuid() == 0
-
-def boolify(s):
- """
- Generic string to boolean converter
-
- return
- - Quick return if string 's' is of type bool
- - True if it's in true_list
- - False if it's in false_list
- - Warn if it's not present in either and return False
- """
- true_list = ['true', 'yes', '1', 'on']
- false_list = ['false', 'no', '0', 'off']
-
- if isinstance(s, bool):
- return s
-
- rv = False
- lstr = s.lower()
- if lstr in true_list:
- rv = True
- elif not lstr in false_list:
- logging.warn("Unknown string (%s) in string to boolean conversion defaulting to False\n" % (s))
-
- return rv
-
-def eintr_wrap(func, exc, *a):
- """
- wrapper around syscalls resilient to interrupt caused
- by signals
- """
- while True:
- try:
- return func(*a)
- except exc:
- ex = sys.exc_info()[1]
- if not ex.args[0] == EINTR:
- raise
-
-def select(*a):
- return eintr_wrap(oselect.select, oselect.error, *a)
-
-def waitpid (*a):
- return eintr_wrap(owaitpid, OSError, *a)
-
-def set_term_handler(hook=lambda *a: finalize(*a, **{'exval': 1})):
- signal(SIGTERM, hook)
diff --git a/xlators/features/protect/src/Makefile.am b/xlators/features/protect/src/Makefile.am
index 7eb93f32e..968e88c45 100644
--- a/xlators/features/protect/src/Makefile.am
+++ b/xlators/features/protect/src/Makefile.am
@@ -2,15 +2,15 @@ xlator_LTLIBRARIES = prot_dht.la prot_client.la prot_server.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-prot_dht_la_LDFLAGS = -module -avoidversion
+prot_dht_la_LDFLAGS = -module -avoid-version
prot_dht_la_SOURCES = prot_dht.c
prot_dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-prot_client_la_LDFLAGS = -module -avoidversion
+prot_client_la_LDFLAGS = -module -avoid-version
prot_client_la_SOURCES = prot_client.c
prot_client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-prot_server_la_LDFLAGS = -module -avoidversion
+prot_server_la_LDFLAGS = -module -avoid-version
prot_server_la_SOURCES = prot_server.c
prot_server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/features/protect/src/prot_client.c b/xlators/features/protect/src/prot_client.c
index a27216d0a..d09715067 100644
--- a/xlators/features/protect/src/prot_client.c
+++ b/xlators/features/protect/src/prot_client.c
@@ -15,7 +15,9 @@
#include "xlator.h"
#include "defaults.h"
+#ifndef __NetBSD__
#include <execinfo.h>
+#endif
#define NUM_FRAMES 20
diff --git a/xlators/features/qemu-block/Makefile.am b/xlators/features/qemu-block/Makefile.am
new file mode 100644
index 000000000..af437a64d
--- /dev/null
+++ b/xlators/features/qemu-block/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/features/qemu-block/src/Makefile.am b/xlators/features/qemu-block/src/Makefile.am
new file mode 100644
index 000000000..08a7b62a0
--- /dev/null
+++ b/xlators/features/qemu-block/src/Makefile.am
@@ -0,0 +1,155 @@
+if ENABLE_QEMU_BLOCK
+xlator_LTLIBRARIES = qemu-block.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+qemu_block_la_LDFLAGS = -module -avoid-version
+qemu_block_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GLIB_LIBS) -lz -lrt
+
+qemu_block_la_SOURCES_qemu = \
+ $(CONTRIBDIR)/qemu/qemu-coroutine.c \
+ $(CONTRIBDIR)/qemu/qemu-coroutine-lock.c \
+ $(CONTRIBDIR)/qemu/qemu-coroutine-sleep.c \
+ $(CONTRIBDIR)/qemu/coroutine-ucontext.c \
+ $(CONTRIBDIR)/qemu/block.c \
+ $(CONTRIBDIR)/qemu/nop-symbols.c
+
+qemu_block_la_SOURCES_qemu_util = \
+ $(CONTRIBDIR)/qemu/util/aes.c \
+ $(CONTRIBDIR)/qemu/util/bitmap.c \
+ $(CONTRIBDIR)/qemu/util/bitops.c \
+ $(CONTRIBDIR)/qemu/util/cutils.c \
+ $(CONTRIBDIR)/qemu/util/error.c \
+ $(CONTRIBDIR)/qemu/util/hbitmap.c \
+ $(CONTRIBDIR)/qemu/util/iov.c \
+ $(CONTRIBDIR)/qemu/util/module.c \
+ $(CONTRIBDIR)/qemu/util/oslib-posix.c \
+ $(CONTRIBDIR)/qemu/util/qemu-option.c \
+ $(CONTRIBDIR)/qemu/util/qemu-error.c \
+ $(CONTRIBDIR)/qemu/util/qemu-thread-posix.c \
+ $(CONTRIBDIR)/qemu/util/unicode.c \
+ $(CONTRIBDIR)/qemu/util/hexdump.c
+
+qemu_block_la_SOURCES_qemu_block = \
+ $(CONTRIBDIR)/qemu/block/snapshot.c \
+ $(CONTRIBDIR)/qemu/block/qcow2-cache.c \
+ $(CONTRIBDIR)/qemu/block/qcow2-cluster.c \
+ $(CONTRIBDIR)/qemu/block/qcow2-refcount.c \
+ $(CONTRIBDIR)/qemu/block/qcow2-snapshot.c \
+ $(CONTRIBDIR)/qemu/block/qcow2.c \
+ $(CONTRIBDIR)/qemu/block/qed-check.c \
+ $(CONTRIBDIR)/qemu/block/qed-cluster.c \
+ $(CONTRIBDIR)/qemu/block/qed-gencb.c \
+ $(CONTRIBDIR)/qemu/block/qed-l2-cache.c \
+ $(CONTRIBDIR)/qemu/block/qed-table.c \
+ $(CONTRIBDIR)/qemu/block/qed.c
+
+qemu_block_la_SOURCES_qemu_qobject = \
+ $(CONTRIBDIR)/qemu/qobject/json-lexer.c \
+ $(CONTRIBDIR)/qemu/qobject/json-parser.c \
+ $(CONTRIBDIR)/qemu/qobject/json-streamer.c \
+ $(CONTRIBDIR)/qemu/qobject/qbool.c \
+ $(CONTRIBDIR)/qemu/qobject/qdict.c \
+ $(CONTRIBDIR)/qemu/qobject/qerror.c \
+ $(CONTRIBDIR)/qemu/qobject/qfloat.c \
+ $(CONTRIBDIR)/qemu/qobject/qint.c \
+ $(CONTRIBDIR)/qemu/qobject/qjson.c \
+ $(CONTRIBDIR)/qemu/qobject/qlist.c \
+ $(CONTRIBDIR)/qemu/qobject/qstring.c
+
+qemu_block_la_SOURCES = \
+ $(qemu_block_la_SOURCES_qemu) \
+ $(qemu_block_la_SOURCES_qemu_util) \
+ $(qemu_block_la_SOURCES_qemu_block) \
+ $(qemu_block_la_SOURCES_qemu_qobject) \
+ bdrv-xlator.c \
+ coroutine-synctask.c \
+ bh-syncop.c \
+ monitor-logging.c \
+ clock-timer.c \
+ qemu-block.c \
+ qb-coroutines.c
+
+noinst_HEADERS_qemu = \
+ $(CONTRIBDIR)/qemu/config-host.h \
+ $(CONTRIBDIR)/qemu/qapi-types.h \
+ $(CONTRIBDIR)/qemu/qmp-commands.h \
+ $(CONTRIBDIR)/qemu/trace/generated-tracers.h \
+ $(CONTRIBDIR)/qemu/include/config.h \
+ $(CONTRIBDIR)/qemu/include/glib-compat.h \
+ $(CONTRIBDIR)/qemu/include/qemu-common.h \
+ $(CONTRIBDIR)/qemu/include/trace.h \
+ $(CONTRIBDIR)/qemu/include/block/coroutine.h \
+ $(CONTRIBDIR)/qemu/include/block/aio.h \
+ $(CONTRIBDIR)/qemu/include/block/block.h \
+ $(CONTRIBDIR)/qemu/include/block/block_int.h \
+ $(CONTRIBDIR)/qemu/include/block/blockjob.h \
+ $(CONTRIBDIR)/qemu/include/block/coroutine.h \
+ $(CONTRIBDIR)/qemu/include/block/coroutine_int.h \
+ $(CONTRIBDIR)/qemu/include/block/snapshot.h \
+ $(CONTRIBDIR)/qemu/include/exec/cpu-common.h \
+ $(CONTRIBDIR)/qemu/include/exec/hwaddr.h \
+ $(CONTRIBDIR)/qemu/include/exec/poison.h \
+ $(CONTRIBDIR)/qemu/include/fpu/softfloat.h \
+ $(CONTRIBDIR)/qemu/include/migration/migration.h \
+ $(CONTRIBDIR)/qemu/include/migration/qemu-file.h \
+ $(CONTRIBDIR)/qemu/include/migration/vmstate.h \
+ $(CONTRIBDIR)/qemu/include/monitor/monitor.h \
+ $(CONTRIBDIR)/qemu/include/monitor/readline.h \
+ $(CONTRIBDIR)/qemu/include/qapi/error.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/json-lexer.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/json-parser.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/json-streamer.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qbool.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qdict.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qerror.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qfloat.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qint.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qjson.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qlist.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qobject.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qstring.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/types.h \
+ $(CONTRIBDIR)/qemu/include/qemu/aes.h \
+ $(CONTRIBDIR)/qemu/include/qemu/atomic.h \
+ $(CONTRIBDIR)/qemu/include/qemu/bitmap.h \
+ $(CONTRIBDIR)/qemu/include/qemu/bitops.h \
+ $(CONTRIBDIR)/qemu/include/qemu/bswap.h \
+ $(CONTRIBDIR)/qemu/include/qemu/compiler.h \
+ $(CONTRIBDIR)/qemu/include/qemu/error-report.h \
+ $(CONTRIBDIR)/qemu/include/qemu/event_notifier.h \
+ $(CONTRIBDIR)/qemu/include/qemu/hbitmap.h \
+ $(CONTRIBDIR)/qemu/include/qemu/host-utils.h \
+ $(CONTRIBDIR)/qemu/include/qemu/iov.h \
+ $(CONTRIBDIR)/qemu/include/qemu/main-loop.h \
+ $(CONTRIBDIR)/qemu/include/qemu/module.h \
+ $(CONTRIBDIR)/qemu/include/qemu/notify.h \
+ $(CONTRIBDIR)/qemu/include/qemu/option.h \
+ $(CONTRIBDIR)/qemu/include/qemu/option_int.h \
+ $(CONTRIBDIR)/qemu/include/qemu/osdep.h \
+ $(CONTRIBDIR)/qemu/include/qemu/queue.h \
+ $(CONTRIBDIR)/qemu/include/qemu/sockets.h \
+ $(CONTRIBDIR)/qemu/include/qemu/thread-posix.h \
+ $(CONTRIBDIR)/qemu/include/qemu/thread.h \
+ $(CONTRIBDIR)/qemu/include/qemu/timer.h \
+ $(CONTRIBDIR)/qemu/include/qemu/typedefs.h \
+ $(CONTRIBDIR)/qemu/include/sysemu/sysemu.h \
+ $(CONTRIBDIR)/qemu/include/sysemu/os-posix.h \
+ $(CONTRIBDIR)/qemu/block/qcow2.h \
+ $(CONTRIBDIR)/qemu/block/qed.h
+
+noinst_HEADERS = \
+ $(noinst_HEADERS_qemu) \
+ qemu-block.h \
+ qemu-block-memory-types.h \
+ qb-coroutines.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(CONTRIBDIR)/qemu \
+ -I$(CONTRIBDIR)/qemu/include \
+ -DGLUSTER_XLATOR
+
+AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) $(GLIB_CFLAGS)
+
+CLEANFILES =
+
+endif
diff --git a/xlators/features/qemu-block/src/bdrv-xlator.c b/xlators/features/qemu-block/src/bdrv-xlator.c
new file mode 100644
index 000000000..1e55b5fb7
--- /dev/null
+++ b/xlators/features/qemu-block/src/bdrv-xlator.c
@@ -0,0 +1,389 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "inode.h"
+#include "syncop.h"
+#include "qemu-block.h"
+#include "block/block_int.h"
+
+typedef struct BDRVGlusterState {
+ inode_t *inode;
+} BDRVGlusterState;
+
+static QemuOptsList runtime_opts = {
+ .name = "gluster",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "filename",
+ .type = QEMU_OPT_STRING,
+ .help = "GFID of file",
+ },
+ { /* end of list */ }
+ },
+};
+
+inode_t *
+qb_inode_from_filename (const char *filename)
+{
+ const char *iptr = NULL;
+ inode_t *inode = NULL;
+
+ iptr = filename + 17;
+ sscanf (iptr, "%p", &inode);
+
+ return inode;
+}
+
+
+int
+qb_inode_to_filename (inode_t *inode, char *filename, int size)
+{
+ return snprintf (filename, size, "gluster://inodep:%p", inode);
+}
+
+
+static fd_t *
+fd_from_bs (BlockDriverState *bs)
+{
+ BDRVGlusterState *s = bs->opaque;
+
+ return fd_anonymous (s->inode);
+}
+
+
+static int
+qemu_gluster_open (BlockDriverState *bs, QDict *options, int bdrv_flags)
+{
+ inode_t *inode = NULL;
+ BDRVGlusterState *s = bs->opaque;
+ QemuOpts *opts = NULL;
+ Error *local_err = NULL;
+ const char *filename = NULL;
+ char gfid_str[128];
+ int ret;
+ qb_conf_t *conf = THIS->private;
+
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ return -EINVAL;
+ }
+
+ filename = qemu_opt_get(opts, "filename");
+
+ /*
+ * gfid:<gfid> format means we're opening a backing image.
+ */
+ ret = sscanf(filename, "gluster://gfid:%s", gfid_str);
+ if (ret) {
+ loc_t loc = {0,};
+ struct iatt buf = {0,};
+ uuid_t gfid;
+
+ uuid_parse(gfid_str, gfid);
+
+ loc.inode = inode_find(conf->root_inode->table, gfid);
+ if (!loc.inode) {
+ loc.inode = inode_new(conf->root_inode->table);
+ uuid_copy(loc.inode->gfid, gfid);
+ }
+
+ uuid_copy(loc.gfid, loc.inode->gfid);
+ ret = syncop_lookup(FIRST_CHILD(THIS), &loc, NULL, &buf, NULL,
+ NULL);
+ if (ret) {
+ loc_wipe(&loc);
+ return ret;
+ }
+
+ s->inode = inode_ref(loc.inode);
+ loc_wipe(&loc);
+ } else {
+ inode = qb_inode_from_filename (filename);
+ if (!inode)
+ return -EINVAL;
+
+ s->inode = inode_ref(inode);
+ }
+
+ return 0;
+}
+
+
+static int
+qemu_gluster_create (const char *filename, QEMUOptionParameter *options)
+{
+ uint64_t total_size = 0;
+ inode_t *inode = NULL;
+ fd_t *fd = NULL;
+ struct iatt stat = {0, };
+ int ret = 0;
+
+ inode = qb_inode_from_filename (filename);
+ if (!inode)
+ return -EINVAL;
+
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n / BDRV_SECTOR_SIZE;
+ }
+ options++;
+ }
+
+ fd = fd_anonymous (inode);
+ if (!fd)
+ return -ENOMEM;
+
+ ret = syncop_fstat (FIRST_CHILD(THIS), fd, &stat);
+ if (ret) {
+ fd_unref (fd);
+ return ret;
+ }
+
+ if (stat.ia_size) {
+ /* format ONLY if the filesize is 0 bytes */
+ fd_unref (fd);
+ return -EFBIG;
+ }
+
+ if (total_size) {
+ ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, total_size);
+ if (ret) {
+ fd_unref (fd);
+ return ret;
+ }
+ }
+
+ fd_unref (fd);
+ return 0;
+}
+
+
+static int
+qemu_gluster_co_readv (BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+ QEMUIOVector *qiov)
+{
+ fd_t *fd = NULL;
+ off_t offset = 0;
+ size_t size = 0;
+ struct iovec *iov = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+
+ fd = fd_from_bs (bs);
+ if (!fd)
+ return -EIO;
+
+ offset = sector_num * BDRV_SECTOR_SIZE;
+ size = nb_sectors * BDRV_SECTOR_SIZE;
+
+ ret = syncop_readv (FIRST_CHILD(THIS), fd, size, offset, 0,
+ &iov, &count, &iobref);
+ if (ret < 0)
+ goto out;
+
+ iov_copy (qiov->iov, qiov->niov, iov, count); /* *choke!* */
+
+out:
+ GF_FREE (iov);
+ if (iobref)
+ iobref_unref (iobref);
+ fd_unref (fd);
+ return ret;
+}
+
+
+static int
+qemu_gluster_co_writev (BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+ QEMUIOVector *qiov)
+{
+ fd_t *fd = NULL;
+ off_t offset = 0;
+ size_t size = 0;
+ struct iobref *iobref = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iovec iov = {0, };
+ int ret = -ENOMEM;
+
+ fd = fd_from_bs (bs);
+ if (!fd)
+ return -EIO;
+
+ offset = sector_num * BDRV_SECTOR_SIZE;
+ size = nb_sectors * BDRV_SECTOR_SIZE;
+
+ iobuf = iobuf_get2 (THIS->ctx->iobuf_pool, size);
+ if (!iobuf)
+ goto out;
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov_unload (iobuf_ptr (iobuf), qiov->iov, qiov->niov); /* *choke!* */
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = size;
+
+ ret = syncop_writev (FIRST_CHILD(THIS), fd, &iov, 1, offset, iobref, 0);
+
+out:
+ if (iobuf)
+ iobuf_unref (iobuf);
+ if (iobref)
+ iobref_unref (iobref);
+ fd_unref (fd);
+ return ret;
+}
+
+
+static int
+qemu_gluster_co_flush (BlockDriverState *bs)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_flush (FIRST_CHILD(THIS), fd);
+
+ fd_unref (fd);
+
+ return ret;
+}
+
+
+static int
+qemu_gluster_co_fsync (BlockDriverState *bs)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_fsync (FIRST_CHILD(THIS), fd, 0);
+
+ fd_unref (fd);
+
+ return ret;
+}
+
+
+static int
+qemu_gluster_truncate (BlockDriverState *bs, int64_t offset)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, offset);
+
+ fd_unref (fd);
+
+ return ret;
+}
+
+
+static int64_t
+qemu_gluster_getlength (BlockDriverState *bs)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+ struct iatt iatt = {0, };
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt);
+ if (ret < 0)
+ return -1;
+
+ return iatt.ia_size;
+}
+
+
+static int64_t
+qemu_gluster_allocated_file_size (BlockDriverState *bs)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+ struct iatt iatt = {0, };
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt);
+ if (ret < 0)
+ return -1;
+
+ return iatt.ia_blocks * 512;
+}
+
+
+static void
+qemu_gluster_close (BlockDriverState *bs)
+{
+ BDRVGlusterState *s = NULL;
+
+ s = bs->opaque;
+
+ inode_unref (s->inode);
+
+ return;
+}
+
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ { NULL }
+};
+
+
+static BlockDriver bdrv_gluster = {
+ .format_name = "gluster",
+ .protocol_name = "gluster",
+ .instance_size = sizeof(BDRVGlusterState),
+ .bdrv_file_open = qemu_gluster_open,
+ .bdrv_close = qemu_gluster_close,
+ .bdrv_create = qemu_gluster_create,
+ .bdrv_getlength = qemu_gluster_getlength,
+ .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_co_readv = qemu_gluster_co_readv,
+ .bdrv_co_writev = qemu_gluster_co_writev,
+ .bdrv_co_flush_to_os = qemu_gluster_co_flush,
+ .bdrv_co_flush_to_disk = qemu_gluster_co_fsync,
+ .bdrv_truncate = qemu_gluster_truncate,
+ .create_options = qemu_gluster_create_options,
+};
+
+
+static void bdrv_gluster_init(void)
+{
+ bdrv_register(&bdrv_gluster);
+}
+
+
+block_init(bdrv_gluster_init);
diff --git a/xlators/features/qemu-block/src/bh-syncop.c b/xlators/features/qemu-block/src/bh-syncop.c
new file mode 100644
index 000000000..e8686f6d4
--- /dev/null
+++ b/xlators/features/qemu-block/src/bh-syncop.c
@@ -0,0 +1,48 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "syncop.h"
+#include "qemu-block-memory-types.h"
+
+#include "block/aio.h"
+
+void
+qemu_bh_schedule (QEMUBH *bh)
+{
+ return;
+}
+
+void
+qemu_bh_cancel (QEMUBH *bh)
+{
+ return;
+}
+
+void
+qemu_bh_delete (QEMUBH *bh)
+{
+
+}
+
+QEMUBH *
+qemu_bh_new (QEMUBHFunc *cb, void *opaque)
+{
+ return NULL;
+}
diff --git a/xlators/features/qemu-block/src/clock-timer.c b/xlators/features/qemu-block/src/clock-timer.c
new file mode 100644
index 000000000..fcbec6ad1
--- /dev/null
+++ b/xlators/features/qemu-block/src/clock-timer.c
@@ -0,0 +1,60 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "syncop.h"
+#include "qemu-block-memory-types.h"
+
+#include "qemu/timer.h"
+
+QEMUClock *vm_clock;
+int use_rt_clock = 0;
+
+QEMUTimer *qemu_new_timer (QEMUClock *clock, int scale,
+ QEMUTimerCB *cb, void *opaque)
+{
+ return NULL;
+}
+
+int64_t qemu_get_clock_ns (QEMUClock *clock)
+{
+ return 0;
+}
+
+void qemu_mod_timer (QEMUTimer *ts, int64_t expire_time)
+{
+ return;
+}
+
+void qemu_free_timer (QEMUTimer *ts)
+{
+
+}
+
+void qemu_del_timer (QEMUTimer *ts)
+{
+
+}
+
+bool qemu_aio_wait()
+{
+ synctask_wake (synctask_get());
+ synctask_yield (synctask_get());
+ return 0;
+}
diff --git a/xlators/features/qemu-block/src/coroutine-synctask.c b/xlators/features/qemu-block/src/coroutine-synctask.c
new file mode 100644
index 000000000..e43988a95
--- /dev/null
+++ b/xlators/features/qemu-block/src/coroutine-synctask.c
@@ -0,0 +1,116 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "syncop.h"
+#include "qemu-block-memory-types.h"
+
+#include "qemu-block.h"
+
+/*
+ * This code serves as the bridge from the main glusterfs context to the qemu
+ * coroutine context via synctask. We create a single threaded syncenv with a
+ * single synctask responsible for processing a queue of coroutines. The qemu
+ * code invoked from within the synctask function handlers uses the ucontext
+ * coroutine implementation and scheduling logic internal to qemu. This
+ * effectively donates a thread of execution to qemu and its internal coroutine
+ * management.
+ *
+ * NOTE: The existence of concurrent synctasks has proven quite racy with regard
+ * to qemu coroutine management, particularly related to the lifecycle
+ * differences with top-level synctasks and internally created coroutines and
+ * interactions with qemu-internal queues (and locks, in turn). We explicitly
+ * disallow this scenario, via the queue, until it is more well supported.
+ */
+
+static struct {
+ struct list_head queue;
+ gf_lock_t lock;
+ struct synctask *task;
+} qb_co;
+
+static void
+init_qbco()
+{
+ INIT_LIST_HEAD(&qb_co.queue);
+ LOCK_INIT(&qb_co.lock);
+}
+
+static int
+synctask_nop_cbk (int ret, call_frame_t *frame, void *opaque)
+{
+ return 0;
+}
+
+static int
+qb_synctask_wrap (void *opaque)
+{
+ qb_local_t *qb_local, *tmp;
+
+ LOCK(&qb_co.lock);
+
+ while (!list_empty(&qb_co.queue)) {
+ list_for_each_entry_safe(qb_local, tmp, &qb_co.queue, list) {
+ list_del_init(&qb_local->list);
+ break;
+ }
+
+ UNLOCK(&qb_co.lock);
+
+ qb_local->synctask_fn(qb_local);
+ /* qb_local is now unwound and gone! */
+
+ LOCK(&qb_co.lock);
+ }
+
+ qb_co.task = NULL;
+
+ UNLOCK(&qb_co.lock);
+
+ return 0;
+}
+
+int
+qb_coroutine (call_frame_t *frame, synctask_fn_t fn)
+{
+ qb_local_t *qb_local = NULL;
+ qb_conf_t *qb_conf = NULL;
+ static int init = 0;
+
+ qb_local = frame->local;
+ qb_local->synctask_fn = fn;
+ qb_conf = frame->this->private;
+
+ if (!init) {
+ init = 1;
+ init_qbco();
+ }
+
+ LOCK(&qb_co.lock);
+
+ if (!qb_co.task)
+ qb_co.task = synctask_create(qb_conf->env, qb_synctask_wrap,
+ synctask_nop_cbk, frame, NULL);
+
+ list_add_tail(&qb_local->list, &qb_co.queue);
+
+ UNLOCK(&qb_co.lock);
+
+ return 0;
+}
diff --git a/xlators/features/qemu-block/src/monitor-logging.c b/xlators/features/qemu-block/src/monitor-logging.c
new file mode 100644
index 000000000..d37c37f0f
--- /dev/null
+++ b/xlators/features/qemu-block/src/monitor-logging.c
@@ -0,0 +1,50 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "qemu-block-memory-types.h"
+
+#include "block/block_int.h"
+
+Monitor *cur_mon;
+
+int
+monitor_cur_is_qmp()
+{
+ /* No QMP support here */
+ return 0;
+}
+
+void
+monitor_set_error (Monitor *mon, QError *qerror)
+{
+ /* NOP here */
+ return;
+}
+
+
+void
+monitor_vprintf(Monitor *mon, const char *fmt, va_list ap)
+{
+ char buf[4096];
+
+ vsnprintf(buf, sizeof(buf), fmt, ap);
+
+ gf_log (THIS->name, GF_LOG_ERROR, "%s", buf);
+}
diff --git a/xlators/features/qemu-block/src/qb-coroutines.c b/xlators/features/qemu-block/src/qb-coroutines.c
new file mode 100644
index 000000000..974312f12
--- /dev/null
+++ b/xlators/features/qemu-block/src/qb-coroutines.c
@@ -0,0 +1,667 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "inode.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "qemu-block-memory-types.h"
+#include "qemu-block.h"
+#include "qb-coroutines.h"
+
+
+int
+qb_format_and_resume (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ char filename[64];
+ char base_filename[128];
+ int use_base = 0;
+ qb_inode_t *qb_inode = NULL;
+ Error *local_err = NULL;
+ fd_t *fd = NULL;
+ dict_t *xattr = NULL;
+ qb_conf_t *qb_conf = NULL;
+ int ret = -1;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+ qb_conf = frame->this->private;
+
+ qb_inode_to_filename (inode, filename, 64);
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+
+ /*
+ * See if the caller specified a backing image.
+ */
+ if (!uuid_is_null(qb_inode->backing_gfid) || qb_inode->backing_fname) {
+ loc_t loc = {0,};
+ char gfid_str[64];
+ struct iatt buf;
+
+ if (!uuid_is_null(qb_inode->backing_gfid)) {
+ loc.inode = inode_find(qb_conf->root_inode->table,
+ qb_inode->backing_gfid);
+ if (!loc.inode) {
+ loc.inode = inode_new(qb_conf->root_inode->table);
+ uuid_copy(loc.inode->gfid,
+ qb_inode->backing_gfid);
+ }
+ uuid_copy(loc.gfid, loc.inode->gfid);
+ } else if (qb_inode->backing_fname) {
+ loc.inode = inode_new(qb_conf->root_inode->table);
+ loc.name = qb_inode->backing_fname;
+ loc.parent = inode_parent(inode, NULL, NULL);
+ loc_path(&loc, loc.name);
+ }
+
+ /*
+ * Lookup the backing image. Verify existence and/or get the
+ * gfid if we don't already have it.
+ */
+ ret = syncop_lookup(FIRST_CHILD(frame->this), &loc, NULL, &buf,
+ NULL, NULL);
+ GF_FREE(qb_inode->backing_fname);
+ if (ret) {
+ loc_wipe(&loc);
+ ret = -ret;
+ goto err;
+ }
+
+ uuid_copy(qb_inode->backing_gfid, buf.ia_gfid);
+ loc_wipe(&loc);
+
+ /*
+ * We pass the filename of the backing image into the qemu block
+ * subsystem as the associated gfid. This is embedded into the
+ * clone image and passed along to the gluster bdrv backend when
+ * the block subsystem needs to operate on the backing image on
+ * behalf of the clone.
+ */
+ uuid_unparse(qb_inode->backing_gfid, gfid_str);
+ snprintf(base_filename, sizeof(base_filename),
+ "gluster://gfid:%s", gfid_str);
+ use_base = 1;
+ }
+
+ bdrv_img_create (filename, qb_inode->fmt,
+ use_base ? base_filename : NULL, 0, 0, qb_inode->size,
+ 0, &local_err, true);
+
+ if (error_is_set (&local_err)) {
+ gf_log (frame->this->name, GF_LOG_ERROR, "%s",
+ error_get_pretty (local_err));
+ error_free (local_err);
+ QB_STUB_UNWIND (stub, -1, EIO);
+ return 0;
+ }
+
+ fd = fd_anonymous (inode);
+ if (!fd) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "could not create anonymous fd for %s",
+ uuid_utoa (inode->gfid));
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ return 0;
+ }
+
+ xattr = dict_new ();
+ if (!xattr) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "could not allocate xattr dict for %s",
+ uuid_utoa (inode->gfid));
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ fd_unref (fd);
+ return 0;
+ }
+
+ ret = dict_set_str (xattr, qb_conf->qb_xattr_key, local->fmt);
+ if (ret) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "could not dict_set for %s",
+ uuid_utoa (inode->gfid));
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ fd_unref (fd);
+ dict_unref (xattr);
+ return 0;
+ }
+
+ ret = syncop_fsetxattr (FIRST_CHILD(THIS), fd, xattr, 0);
+ if (ret) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "failed to setxattr for %s",
+ uuid_utoa (inode->gfid));
+ QB_STUB_UNWIND (stub, -1, -ret);
+ fd_unref (fd);
+ dict_unref (xattr);
+ return 0;
+ }
+
+ fd_unref (fd);
+ dict_unref (xattr);
+
+ QB_STUB_UNWIND (stub, 0, 0);
+
+ return 0;
+
+err:
+ QB_STUB_UNWIND(stub, -1, ret);
+ return 0;
+}
+
+
+static BlockDriverState *
+qb_bs_create (inode_t *inode, const char *fmt)
+{
+ char filename[64];
+ BlockDriverState *bs = NULL;
+ BlockDriver *drv = NULL;
+ int op_errno = 0;
+ int ret = 0;
+
+ bs = bdrv_new (uuid_utoa (inode->gfid));
+ if (!bs) {
+ op_errno = ENOMEM;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "could not allocate @bdrv for gfid:%s",
+ uuid_utoa (inode->gfid));
+ goto err;
+ }
+
+ drv = bdrv_find_format (fmt);
+ if (!drv) {
+ op_errno = EINVAL;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unknown file format: %s for gfid:%s",
+ fmt, uuid_utoa (inode->gfid));
+ goto err;
+ }
+
+ qb_inode_to_filename (inode, filename, 64);
+
+ ret = bdrv_open (bs, filename, NULL, BDRV_O_RDWR, drv);
+ if (ret < 0) {
+ op_errno = -ret;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unable to bdrv_open() gfid:%s (%s)",
+ uuid_utoa (inode->gfid), strerror (op_errno));
+ goto err;
+ }
+
+ return bs;
+err:
+ errno = op_errno;
+ return NULL;
+}
+
+
+int
+qb_co_open (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+ qb_inode->refcnt++;
+
+ QB_STUB_RESUME (stub);
+
+ return 0;
+}
+
+
+int
+qb_co_writev (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ QEMUIOVector qiov = {0, };
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ qemu_iovec_init_external (&qiov, stub->args.vector, stub->args.count);
+
+ ret = bdrv_pwritev (qb_inode->bs, stub->args.offset, &qiov);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+int
+qb_co_readv (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec iov = {0, };
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ if (stub->args.offset >= qb_inode->size) {
+ QB_STUB_UNWIND (stub, 0, 0);
+ return 0;
+ }
+
+ iobuf = iobuf_get2 (frame->this->ctx->iobuf_pool, stub->args.size);
+ if (!iobuf) {
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ return 0;
+ }
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ iobuf_unref (iobuf);
+ return 0;
+ }
+
+ if (iobref_add (iobref, iobuf) < 0) {
+ iobuf_unref (iobuf);
+ iobref_unref (iobref);
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ return 0;
+ }
+
+ ret = bdrv_pread (qb_inode->bs, stub->args.offset, iobuf_ptr (iobuf),
+ stub->args.size);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ iobref_unref (iobref);
+ return 0;
+ }
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = ret;
+
+ stub->args_cbk.vector = iov_dup (&iov, 1);
+ stub->args_cbk.count = 1;
+ stub->args_cbk.iobref = iobref;
+
+ QB_STUB_UNWIND (stub, ret, 0);
+
+ return 0;
+}
+
+
+int
+qb_co_fsync (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ ret = bdrv_flush (qb_inode->bs);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+static void
+qb_update_size_xattr (xlator_t *this, fd_t *fd, const char *fmt, off_t offset)
+{
+ char val[QB_XATTR_VAL_MAX];
+ qb_conf_t *qb_conf = NULL;
+ dict_t *xattr = NULL;
+
+ qb_conf = this->private;
+
+ snprintf (val, QB_XATTR_VAL_MAX, "%s:%llu",
+ fmt, (long long unsigned) offset);
+
+ xattr = dict_new ();
+ if (!xattr)
+ return;
+
+ if (dict_set_str (xattr, qb_conf->qb_xattr_key, val) != 0) {
+ dict_unref (xattr);
+ return;
+ }
+
+ syncop_fsetxattr (FIRST_CHILD(this), fd, xattr, 0);
+ dict_unref (xattr);
+}
+
+
+int
+qb_co_truncate (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ int ret = 0;
+ off_t offset = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ ret = syncop_fstat (FIRST_CHILD(this), local->fd,
+ &stub->args_cbk.prestat);
+ if (ret < 0)
+ goto out;
+ stub->args_cbk.prestat.ia_size = qb_inode->size;
+
+ ret = bdrv_truncate (qb_inode->bs, stub->args.offset);
+ if (ret < 0)
+ goto out;
+
+ offset = bdrv_getlength (qb_inode->bs);
+
+ qb_inode->size = offset;
+
+ ret = syncop_fstat (FIRST_CHILD(this), local->fd,
+ &stub->args_cbk.poststat);
+ if (ret < 0)
+ goto out;
+ stub->args_cbk.poststat.ia_size = qb_inode->size;
+
+ qb_update_size_xattr (this, local->fd, qb_inode->fmt, qb_inode->size);
+
+out:
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+int
+qb_co_close (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ BlockDriverState *bs = NULL;
+
+ local = opaque;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (THIS, inode);
+
+ if (!--qb_inode->refcnt) {
+ bs = qb_inode->bs;
+ qb_inode->bs = NULL;
+ bdrv_delete (bs);
+ }
+
+ frame = local->frame;
+ frame->local = NULL;
+ qb_local_free (THIS, local);
+ STACK_DESTROY (frame->root);
+
+ return 0;
+}
+
+
+int
+qb_snapshot_create (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ QEMUSnapshotInfo sn;
+ struct timeval tv = {0, };
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ memset (&sn, 0, sizeof (sn));
+ pstrcpy (sn.name, sizeof(sn.name), local->name);
+ gettimeofday (&tv, NULL);
+ sn.date_sec = tv.tv_sec;
+ sn.date_nsec = tv.tv_usec * 1000;
+
+ ret = bdrv_snapshot_create (qb_inode->bs, &sn);
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+int
+qb_snapshot_delete (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ ret = bdrv_snapshot_delete (qb_inode->bs, local->name);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+int
+qb_snapshot_goto (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ ret = bdrv_snapshot_goto (qb_inode->bs, local->name);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
diff --git a/xlators/features/qemu-block/src/qb-coroutines.h b/xlators/features/qemu-block/src/qb-coroutines.h
new file mode 100644
index 000000000..583319f3b
--- /dev/null
+++ b/xlators/features/qemu-block/src/qb-coroutines.h
@@ -0,0 +1,30 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __QB_COROUTINES_H
+#define __QB_COROUTINES_H
+
+#include "syncop.h"
+#include "call-stub.h"
+#include "block/block_int.h"
+#include "monitor/monitor.h"
+
+int qb_format_and_resume (void *opaque);
+int qb_snapshot_create (void *opaque);
+int qb_snapshot_delete (void *opaque);
+int qb_snapshot_goto (void *opaque);
+int qb_co_open (void *opaque);
+int qb_co_close (void *opaque);
+int qb_co_writev (void *opaque);
+int qb_co_readv (void *opaque);
+int qb_co_fsync (void *opaque);
+int qb_co_truncate (void *opaque);
+
+#endif /* __QB_COROUTINES_H */
diff --git a/xlators/features/qemu-block/src/qemu-block-memory-types.h b/xlators/features/qemu-block/src/qemu-block-memory-types.h
new file mode 100644
index 000000000..267b3893f
--- /dev/null
+++ b/xlators/features/qemu-block/src/qemu-block-memory-types.h
@@ -0,0 +1,25 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __QB_MEM_TYPES_H__
+#define __QB_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_qb_mem_types_ {
+ gf_qb_mt_qb_conf_t = gf_common_mt_end + 1,
+ gf_qb_mt_qb_inode_t,
+ gf_qb_mt_qb_local_t,
+ gf_qb_mt_coroutinesynctask_t,
+ gf_qb_mt_end
+};
+#endif
+
diff --git a/xlators/features/qemu-block/src/qemu-block.c b/xlators/features/qemu-block/src/qemu-block.c
new file mode 100644
index 000000000..48bbf3140
--- /dev/null
+++ b/xlators/features/qemu-block/src/qemu-block.c
@@ -0,0 +1,1140 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "inode.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "qemu-block-memory-types.h"
+#include "qemu-block.h"
+#include "qb-coroutines.h"
+
+
+qb_inode_t *
+__qb_inode_ctx_get (xlator_t *this, inode_t *inode)
+{
+ uint64_t value = 0;
+ qb_inode_t *qb_inode = NULL;
+
+ __inode_ctx_get (inode, this, &value);
+ qb_inode = (qb_inode_t *)(unsigned long) value;
+
+ return qb_inode;
+}
+
+
+qb_inode_t *
+qb_inode_ctx_get (xlator_t *this, inode_t *inode)
+{
+ qb_inode_t *qb_inode = NULL;
+
+ LOCK (&inode->lock);
+ {
+ qb_inode = __qb_inode_ctx_get (this, inode);
+ }
+ UNLOCK (&inode->lock);
+
+ return qb_inode;
+}
+
+
+qb_inode_t *
+qb_inode_ctx_del (xlator_t *this, inode_t *inode)
+{
+ uint64_t value = 0;
+ qb_inode_t *qb_inode = NULL;
+
+ inode_ctx_del (inode, this, &value);
+ qb_inode = (qb_inode_t *)(unsigned long) value;
+
+ return qb_inode;
+}
+
+
+int
+qb_inode_cleanup (xlator_t *this, inode_t *inode, int warn)
+{
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_del (this, inode);
+
+ if (!qb_inode)
+ return 0;
+
+ if (warn)
+ gf_log (this->name, GF_LOG_WARNING,
+ "inode %s no longer block formatted",
+ uuid_utoa (inode->gfid));
+
+ /* free (qb_inode->bs); */
+
+ GF_FREE (qb_inode);
+
+ return 0;
+}
+
+
+int
+qb_iatt_fixup (xlator_t *this, inode_t *inode, struct iatt *iatt)
+{
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, inode);
+ if (!qb_inode)
+ return 0;
+
+ iatt->ia_size = qb_inode->size;
+
+ return 0;
+}
+
+
+int
+qb_format_extract (xlator_t *this, char *format, inode_t *inode)
+{
+ char *s, *save;
+ uint64_t size = 0;
+ char fmt[QB_XATTR_VAL_MAX+1] = {0, };
+ qb_inode_t *qb_inode = NULL;
+ char *formatstr = NULL;
+ uuid_t gfid = {0,};
+ char gfid_str[64] = {0,};
+ int ret;
+
+ strncpy(fmt, format, QB_XATTR_VAL_MAX);
+
+ s = strtok_r(fmt, ":", &save);
+ if (!s)
+ goto invalid;
+ formatstr = gf_strdup(s);
+
+ s = strtok_r(NULL, ":", &save);
+ if (!s)
+ goto invalid;
+ if (gf_string2bytesize (s, &size))
+ goto invalid;
+ if (!size)
+ goto invalid;
+
+ s = strtok_r(NULL, "\0", &save);
+ if (s && !strncmp(s, "<gfid:", strlen("<gfid:"))) {
+ /*
+ * Check for valid gfid backing image specifier.
+ */
+ if (strlen(s) + 1 > sizeof(gfid_str))
+ goto invalid;
+ ret = sscanf(s, "<gfid:%[^>]s", gfid_str);
+ if (ret == 1) {
+ ret = uuid_parse(gfid_str, gfid);
+ if (ret < 0)
+ goto invalid;
+ }
+ }
+
+ qb_inode = qb_inode_ctx_get (this, inode);
+ if (!qb_inode)
+ qb_inode = GF_CALLOC (1, sizeof (*qb_inode),
+ gf_qb_mt_qb_inode_t);
+ if (!qb_inode) {
+ GF_FREE(formatstr);
+ return ENOMEM;
+ }
+
+ strncpy(qb_inode->fmt, formatstr, QB_XATTR_VAL_MAX);
+ qb_inode->size = size;
+
+ /*
+ * If a backing gfid was not specified, interpret any remaining bytes
+ * associated with a backing image as a filename local to the parent
+ * directory. The format processing will validate further.
+ */
+ if (!uuid_is_null(gfid))
+ uuid_copy(qb_inode->backing_gfid, gfid);
+ else if (s)
+ qb_inode->backing_fname = gf_strdup(s);
+
+ inode_ctx_set (inode, this, (void *)&qb_inode);
+
+ GF_FREE(formatstr);
+
+ return 0;
+
+invalid:
+ GF_FREE(formatstr);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid format '%s' in inode %s", format,
+ uuid_utoa (inode->gfid));
+ return EINVAL;
+}
+
+
+void
+qb_local_free (xlator_t *this, qb_local_t *local)
+{
+ if (local->inode)
+ inode_unref (local->inode);
+ if (local->fd)
+ fd_unref (local->fd);
+ GF_FREE (local);
+}
+
+
+int
+qb_local_init (call_frame_t *frame)
+{
+ qb_local_t *qb_local = NULL;
+
+ qb_local = GF_CALLOC (1, sizeof (*qb_local), gf_qb_mt_qb_local_t);
+ if (!qb_local)
+ return -1;
+ INIT_LIST_HEAD(&qb_local->list);
+
+ qb_local->frame = frame;
+ frame->local = qb_local;
+
+ return 0;
+}
+
+
+int
+qb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ dict_t *xdata, struct iatt *postparent)
+{
+ char *format = NULL;
+ qb_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (op_ret == -1)
+ goto out;
+
+ /*
+ * Cache the root inode for dealing with backing images. The format
+ * coroutine and the gluster qemu backend driver both use the root inode
+ * table to verify and/or redirect I/O to the backing image via
+ * anonymous fd's.
+ */
+ if (!conf->root_inode && __is_root_gfid(inode->gfid))
+ conf->root_inode = inode_ref(inode);
+
+ if (!xdata)
+ goto out;
+
+ if (dict_get_str (xdata, conf->qb_xattr_key, &format))
+ goto out;
+
+ if (!format) {
+ qb_inode_cleanup (this, inode, 1);
+ goto out;
+ }
+
+ op_errno = qb_format_extract (this, format, inode);
+ if (op_errno)
+ op_ret = -1;
+
+ qb_iatt_fixup (this, inode, buf);
+out:
+ QB_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+
+int
+qb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ qb_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ xdata = xdata ? dict_ref (xdata) : dict_new ();
+
+ if (!xdata)
+ goto enomem;
+
+ if (dict_set_int32 (xdata, conf->qb_xattr_key, 0))
+ goto enomem;
+
+ STACK_WIND (frame, qb_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ dict_unref (xdata);
+ return 0;
+enomem:
+ QB_STACK_UNWIND (lookup, frame, -1, ENOMEM, 0, 0, 0, 0);
+ if (xdata)
+ dict_unref (xdata);
+ return 0;
+}
+
+
+int
+qb_setxattr_format (call_frame_t *frame, xlator_t *this, call_stub_t *stub,
+ dict_t *xattr, inode_t *inode)
+{
+ char *format = NULL;
+ int op_errno = 0;
+ qb_local_t *qb_local = NULL;
+ data_t *data = NULL;
+ qb_inode_t *qb_inode;
+
+ if (!(data = dict_get (xattr, "trusted.glusterfs.block-format"))) {
+ QB_STUB_RESUME (stub);
+ return 0;
+ }
+
+ format = alloca (data->len + 1);
+ memcpy (format, data->data, data->len);
+ format[data->len] = 0;
+
+ op_errno = qb_format_extract (this, format, inode);
+ if (op_errno) {
+ QB_STUB_UNWIND (stub, -1, op_errno);
+ return 0;
+ }
+ qb_inode = qb_inode_ctx_get(this, inode);
+
+ qb_local = frame->local;
+
+ qb_local->stub = stub;
+ qb_local->inode = inode_ref (inode);
+
+ snprintf(qb_local->fmt, QB_XATTR_VAL_MAX, "%s:%lu", qb_inode->fmt,
+ qb_inode->size);
+
+ qb_coroutine (frame, qb_format_and_resume);
+
+ return 0;
+}
+
+
+int
+qb_setxattr_snapshot_create (call_frame_t *frame, xlator_t *this,
+ call_stub_t *stub, dict_t *xattr, inode_t *inode)
+{
+ qb_local_t *qb_local = NULL;
+ char *name = NULL;
+ data_t *data = NULL;
+
+ if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) {
+ QB_STUB_RESUME (stub);
+ return 0;
+ }
+
+ name = alloca (data->len + 1);
+ memcpy (name, data->data, data->len);
+ name[data->len] = 0;
+
+ qb_local = frame->local;
+
+ qb_local->stub = stub;
+ qb_local->inode = inode_ref (inode);
+ strncpy (qb_local->name, name, 128);
+
+ qb_coroutine (frame, qb_snapshot_create);
+
+ return 0;
+}
+
+
+int
+qb_setxattr_snapshot_delete (call_frame_t *frame, xlator_t *this,
+ call_stub_t *stub, dict_t *xattr, inode_t *inode)
+{
+ qb_local_t *qb_local = NULL;
+ char *name = NULL;
+ data_t *data = NULL;
+
+ if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) {
+ QB_STUB_RESUME (stub);
+ return 0;
+ }
+
+ name = alloca (data->len + 1);
+ memcpy (name, data->data, data->len);
+ name[data->len] = 0;
+
+ qb_local = frame->local;
+
+ qb_local->stub = stub;
+ qb_local->inode = inode_ref (inode);
+ strncpy (qb_local->name, name, 128);
+
+ qb_coroutine (frame, qb_snapshot_delete);
+
+ return 0;
+}
+
+int
+qb_setxattr_snapshot_goto (call_frame_t *frame, xlator_t *this,
+ call_stub_t *stub, dict_t *xattr, inode_t *inode)
+{
+ qb_local_t *qb_local = NULL;
+ char *name = NULL;
+ data_t *data = NULL;
+
+ if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) {
+ QB_STUB_RESUME (stub);
+ return 0;
+ }
+
+ name = alloca (data->len + 1);
+ memcpy (name, data->data, data->len);
+ name[data->len] = 0;
+
+ qb_local = frame->local;
+
+ qb_local->stub = stub;
+ qb_local->inode = inode_ref (inode);
+ strncpy (qb_local->name, name, 128);
+
+ qb_coroutine (frame, qb_snapshot_goto);
+
+ return 0;
+}
+
+
+int
+qb_setxattr_common (call_frame_t *frame, xlator_t *this, call_stub_t *stub,
+ dict_t *xattr, inode_t *inode)
+{
+ data_t *data = NULL;
+
+ if ((data = dict_get (xattr, "trusted.glusterfs.block-format"))) {
+ qb_setxattr_format (frame, this, stub, xattr, inode);
+ return 0;
+ }
+
+ if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) {
+ qb_setxattr_snapshot_create (frame, this, stub, xattr, inode);
+ return 0;
+ }
+
+ if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) {
+ qb_setxattr_snapshot_delete (frame, this, stub, xattr, inode);
+ return 0;
+ }
+
+ if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) {
+ qb_setxattr_snapshot_goto (frame, this, stub, xattr, inode);
+ return 0;
+ }
+
+ QB_STUB_RESUME (stub);
+
+ return 0;
+}
+
+
+int
+qb_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ stub = fop_setxattr_stub (frame, default_setxattr_resume, loc, xattr,
+ flags, xdata);
+ if (!stub)
+ goto enomem;
+
+ qb_setxattr_common (frame, this, stub, xattr, loc->inode);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (setxattr, frame, -1, ENOMEM, 0);
+ return 0;
+}
+
+
+int
+qb_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr,
+ flags, xdata);
+ if (!stub)
+ goto enomem;
+
+ qb_setxattr_common (frame, this, stub, xattr, fd->inode);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (fsetxattr, frame, -1, ENOMEM, 0);
+ return 0;
+}
+
+
+int
+qb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ qb_local_t *qb_local = NULL;
+
+ qb_local = frame->local;
+
+ if (op_ret < 0)
+ goto unwind;
+
+ if (!qb_inode_ctx_get (this, qb_local->inode))
+ goto unwind;
+
+ stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata);
+ if (!stub) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ qb_local->stub = stub;
+
+ qb_coroutine (frame, qb_co_open);
+
+ return 0;
+unwind:
+ QB_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+
+int
+qb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ fd_t *fd, dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, loc->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd,
+ xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (loc->inode);
+ qb_local->fd = fd_ref (fd);
+
+ STACK_WIND (frame, qb_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+enomem:
+ QB_STACK_UNWIND (open, frame, -1, ENOMEM, 0, 0);
+ return 0;
+}
+
+
+int
+qb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_writev_stub (frame, NULL, fd, vector, count,
+ offset, flags, iobref, xdata);
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_writev);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (writev, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, offset,
+ flags, xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_readv_stub (frame, NULL, fd, size, offset,
+ flags, xdata);
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_readv);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int dsync,
+ dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd, dsync, xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_fsync_stub (frame, NULL, fd, dsync, xdata);
+
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_fsync);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush, fd, xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_flush_stub (frame, NULL, fd, xdata);
+
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_fsync);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (flush, frame, -1, ENOMEM, 0);
+ return 0;
+}
+
+static int32_t
+qb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ qb_conf_t *conf = this->private;
+ gf_dirent_t *entry;
+ char *format;
+
+ list_for_each_entry(entry, &entries->list, list) {
+ if (!entry->inode || !entry->dict)
+ continue;
+
+ format = NULL;
+ if (dict_get_str(entry->dict, conf->qb_xattr_key, &format))
+ continue;
+
+ if (!format) {
+ qb_inode_cleanup(this, entry->inode, 1);
+ continue;
+ }
+
+ if (qb_format_extract(this, format, entry->inode))
+ continue;
+
+ qb_iatt_fixup(this, entry->inode, &entry->d_stat);
+ }
+
+ STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+static int32_t
+qb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
+{
+ qb_conf_t *conf = this->private;
+
+ xdata = xdata ? dict_ref(xdata) : dict_new();
+ if (!xdata)
+ goto enomem;
+
+ if (dict_set_int32 (xdata, conf->qb_xattr_key, 0))
+ goto enomem;
+
+ STACK_WIND(frame, qb_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+
+ dict_unref(xdata);
+ return 0;
+
+enomem:
+ QB_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL);
+ if (xdata)
+ dict_unref(xdata);
+ return 0;
+}
+
+int
+qb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, loc->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset,
+ xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (loc->inode);
+ qb_local->fd = fd_anonymous (loc->inode);
+
+ qb_local->stub = fop_truncate_stub (frame, NULL, loc, offset, xdata);
+
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_truncate);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (truncate, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset,
+ xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_ftruncate_stub (frame, NULL, fd, offset, xdata);
+
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_truncate);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata)
+{
+ inode_t *inode = NULL;
+
+ inode = frame->local;
+ frame->local = NULL;
+
+ if (inode) {
+ qb_iatt_fixup (this, inode, iatt);
+ inode_unref (inode);
+ }
+
+ QB_STACK_UNWIND (stat, frame, op_ret, op_errno, iatt, xdata);
+
+ return 0;
+}
+
+int
+qb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ if (qb_inode_ctx_get (this, loc->inode))
+ frame->local = inode_ref (loc->inode);
+
+ STACK_WIND (frame, qb_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
+}
+
+
+int
+qb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata)
+{
+ inode_t *inode = NULL;
+
+ inode = frame->local;
+ frame->local = NULL;
+
+ if (inode) {
+ qb_iatt_fixup (this, inode, iatt);
+ inode_unref (inode);
+ }
+
+ QB_STACK_UNWIND (fstat, frame, op_ret, op_errno, iatt, xdata);
+
+ return 0;
+}
+
+
+int
+qb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ if (qb_inode_ctx_get (this, fd->inode))
+ frame->local = inode_ref (fd->inode);
+
+ STACK_WIND (frame, qb_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ return 0;
+}
+
+
+int
+qb_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
+ dict_t *xdata)
+{
+ inode_t *inode = NULL;
+
+ inode = frame->local;
+ frame->local = NULL;
+
+ if (inode) {
+ qb_iatt_fixup (this, inode, pre);
+ qb_iatt_fixup (this, inode, post);
+ inode_unref (inode);
+ }
+
+ QB_STACK_UNWIND (setattr, frame, op_ret, op_errno, pre, post, xdata);
+
+ return 0;
+}
+
+
+int
+qb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int valid, dict_t *xdata)
+{
+ if (qb_inode_ctx_get (this, loc->inode))
+ frame->local = inode_ref (loc->inode);
+
+ STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr, loc, buf, valid, xdata);
+ return 0;
+}
+
+
+int
+qb_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
+ dict_t *xdata)
+{
+ inode_t *inode = NULL;
+
+ inode = frame->local;
+ frame->local = NULL;
+
+ if (inode) {
+ qb_iatt_fixup (this, inode, pre);
+ qb_iatt_fixup (this, inode, post);
+ inode_unref (inode);
+ }
+
+ QB_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, pre, post, xdata);
+
+ return 0;
+}
+
+
+int
+qb_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+ int valid, dict_t *xdata)
+{
+ if (qb_inode_ctx_get (this, fd->inode))
+ frame->local = inode_ref (fd->inode);
+
+ STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetattr, fd, buf, valid, xdata);
+ return 0;
+}
+
+
+int
+qb_forget (xlator_t *this, inode_t *inode)
+{
+ return qb_inode_cleanup (this, inode, 0);
+}
+
+
+int
+qb_release (xlator_t *this, fd_t *fd)
+{
+ call_frame_t *frame = NULL;
+
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not allocate frame. "
+ "Leaking QEMU BlockDriverState");
+ return -1;
+ }
+
+ if (qb_local_init (frame) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not allocate local. "
+ "Leaking QEMU BlockDriverState");
+ STACK_DESTROY (frame->root);
+ return -1;
+ }
+
+ if (qb_coroutine (frame, qb_co_close) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not allocate coroutine. "
+ "Leaking QEMU BlockDriverState");
+ qb_local_free (this, frame->local);
+ frame->local = NULL;
+ STACK_DESTROY (frame->root);
+ }
+
+ return 0;
+}
+
+int
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ ret = xlator_mem_acct_init (this, gf_qb_mt_end + 1);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init "
+ "failed");
+ return ret;
+}
+
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ return 0;
+}
+
+
+int
+init (xlator_t *this)
+{
+ qb_conf_t *conf = NULL;
+ int32_t ret = -1;
+ static int bdrv_inited = 0;
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "FATAL: qemu-block (%s) not configured with exactly "
+ "one child", this->name);
+ goto out;
+ }
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_qb_mt_qb_conf_t);
+ if (!conf)
+ goto out;
+
+ /* configure 'option window-size <size>' */
+ GF_OPTION_INIT ("default-password", conf->default_password, str, out);
+
+ /* qemu coroutines use "co_mutex" for synchronizing among themselves.
+ However "co_mutex" itself is not threadsafe if the coroutine framework
+ is multithreaded (which usually is not). However synctasks are
+ fundamentally multithreaded, so for now create a syncenv which has
+ scaling limits set to max 1 thread so that the qemu coroutines can
+ execute "safely".
+
+ Future work: provide an implementation of "co_mutex" which is
+ threadsafe and use the global multithreaded ctx->env syncenv.
+ */
+ conf->env = syncenv_new (0, 1, 1);
+
+ this->private = conf;
+
+ ret = 0;
+
+ snprintf (conf->qb_xattr_key, QB_XATTR_KEY_MAX, QB_XATTR_KEY_FMT,
+ this->name);
+
+ cur_mon = (void *) 1;
+
+ if (!bdrv_inited) {
+ bdrv_init ();
+ bdrv_inited = 1;
+ }
+
+out:
+ if (ret)
+ GF_FREE (conf);
+
+ return ret;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ qb_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ this->private = NULL;
+
+ if (conf->root_inode)
+ inode_unref(conf->root_inode);
+ GF_FREE (conf);
+
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = qb_lookup,
+ .fsetxattr = qb_fsetxattr,
+ .setxattr = qb_setxattr,
+ .open = qb_open,
+ .writev = qb_writev,
+ .readv = qb_readv,
+ .fsync = qb_fsync,
+ .truncate = qb_truncate,
+ .ftruncate = qb_ftruncate,
+ .stat = qb_stat,
+ .fstat = qb_fstat,
+ .setattr = qb_setattr,
+ .fsetattr = qb_fsetattr,
+ .flush = qb_flush,
+/*
+ .getxattr = qb_getxattr,
+ .fgetxattr = qb_fgetxattr
+*/
+ .readdirp = qb_readdirp,
+};
+
+
+struct xlator_cbks cbks = {
+ .forget = qb_forget,
+ .release = qb_release,
+};
+
+
+struct xlator_dumpops dumpops = {
+};
+
+
+struct volume_options options[] = {
+ { .key = {"default-password"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "",
+ .description = "Default password for the AES encrypted block images."
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/features/qemu-block/src/qemu-block.h b/xlators/features/qemu-block/src/qemu-block.h
new file mode 100644
index 000000000..c95f2799a
--- /dev/null
+++ b/xlators/features/qemu-block/src/qemu-block.h
@@ -0,0 +1,109 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __QEMU_BLOCK_H
+#define __QEMU_BLOCK_H
+
+#include "syncop.h"
+#include "call-stub.h"
+#include "block/block_int.h"
+#include "monitor/monitor.h"
+
+/* QB_XATTR_KEY_FMT is the on-disk xattr stored in the inode which
+ indicates that the file must be "interpreted" by the block format
+ logic. The value of the key is of the pattern:
+
+ "format:virtual_size"
+
+ e.g
+
+ "qcow2:20GB" or "qed:100GB"
+
+ The format and virtual size are colon separated. The format is
+ a case sensitive string which qemu recognizes. virtual_size is
+ specified as a size which glusterfs recognizes as size (i.e.,
+ value accepted by gf_string2bytesize())
+*/
+#define QB_XATTR_KEY_FMT "trusted.glusterfs.%s.format"
+
+#define QB_XATTR_KEY_MAX 64
+
+#define QB_XATTR_VAL_MAX 64
+
+
+typedef struct qb_inode {
+ char fmt[QB_XATTR_VAL_MAX]; /* this is only the format, not "format:size" */
+ size_t size; /* virtual size in bytes */
+ BlockDriverState *bs;
+ int refcnt;
+ uuid_t backing_gfid;
+ char *backing_fname;
+} qb_inode_t;
+
+
+typedef struct qb_conf {
+ Monitor *mon;
+ struct syncenv *env;
+ char qb_xattr_key[QB_XATTR_KEY_MAX];
+ char *default_password;
+ inode_t *root_inode;
+} qb_conf_t;
+
+
+typedef struct qb_local {
+ call_frame_t *frame; /* backpointer */
+ call_stub_t *stub;
+ inode_t *inode;
+ fd_t *fd;
+ char fmt[QB_XATTR_VAL_MAX+1];
+ char name[256];
+ synctask_fn_t synctask_fn;
+ struct list_head list;
+} qb_local_t;
+
+void qb_local_free (xlator_t *this, qb_local_t *local);
+int qb_coroutine (call_frame_t *frame, synctask_fn_t fn);
+inode_t *qb_inode_from_filename (const char *filename);
+int qb_inode_to_filename (inode_t *inode, char *filename, int size);
+int qb_format_extract (xlator_t *this, char *format, inode_t *inode);
+
+qb_inode_t *qb_inode_ctx_get (xlator_t *this, inode_t *inode);
+
+#define QB_STACK_UNWIND(typ, frame, args ...) do { \
+ qb_local_t *__local = frame->local; \
+ xlator_t *__this = frame->this; \
+ \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT (typ, frame, args); \
+ if (__local) \
+ qb_local_free (__this, __local); \
+ } while (0)
+
+#define QB_STUB_UNWIND(stub, op_ret, op_errno) do { \
+ qb_local_t *__local = stub->frame->local; \
+ xlator_t *__this = stub->frame->this; \
+ \
+ stub->frame->local = NULL; \
+ call_unwind_error (stub, op_ret, op_errno); \
+ if (__local) \
+ qb_local_free (__this, __local); \
+ } while (0)
+
+#define QB_STUB_RESUME(stub_errno) do { \
+ qb_local_t *__local = stub->frame->local; \
+ xlator_t *__this = stub->frame->this; \
+ \
+ stub->frame->local = NULL; \
+ call_resume (stub); \
+ if (__local) \
+ qb_local_free (__this, __local); \
+ } while (0)
+
+#endif /* !__QEMU_BLOCK_H */
diff --git a/xlators/features/quiesce/src/quiesce.c b/xlators/features/quiesce/src/quiesce.c
index 73eb91947..24c7dc6ed 100644
--- a/xlators/features/quiesce/src/quiesce.c
+++ b/xlators/features/quiesce/src/quiesce.c
@@ -111,7 +111,7 @@ void
gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub)
{
quiesce_priv_t *priv = NULL;
- struct timeval timeout = {0,};
+ struct timespec timeout = {0,};
priv = this->private;
if (!priv) {
@@ -129,7 +129,7 @@ gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub)
if (!priv->timer) {
timeout.tv_sec = 20;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
priv->timer = gf_timer_call_after (this->ctx,
timeout,
@@ -2492,7 +2492,7 @@ notify (xlator_t *this, int event, void *data, ...)
{
int ret = 0;
quiesce_priv_t *priv = NULL;
- struct timeval timeout = {0,};
+ struct timespec timeout = {0,};
priv = this->private;
if (!priv)
@@ -2525,7 +2525,7 @@ notify (xlator_t *this, int event, void *data, ...)
if (priv->timer)
break;
timeout.tv_sec = 20;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
priv->timer = gf_timer_call_after (this->ctx,
timeout,
diff --git a/xlators/features/quota/src/Makefile.am b/xlators/features/quota/src/Makefile.am
index 9546f4276..7165adc59 100644
--- a/xlators/features/quota/src/Makefile.am
+++ b/xlators/features/quota/src/Makefile.am
@@ -1,17 +1,22 @@
-xlator_LTLIBRARIES = quota.la
+xlator_LTLIBRARIES = quota.la quotad.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
quota_la_LDFLAGS = -module -avoid-version
+quotad_la_LDFLAGS = -module -avoid-version
-quota_la_SOURCES = quota.c
+quota_la_SOURCES = quota.c quota-enforcer-client.c
quota_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = quota-mem-types.h quota.h
+quotad_la_SOURCES = quotad.c quotad-helpers.c quotad-aggregator.c
+quotad_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = quota-mem-types.h quota.h quotad-aggregator.h quotad-helpers.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/xlators/cluster/dht/src
+ -I$(top_srcdir)/xlators/cluster/dht/src -I$(top_srcdir)/rpc/xdr/src/ \
+ -I$(top_srcdir)/rpc/rpc-lib/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
-CLEANFILES =
+CLEANFILES =
diff --git a/xlators/features/quota/src/quota-enforcer-client.c b/xlators/features/quota/src/quota-enforcer-client.c
new file mode 100644
index 000000000..7d8ab937d
--- /dev/null
+++ b/xlators/features/quota/src/quota-enforcer-client.c
@@ -0,0 +1,403 @@
+/*
+ Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <stdio.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sys/file.h>
+#include <netdb.h>
+#include <signal.h>
+#include <libgen.h>
+
+#include <sys/utsname.h>
+
+#include <stdint.h>
+#include <pthread.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <semaphore.h>
+#include <errno.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#ifdef HAVE_MALLOC_STATS
+#ifdef DEBUG
+#include <mcheck.h>
+#endif
+#endif
+
+#include "quota.h"
+
+extern struct rpc_clnt_program quota_enforcer_clnt;
+
+int32_t
+quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent);
+
+int
+quota_enforcer_submit_request (void *req, call_frame_t *frame,
+ rpc_clnt_prog_t *prog,
+ int procnum, struct iobref *iobref,
+ xlator_t *this, fop_cbk_fn_t cbkfn,
+ xdrproc_t xdrproc)
+{
+ int ret = -1;
+ int count = 0;
+ struct iovec iov = {0, };
+ struct iobuf *iobuf = NULL;
+ char new_iobref = 0;
+ ssize_t xdr_size = 0;
+ quota_priv_t *priv = NULL;
+
+ GF_ASSERT (this);
+
+ priv = this->private;
+
+ if (req) {
+ xdr_size = xdr_sizeof (xdrproc, req);
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, xdr_size);
+ if (!iobuf) {
+ goto out;
+ }
+
+ if (!iobref) {
+ iobref = iobref_new ();
+ if (!iobref) {
+ goto out;
+ }
+
+ new_iobref = 1;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov.iov_base = iobuf->ptr;
+ iov.iov_len = iobuf_size (iobuf);
+
+ /* Create the xdr payload */
+ ret = xdr_serialize_generic (iov, req, xdrproc);
+ if (ret == -1) {
+ goto out;
+ }
+ iov.iov_len = ret;
+ count = 1;
+ }
+
+ /* Send the msg */
+ ret = rpc_clnt_submit (priv->rpc_clnt, prog, procnum, cbkfn,
+ &iov, count,
+ NULL, 0, iobref, frame, NULL, 0, NULL, 0, NULL);
+ ret = 0;
+
+out:
+ if (new_iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ return ret;
+}
+
+int
+quota_enforcer_lookup_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ quota_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int ret = 0;
+ gfs3_lookup_rsp rsp = {0,};
+ struct iatt stbuf = {0,};
+ struct iatt postparent = {0,};
+ int op_errno = EINVAL;
+ dict_t *xdata = NULL;
+ inode_t *inode = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ frame = myframe;
+ local = frame->local;
+ inode = local->validate_loc.inode;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gfs3_lookup_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ op_errno = gf_error_to_errno (rsp.op_errno);
+ gf_stat_to_iatt (&rsp.postparent, &postparent);
+
+ if (rsp.op_ret == -1)
+ goto out;
+
+ rsp.op_ret = -1;
+ gf_stat_to_iatt (&rsp.stat, &stbuf);
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), rsp.op_ret,
+ op_errno, out);
+
+ if ((!uuid_is_null (inode->gfid))
+ && (uuid_compare (stbuf.ia_gfid, inode->gfid) != 0)) {
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "gfid changed for %s", local->validate_loc.path);
+ rsp.op_ret = -1;
+ op_errno = ESTALE;
+ goto out;
+ }
+
+ rsp.op_ret = 0;
+
+out:
+ rsp.op_errno = op_errno;
+ if (rsp.op_ret == -1) {
+ /* any error other than ENOENT */
+ if (rsp.op_errno != ENOENT)
+ gf_log (this->name, GF_LOG_WARNING,
+ "remote operation failed: %s. Path: %s (%s)",
+ strerror (rsp.op_errno),
+ local->validate_loc.path,
+ loc_gfid_utoa (&local->validate_loc));
+ else
+ gf_log (this->name, GF_LOG_TRACE,
+ "not found on remote node");
+
+ }
+
+ local->validate_cbk (frame, NULL, this, rsp.op_ret, rsp.op_errno, inode,
+ &stbuf, xdata, &postparent);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+int
+quota_enforcer_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata, fop_lookup_cbk_t validate_cbk)
+{
+ quota_local_t *local = NULL;
+ gfs3_lookup_req req = {{0,},};
+ int ret = 0;
+ int op_errno = ESTALE;
+ quota_priv_t *priv = NULL;
+
+ if (!frame || !this || !loc)
+ goto unwind;
+
+ local = frame->local;
+ local->validate_cbk = validate_cbk;
+
+ priv = this->private;
+
+ if (!(loc && loc->inode))
+ goto unwind;
+
+ if (!uuid_is_null (loc->inode->gfid))
+ memcpy (req.gfid, loc->inode->gfid, 16);
+ else
+ memcpy (req.gfid, loc->gfid, 16);
+
+ if (xdata) {
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata,
+ (&req.xdata.xdata_val),
+ req.xdata.xdata_len,
+ op_errno, unwind);
+ }
+
+ if (loc->name)
+ req.bname = (char *)loc->name;
+ else
+ req.bname = "";
+
+ ret = quota_enforcer_submit_request (&req, frame,
+ priv->quota_enforcer,
+ GF_AGGREGATOR_LOOKUP,
+ NULL, this,
+ quota_enforcer_lookup_cbk,
+ (xdrproc_t)xdr_gfs3_lookup_req);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+ }
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+
+unwind:
+ validate_cbk (frame, NULL, this, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
+
+int
+quota_enforcer_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ xlator_t *this = NULL;
+ int ret = 0;
+
+ this = mydata;
+
+ switch (event) {
+ case RPC_CLNT_CONNECT:
+ {
+ gf_log (this->name, GF_LOG_TRACE, "got RPC_CLNT_CONNECT");
+ break;
+ }
+
+ case RPC_CLNT_DISCONNECT:
+ {
+ gf_log (this->name, GF_LOG_TRACE, "got RPC_CLNT_DISCONNECT");
+ break;
+ }
+
+ default:
+ gf_log (this->name, GF_LOG_TRACE,
+ "got some other RPC event %d", event);
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+int
+quota_enforcer_blocking_connect (rpc_clnt_t *rpc)
+{
+ dict_t *options = NULL;
+ int ret = -1;
+
+ options = dict_new ();
+ if (options == NULL)
+ goto out;
+
+ ret = dict_set_str (options, "non-blocking-io", "no");
+ if (ret)
+ goto out;
+
+ rpc->conn.trans->reconfigure (rpc->conn.trans, options);
+
+ rpc_clnt_start (rpc);
+
+ ret = dict_set_str (options, "non-blocking-io", "yes");
+ if (ret)
+ goto out;
+
+ rpc->conn.trans->reconfigure (rpc->conn.trans, options);
+
+ ret = 0;
+out:
+ dict_unref (options);
+
+ return ret;
+}
+
+//Returns a started rpc_clnt. Creates a new rpc_clnt if quota_priv doesn't have
+//one already
+struct rpc_clnt *
+quota_enforcer_init (xlator_t *this, dict_t *options)
+{
+ struct rpc_clnt *rpc = NULL;
+ quota_priv_t *priv = NULL;
+ int ret = -1;
+
+ priv = this->private;
+ if (priv->rpc_clnt) {
+ gf_log (this->name, GF_LOG_TRACE, "quota enforcer clnt already "
+ "inited");
+ //Turns out to be a NOP if the clnt is already connected.
+ ret = quota_enforcer_blocking_connect (priv->rpc_clnt);
+ if (ret)
+ goto out;
+
+ return priv->rpc_clnt;
+ }
+
+ priv->quota_enforcer = &quota_enforcer_clnt;
+
+ ret = dict_set_str (options, "transport.address-family", "unix");
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (options, "transport-type", "socket");
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (options, "transport.socket.connect-path",
+ "/tmp/quotad.socket");
+ if (ret)
+ goto out;
+
+ rpc = rpc_clnt_new (options, this->ctx, this->name, 16);
+ if (!rpc) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = rpc_clnt_register_notify (rpc, quota_enforcer_notify, this);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "failed to register notify");
+ goto out;
+ }
+
+ ret = quota_enforcer_blocking_connect (rpc);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+ if (ret) {
+ if (rpc)
+ rpc_clnt_unref (rpc);
+ rpc = NULL;
+ }
+
+ return rpc;
+}
+
+struct rpc_clnt_procedure quota_enforcer_actors[GF_AGGREGATOR_MAXVALUE] = {
+ [GF_AGGREGATOR_NULL] = {"NULL", NULL},
+ [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", NULL},
+};
+
+struct rpc_clnt_program quota_enforcer_clnt = {
+ .progname = "Quota enforcer",
+ .prognum = GLUSTER_AGGREGATOR_PROGRAM,
+ .progver = GLUSTER_AGGREGATOR_VERSION,
+ .numproc = GF_AGGREGATOR_MAXVALUE,
+ .proctable = quota_enforcer_actors,
+};
diff --git a/xlators/features/quota/src/quota-mem-types.h b/xlators/features/quota/src/quota-mem-types.h
index 3082865da..97d916568 100644
--- a/xlators/features/quota/src/quota-mem-types.h
+++ b/xlators/features/quota/src/quota-mem-types.h
@@ -21,6 +21,9 @@ enum gf_quota_mem_types_ {
gf_quota_mt_int32_t,
gf_quota_mt_limits_t,
gf_quota_mt_quota_dentry_t,
+ gf_quota_mt_quota_limits_level_t,
+ gf_quota_mt_qd_vols_conf_t,
+ gf_quota_mt_aggregator_state_t,
gf_quota_mt_end
};
#endif
diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c
index 4ea54cca8..2ca4da0c1 100644
--- a/xlators/features/quota/src/quota.c
+++ b/xlators/features/quota/src/quota.c
@@ -12,44 +12,100 @@
#include "quota.h"
#include "common-utils.h"
#include "defaults.h"
+#include "statedump.h"
int32_t
quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
char *name, uuid_t par);
+
+int
+quota_fill_inodectx (xlator_t *this, inode_t *inode, dict_t *dict,
+ loc_t *loc, struct iatt *buf, int32_t *op_errno);
+
struct volume_options options[];
+static int32_t
+__quota_init_inode_ctx (inode_t *inode, xlator_t *this,
+ quota_inode_ctx_t **context)
+{
+ int32_t ret = -1;
+ quota_inode_ctx_t *ctx = NULL;
+
+ if (inode == NULL) {
+ goto out;
+ }
+
+ QUOTA_ALLOC_OR_GOTO (ctx, quota_inode_ctx_t, out);
+
+ LOCK_INIT(&ctx->lock);
+
+ if (context != NULL) {
+ *context = ctx;
+ }
+
+ INIT_LIST_HEAD (&ctx->parents);
+
+ ret = __inode_ctx_put (inode, this, (uint64_t )(long)ctx);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot set quota context in inode (gfid:%s)",
+ uuid_utoa (inode->gfid));
+ }
+out:
+ return ret;
+}
+
+
+static int32_t
+quota_inode_ctx_get (inode_t *inode, xlator_t *this,
+ quota_inode_ctx_t **ctx, char create_if_absent)
+{
+ int32_t ret = 0;
+ uint64_t ctx_int;
+
+ LOCK (&inode->lock);
+ {
+ ret = __inode_ctx_get (inode, this, &ctx_int);
+
+ if ((ret == 0) && (ctx != NULL)) {
+ *ctx = (quota_inode_ctx_t *) (unsigned long)ctx_int;
+ } else if (create_if_absent) {
+ ret = __quota_init_inode_ctx (inode, this, ctx);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ return ret;
+}
+
int
quota_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path)
{
int ret = -1;
- if (!loc) {
+ if (!loc || (inode == NULL))
return ret;
- }
if (inode) {
loc->inode = inode_ref (inode);
+ uuid_copy (loc->gfid, inode->gfid);
}
if (parent) {
loc->parent = inode_ref (parent);
}
- loc->path = gf_strdup (path);
- if (!loc->path) {
- goto loc_wipe;
- }
+ if (path != NULL) {
+ loc->path = gf_strdup (path);
- loc->name = strrchr (loc->path, '/');
- if (loc->name) {
- loc->name++;
- } else {
- goto loc_wipe;
+ loc->name = strrchr (loc->path, '/');
+ if (loc->name) {
+ loc->name++;
+ }
}
ret = 0;
-loc_wipe:
if (ret < 0) {
loc_wipe (loc);
}
@@ -82,7 +138,6 @@ quota_inode_loc_fill (inode_t *inode, loc_t *loc)
gf_log (this->name, GF_LOG_DEBUG,
"cannot find parent for inode (gfid:%s)",
uuid_utoa (inode->gfid));
- goto err;
}
ignore_parent:
@@ -91,7 +146,6 @@ ignore_parent:
gf_log (this->name, GF_LOG_DEBUG,
"cannot construct path for inode (gfid:%s)",
uuid_utoa (inode->gfid));
- goto err;
}
ret = quota_loc_fill (loc, inode, parent, resolvedpath);
@@ -137,8 +191,13 @@ quota_local_new ()
{
quota_local_t *local = NULL;
local = mem_get0 (THIS->local_pool);
- if (local)
- LOCK_INIT (&local->lock);
+ if (local == NULL)
+ goto out;
+
+ LOCK_INIT (&local->lock);
+ local->space_available = -1;
+
+out:
return local;
}
@@ -156,12 +215,15 @@ __quota_dentry_new (quota_inode_ctx_t *ctx, char *name, uuid_t par)
dentry->name = gf_strdup (name);
if (dentry->name == NULL) {
GF_FREE (dentry);
+ dentry = NULL;
goto err;
}
uuid_copy (dentry->par, par);
- list_add_tail (&dentry->next, &ctx->parents);
+ if (ctx != NULL)
+ list_add_tail (&dentry->next, &ctx->parents);
+
err:
return dentry;
}
@@ -182,19 +244,64 @@ out:
return;
}
+static inline void
+quota_link_count_decrement (quota_local_t *local)
+{
+ call_stub_t *stub = NULL;
+ int link_count = -1;
+
+ if (local == NULL)
+ goto out;
+
+ LOCK (&local->lock);
+ {
+ link_count = --local->link_count;
+ if (link_count == 0) {
+ stub = local->stub;
+ local->stub = NULL;
+ }
+ }
+ UNLOCK (&local->lock);
+
+ if (stub != NULL) {
+ call_resume (stub);
+ }
+out:
+ return;
+}
+
+static inline void
+quota_handle_validate_error (quota_local_t *local, int32_t op_ret,
+ int32_t op_errno)
+{
+ if (local == NULL)
+ goto out;
+
+ LOCK (&local->lock);
+ {
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&local->lock);
+
+ /* we abort checking limits on this path to root */
+ quota_link_count_decrement (local);
+out:
+ return;
+}
int32_t
quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
{
- quota_local_t *local = NULL;
- uint32_t validate_count = 0, link_count = 0;
- int32_t ret = 0;
- quota_inode_ctx_t *ctx = NULL;
- int64_t *size = 0;
- uint64_t value = 0;
- call_stub_t *stub = NULL;
+ quota_local_t *local = NULL;
+ int32_t ret = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ int64_t *size = 0;
+ uint64_t value = 0;
local = frame->local;
@@ -206,7 +313,7 @@ quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
GF_ASSERT (frame);
GF_VALIDATE_OR_GOTO_WITH_ERROR ("quota", this, unwind, op_errno,
EINVAL);
- GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, dict, unwind, op_errno,
+ GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, xdata, unwind, op_errno,
EINVAL);
ret = inode_ctx_get (local->validate_loc.inode, this, &value);
@@ -220,7 +327,7 @@ quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unwind;
}
- ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size);
+ ret = dict_get_bin (xdata, QUOTA_SIZE_KEY, (void **) &size);
if (ret < 0) {
gf_log (this->name, GF_LOG_WARNING,
"size key not present in dict");
@@ -243,25 +350,7 @@ quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
unwind:
- LOCK (&local->lock);
- {
- local->op_ret = -1;
- local->op_errno = op_errno;
-
- validate_count = --local->validate_count;
- link_count = local->link_count;
-
- if ((validate_count == 0) && (link_count == 0)) {
- stub = local->stub;
- local->stub = NULL;
- }
- }
- UNLOCK (&local->lock);
-
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_handle_validate_error (local, op_ret, op_errno);
return 0;
}
@@ -288,34 +377,425 @@ quota_timeout (struct timeval *tv, int32_t timeout)
return timed_out;
}
+static inline void
+quota_add_parent (quota_dentry_t *dentry, struct list_head *list)
+{
+ quota_dentry_t *entry = NULL;
+ gf_boolean_t found = _gf_false;
+
+ if ((dentry == NULL) || (list == NULL)) {
+ goto out;
+ }
+
+ list_for_each_entry (entry, list, next) {
+ if (uuid_compare (dentry->par, entry->par) == 0) {
+ found = _gf_true;
+ goto out;
+ }
+ }
+
+ list_add_tail (&dentry->next, list);
+
+out:
+ return;
+}
int32_t
-quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
- char *name, uuid_t par)
+quota_build_ancestry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *entries, dict_t *xdata)
{
- int32_t ret = -1;
- inode_t *_inode = NULL, *parent = NULL;
- quota_inode_ctx_t *ctx = NULL;
- quota_priv_t *priv = NULL;
- quota_local_t *local = NULL;
- char need_validate = 0, need_unwind = 0;
- int64_t delta = 0;
- call_stub_t *stub = NULL;
- int32_t validate_count = 0, link_count = 0;
- uint64_t value = 0;
- char just_validated = 0;
- uuid_t trav_uuid = {0,};
-
- GF_VALIDATE_OR_GOTO ("quota", this, out);
- GF_VALIDATE_OR_GOTO (this->name, frame, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ inode_t *parent = NULL, *tmp_parent = NULL;
+ gf_dirent_t *entry = NULL;
+ loc_t loc = {0, };
+ quota_dentry_t *dentry = NULL, *tmp = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ struct list_head parents = {0, };
+ quota_local_t *local = NULL;
+
+ INIT_LIST_HEAD (&parents);
local = frame->local;
- GF_VALIDATE_OR_GOTO (this->name, local, out);
+ frame->local = NULL;
+
+ if (op_ret < 0)
+ goto err;
+
+ parent = inode_parent (local->loc.inode, 0, NULL);
+ if (parent == NULL) {
+ gf_log (this->name, GF_LOG_WARNING, "parent is NULL");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if ((op_ret > 0) && (entries != NULL)) {
+ list_for_each_entry (entry, &entries->list, list) {
+ if (__is_root_gfid (entry->inode->gfid)) {
+ /* The list contains a sub-list for each
+ * possible path to the target inode. Each
+ * sub-list starts with the root entry of the
+ * tree and is followed by the child entries
+ * for a particular path to the target entry.
+ * The root entry is an implied sub-list
+ * delimiter, as it denotes we have started
+ * processing a new path. Reset the parent
+ * pointer and continue
+ */
+
+ tmp_parent = NULL;
+ }
+
+ uuid_copy (loc.gfid, entry->d_stat.ia_gfid);
+
+ loc.inode = inode_ref (entry->inode);
+ loc.parent = inode_ref (tmp_parent);
+ loc.name = entry->d_name;
+
+ quota_fill_inodectx (this, entry->inode, entry->dict,
+ &loc, &entry->d_stat, &op_errno);
+
+ tmp_parent = entry->inode;
+
+ loc_wipe (&loc);
+ }
+ }
+
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
+
+ if (ctx != NULL) {
+ LOCK (&ctx->lock);
+ {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ /* we built ancestry for a non-directory */
+ tmp = __quota_dentry_new (NULL, dentry->name,
+ dentry->par);
+ quota_add_parent (tmp, &parents);
+
+ if (list_empty (&tmp->next)) {
+ __quota_dentry_free (tmp);
+ tmp = NULL;
+ }
+ }
+ }
+ UNLOCK (&ctx->lock);
+ }
+
+ if (list_empty (&parents)) {
+ /* we built ancestry for a directory */
+ list_for_each_entry (entry, &entries->list, list) {
+ if (entry->inode == local->loc.inode)
+ break;
+ }
+
+ GF_ASSERT (&entry->list != &entries->list);
+
+ tmp = __quota_dentry_new (NULL, entry->d_name, parent->gfid);
+ quota_add_parent (tmp, &parents);
+ }
+
+ local->ancestry_cbk (&parents, local->loc.inode, 0, 0,
+ local->ancestry_data);
+ goto cleanup;
+
+err:
+ local->ancestry_cbk (NULL, NULL, -1, op_errno, local->ancestry_data);
+
+cleanup:
+ STACK_DESTROY (frame->root);
+ quota_local_cleanup (this, local);
+
+ if (parent != NULL) {
+ inode_unref (parent);
+ parent = NULL;
+ }
+
+ list_for_each_entry_safe (dentry, tmp, &parents, next) {
+ __quota_dentry_free (dentry);
+ }
+
+ return 0;
+}
+
+int32_t
+quota_build_ancestry_open_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd, dict_t *xdata)
+{
+ dict_t *xdata_req = NULL;
+ quota_local_t *local = NULL;
+
+ if (op_ret < 0) {
+ goto err;
+ }
+
+ xdata_req = dict_new ();
+ if (xdata_req == NULL) {
+ op_ret = -ENOMEM;
+ goto err;
+ }
+
+ op_ret = dict_set_int8 (xdata_req, QUOTA_LIMIT_KEY, 1);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto err;
+ }
+
+ op_ret = dict_set_int8 (xdata_req, GET_ANCESTRY_DENTRY_KEY, 1);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto err;
+ }
+
+ /* This would ask posix layer to construct dentry chain till root */
+ STACK_WIND (frame, quota_build_ancestry_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, 0, 0, xdata_req);
+
+ op_ret = 0;
+
+err:
+ fd_unref (fd);
+
+ dict_unref (xdata_req);
+
+ if (op_ret < 0) {
+ local = frame->local;
+ frame->local = NULL;
+
+ local->ancestry_cbk (NULL, NULL, -1, op_errno,
+ local->ancestry_data);
+ quota_local_cleanup (this, local);
+ STACK_DESTROY (frame->root);
+ }
+
+ return 0;
+}
+
+int
+quota_build_ancestry (inode_t *inode, quota_ancestry_built_t ancestry_cbk,
+ void *data)
+{
+ loc_t loc = {0, };
+ fd_t *fd = NULL;
+ quota_local_t *local = NULL;
+ call_frame_t *new_frame = NULL;
+ int op_errno = EINVAL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ fd = fd_create (inode, 0);
+
+ new_frame = create_frame (this, this->ctx->pool);
+ if (new_frame == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ new_frame->root->uid = new_frame->root->gid = 0;
+
+ local = quota_local_new ();
+ if (local == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ new_frame->local = local;
+ local->ancestry_cbk = ancestry_cbk;
+ local->ancestry_data = data;
+ local->loc.inode = inode_ref (inode);
+
+ if (IA_ISDIR (inode->ia_type)) {
+ STACK_WIND (new_frame, quota_build_ancestry_open_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir, &loc, fd,
+ NULL);
+ } else {
+ STACK_WIND (new_frame, quota_build_ancestry_open_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, &loc, 0, fd,
+ NULL);
+ }
+
+ loc_wipe (&loc);
+ return 0;
+
+err:
+ ancestry_cbk (NULL, NULL, -1, op_errno, data);
+
+ fd_unref (fd);
+
+ local = new_frame->local;
+ new_frame->local = NULL;
+
+ if (local != NULL) {
+ quota_local_cleanup (this, local);
+ }
+
+ if (new_frame != NULL) {
+ STACK_DESTROY (new_frame->root);
+ }
+
+ loc_wipe (&loc);
+ return 0;
+}
+
+int
+quota_validate (call_frame_t *frame, inode_t *inode, xlator_t *this,
+ fop_lookup_cbk_t cbk_fn)
+{
+ quota_local_t *local = NULL;
+ int ret = 0;
+ dict_t *xdata = NULL;
+ quota_priv_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&local->lock);
+ {
+ loc_wipe (&local->validate_loc);
+
+ ret = quota_inode_loc_fill (inode, &local->validate_loc);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot fill loc for inode (gfid:%s), hence "
+ "aborting quota-checks and continuing with fop",
+ uuid_utoa (inode->gfid));
+ }
+ }
+ UNLOCK (&local->lock);
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ xdata = dict_new ();
+ if (xdata == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = dict_set_int8 (xdata, QUOTA_SIZE_KEY, 1);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "dict set failed");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = dict_set_str (xdata, "volume-uuid", priv->volume_uuid);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "dict set failed");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = quota_enforcer_lookup (frame, this, &local->validate_loc, xdata,
+ cbk_fn);
+ if (ret < 0) {
+ ret = -ENOTCONN;
+ goto err;
+ }
+
+ ret = 0;
+err:
+ return ret;
+}
+
+void
+quota_check_limit_continuation (struct list_head *parents, inode_t *inode,
+ int32_t op_ret, int32_t op_errno, void *data)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ quota_local_t *local = NULL;
+ quota_dentry_t *entry = NULL;
+ inode_t *parent = NULL;
+ int parent_count = 0;
+
+ frame = data;
+ local = frame->local;
+ this = THIS;
+
+ if ((op_ret < 0) || list_empty (parents)) {
+ if (op_ret >= 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Couldn't build ancestry for inode (gfid:%s). "
+ "Without knowing ancestors till root, quota "
+ "cannot be enforced. "
+ "Hence, failing fop with EIO",
+ uuid_utoa (inode->gfid));
+ op_errno = EIO;
+ }
+
+ quota_handle_validate_error (local, -1, op_errno);
+ goto out;
+ }
+
+ list_for_each_entry (entry, parents, next) {
+ parent_count++;
+ }
+
+ LOCK (&local->lock);
+ {
+ local->link_count += (parent_count - 1);
+ }
+ UNLOCK (&local->lock);
+
+ list_for_each_entry (entry, parents, next) {
+ parent = inode_find (inode->table, entry->par);
+
+ quota_check_limit (frame, parent, this, NULL, NULL);
+ }
+
+out:
+ return;
+}
+
+int32_t
+quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
+ char *name, uuid_t par)
+{
+ int32_t ret = -1, op_errno = EINVAL;
+ inode_t *_inode = NULL, *parent = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ quota_priv_t *priv = NULL;
+ quota_local_t *local = NULL;
+ char need_validate = 0;
+ gf_boolean_t hard_limit_exceeded = 0;
+ int64_t delta = 0, wouldbe_size = 0;
+ int64_t space_available = 0;
+ uint64_t value = 0;
+ char just_validated = 0;
+ uuid_t trav_uuid = {0,};
+ uint32_t timeout = 0;
+
+ GF_VALIDATE_OR_GOTO ("quota", this, err);
+ GF_VALIDATE_OR_GOTO (this->name, frame, err);
+ GF_VALIDATE_OR_GOTO (this->name, inode, err);
+
+ local = frame->local;
+ GF_VALIDATE_OR_GOTO (this->name, local, err);
delta = local->delta;
- GF_VALIDATE_OR_GOTO (this->name, local->stub, out);
+ GF_VALIDATE_OR_GOTO (this->name, local->stub, err);
+ /* Allow all the trusted clients
+ * Don't block the gluster internal processes like rebalance, gsyncd,
+ * self heal etc from the disk quotas.
+ *
+ * Method: Allow all the clients with PID negative. This is by the
+ * assumption that any kernel assigned pid doesn't have the negative
+ * number.
+ */
+ if (0 > frame->root->pid) {
+ ret = 0;
+ quota_link_count_decrement (local);
+ goto done;
+ }
priv = this->private;
@@ -328,10 +808,6 @@ quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
{
just_validated = local->just_validated;
local->just_validated = 0;
-
- if (just_validated) {
- local->validate_count--;
- }
}
UNLOCK (&local->lock);
@@ -340,34 +816,68 @@ quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
}
do {
- if (ctx != NULL) {
+ if (ctx != NULL && (ctx->hard_lim > 0 || ctx->soft_lim > 0)) {
+ wouldbe_size = ctx->size + delta;
+
LOCK (&ctx->lock);
{
- if (ctx->limit >= 0) {
- if (!just_validated
- && quota_timeout (&ctx->tv,
- priv->timeout)) {
- need_validate = 1;
- } else if ((ctx->size + delta)
- >= ctx->limit) {
- local->op_ret = -1;
- local->op_errno = EDQUOT;
- need_unwind = 1;
- }
+ timeout = priv->soft_timeout;
+
+ if ((ctx->soft_lim >= 0)
+ && (wouldbe_size > ctx->soft_lim)) {
+ timeout = priv->hard_timeout;
+ }
+
+ if (!just_validated
+ && quota_timeout (&ctx->tv, timeout)) {
+ need_validate = 1;
+ } else if (wouldbe_size >= ctx->hard_lim) {
+ hard_limit_exceeded = 1;
}
}
UNLOCK (&ctx->lock);
if (need_validate) {
- goto validate;
- }
+ ret = quota_validate (frame, _inode, this,
+ quota_validate_cbk);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto err;
+ }
- if (need_unwind) {
break;
}
+
+ if (hard_limit_exceeded) {
+ local->op_ret = -1;
+ local->op_errno = EDQUOT;
+
+ space_available = ctx->hard_lim - ctx->size;
+
+ if (space_available < 0)
+ space_available = 0;
+
+ if ((local->space_available < 0)
+ || (local->space_available
+ > space_available)){
+ local->space_available
+ = space_available;
+
+ }
+
+ if (space_available == 0) {
+ op_errno = EDQUOT;
+ goto err;
+ }
+ }
+
+ /* We log usage only if quota limit is configured on
+ that inode. */
+ quota_log_usage (this, ctx, _inode, delta);
}
if (__is_root_gfid (_inode->gfid)) {
+ quota_link_count_decrement (local);
break;
}
@@ -379,10 +889,15 @@ quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
}
if (parent == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "cannot find parent for inode (gfid:%s), hence "
- "aborting enforcing quota-limits and continuing"
- " with the fop", uuid_utoa (_inode->gfid));
+ ret = quota_build_ancestry (_inode,
+ quota_check_limit_continuation,
+ frame);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ break;
}
inode_unref (_inode);
@@ -398,253 +913,105 @@ quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
ctx = (quota_inode_ctx_t *)(unsigned long)value;
} while (1);
- ret = 0;
-
if (_inode != NULL) {
inode_unref (_inode);
+ _inode = NULL;
}
- LOCK (&local->lock);
- {
- validate_count = local->validate_count;
- link_count = local->link_count;
- if ((validate_count == 0) && (link_count == 0)) {
- stub = local->stub;
- local->stub = NULL;
- }
- }
- UNLOCK (&local->lock);
-
- if (stub != NULL) {
- call_resume (stub);
- }
-
-out:
- return ret;
-
-validate:
- LOCK (&local->lock);
- {
- loc_wipe (&local->validate_loc);
-
- if (just_validated) {
- local->validate_count--;
- }
-
- local->validate_count++;
- ret = quota_inode_loc_fill (_inode, &local->validate_loc);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "cannot fill loc for inode (gfid:%s), hence "
- "aborting quota-checks and continuing with fop",
- uuid_utoa (_inode->gfid));
- local->validate_count--;
- }
- }
- UNLOCK (&local->lock);
-
- if (ret < 0) {
- goto loc_fill_failed;
- }
+done:
+ return 0;
- STACK_WIND (frame, quota_validate_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr, &local->validate_loc,
- QUOTA_SIZE_KEY, NULL);
+err:
+ quota_handle_validate_error (local, -1, op_errno);
-loc_fill_failed:
inode_unref (_inode);
return 0;
}
-
-int32_t
-quota_get_limit_value (inode_t *inode, xlator_t *this, int64_t *n)
+static inline int
+quota_get_limits (xlator_t *this, dict_t *dict, int64_t *hard_lim,
+ int64_t *soft_lim)
{
- int32_t ret = 0;
- char *path = NULL;
- limits_t *limit_node = NULL;
- quota_priv_t *priv = NULL;
+ quota_limit_t *limit = NULL;
+ quota_priv_t *priv = NULL;
+ int64_t soft_lim_percent = 0, *ptr = NULL;
+ int ret = 0;
- if (inode == NULL || n == NULL) {
- ret = -1;
+ if ((this == NULL) || (dict == NULL) || (hard_lim == NULL)
+ || (soft_lim == NULL))
goto out;
- }
-
- *n = 0;
-
- ret = inode_path (inode, NULL, &path);
- if (ret < 0) {
- ret = -1;
- goto out;
- }
priv = this->private;
- list_for_each_entry (limit_node, &priv->limit_head, limit_list) {
- if (strcmp (limit_node->path, path) == 0) {
- *n = limit_node->value;
- break;
- }
- }
-
-out:
- GF_FREE (path);
-
- return ret;
-}
-
-
-static int32_t
-__quota_init_inode_ctx (inode_t *inode, int64_t limit, xlator_t *this,
- dict_t *dict, struct iatt *buf,
- quota_inode_ctx_t **context)
-{
- int32_t ret = -1;
- int64_t *size = 0;
- quota_inode_ctx_t *ctx = NULL;
+ ret = dict_get_bin (dict, QUOTA_LIMIT_KEY, (void **) &ptr);
+ limit = (quota_limit_t *)ptr;
- if (inode == NULL) {
- goto out;
+ if (limit) {
+ *hard_lim = ntoh64 (limit->hard_lim);
+ soft_lim_percent = ntoh64 (limit->soft_lim_percent);
}
- QUOTA_ALLOC_OR_GOTO (ctx, quota_inode_ctx_t, out);
-
- ctx->limit = limit;
- if (buf)
- ctx->buf = *buf;
-
- LOCK_INIT(&ctx->lock);
-
- if (context != NULL) {
- *context = ctx;
+ if (soft_lim_percent < 0) {
+ soft_lim_percent = priv->default_soft_lim;
}
- INIT_LIST_HEAD (&ctx->parents);
-
- if (dict != NULL) {
- ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size);
- if (ret == 0) {
- ctx->size = ntoh64 (*size);
- gettimeofday (&ctx->tv, NULL);
- }
+ if ((*hard_lim > 0) && (soft_lim_percent > 0)) {
+ *soft_lim = (soft_lim_percent * (*hard_lim))/100;
}
- ret = __inode_ctx_put (inode, this, (uint64_t )(long)ctx);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_WARNING,
- "cannot set quota context in inode (gfid:%s)",
- uuid_utoa (inode->gfid));
- }
out:
- return ret;
-}
-
-
-static int32_t
-quota_inode_ctx_get (inode_t *inode, int64_t limit, xlator_t *this,
- dict_t *dict, struct iatt *buf, quota_inode_ctx_t **ctx,
- char create_if_absent)
-{
- int32_t ret = 0;
- uint64_t ctx_int;
-
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx_int);
-
- if ((ret == 0) && (ctx != NULL)) {
- *ctx = (quota_inode_ctx_t *) (unsigned long)ctx_int;
- } else if (create_if_absent) {
- ret = __quota_init_inode_ctx (inode, limit, this, dict,
- buf, ctx);
- }
- }
- UNLOCK (&inode->lock);
-
- return ret;
+ return 0;
}
-
-int32_t
-quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *dict, struct iatt *postparent)
+int
+quota_fill_inodectx (xlator_t *this, inode_t *inode, dict_t *dict,
+ loc_t *loc, struct iatt *buf, int32_t *op_errno)
{
- int32_t ret = -1;
- char found = 0;
- quota_local_t *local = NULL;
- quota_inode_ctx_t *ctx = NULL;
- quota_dentry_t *dentry = NULL;
- int64_t *size = 0;
- uint64_t value = 0;
- limits_t *limit_node = NULL;
- quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ char found = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ quota_dentry_t *dentry = NULL;
+ uint64_t value = 0;
+ int64_t hard_lim = -1, soft_lim = -1;
- local = frame->local;
-
- priv = this->private;
+ quota_get_limits (this, dict, &hard_lim, &soft_lim);
inode_ctx_get (inode, this, &value);
ctx = (quota_inode_ctx_t *)(unsigned long)value;
- if ((op_ret < 0) || (local == NULL)
- || (((ctx == NULL) || (ctx->limit == local->limit))
- && (local->limit < 0) && !((IA_ISREG (buf->ia_type))
- || (IA_ISLNK (buf->ia_type))))) {
- goto unwind;
- }
-
- LOCK (&priv->lock);
- {
- list_for_each_entry (limit_node, &priv->limit_head,
- limit_list) {
- if (strcmp (local->loc.path, limit_node->path) == 0) {
- uuid_copy (limit_node->gfid, buf->ia_gfid);
- break;
- }
- }
+ if ((((ctx == NULL) || (ctx->hard_lim == hard_lim))
+ && (hard_lim < 0) && !QUOTA_REG_OR_LNK_FILE (buf->ia_type))) {
+ ret = 0;
+ goto out;
}
- UNLOCK (&priv->lock);
- ret = quota_inode_ctx_get (local->loc.inode, local->limit, this, dict,
- buf, &ctx, 1);
+ ret = quota_inode_ctx_get (inode, this, &ctx, 1);
if ((ret == -1) || (ctx == NULL)) {
gf_log (this->name, GF_LOG_WARNING, "cannot create quota "
"context in inode(gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
+ uuid_utoa (inode->gfid));
+ ret = -1;
+ *op_errno = ENOMEM;
+ goto out;
}
LOCK (&ctx->lock);
{
-
- if (dict != NULL) {
- ret = dict_get_bin (dict, QUOTA_SIZE_KEY,
- (void **) &size);
- if (ret == 0) {
- ctx->size = ntoh64 (*size);
- gettimeofday (&ctx->tv, NULL);
- }
- }
-
- if (local->limit != ctx->limit) {
- ctx->limit = local->limit;
- }
+ ctx->hard_lim = hard_lim;
+ ctx->soft_lim = soft_lim;
ctx->buf = *buf;
- if (!(IA_ISREG (buf->ia_type) || IA_ISLNK (buf->ia_type))) {
+ if (!QUOTA_REG_OR_LNK_FILE (buf->ia_type)) {
goto unlock;
}
- if (local->loc.name == NULL)
+ if (loc->name == NULL)
goto unlock;
list_for_each_entry (dentry, &ctx->parents, next) {
- if ((strcmp (dentry->name, local->loc.name) == 0) &&
- (uuid_compare (local->loc.parent->gfid,
+ if ((strcmp (dentry->name, loc->name) == 0) &&
+ (uuid_compare (loc->parent->gfid,
dentry->par) == 0)) {
found = 1;
break;
@@ -653,18 +1020,18 @@ quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!found) {
dentry = __quota_dentry_new (ctx,
- (char *)local->loc.name,
- local->loc.parent->gfid);
+ (char *)loc->name,
+ loc->parent->gfid);
if (dentry == NULL) {
/*
- gf_log (this->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"cannot create a new dentry (par:%"
PRId64", name:%s) for inode(ino:%"
PRId64", gfid:%s)",
uuid_utoa (local->loc.inode->gfid));
*/
- op_ret = -1;
- op_errno = ENOMEM;
+ ret = -1;
+ *op_errno = ENOMEM;
goto unlock;
}
}
@@ -672,6 +1039,25 @@ quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
unlock:
UNLOCK (&ctx->lock);
+out:
+ return ret;
+}
+
+int32_t
+quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *dict, struct iatt *postparent)
+{
+ quota_local_t *local = NULL;
+
+ if (op_ret < 0)
+ goto unwind;
+
+ local = frame->local;
+
+ op_ret = quota_fill_inodectx (this, inode, dict, &local->loc, buf,
+ &op_errno);
+
unwind:
QUOTA_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
dict, postparent);
@@ -683,133 +1069,55 @@ int32_t
quota_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
dict_t *xattr_req)
{
- int32_t ret = -1;
- int64_t limit = -1;
- limits_t *limit_node = NULL;
- gf_boolean_t dict_newed = _gf_false;
- quota_priv_t *priv = NULL;
- quota_local_t *local = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ quota_local_t *local = NULL;
priv = this->private;
- list_for_each_entry (limit_node, &priv->limit_head, limit_list) {
- if (strcmp (limit_node->path, loc->path) == 0) {
- limit = limit_node->value;
- }
- }
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
- local = quota_local_new ();
- if (local == NULL) {
+ xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new();
+ if (!xattr_req)
goto err;
- }
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
+ local = quota_local_new ();
+ if (local == NULL) {
goto err;
}
frame->local = local;
+ loc_copy (&local->loc, loc);
- local->limit = limit;
-
- if (limit < 0) {
- goto wind;
- }
-
- if (xattr_req == NULL) {
- xattr_req = dict_new ();
- dict_newed = _gf_true;
- }
-
- ret = dict_set_uint64 (xattr_req, QUOTA_SIZE_KEY, 0);
+ ret = dict_set_int8 (xattr_req, QUOTA_LIMIT_KEY, 1);
if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict set of key for hard-limit failed");
goto err;
}
-wind:
STACK_WIND (frame, quota_lookup_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
ret = 0;
err:
+ if (xattr_req)
+ dict_unref (xattr_req);
+
if (ret < 0) {
QUOTA_STACK_UNWIND (lookup, frame, -1, ENOMEM,
NULL, NULL, NULL, NULL);
}
- if (dict_newed == _gf_true) {
- dict_unref (xattr_req);
- }
-
return 0;
-}
-
-
-void
-quota_update_size (xlator_t *this, inode_t *inode, char *name, uuid_t par,
- int64_t delta)
-{
- inode_t *_inode = NULL;
- inode_t *parent = NULL;
- uint64_t value = 0;
- quota_inode_ctx_t *ctx = NULL;
- uuid_t trav_uuid = {0,};
-
- GF_VALIDATE_OR_GOTO ("quota", this, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
-
- inode_ctx_get (inode, this, &value);
- ctx = (quota_inode_ctx_t *)(unsigned long)value;
-
- _inode = inode_ref (inode);
-
- if ( par != NULL ) {
- uuid_copy (trav_uuid, par);
- }
-
- do {
- if ((ctx != NULL) && (ctx->limit >= 0)) {
- LOCK (&ctx->lock);
- {
- ctx->size += delta;
- }
- UNLOCK (&ctx->lock);
- }
-
- if (__is_root_gfid (_inode->gfid)) {
- break;
- }
-
- parent = inode_parent (_inode, trav_uuid, name);
- if (parent == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "cannot find parent for inode (gfid:%s), hence "
- "aborting size updation of parents",
- uuid_utoa (_inode->gfid));
- }
- if (name != NULL) {
- name = NULL;
- uuid_clear (trav_uuid);
- }
-
- inode_unref (_inode);
- _inode = parent;
-
- if (_inode == NULL) {
- break;
- }
-
- inode_ctx_get (_inode, this, &value);
- ctx = (quota_inode_ctx_t *)(unsigned long)value;
- } while (1);
-
-out:
- return;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+ return 0;
}
-
int32_t
quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
@@ -819,8 +1127,6 @@ quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uint64_t ctx_int = 0;
quota_inode_ctx_t *ctx = NULL;
quota_local_t *local = NULL;
- quota_dentry_t *dentry = NULL;
- int64_t delta = 0;
local = frame->local;
@@ -850,12 +1156,6 @@ quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&ctx->lock);
- list_for_each_entry (dentry, &ctx->parents, next) {
- delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512;
- quota_update_size (this, local->loc.inode,
- dentry->name, dentry->par, delta);
- }
-
out:
QUOTA_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
@@ -869,8 +1169,13 @@ quota_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t off,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- quota_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ quota_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+ struct iovec *new_vector = NULL;
+ int32_t new_count = 0;
+
+ priv = this->private;
local = frame->local;
if (local == NULL) {
@@ -880,12 +1185,38 @@ quota_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (local->op_ret == -1) {
op_errno = local->op_errno;
- goto unwind;
+
+ if ((op_errno == EDQUOT) && (local->space_available > 0)) {
+ new_count = iov_subset (vector, count, 0,
+ local->space_available, NULL);
+
+ new_vector = GF_CALLOC (new_count,
+ sizeof (struct iovec),
+ gf_common_mt_iovec);
+ if (new_vector == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ new_count = iov_subset (vector, count, 0,
+ local->space_available,
+ new_vector);
+
+ vector = new_vector;
+ count = new_count;
+ } else {
+ goto unwind;
+ }
}
- STACK_WIND (frame, quota_writev_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev, fd, vector, count, off,
- flags, iobref, xdata);
+ STACK_WIND (frame, quota_writev_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, fd,
+ vector, count, off, flags, iobref, xdata);
+
+ if (new_vector != NULL)
+ GF_FREE (new_vector);
+
return 0;
unwind:
@@ -899,14 +1230,21 @@ quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t off,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t ret = -1, op_errno = EINVAL;
int32_t parents = 0;
uint64_t size = 0;
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
- quota_priv_t *priv = NULL;
+ quota_dentry_t *dentry = NULL, *tmp = NULL;
call_stub_t *stub = NULL;
- quota_dentry_t *dentry = NULL;
+ struct list_head head = {0, };
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ INIT_LIST_HEAD (&head);
GF_ASSERT (frame);
GF_VALIDATE_OR_GOTO ("quota", this, unwind);
@@ -920,12 +1258,13 @@ quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
frame->local = local;
local->loc.inode = inode_ref (fd->inode);
- ret = quota_inode_ctx_get (fd->inode, -1, this, NULL, NULL, &ctx, 0);
+ ret = quota_inode_ctx_get (fd->inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (fd->inode->gfid));
- goto unwind;
}
stub = fop_writev_stub (frame, quota_writev_helper, fd, vector, count,
@@ -939,40 +1278,38 @@ quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
GF_VALIDATE_OR_GOTO (this->name, priv, unwind);
size = iov_length (vector, count);
- LOCK (&ctx->lock);
- {
- list_for_each_entry (dentry, &ctx->parents, next) {
- parents++;
- }
- }
- UNLOCK (&ctx->lock);
-
- local->delta = size;
- local->stub = stub;
- local->link_count = parents;
-
- list_for_each_entry (dentry, &ctx->parents, next) {
- ret = quota_check_limit (frame, fd->inode, this, dentry->name,
- dentry->par);
- if (ret == -1) {
- break;
+ if (ctx != NULL) {
+ LOCK (&ctx->lock);
+ {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ tmp = __quota_dentry_new (NULL, dentry->name,
+ dentry->par);
+ list_add_tail (&tmp->next, &head);
+ parents++;
+ }
}
+ UNLOCK (&ctx->lock);
}
- stub = NULL;
-
LOCK (&local->lock);
{
- local->link_count = 0;
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
+ local->delta = size;
+ local->link_count = (parents != 0) ? parents : 1;
+ local->stub = stub;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
+ if (parents == 0) {
+ /* nameless lookup on this inode, allow quota to reconstruct
+ * ancestry as part of check_limit.
+ */
+ quota_check_limit (frame, fd->inode, this, NULL, NULL);
+ } else {
+ list_for_each_entry_safe (dentry, tmp, &head, next) {
+ quota_check_limit (frame, fd->inode, this, dentry->name,
+ dentry->par);
+ __quota_dentry_free (dentry);
+ }
}
return 0;
@@ -980,6 +1317,12 @@ quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
unwind:
QUOTA_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd,
+ vector, count, off, flags, iobref, xdata);
+ return 0;
}
@@ -1014,8 +1357,10 @@ quota_mkdir_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto unwind;
}
- STACK_WIND (frame, quota_mkdir_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+ STACK_WIND (frame, quota_mkdir_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc,
+ mode, umask, xdata);
+
return 0;
unwind:
@@ -1029,9 +1374,14 @@ int32_t
quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
mode_t umask, dict_t *xdata)
{
- int32_t ret = 0, op_errno = 0;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = 0, op_errno = 0;
+ quota_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
@@ -1041,8 +1391,6 @@ quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
frame->local = local;
- local->link_count = 1;
-
ret = loc_copy (&local->loc, loc);
if (ret) {
op_errno = ENOMEM;
@@ -1057,34 +1405,29 @@ quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
goto err;
}
- local->stub = stub;
- local->delta = 0;
-
- quota_check_limit (frame, loc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
-
- local->link_count = 0;
+ local->stub = stub;
+ local->delta = 0;
+ local->link_count = 1;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_check_limit (frame, loc->parent, this, NULL, NULL);
return 0;
+
err:
QUOTA_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL,
NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir,
+ loc, mode, umask, xdata);
+
+ return 0;
}
@@ -1104,7 +1447,7 @@ quota_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unwind;
}
- ret = quota_inode_ctx_get (inode, -1, this, NULL, buf, &ctx, 1);
+ ret = quota_inode_ctx_get (inode, this, &ctx, 1);
if ((ret == -1) || (ctx == NULL)) {
gf_log (this->name, GF_LOG_WARNING, "cannot create quota "
"context in inode(gfid:%s)",
@@ -1147,8 +1490,12 @@ quota_create_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
local = frame->local;
+
+ priv = this->private;
+
if (local == NULL) {
gf_log (this->name, GF_LOG_WARNING, "local is NULL");
goto unwind;
@@ -1159,9 +1506,10 @@ quota_create_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto unwind;
}
- STACK_WIND (frame, quota_create_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create, loc, flags, mode, umask,
- fd, xdata);
+
+ STACK_WIND (frame, quota_create_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, loc,
+ flags, mode, umask, fd, xdata);
return 0;
unwind:
@@ -1175,12 +1523,19 @@ int32_t
quota_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- int32_t ret = -1;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ quota_local_t *local = NULL;
+ int32_t op_errno = 0;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
+ op_errno = ENOMEM;
goto err;
}
@@ -1189,6 +1544,7 @@ quota_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
ret = loc_copy (&local->loc, loc);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "loc_copy failed");
+ op_errno = ENOMEM;
goto err;
}
@@ -1198,33 +1554,26 @@ quota_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
goto err;
}
- local->link_count = 1;
- local->stub = stub;
- local->delta = 0;
-
- quota_check_limit (frame, loc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- local->link_count = 0;
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
+ local->link_count = 1;
+ local->stub = stub;
+ local->delta = 0;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_check_limit (frame, loc->parent, this, NULL, NULL);
return 0;
err:
- QUOTA_STACK_UNWIND (create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
- NULL, NULL);
+ QUOTA_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL, NULL);
+
+ return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->create, loc,
+ flags, mode, umask, fd, xdata);
return 0;
}
@@ -1237,6 +1586,8 @@ quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
uint64_t value = 0;
+ quota_dentry_t *dentry = NULL;
+ quota_dentry_t *old_dentry = NULL;
if (op_ret < 0) {
goto out;
@@ -1254,9 +1605,20 @@ quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_update_size (this, local->loc.inode, (char *)local->loc.name,
- local->loc.parent->gfid,
- (-(ctx->buf.ia_blocks * 512)));
+ LOCK (&ctx->lock);
+ {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ if ((strcmp (dentry->name, local->loc.name) == 0) &&
+ (uuid_compare (local->loc.parent->gfid,
+ dentry->par) == 0)) {
+ old_dentry = dentry;
+ break;
+ }
+ }
+ if (old_dentry)
+ __quota_dentry_free (old_dentry);
+ }
+ UNLOCK (&ctx->lock);
out:
QUOTA_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent,
@@ -1269,9 +1631,14 @@ int32_t
quota_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
dict_t *xdata)
{
- int32_t ret = 0;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto err;
@@ -1279,6 +1646,10 @@ quota_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
frame->local = local;
+ if (xdata && dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) {
+ local->skip_check = _gf_true;
+ }
+
ret = loc_copy (&local->loc, loc);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "loc_copy failed");
@@ -1296,6 +1667,11 @@ err:
}
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+ return 0;
}
@@ -1317,16 +1693,16 @@ quota_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = (quota_local_t *) frame->local;
- quota_update_size (this, local->loc.parent, NULL, NULL,
- (buf->ia_blocks * 512));
+ if (local->skip_check)
+ goto out;
- ret = quota_inode_ctx_get (inode, -1, this, NULL, NULL, &ctx, 0);
+ ret = quota_inode_ctx_get (inode, this, &ctx, 0);
if ((ret == -1) || (ctx == NULL)) {
- gf_log (this->name, GF_LOG_WARNING, "cannot find quota "
- "context in %s (gfid:%s)", local->loc.path,
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (inode->gfid));
- op_ret = -1;
- op_errno = EINVAL;
goto out;
}
@@ -1380,6 +1756,9 @@ quota_link_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+
+ priv = this->private;
local = frame->local;
if (local == NULL) {
@@ -1393,8 +1772,9 @@ quota_link_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
goto unwind;
}
- STACK_WIND (frame, quota_link_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ STACK_WIND (frame, quota_link_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, oldloc,
+ newloc, xdata);
return 0;
unwind:
@@ -1408,10 +1788,28 @@ int32_t
quota_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t ret = -1, op_errno = ENOMEM;
quota_local_t *local = NULL;
+ quota_inode_ctx_t *ctx = NULL;
call_stub_t *stub = NULL;
- quota_inode_ctx_t *ctx = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ if (xdata && dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) {
+ goto off;
+ }
+
+ quota_inode_ctx_get (oldloc->inode, this, &ctx, 0);
+ if (ctx == NULL) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
+ uuid_utoa (oldloc->inode->gfid));
+ }
local = quota_local_new ();
if (local == NULL) {
@@ -1420,6 +1818,7 @@ quota_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
frame->local = (void *) local;
+
ret = loc_copy (&local->loc, newloc);
if (ret == -1) {
gf_log (this->name, GF_LOG_WARNING, "loc_copy failed");
@@ -1431,47 +1830,26 @@ quota_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
goto err;
}
- local->link_count = 1;
- local->stub = stub;
-
- ret = quota_inode_ctx_get (oldloc->inode, -1, this, NULL, NULL, &ctx,
- 0);
- if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
- oldloc->inode ? uuid_utoa (oldloc->inode->gfid) : "0");
- op_errno = EINVAL;
- goto err;
- }
-
- local->delta = ctx->buf.ia_blocks * 512;
-
- quota_check_limit (frame, newloc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
-
- local->link_count = 0;
+ local->link_count = 1;
+ local->stub = stub;
+ local->delta = (ctx != NULL) ? ctx->buf.ia_blocks * 512 : 0;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
+ quota_check_limit (frame, newloc->parent, this, NULL, NULL);
+ return 0;
- ret = 0;
err:
- if (ret < 0) {
- QUOTA_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL,
- NULL, NULL, NULL);
- }
+ QUOTA_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc,
+ newloc, xdata);
return 0;
}
@@ -1484,11 +1862,11 @@ quota_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dict_t *xdata)
{
int32_t ret = -1;
+ int64_t size = 0;
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
quota_dentry_t *old_dentry = NULL, *dentry = NULL;
char new_dentry_found = 0;
- int64_t size = 0;
if (op_ret < 0) {
goto out;
@@ -1502,29 +1880,22 @@ quota_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- if (IA_ISREG (local->oldloc.inode->ia_type)
- || IA_ISLNK (local->oldloc.inode->ia_type)) {
+ if (QUOTA_REG_OR_LNK_FILE (local->oldloc.inode->ia_type)) {
size = buf->ia_blocks * 512;
}
- if (local->oldloc.parent != local->newloc.parent) {
- quota_update_size (this, local->oldloc.parent, NULL, NULL, (-size));
- quota_update_size (this, local->newloc.parent, NULL, NULL, size);
- }
-
- if (!(IA_ISREG (local->oldloc.inode->ia_type)
- || IA_ISLNK (local->oldloc.inode->ia_type))) {
+ if (!QUOTA_REG_OR_LNK_FILE (local->oldloc.inode->ia_type)) {
goto out;
}
- ret = quota_inode_ctx_get (local->oldloc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ ret = quota_inode_ctx_get (local->oldloc.inode, this, &ctx, 0);
if ((ret == -1) || (ctx == NULL)) {
- gf_log (this->name, GF_LOG_WARNING, "quota context not"
- "set in inode(gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->oldloc.inode->gfid));
- op_ret = -1;
- op_errno = EINVAL;
+
goto out;
}
@@ -1570,7 +1941,8 @@ quota_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (dentry == NULL) {
gf_log (this->name, GF_LOG_WARNING,
"cannot create a new dentry (name:%s) "
- "for inode(gfid:%s)", local->newloc.name,
+ "for inode(gfid:%s)",
+ local->newloc.name,
uuid_utoa (local->newloc.inode->gfid));
op_ret = -1;
op_errno = ENOMEM;
@@ -1597,6 +1969,9 @@ quota_rename_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+
+ priv = this->private;
local = frame->local;
if (local == NULL) {
@@ -1610,8 +1985,10 @@ quota_rename_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
goto unwind;
}
- STACK_WIND (frame, quota_rename_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+ STACK_WIND (frame, quota_rename_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, oldloc,
+ newloc, xdata);
+
return 0;
unwind:
@@ -1621,14 +1998,59 @@ unwind:
}
+static int32_t
+quota_rename_get_size_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ quota_local_t *local = NULL;
+ int32_t ret = 0;
+ int64_t *size = 0;
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO_WITH_ERROR ("quota", this, out, op_errno,
+ EINVAL);
+ GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, xdata, out, op_errno,
+ EINVAL);
+ local = frame->local;
+ GF_ASSERT (local);
+ local->link_count = 1;
+
+ if (op_ret < 0)
+ goto out;
+
+
+ ret = dict_get_bin (xdata, QUOTA_SIZE_KEY, (void **) &size);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "size key not present in dict");
+ op_errno = EINVAL;
+ goto out;
+ }
+ local->delta = ntoh64 (*size);
+ quota_check_limit (frame, local->newloc.parent, this,
+ NULL, NULL);
+ return 0;
+
+out:
+ quota_handle_validate_error (local, -1, op_errno);
+ return 0;
+}
+
int32_t
quota_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
loc_t *newloc, dict_t *xdata)
{
- int32_t ret = -1, op_errno = ENOMEM;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
- quota_inode_ctx_t *ctx = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1, op_errno = ENOMEM;
+ quota_local_t *local = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
@@ -1655,51 +2077,63 @@ quota_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
goto err;
}
- local->link_count = 1;
- local->stub = stub;
+ LOCK (&local->lock);
+ {
+ local->link_count = 1;
+ local->stub = stub;
+ }
+ UNLOCK (&local->lock);
- if (IA_ISREG (oldloc->inode->ia_type)
- || IA_ISLNK (oldloc->inode->ia_type)) {
- ret = quota_inode_ctx_get (oldloc->inode, -1, this, NULL, NULL,
- &ctx, 0);
+ if (QUOTA_REG_OR_LNK_FILE (oldloc->inode->ia_type)) {
+ ret = quota_inode_ctx_get (oldloc->inode, this, &ctx, 0);
if (ctx == NULL) {
gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ "quota context not set in inode (gfid:%s), "
+ "considering file size as zero while enforcing "
+ "quota on new ancestry",
oldloc->inode ? uuid_utoa (oldloc->inode->gfid)
: "0");
- op_errno = EINVAL;
- goto err;
- }
- local->delta = ctx->buf.ia_blocks * 512;
- } else {
- local->delta = 0;
- }
+ local->delta = 0;
- quota_check_limit (frame, newloc->parent, this, NULL, NULL);
+ } else {
- stub = NULL;
+ /* FIXME: We need to account for the size occupied by this
+ * inode on the target directory. To avoid double
+ * accounting, we need to modify enforcer to perform
+ * quota_check_limit only uptil the least common ancestor
+ * directory inode*/
- LOCK (&local->lock);
- {
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
+ /* FIXME: The following code assumes that regular files and
+ *linkfiles are present, in their entirety, in a single
+ brick. This *assumption is invalid in the case of
+ stripe.*/
+
+ local->delta = ctx->buf.ia_blocks * 512;
}
- local->link_count = 0;
- }
- UNLOCK (&local->lock);
+ } else if (IA_ISDIR (oldloc->inode->ia_type)) {
+ ret = quota_validate (frame, oldloc->inode, this,
+ quota_rename_get_size_cbk);
+ if (ret){
+ op_errno = -ret;
+ goto err;
+ }
- if (stub != NULL) {
- call_resume (stub);
+ return 0;
}
- ret = 0;
+ quota_check_limit (frame, newloc->parent, this, NULL, NULL);
+ return 0;
+
err:
- if (ret == -1) {
- QUOTA_STACK_UNWIND (rename, frame, -1, op_errno, NULL,
- NULL, NULL, NULL, NULL, NULL);
- }
+ QUOTA_STACK_UNWIND (rename, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, NULL, NULL);
+ return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename, oldloc,
+ newloc, xdata);
return 0;
}
@@ -1711,7 +2145,6 @@ quota_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int64_t size = 0;
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
quota_dentry_t *dentry = NULL;
@@ -1721,16 +2154,15 @@ quota_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
local = frame->local;
- size = buf->ia_blocks * 512;
-
- quota_update_size (this, local->loc.parent, NULL, NULL, size);
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 1);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 1);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
+
goto out;
}
@@ -1765,6 +2197,7 @@ quota_symlink_helper (call_frame_t *frame, xlator_t *this, const char *linkpath,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
local = frame->local;
if (local == NULL) {
@@ -1772,14 +2205,16 @@ quota_symlink_helper (call_frame_t *frame, xlator_t *this, const char *linkpath,
goto unwind;
}
+ priv = this->private;
+
if (local->op_ret == -1) {
op_errno = local->op_errno;
goto unwind;
}
- STACK_WIND (frame, quota_symlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask,
- xdata);
+ STACK_WIND (frame, quota_symlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ linkpath, loc, umask, xdata);
return 0;
unwind:
@@ -1793,10 +2228,15 @@ int
quota_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
loc_t *loc, mode_t umask, dict_t *xdata)
{
- int32_t ret = -1;
- int32_t op_errno = ENOMEM;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ int32_t op_errno = ENOMEM;
+ quota_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
@@ -1811,36 +2251,21 @@ quota_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
goto err;
}
- local->link_count = 1;
-
stub = fop_symlink_stub (frame, quota_symlink_helper, linkpath, loc,
umask, xdata);
if (stub == NULL) {
goto err;
}
- local->stub = stub;
- local->delta = strlen (linkpath);
-
- quota_check_limit (frame, loc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
-
- local->link_count = 0;
+ local->stub = stub;
+ local->delta = strlen (linkpath);
+ local->link_count = 1;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_check_limit (frame, loc->parent, this, NULL, NULL);
return 0;
err:
@@ -1848,6 +2273,12 @@ err:
NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->symlink,
+ linkpath, loc, umask, xdata);
+ return 0;
}
@@ -1857,7 +2288,6 @@ quota_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
quota_local_t *local = NULL;
- int64_t delta = 0;
quota_inode_ctx_t *ctx = NULL;
if (op_ret < 0) {
@@ -1870,15 +2300,12 @@ quota_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512;
-
- quota_update_size (this, local->loc.inode, NULL, NULL, delta);
-
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -1900,9 +2327,14 @@ int32_t
quota_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t ret = -1;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto err;
@@ -1920,10 +2352,15 @@ quota_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
return 0;
+
err:
QUOTA_STACK_UNWIND (truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
}
@@ -1933,7 +2370,6 @@ quota_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
quota_local_t *local = NULL;
- int64_t delta = 0;
quota_inode_ctx_t *ctx = NULL;
if (op_ret < 0) {
@@ -1946,15 +2382,12 @@ quota_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512;
-
- quota_update_size (this, local->loc.inode, NULL, NULL, delta);
-
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -1976,8 +2409,13 @@ int32_t
quota_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL)
goto err;
@@ -1986,14 +2424,21 @@ quota_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
local->loc.inode = inode_ref (fd->inode);
- STACK_WIND (frame, quota_ftruncate_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ STACK_WIND (frame, quota_ftruncate_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, fd,
+ offset, xdata);
return 0;
err:
QUOTA_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd,
+ offset, xdata);
+ return 0;
}
@@ -2006,14 +2451,23 @@ quota_send_dir_limit_to_cli (call_frame_t *frame, xlator_t *this,
dict_t *dict = NULL;
quota_inode_ctx_t *ctx = NULL;
uint64_t value = 0;
+ quota_priv_t *priv = NULL;
+
+ priv = this->private;
+ if (!priv->is_quota_on) {
+ snprintf (dir_limit, 1024, "Quota is disabled please turn on");
+ goto dict_set;
+ }
ret = inode_ctx_get (inode, this, &value);
if (ret < 0)
goto out;
ctx = (quota_inode_ctx_t *)(unsigned long)value;
- snprintf (dir_limit, 1024, "%"PRId64",%"PRId64, ctx->size, ctx->limit);
+ snprintf (dir_limit, 1024, "%"PRId64",%"PRId64, ctx->size,
+ ctx->hard_lim);
+dict_set:
dict = dict_new ();
if (dict == NULL) {
ret = -1;
@@ -2024,7 +2478,7 @@ quota_send_dir_limit_to_cli (call_frame_t *frame, xlator_t *this,
if (ret < 0)
goto out;
- gf_log (this->name, GF_LOG_INFO, "str = %s", dir_limit);
+ gf_log (this->name, GF_LOG_DEBUG, "str = %s", dir_limit);
QUOTA_STACK_UNWIND (getxattr, frame, 0, 0, dict, NULL);
@@ -2076,7 +2530,8 @@ quota_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
int32_t
quota_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
{
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
@@ -2091,12 +2546,17 @@ quota_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "quota context not set in inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ if (!IA_ISDIR (buf->ia_type)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
goto out;
}
@@ -2116,9 +2576,14 @@ out:
int32_t
quota_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
int32_t ret = -1;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2132,12 +2597,19 @@ quota_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
}
STACK_WIND (frame, quota_stat_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat, loc, xdata);
+ FIRST_CHILD(this)->fops->stat, loc,
+ xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (stat, frame, -1, ENOMEM, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc,
+ xdata);
+ return 0;
}
@@ -2159,12 +2631,17 @@ quota_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ if (!IA_ISDIR (buf->ia_type)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
goto out;
}
@@ -2184,8 +2661,13 @@ out:
int32_t
quota_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2195,13 +2677,20 @@ quota_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
local->loc.inode = inode_ref (fd->inode);
- STACK_WIND (frame, quota_fstat_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ STACK_WIND (frame, quota_fstat_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, fd,
+ xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (fstat, frame, -1, ENOMEM, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd,
+ xdata);
+ return 0;
}
@@ -2223,11 +2712,12 @@ quota_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -2239,7 +2729,8 @@ quota_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
UNLOCK (&ctx->lock);
out:
- QUOTA_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, buf, xdata);
+ QUOTA_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, buf,
+ xdata);
return 0;
}
@@ -2248,9 +2739,14 @@ int32_t
quota_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
int32_t ret = -1;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2264,13 +2760,20 @@ quota_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
goto unwind;
}
- STACK_WIND (frame, quota_readlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+ STACK_WIND (frame, quota_readlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink, loc,
+ size, xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (readlink, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readlink, loc,
+ size, xdata);
+ return 0;
}
@@ -2293,11 +2796,12 @@ quota_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -2319,8 +2823,13 @@ int32_t
quota_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, uint32_t flags, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2330,13 +2839,20 @@ quota_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->loc.inode = inode_ref (fd->inode);
- STACK_WIND (frame, quota_readv_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
- xdata);
+ STACK_WIND (frame, quota_readv_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, fd,
+ size, offset, flags, xdata);
return 0;
unwind:
- QUOTA_STACK_UNWIND (readv, frame, -1, ENOMEM, NULL, -1, NULL, NULL, NULL);
+ QUOTA_STACK_UNWIND (readv, frame, -1, ENOMEM, NULL, -1, NULL, NULL,
+ NULL);
+ return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd,
+ size, offset, flags, xdata);
return 0;
}
@@ -2359,11 +2875,12 @@ quota_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -2385,8 +2902,13 @@ int32_t
quota_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2397,13 +2919,19 @@ quota_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
frame->local = local;
STACK_WIND (frame, quota_fsync_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+ FIRST_CHILD(this)->fops->fsync, fd,
+ flags, xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (fsync, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd,
+ flags, xdata);
+ return 0;
}
@@ -2425,12 +2953,16 @@ quota_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "quota context not set in inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ if (!IA_ISDIR (statpost->ia_type)) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is "
+ "NULL on inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
goto out;
}
@@ -2452,9 +2984,14 @@ int32_t
quota_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
int32_t ret = -1;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2469,12 +3006,19 @@ quota_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
}
STACK_WIND (frame, quota_setattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid, xdata);
+ FIRST_CHILD (this)->fops->setattr, loc,
+ stbuf, valid, xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, loc,
+ stbuf, valid, xdata);
+ return 0;
}
@@ -2496,12 +3040,16 @@ quota_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ if (!IA_ISDIR (statpost->ia_type)) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is "
+ "NULL on inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
goto out;
}
@@ -2522,8 +3070,13 @@ int32_t
quota_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2533,13 +3086,20 @@ quota_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->loc.inode = inode_ref (fd->inode);
- STACK_WIND (frame, quota_fsetattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, valid, xdata);
+ STACK_WIND (frame, quota_fsetattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, fd,
+ stbuf, valid, xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsetattr, fd,
+ stbuf, valid, xdata);
+ return 0;
}
@@ -2559,7 +3119,7 @@ quota_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unwind;
}
- ret = quota_inode_ctx_get (inode, -1, this, NULL, buf, &ctx, 1);
+ ret = quota_inode_ctx_get (inode, this, &ctx, 1);
if ((ret == -1) || (ctx == NULL)) {
gf_log (this->name, GF_LOG_WARNING, "cannot create quota "
"context in inode (gfid:%s)", uuid_utoa (inode->gfid));
@@ -2600,6 +3160,7 @@ quota_mknod_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
local = frame->local;
if (local == NULL) {
@@ -2607,14 +3168,16 @@ quota_mknod_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto unwind;
}
+ priv = this->private;
+
if (local->op_ret == -1) {
op_errno = local->op_errno;
goto unwind;
}
- STACK_WIND (frame, quota_mknod_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask,
- xdata);
+ STACK_WIND (frame, quota_mknod_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, loc,
+ mode, rdev, umask, xdata);
return 0;
@@ -2629,9 +3192,14 @@ int
quota_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
dev_t rdev, mode_t umask, dict_t *xdata)
{
- int32_t ret = -1;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ quota_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
@@ -2652,33 +3220,26 @@ quota_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
goto err;
}
- local->link_count = 1;
- local->stub = stub;
- local->delta = 0;
-
- quota_check_limit (frame, loc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- local->link_count = 0;
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
+ local->link_count = 1;
+ local->stub = stub;
+ local->delta = 0;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_check_limit (frame, loc->parent, this, NULL, NULL);
return 0;
+
err:
QUOTA_STACK_UNWIND (mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
NULL);
+ return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod, loc,
+ mode, rdev, umask, xdata);
return 0;
}
@@ -2686,6 +3247,28 @@ int
quota_setxattr_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno, dict_t *xdata)
{
+ quota_local_t *local = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ ret = quota_inode_ctx_get (local->loc.inode, this, &ctx, 1);
+ if ((ret < 0) || (ctx == NULL)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ LOCK (&ctx->lock);
+ {
+ ctx->hard_lim = local->limit.hard_lim;
+ ctx->soft_lim = local->limit.soft_lim_percent;
+ }
+ UNLOCK (&ctx->lock);
+
+out:
QUOTA_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2694,30 +3277,83 @@ int
quota_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
{
- int op_errno = EINVAL;
- int op_ret = -1;
+ quota_priv_t *priv = NULL;
+ int op_errno = EINVAL;
+ int op_ret = -1;
+ int64_t hard_lim = -1, soft_lim = -1;
+ quota_local_t *local = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*", dict,
- op_errno, err);
+ if (frame->root->pid >= 0) {
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*", dict,
+ op_errno, err);
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.pgfid*", dict, op_errno,
+ err);
+ }
+
+ quota_get_limits (this, dict, &hard_lim, &soft_lim);
+
+ if (hard_lim > 0) {
+ local = quota_local_new ();
+ if (local == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ frame->local = local;
+ loc_copy (&local->loc, loc);
+
+ local->limit.hard_lim = hard_lim;
+ local->limit.soft_lim_percent = soft_lim;
+ }
STACK_WIND (frame, quota_setxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setxattr,
- loc, dict, flags, xdata);
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, loc,
+ dict, flags, xdata);
return 0;
err:
QUOTA_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr, loc,
+ dict, flags, xdata);
+ return 0;
}
int
quota_fsetxattr_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno, dict_t *xdata)
{
+ quota_inode_ctx_t *ctx = NULL;
+ quota_local_t *local = NULL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ op_ret = quota_inode_ctx_get (local->loc.inode, this, &ctx, 1);
+ if ((op_ret < 0) || (ctx == NULL)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ LOCK (&ctx->lock);
+ {
+ ctx->hard_lim = local->limit.hard_lim;
+ ctx->soft_lim = local->limit.soft_lim_percent;
+ }
+ UNLOCK (&ctx->lock);
+
+out:
QUOTA_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2726,24 +3362,55 @@ int
quota_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
dict_t *dict, int flags, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ quota_local_t *local = NULL;
+ int64_t hard_lim = -1, soft_lim = -1;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*", dict,
- op_errno, err);
+ if (0 <= frame->root->pid) {
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*",
+ dict, op_errno, err);
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.pgfid*", dict,
+ op_errno, err);
+ }
+
+ quota_get_limits (this, dict, &hard_lim, &soft_lim);
+
+ if (hard_lim > 0) {
+ local = quota_local_new ();
+ if (local == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->loc.inode = inode_ref (fd->inode);
+
+ local->limit.hard_lim = hard_lim;
+ local->limit.soft_lim_percent = soft_lim;
+ }
STACK_WIND (frame, quota_fsetxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetxattr,
- fd, dict, flags, xdata);
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr, fd,
+ dict, flags, xdata);
return 0;
- err:
+err:
QUOTA_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr, fd,
+ dict, flags, xdata);
+ return 0;
}
@@ -2759,24 +3426,42 @@ int
quota_removexattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t op_errno = EINVAL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
VALIDATE_OR_GOTO (this, err);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.quota*",
- name, op_errno, err);
+ /* all quota xattrs can be cleaned up by doing setxattr on special key.
+ * Hence its ok that we don't allow removexattr on quota keys here.
+ */
+ if (frame->root->pid >= 0) {
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.quota*",
+ name, op_errno, err);
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.pgfid*", name,
+ op_errno, err);
+ }
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (loc, err);
STACK_WIND (frame, quota_removexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr,
loc, name, xdata);
return 0;
+
err:
QUOTA_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
}
@@ -2792,24 +3477,37 @@ int
quota_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t op_ret = -1;
int32_t op_errno = EINVAL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.quota*",
- name, op_errno, err);
-
+ if (frame->root->pid >= 0) {
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.quota*",
+ name, op_errno, err);
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.pgfid*", name,
+ op_errno, err);
+ }
STACK_WIND (frame, quota_fremovexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fremovexattr,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr,
fd, name, xdata);
return 0;
- err:
+err:
QUOTA_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
}
@@ -2818,16 +3516,16 @@ quota_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct statvfs *buf,
dict_t *xdata)
{
- inode_t *root_inode = NULL;
- quota_priv_t *priv = NULL;
- uint64_t value = 0;
- quota_inode_ctx_t *ctx = NULL;
- limits_t *limit_node = NULL;
- int64_t usage = -1;
- int64_t avail = -1;
- int64_t blocks = 0;
+ inode_t *inode = NULL;
+ uint64_t value = 0;
+ int64_t usage = -1;
+ int64_t avail = -1;
+ int64_t blocks = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ int ret = 0;
+ gf_boolean_t dict_created = _gf_false;
- root_inode = cookie;
+ inode = cookie;
/* This fop will fail mostly in case of client disconnect's,
* which is already logged. Hence, not logging here */
@@ -2838,137 +3536,540 @@ quota_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* cookie, and it would only do so if the value was non-NULL. This
* check is therefore just routine defensive coding.
*/
- if (!root_inode) {
+ if (!inode) {
gf_log(this->name,GF_LOG_WARNING,
"null inode, cannot adjust for quota");
goto unwind;
}
- if (!root_inode->table || (root_inode != root_inode->table->root)) {
- gf_log(this->name,GF_LOG_WARNING,
- "non-root inode, cannot adjust for quota");
- goto unwind;
- }
- inode_ctx_get (root_inode, this, &value);
+ inode_ctx_get (inode, this, &value);
if (!value) {
goto unwind;
}
- ctx = (quota_inode_ctx_t *)(unsigned long)value;
- usage = (ctx->size) / buf->f_bsize;
- priv = this->private;
- list_for_each_entry (limit_node, &priv->limit_head, limit_list) {
- /* Notice that this only works for volume-level quota. */
- if (strcmp (limit_node->path, "/") == 0) {
- blocks = limit_node->value / buf->f_bsize;
- if (usage > blocks) {
- break;
- }
+ /* if limit is set on this inode, report statfs based on this inode
+ * else report based on root.
+ */
+ ctx = (quota_inode_ctx_t *)(unsigned long)value;
+ if (ctx->hard_lim <= 0) {
+ inode_ctx_get (inode->table->root, this, &value);
+ ctx = (quota_inode_ctx_t *)(unsigned long) value;
+ if (!ctx || ctx->hard_lim < 0)
+ goto unwind;
+ }
+
+ { /* statfs is adjusted in this code block */
+ usage = (ctx->size) / buf->f_bsize;
+
+ blocks = ctx->hard_lim / buf->f_bsize;
+ buf->f_blocks = blocks;
+
+ avail = buf->f_blocks - usage;
+ avail = max (avail, 0);
+
+ buf->f_bfree = avail;
+ /*
+ * We have to assume that the total assigned quota
+ * won't cause us to dip into the reserved space,
+ * because dealing with the overcommitted cases is
+ * just too hairy (especially when different bricks
+ * might be using different reserved percentages and
+ * such).
+ */
+ buf->f_bavail = buf->f_bfree;
+ }
- buf->f_blocks = blocks;
- avail = buf->f_blocks - usage;
- if (buf->f_bfree > avail) {
- buf->f_bfree = avail;
- }
- /*
- * We have to assume that the total assigned quota
- * won't cause us to dip into the reserved space,
- * because dealing with the overcommitted cases is
- * just too hairy (especially when different bricks
- * might be using different reserved percentages and
- * such).
- */
- buf->f_bavail = buf->f_bfree;
- break;
- }
+ if (!xdata) {
+ xdata = dict_new ();
+ if (!xdata)
+ goto unwind;
+ dict_created = _gf_true;
}
+ ret = dict_set_int8 (xdata, "quota-deem-statfs", 1);
+ if (-1 == ret)
+ gf_log (this->name, GF_LOG_ERROR, "Dict set failed, "
+ "deem-statfs option may have no effect");
+
unwind:
- if (root_inode) {
- inode_unref(root_inode);
- }
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
+ QUOTA_STACK_UNWIND (statfs, frame, op_ret, op_errno, buf, xdata);
+
+ if (dict_created)
+ dict_unref (xdata);
return 0;
}
int32_t
+quota_statfs_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ quota_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ GF_VALIDATE_OR_GOTO ("quota", (local = frame->local), err);
+
+ if (-1 == local->op_ret) {
+ op_errno = local->op_errno;
+ goto err;
+ }
+
+ STACK_WIND_COOKIE (frame, quota_statfs_cbk, loc->inode,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
+err:
+ QUOTA_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+int32_t
+quota_statfs_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ quota_local_t *local = NULL;
+ int32_t ret = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ int64_t *size = 0;
+ uint64_t value = 0;
+
+ local = frame->local;
+
+ if (op_ret < 0)
+ goto resume;
+
+ GF_ASSERT (local);
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO_WITH_ERROR ("quota", this, resume, op_errno,
+ EINVAL);
+ GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, xdata, resume, op_errno,
+ EINVAL);
+
+ ret = inode_ctx_get (local->validate_loc.inode, this, &value);
+
+ ctx = (quota_inode_ctx_t *)(unsigned long)value;
+ if ((ret == -1) || (ctx == NULL)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "quota context is not present in inode (gfid:%s)",
+ uuid_utoa (local->validate_loc.inode->gfid));
+ op_errno = EINVAL;
+ goto resume;
+ }
+
+ ret = dict_get_bin (xdata, QUOTA_SIZE_KEY, (void **) &size);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "size key not present in dict");
+ op_errno = EINVAL;
+ goto resume;
+ }
+
+ LOCK (&ctx->lock);
+ {
+ ctx->size = ntoh64 (*size);
+ gettimeofday (&ctx->tv, NULL);
+ }
+ UNLOCK (&ctx->lock);
+
+resume:
+ quota_link_count_decrement (local);
+ return 0;
+}
+
+int32_t
quota_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- inode_t *root_inode = NULL;
- quota_priv_t *priv = NULL;
+ quota_local_t *local = NULL;
+ int op_errno = 0;
+ call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int ret = 0;
priv = this->private;
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
if (priv->consider_statfs && loc->inode) {
- root_inode = loc->inode->table->root;
- inode_ref(root_inode);
- STACK_WIND_COOKIE (frame, quota_statfs_cbk, root_inode,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->statfs, loc, xdata);
- }
- else {
- /*
- * We have to make sure that we never get to quota_statfs_cbk
- * with a cookie that points to something other than an inode,
- * which is exactly what would happen with STACK_UNWIND using
- * that as a callback. Therefore, use default_statfs_cbk in
- * this case instead.
- *
- * Also if the option deem-statfs is not set to "on" don't
- * bother calculating quota limit on / in statfs_cbk.
- */
- if (priv->consider_statfs)
- gf_log(this->name,GF_LOG_WARNING,
- "missing inode, cannot adjust for quota");
- STACK_WIND (frame, default_statfs_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ local = quota_local_new ();
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+
+ stub = fop_statfs_stub (frame, quota_statfs_helper, loc, xdata);
+ if (!stub) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ LOCK (&local->lock);
+ {
+ local->inode = inode_ref (loc->inode);
+ local->link_count = 1;
+ local->stub = stub;
+ }
+ UNLOCK (&local->lock);
+
+ ret = quota_validate (frame, local->inode, this,
+ quota_statfs_validate_cbk);
+ if (0 > ret) {
+ quota_handle_validate_error (local, -1, -ret);
+ }
+
+ return 0;
}
+
+ /*
+ * We have to make sure that we never get to quota_statfs_cbk
+ * with a cookie that points to something other than an inode,
+ * which is exactly what would happen with STACK_UNWIND using
+ * that as a callback. Therefore, use default_statfs_cbk in
+ * this case instead.
+ *
+ * Also if the option deem-statfs is not set to "on" don't
+ * bother calculating quota limit on / in statfs_cbk.
+ */
+ if (priv->consider_statfs)
+ gf_log (this->name,GF_LOG_WARNING,
+ "missing inode, cannot adjust for quota");
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
return 0;
-}
+err:
+ STACK_UNWIND_STRICT (statfs, frame, -1, op_errno, NULL, NULL);
+
+ if (local)
+ quota_local_cleanup (this, local);
+ return 0;
+}
int
quota_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, gf_dirent_t *entries,
dict_t *xdata)
{
- gf_dirent_t *entry = NULL;
+ gf_dirent_t *entry = NULL;
+ quota_local_t *local = NULL;
+ loc_t loc = {0, };
if (op_ret <= 0)
goto unwind;
+ local = frame->local;
+
list_for_each_entry (entry, &entries->list, list) {
- /* TODO: fill things */
+ if ((strcmp (entry->d_name, ".") == 0)
+ || (strcmp (entry->d_name, "..") == 0))
+ continue;
+
+ uuid_copy (loc.gfid, entry->d_stat.ia_gfid);
+ loc.inode = inode_ref (entry->inode);
+ loc.parent = inode_ref (local->loc.inode);
+ uuid_copy (loc.pargfid, loc.parent->gfid);
+ loc.name = entry->d_name;
+
+ quota_fill_inodectx (this, entry->inode, entry->dict,
+ &loc, &entry->d_stat, &op_errno);
+
+ loc_wipe (&loc);
}
unwind:
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
+ QUOTA_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata);
return 0;
}
+
int
quota_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *dict)
{
- int ret = 0;
+ quota_priv_t *priv = NULL;
+ int ret = 0;
+ gf_boolean_t new_dict = _gf_false;
+ quota_local_t *local = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ local = quota_local_new ();
+
+ if (local == NULL) {
+ goto err;
+ }
+
+ frame->local = local;
+
+ local->loc.inode = inode_ref (fd->inode);
+
+ if (dict == NULL) {
+ dict = dict_new ();
+ new_dict = _gf_true;
+ }
if (dict) {
- ret = dict_set_uint64 (dict, QUOTA_SIZE_KEY, 0);
+ ret = dict_set_int8 (dict, QUOTA_LIMIT_KEY, 1);
if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict set of key for hard-limit failed");
goto err;
}
}
STACK_WIND (frame, quota_readdirp_cbk,
- FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
- fd, size, offset, dict);
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, fd,
+ size, offset, dict);
+
+ if (new_dict) {
+ dict_unref (dict);
+ }
+
return 0;
err:
STACK_UNWIND_STRICT (readdirp, frame, -1, EINVAL, NULL, NULL);
+
+ if (new_dict) {
+ dict_unref (dict);
+ }
+
+ return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd,
+ size, offset, dict);
+ return 0;
+}
+
+int32_t
+quota_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_int = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ quota_local_t *local = NULL;
+
+ local = frame->local;
+
+ if ((op_ret < 0) || (local == NULL)) {
+ goto out;
+ }
+
+ ret = inode_ctx_get (local->loc.inode, this, &ctx_int);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to get the context", local->loc.path);
+ goto out;
+ }
+
+ ctx = (quota_inode_ctx_t *)(unsigned long) ctx_int;
+
+ if (ctx == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "quota context not set in %s (gfid:%s)",
+ local->loc.path, uuid_utoa (local->loc.inode->gfid));
+ goto out;
+ }
+
+ LOCK (&ctx->lock);
+ {
+ ctx->buf = *postbuf;
+ }
+ UNLOCK (&ctx->lock);
+
+out:
+ QUOTA_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+
+int32_t
+quota_fallocate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata)
+{
+ quota_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+
+ local = frame->local;
+ if (local == NULL) {
+ gf_log (this->name, GF_LOG_WARNING, "local is NULL");
+ goto unwind;
+ }
+
+ priv = this->private;
+
+ if (local->op_ret == -1) {
+ op_errno = local->op_errno;
+ goto unwind;
+ }
+
+ STACK_WIND (frame, quota_fallocate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+ return 0;
+
+unwind:
+ QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+quota_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int32_t ret = -1, op_errno = EINVAL;
+ int32_t parents = 0;
+ quota_local_t *local = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ quota_priv_t *priv = NULL;
+ quota_dentry_t *dentry = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, priv, unwind);
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO ("quota", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+
+ local = quota_local_new ();
+ if (local == NULL) {
+ goto unwind;
+ }
+
+ frame->local = local;
+ local->loc.inode = inode_ref (fd->inode);
+
+ ret = quota_inode_ctx_get (fd->inode, this, &ctx, 0);
+ if (ctx == NULL) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is "
+ "NULL on inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
+ stub = fop_fallocate_stub(frame, quota_fallocate_helper, fd, mode,
+ offset, len, xdata);
+ if (stub == NULL) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, priv, unwind);
+
+ if (ctx != NULL) {
+ LOCK (&ctx->lock);
+ {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ parents++;
+ }
+ }
+ UNLOCK (&ctx->lock);
+ }
+
+ /*
+ * Note that by using len as the delta we're assuming the range from
+ * offset to offset+len has not already been allocated. This can result
+ * in ENOSPC errors attempting to allocate an already allocated range.
+ */
+ local->delta = len;
+ local->stub = stub;
+ local->link_count = parents;
+
+ if (parents == 0) {
+ local->link_count = 1;
+ quota_check_limit (frame, fd->inode, this, NULL, NULL);
+ } else {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ quota_check_limit (frame, fd->inode, this, dentry->name,
+ dentry->par);
+ }
+ }
+
+ return 0;
+
+unwind:
+ QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset,
+ len, xdata);
return 0;
}
+/* Logs if
+* i. Usage crossed soft limit
+* ii. Usage above soft limit and alert-time elapsed
+*/
+void
+quota_log_usage (xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode,
+ int64_t delta)
+{
+ struct timeval cur_time = {0,};
+ char *usage_str = NULL;
+ char size_str[32] = {0};
+ char *path = NULL;
+ int64_t cur_size = 0;
+ quota_priv_t *priv = NULL;
+ gf_boolean_t dyn_mem = _gf_true;
+
+ priv = this->private;
+ if ((ctx->soft_lim <= 0) || (timerisset (&ctx->prev_log) &&
+ !quota_timeout (&ctx->prev_log,
+ priv->log_timeout))) {
+ return;
+ }
+
+
+ cur_size = ctx->size + delta;
+ usage_str = gf_uint64_2human_readable (cur_size);
+ if (!usage_str) {
+ snprintf (size_str, sizeof (size_str), "%"PRId64, cur_size);
+ usage_str = (char*) size_str;
+ dyn_mem = _gf_false;
+ }
+ inode_path (inode, NULL, &path);
+ if (!path)
+ path = uuid_utoa (inode->gfid);
+
+ gettimeofday (&cur_time, NULL);
+ /* Usage crossed/reached soft limit */
+ if (DID_REACH_LIMIT (ctx->soft_lim, ctx->size, cur_size)) {
+
+ gf_log (this->name, GF_LOG_ALERT, "Usage crossed "
+ "soft limit: %s used by %s", usage_str, path);
+ ctx->prev_log = cur_time;
+ }
+ /* Usage is above soft limit */
+ else if (cur_size > ctx->soft_lim){
+ gf_log (this->name, GF_LOG_ALERT, "Usage is above "
+ "soft limit: %s used by %s", usage_str, path);
+ ctx->prev_log = cur_time;
+ }
+
+ if (dyn_mem)
+ GF_FREE (usage_str);
+}
int32_t
mem_acct_init (xlator_t *this)
@@ -3021,83 +4122,6 @@ quota_forget (xlator_t *this, inode_t *inode)
return 0;
}
-
-int
-quota_parse_limits (quota_priv_t *priv, xlator_t *this, dict_t *xl_options,
- struct list_head *old_list)
-{
- int32_t ret = -1;
- char *str = NULL;
- char *str_val = NULL;
- char *path = NULL, *saveptr = NULL;
- uint64_t value = 0;
- limits_t *quota_lim = NULL, *old = NULL;
- char *last_colon= NULL;
-
- ret = dict_get_str (xl_options, "limit-set", &str);
-
- if (str) {
- path = strtok_r (str, ",", &saveptr);
-
- while (path) {
- last_colon = strrchr (path, ':');
- *last_colon = '\0';
- str_val = last_colon + 1;
-
- ret = gf_string2bytesize (str_val, &value);
- if (ret != 0)
- goto err;
-
- QUOTA_ALLOC_OR_GOTO (quota_lim, limits_t, err);
-
- quota_lim->path = path;
-
- quota_lim->value = value;
-
- gf_log (this->name, GF_LOG_INFO, "%s:%"PRId64,
- quota_lim->path, quota_lim->value);
-
- if (old_list != NULL) {
- list_for_each_entry (old, old_list,
- limit_list) {
- if (strcmp (old->path, quota_lim->path)
- == 0) {
- uuid_copy (quota_lim->gfid,
- old->gfid);
- break;
- }
- }
- }
-
- LOCK (&priv->lock);
- {
- list_add_tail (&quota_lim->limit_list,
- &priv->limit_head);
- }
- UNLOCK (&priv->lock);
-
- path = strtok_r (NULL, ",", &saveptr);
- }
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "no \"limit-set\" option provided");
- }
-
- LOCK (&priv->lock);
- {
- list_for_each_entry (quota_lim, &priv->limit_head, limit_list) {
- gf_log (this->name, GF_LOG_INFO, "%s:%"PRId64,
- quota_lim->path, quota_lim->value);
- }
- }
- UNLOCK (&priv->lock);
-
- ret = 0;
-err:
- return ret;
-}
-
-
int32_t
init (xlator_t *this)
{
@@ -3119,20 +4143,18 @@ init (xlator_t *this)
QUOTA_ALLOC_OR_GOTO (priv, quota_priv_t, err);
- INIT_LIST_HEAD (&priv->limit_head);
-
LOCK_INIT (&priv->lock);
this->private = priv;
- ret = quota_parse_limits (priv, this, this->options, NULL);
-
- if (ret) {
- goto err;
- }
-
- GF_OPTION_INIT ("timeout", priv->timeout, int64, err);
GF_OPTION_INIT ("deem-statfs", priv->consider_statfs, bool, err);
+ GF_OPTION_INIT ("server-quota", priv->is_quota_on, bool, err);
+ GF_OPTION_INIT ("default-soft-limit", priv->default_soft_lim, percent,
+ err);
+ GF_OPTION_INIT ("soft-timeout", priv->soft_timeout, time, err);
+ GF_OPTION_INIT ("hard-timeout", priv->hard_timeout, time, err);
+ GF_OPTION_INIT ("alert-time", priv->log_timeout, time, err);
+ GF_OPTION_INIT ("volume-uuid", priv->volume_uuid, str, err);
this->local_pool = mem_pool_new (quota_local_t, 64);
if (!this->local_pool) {
@@ -3142,134 +4164,103 @@ init (xlator_t *this)
goto err;
}
+ if (priv->is_quota_on) {
+ priv->rpc_clnt = quota_enforcer_init (this, this->options);
+ if (priv->rpc_clnt == NULL) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_WARNING,
+ "quota enforcer rpc init failed");
+ goto err;
+ }
+ }
+
ret = 0;
err:
return ret;
}
-
-void
-__quota_reconfigure_inode_ctx (xlator_t *this, inode_t *inode, limits_t *limit)
+int
+reconfigure (xlator_t *this, dict_t *options)
{
- int ret = -1;
- quota_inode_ctx_t *ctx = NULL;
-
- GF_VALIDATE_OR_GOTO ("quota", this, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
- GF_VALIDATE_OR_GOTO (this->name, limit, out);
-
- ret = quota_inode_ctx_get (inode, limit->value, this, NULL, NULL, &ctx,
- 1);
- if ((ret == -1) || (ctx == NULL)) {
- gf_log (this->name, GF_LOG_WARNING, "cannot create quota "
- "context in inode(gfid:%s)",
- uuid_utoa (inode->gfid));
- goto out;
- }
-
- LOCK (&ctx->lock);
- {
- ctx->limit = limit->value;
- }
- UNLOCK (&ctx->lock);
-
-out:
- return;
-}
-
+ int32_t ret = -1;
+ quota_priv_t *priv = NULL;
+ gf_boolean_t quota_on = _gf_false;
-void
-__quota_reconfigure (xlator_t *this, inode_table_t *itable, limits_t *limit)
-{
- inode_t *inode = NULL;
+ priv = this->private;
- if ((this == NULL) || (itable == NULL) || (limit == NULL)) {
- goto out;
- }
+ GF_OPTION_RECONF ("deem-statfs", priv->consider_statfs, options, bool,
+ out);
+ GF_OPTION_RECONF ("server-quota", quota_on, options, bool,
+ out);
+ GF_OPTION_RECONF ("default-soft-limit", priv->default_soft_lim,
+ options, percent, out);
+ GF_OPTION_RECONF ("alert-time", priv->log_timeout, options,
+ time, out);
+ GF_OPTION_RECONF ("soft-timeout", priv->soft_timeout, options,
+ time, out);
+ GF_OPTION_RECONF ("hard-timeout", priv->hard_timeout, options,
+ time, out);
+
+ if (quota_on) {
+ priv->rpc_clnt = quota_enforcer_init (this,
+ this->options);
+ if (priv->rpc_clnt == NULL) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_WARNING,
+ "quota enforcer rpc init failed");
+ goto out;
+ }
- if (!uuid_is_null (limit->gfid)) {
- inode = inode_find (itable, limit->gfid);
} else {
- inode = inode_resolve (itable, limit->path);
+ if (priv->rpc_clnt) {
+ // Quotad is shutdown when there is no started volume
+ // which has quota enabled. So, we should disable the
+ // enforcer client when quota is disabled on a volume,
+ // to avoid spurious reconnect attempts to a service
+ // (quotad), that is known to be down.
+ rpc_clnt_disable (priv->rpc_clnt);
+ }
}
- if (inode != NULL) {
- __quota_reconfigure_inode_ctx (this, inode, limit);
- }
+ priv->is_quota_on = quota_on;
+ ret = 0;
out:
- return;
+ return ret;
}
-
-int
-reconfigure (xlator_t *this, dict_t *options)
+int32_t
+quota_priv_dump (xlator_t *this)
{
- int32_t ret = -1;
- quota_priv_t *priv = NULL;
- limits_t *limit = NULL, *next = NULL, *new = NULL;
- struct list_head head = {0, };
- xlator_t *top = NULL;
- char found = 0;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
- priv = this->private;
- INIT_LIST_HEAD (&head);
+ GF_ASSERT (this);
- LOCK (&priv->lock);
- {
- list_splice_init (&priv->limit_head, &head);
- }
- UNLOCK (&priv->lock);
-
- ret = quota_parse_limits (priv, this, options, &head);
- if (ret == -1) {
- gf_log ("quota", GF_LOG_WARNING,
- "quota reconfigure failed, "
- "new changes will not take effect");
- goto out;
- }
-
- LOCK (&priv->lock);
- {
- top = ((glusterfs_ctx_t *)this->ctx)->active->top;
- GF_ASSERT (top);
-
- list_for_each_entry (limit, &priv->limit_head, limit_list) {
- __quota_reconfigure (this, top->itable, limit);
- }
-
- list_for_each_entry_safe (limit, next, &head, limit_list) {
- found = 0;
- list_for_each_entry (new, &priv->limit_head,
- limit_list) {
- if (strcmp (new->path, limit->path) == 0) {
- found = 1;
- break;
- }
- }
-
- if (!found) {
- limit->value = -1;
- __quota_reconfigure (this, top->itable, limit);
- }
+ priv = this->private;
- list_del_init (&limit->limit_list);
- GF_FREE (limit);
- }
+ gf_proc_dump_add_section ("xlators.features.quota.priv", this->name);
+
+ ret = TRY_LOCK (&priv->lock);
+ if (ret)
+ goto out;
+ else {
+ gf_proc_dump_write("soft-timeout", "%d", priv->soft_timeout);
+ gf_proc_dump_write("hard-timeout", "%d", priv->hard_timeout);
+ gf_proc_dump_write("alert-time", "%d", priv->log_timeout);
+ gf_proc_dump_write("quota-on", "%d", priv->is_quota_on);
+ gf_proc_dump_write("statfs", "%d", priv->consider_statfs);
+ gf_proc_dump_write("volume-uuid", "%s", priv->volume_uuid);
+ gf_proc_dump_write("validation-count", "%ld",
+ priv->validation_count);
}
UNLOCK (&priv->lock);
- GF_OPTION_RECONF ("timeout", priv->timeout, options, int64, out);
- GF_OPTION_RECONF ("deem-statfs", priv->consider_statfs, options, bool,
- out);
-
- ret = 0;
out:
- return ret;
+ return 0;
}
-
void
fini (xlator_t *this)
{
@@ -3304,22 +4295,18 @@ struct xlator_fops fops = {
.removexattr = quota_removexattr,
.fremovexattr = quota_fremovexattr,
.readdirp = quota_readdirp,
+ .fallocate = quota_fallocate,
};
struct xlator_cbks cbks = {
.forget = quota_forget
};
+struct xlator_dumpops dumpops = {
+ .priv = quota_priv_dump,
+};
struct volume_options options[] = {
{.key = {"limit-set"}},
- {.key = {"timeout"},
- .type = GF_OPTION_TYPE_SIZET,
- .min = 0,
- .max = 60,
- .default_value = "0",
- .description = "quota caches the directory sizes on client. Timeout "
- "indicates the timeout for the cache to be revalidated."
- },
{.key = {"deem-statfs"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
@@ -3327,5 +4314,60 @@ struct volume_options options[] = {
"consideration while estimating fs size. (df command)"
" (Default is off)."
},
+ {.key = {"server-quota"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Skip the quota enforcement if the feature is"
+ " not turned on. This is not a user exposed option."
+ },
+ {.key = {"default-soft-limit"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "80%",
+ },
+ {.key = {"soft-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
+ .min = 0,
+ .max = 1800,
+ .default_value = "60",
+ .description = "quota caches the directory sizes on client. "
+ "soft-timeout indicates the timeout for the validity of"
+ " cache before soft-limit has been crossed."
+ },
+ {.key = {"hard-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
+ .min = 0,
+ .max = 60,
+ .default_value = "5",
+ .description = "quota caches the directory sizes on client. "
+ "hard-timeout indicates the timeout for the validity of"
+ " cache after soft-limit has been crossed."
+ },
+ { .key = {"username"},
+ .type = GF_OPTION_TYPE_ANY,
+ },
+ { .key = {"password"},
+ .type = GF_OPTION_TYPE_ANY,
+ },
+ { .key = {"transport-type"},
+ .value = {"tcp", "socket", "ib-verbs", "unix", "ib-sdp",
+ "tcp/client", "ib-verbs/client", "rdma"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+ { .key = {"remote-host"},
+ .type = GF_OPTION_TYPE_INTERNET_ADDRESS,
+ },
+ { .key = {"remote-port"},
+ .type = GF_OPTION_TYPE_INT,
+ },
+ { .key = {"volume-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "uuid of the volume this brick is part of."
+ },
+ { .key = {"alert-time"},
+ .type = GF_OPTION_TYPE_TIME,
+ .min = 0,
+ .max = 7*86400,
+ .default_value = "86400",
+ },
{.key = {NULL}}
};
diff --git a/xlators/features/quota/src/quota.h b/xlators/features/quota/src/quota.h
index 84ecbb308..84c3257fe 100644
--- a/xlators/features/quota/src/quota.h
+++ b/xlators/features/quota/src/quota.h
@@ -12,20 +12,48 @@
#include "config.h"
#endif
+#ifndef _QUOTA_H
+#define _QUOTA_H
+
#include "xlator.h"
#include "call-stub.h"
#include "defaults.h"
-#include "byte-order.h"
#include "common-utils.h"
#include "quota-mem-types.h"
+#include "glusterfs.h"
+#include "compat.h"
+#include "logging.h"
+#include "dict.h"
+#include "stack.h"
+#include "common-utils.h"
+#include "event.h"
+#include "globals.h"
+#include "rpcsvc.h"
+#include "rpc-clnt.h"
+#include "byte-order.h"
+#include "glusterfs3-xdr.h"
+#include "glusterfs3.h"
+#include "xdr-generic.h"
+#include "compat-errno.h"
+#include "protocol-common.h"
-#define QUOTA_XATTR_PREFIX "trusted."
#define DIRTY "dirty"
#define SIZE "size"
#define CONTRIBUTION "contri"
#define VAL_LENGTH 8
#define READDIR_BUF 4096
+#ifndef UUID_CANONICAL_FORM_LEN
+#define UUID_CANONICAL_FORM_LEN 36
+#endif
+
+#define WIND_IF_QUOTAOFF(is_quota_on, label) \
+ if (!is_quota_on) \
+ goto label;
+
+#define DID_REACH_LIMIT(lim, prev_size, cur_size) \
+ ((cur_size) >= (lim) && (prev_size) < (lim))
+
#define QUOTA_SAFE_INCREMENT(lock, var) \
do { \
LOCK (lock); \
@@ -46,7 +74,7 @@
gf_quota_mt_##type); \
if (!var) { \
gf_log ("", GF_LOG_ERROR, \
- "out of memory :("); \
+ "out of memory"); \
ret = -1; \
goto label; \
} \
@@ -74,11 +102,17 @@
#define GET_CONTRI_KEY(var, _vol_name, _gfid, _ret) \
do { \
char _gfid_unparsed[40]; \
- uuid_unparse (_gfid, _gfid_unparsed); \
- _ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \
- "%s.%s." CONTRIBUTION, \
- _vol_name, _gfid_unparsed); \
- } while (0)
+ if (_gfid != NULL) { \
+ uuid_unparse (_gfid, _gfid_unparsed); \
+ _ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \
+ "%s.%s." CONTRIBUTION, \
+ _vol_name, _gfid_unparsed); \
+ } else { \
+ _ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \
+ "%s.." CONTRIBUTION, \
+ _vol_name); \
+ } \
+ } while (0)
#define GET_CONTRI_KEY_OR_GOTO(var, _vol_name, _gfid, label) \
@@ -96,6 +130,11 @@
goto label; \
} while (0)
+#define QUOTA_REG_OR_LNK_FILE(ia_type) \
+ (IA_ISREG (ia_type) || IA_ISLNK (ia_type))
+
+
+
struct quota_dentry {
char *name;
uuid_t par;
@@ -105,47 +144,77 @@ typedef struct quota_dentry quota_dentry_t;
struct quota_inode_ctx {
int64_t size;
- int64_t limit;
+ int64_t hard_lim;
+ int64_t soft_lim;
struct iatt buf;
struct list_head parents;
struct timeval tv;
+ struct timeval prev_log;
gf_lock_t lock;
};
typedef struct quota_inode_ctx quota_inode_ctx_t;
+struct quota_limit {
+ int64_t hard_lim;
+ int64_t soft_lim_percent;
+} __attribute__ ((packed));
+typedef struct quota_limit quota_limit_t;
+
+typedef void
+(*quota_ancestry_built_t) (struct list_head *parents, inode_t *inode,
+ int32_t op_ret, int32_t op_errno, void *data);
+
struct quota_local {
- gf_lock_t lock;
- uint32_t validate_count;
- uint32_t link_count;
- loc_t loc;
- loc_t oldloc;
- loc_t newloc;
- loc_t validate_loc;
- int64_t delta;
- int32_t op_ret;
- int32_t op_errno;
- int64_t size;
- int64_t limit;
- char just_validated;
- inode_t *inode;
- call_stub_t *stub;
+ gf_lock_t lock;
+ uint32_t validate_count;
+ uint32_t link_count;
+ loc_t loc;
+ loc_t oldloc;
+ loc_t newloc;
+ loc_t validate_loc;
+ int64_t delta;
+ int32_t op_ret;
+ int32_t op_errno;
+ int64_t size;
+ gf_boolean_t skip_check;
+ char just_validated;
+ fop_lookup_cbk_t validate_cbk;
+ inode_t *inode;
+ call_stub_t *stub;
+ struct iobref *iobref;
+ quota_limit_t limit;
+ int64_t space_available;
+ quota_ancestry_built_t ancestry_cbk;
+ void *ancestry_data;
};
-typedef struct quota_local quota_local_t;
+typedef struct quota_local quota_local_t;
struct quota_priv {
- int64_t timeout;
- gf_boolean_t consider_statfs;
- struct list_head limit_head;
- gf_lock_t lock;
+ uint32_t soft_timeout;
+ uint32_t hard_timeout;
+ uint32_t log_timeout;
+ double default_soft_lim;
+ gf_boolean_t is_quota_on;
+ gf_boolean_t consider_statfs;
+ gf_lock_t lock;
+ rpc_clnt_prog_t *quota_enforcer;
+ struct rpcsvc_program *quotad_aggregator;
+ struct rpc_clnt *rpc_clnt;
+ rpcsvc_t *rpcsvc;
+ inode_table_t *itable;
+ char *volume_uuid;
+ uint64_t validation_count;
};
-typedef struct quota_priv quota_priv_t;
+typedef struct quota_priv quota_priv_t;
-struct limits {
- struct list_head limit_list;
- char *path;
- int64_t value;
- uuid_t gfid;
-};
-typedef struct limits limits_t;
+int
+quota_enforcer_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata, fop_lookup_cbk_t cbk);
+struct rpc_clnt *
+quota_enforcer_init (xlator_t *this, dict_t *options);
-uint64_t cn = 1;
+void
+quota_log_usage (xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode,
+ int64_t delta);
+
+#endif
diff --git a/xlators/features/quota/src/quotad-aggregator.c b/xlators/features/quota/src/quotad-aggregator.c
new file mode 100644
index 000000000..5f13fd251
--- /dev/null
+++ b/xlators/features/quota/src/quotad-aggregator.c
@@ -0,0 +1,423 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "cli1-xdr.h"
+#include "quota.h"
+#include "quotad-helpers.h"
+#include "quotad-aggregator.h"
+
+struct rpcsvc_program quotad_aggregator_prog;
+
+struct iobuf *
+quotad_serialize_reply (rpcsvc_request_t *req, void *arg, struct iovec *outmsg,
+ xdrproc_t xdrproc)
+{
+ struct iobuf *iob = NULL;
+ ssize_t retlen = 0;
+ ssize_t xdr_size = 0;
+
+ GF_VALIDATE_OR_GOTO ("server", req, ret);
+
+ /* First, get the io buffer into which the reply in arg will
+ * be serialized.
+ */
+ if (arg && xdrproc) {
+ xdr_size = xdr_sizeof (xdrproc, arg);
+ iob = iobuf_get2 (req->svc->ctx->iobuf_pool, xdr_size);
+ if (!iob) {
+ gf_log_callingfn (THIS->name, GF_LOG_ERROR,
+ "Failed to get iobuf");
+ goto ret;
+ };
+
+ iobuf_to_iovec (iob, outmsg);
+ /* Use the given serializer to translate the give C structure in arg
+ * to XDR format which will be written into the buffer in outmsg.
+ */
+ /* retlen is used to received the error since size_t is unsigned and we
+ * need -1 for error notification during encoding.
+ */
+
+ retlen = xdr_serialize_generic (*outmsg, arg, xdrproc);
+ if (retlen == -1) {
+ /* Failed to Encode 'GlusterFS' msg in RPC is not exactly
+ failure of RPC return values.. client should get
+ notified about this, so there are no missing frames */
+ gf_log_callingfn ("", GF_LOG_ERROR, "Failed to encode message");
+ req->rpc_err = GARBAGE_ARGS;
+ retlen = 0;
+ }
+ }
+ outmsg->iov_len = retlen;
+ret:
+ if (retlen == -1) {
+ iobuf_unref (iob);
+ iob = NULL;
+ }
+
+ return iob;
+}
+
+int
+quotad_aggregator_submit_reply (call_frame_t *frame, rpcsvc_request_t *req,
+ void *arg, struct iovec *payload,
+ int payloadcount, struct iobref *iobref,
+ xdrproc_t xdrproc)
+{
+ struct iobuf *iob = NULL;
+ int ret = -1;
+ struct iovec rsp = {0,};
+ quotad_aggregator_state_t *state = NULL;
+ char new_iobref = 0;
+
+ GF_VALIDATE_OR_GOTO ("server", req, ret);
+
+ if (frame) {
+ state = frame->root->state;
+ frame->local = NULL;
+ }
+
+ if (!iobref) {
+ iobref = iobref_new ();
+ if (!iobref) {
+ goto ret;
+ }
+
+ new_iobref = 1;
+ }
+
+ iob = quotad_serialize_reply (req, arg, &rsp, xdrproc);
+ if (!iob) {
+ gf_log ("", GF_LOG_ERROR, "Failed to serialize reply");
+ goto ret;
+ }
+
+ iobref_add (iobref, iob);
+
+ ret = rpcsvc_submit_generic (req, &rsp, 1, payload, payloadcount,
+ iobref);
+
+ iobuf_unref (iob);
+
+ ret = 0;
+ret:
+ if (state) {
+ quotad_aggregator_free_state (state);
+ }
+
+ if (frame) {
+ if (frame->root->client)
+ gf_client_unref (frame->root->client);
+
+ STACK_DESTROY (frame->root);
+ }
+
+ if (new_iobref) {
+ iobref_unref (iobref);
+ }
+
+ return ret;
+}
+
+int
+quotad_aggregator_getlimit_cbk (xlator_t *this, call_frame_t *frame,
+ void *lookup_rsp)
+{
+ gfs3_lookup_rsp *rsp = lookup_rsp;
+ gf_cli_rsp cli_rsp = {0,};
+ dict_t *xdata = NULL;
+ int ret = -1;
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->this, xdata,
+ (rsp->xdata.xdata_val),
+ (rsp->xdata.xdata_len), rsp->op_ret,
+ rsp->op_errno, out);
+
+ ret = 0;
+out:
+ rsp->op_ret = ret;
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to unserialize "
+ "nameless lookup rsp");
+ goto reply;
+ }
+ cli_rsp.op_ret = rsp->op_ret;
+ cli_rsp.op_errno = rsp->op_errno;
+ cli_rsp.op_errstr = "";
+ if (xdata) {
+ GF_PROTOCOL_DICT_SERIALIZE (frame->this, xdata,
+ (&cli_rsp.dict.dict_val),
+ (cli_rsp.dict.dict_len),
+ cli_rsp.op_errno, reply);
+ }
+
+reply:
+ quotad_aggregator_submit_reply (frame, frame->local, (void*)&cli_rsp, NULL, 0,
+ NULL, (xdrproc_t)xdr_gf_cli_rsp);
+
+ dict_unref (xdata);
+ GF_FREE (cli_rsp.dict.dict_val);
+ return 0;
+}
+
+int
+quotad_aggregator_getlimit (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ gf_cli_req cli_req = {{0}, };
+ gf_cli_rsp cli_rsp = {0};
+ gfs3_lookup_req args = {{0,},};
+ gfs3_lookup_rsp rsp = {0,};
+ quotad_aggregator_state_t *state = NULL;
+ xlator_t *this = NULL;
+ dict_t *dict = NULL;
+ int ret = -1, op_errno = 0;
+ char *gfid_str = NULL;
+ uuid_t gfid = {0};
+
+ GF_VALIDATE_OR_GOTO ("quotad-aggregator", req, err);
+
+ this = THIS;
+
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ gf_log ("", GF_LOG_ERROR, "xdr decoding error");
+ req->rpc_err = GARBAGE_ARGS;
+ goto err;
+ }
+
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len, &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "unserialize req-buffer to dictionary");
+ goto err;
+ }
+ }
+
+ ret = dict_get_str (dict, "gfid", &gfid_str);
+ if (ret) {
+ goto err;
+ }
+
+ uuid_parse ((const char*)gfid_str, gfid);
+
+ frame = quotad_aggregator_get_frame_from_req (req);
+ if (frame == NULL) {
+ rsp.op_errno = ENOMEM;
+ goto err;
+ }
+ state = frame->root->state;
+ state->xdata = dict;
+ ret = dict_set_int32 (state->xdata, QUOTA_LIMIT_KEY, 42);
+ if (ret)
+ goto err;
+
+ ret = dict_set_int32 (state->xdata, QUOTA_SIZE_KEY, 42);
+ if (ret)
+ goto err;
+
+ ret = dict_set_int32 (state->xdata, GET_ANCESTRY_PATH_KEY, 42);
+ if (ret)
+ goto err;
+
+ memcpy (&args.gfid, &gfid, 16);
+
+ args.bname = alloca (req->msg[0].iov_len);
+ args.xdata.xdata_val = alloca (req->msg[0].iov_len);
+
+ ret = qd_nameless_lookup (this, frame, &args, state->xdata,
+ quotad_aggregator_getlimit_cbk);
+ if (ret) {
+ rsp.op_errno = ret;
+ goto err;
+ }
+
+ return ret;
+
+err:
+ cli_rsp.op_ret = -1;
+ cli_rsp.op_errno = op_errno;
+ cli_rsp.op_errstr = "";
+
+ quotad_aggregator_getlimit_cbk (this, frame, &cli_rsp);
+ dict_unref (dict);
+
+ return ret;
+}
+
+int
+quotad_aggregator_lookup_cbk (xlator_t *this, call_frame_t *frame,
+ void *rsp)
+{
+ quotad_aggregator_submit_reply (frame, frame->local, rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gfs3_lookup_rsp);
+
+ return 0;
+}
+
+
+int
+quotad_aggregator_lookup (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ gfs3_lookup_req args = {{0,},};
+ int ret = -1, op_errno = 0;
+ gfs3_lookup_rsp rsp = {0,};
+ quotad_aggregator_state_t *state = NULL;
+ xlator_t *this = NULL;
+
+ GF_VALIDATE_OR_GOTO ("quotad-aggregator", req, err);
+
+ this = THIS;
+
+ args.bname = alloca (req->msg[0].iov_len);
+ args.xdata.xdata_val = alloca (req->msg[0].iov_len);
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_lookup_req);
+ if (ret < 0) {
+ rsp.op_errno = EINVAL;
+ goto err;
+ }
+
+ frame = quotad_aggregator_get_frame_from_req (req);
+ if (frame == NULL) {
+ rsp.op_errno = ENOMEM;
+ goto err;
+ }
+
+ state = frame->root->state;
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, state->xdata,
+ (args.xdata.xdata_val),
+ (args.xdata.xdata_len), ret,
+ op_errno, err);
+
+
+ ret = qd_nameless_lookup (this, frame, &args, state->xdata,
+ quotad_aggregator_lookup_cbk);
+ if (ret) {
+ rsp.op_errno = ret;
+ goto err;
+ }
+
+ return ret;
+
+err:
+ rsp.op_ret = -1;
+ rsp.op_errno = op_errno;
+
+ quotad_aggregator_lookup_cbk (this, frame, &rsp);
+ return ret;
+}
+
+int
+quotad_aggregator_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
+ void *data)
+{
+ if (!xl || !data) {
+ gf_log_callingfn ("server", GF_LOG_WARNING,
+ "Calling rpc_notify without initializing");
+ goto out;
+ }
+
+ switch (event) {
+ case RPCSVC_EVENT_ACCEPT:
+ break;
+
+ case RPCSVC_EVENT_DISCONNECT:
+ break;
+
+ default:
+ break;
+ }
+
+out:
+ return 0;
+}
+
+int
+quotad_aggregator_init (xlator_t *this)
+{
+ quota_priv_t *priv = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ ret = dict_set_str (this->options, "transport.address-family", "unix");
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (this->options, "transport-type", "socket");
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (this->options, "transport.socket.listen-path",
+ "/tmp/quotad.socket");
+ if (ret)
+ goto out;
+
+ /* RPC related */
+ priv->rpcsvc = rpcsvc_init (this, this->ctx, this->options, 0);
+ if (priv->rpcsvc == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "creation of rpcsvc failed");
+ ret = -1;
+ goto out;
+ }
+
+ ret = rpcsvc_create_listeners (priv->rpcsvc, this->options,
+ this->name);
+ if (ret < 1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "creation of listener failed");
+ ret = -1;
+ goto out;
+ }
+
+ priv->quotad_aggregator = &quotad_aggregator_prog;
+ quotad_aggregator_prog.options = this->options;
+
+ ret = rpcsvc_program_register (priv->rpcsvc, &quotad_aggregator_prog);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "registration of program (name:%s, prognum:%d, "
+ "progver:%d) failed", quotad_aggregator_prog.progname,
+ quotad_aggregator_prog.prognum,
+ quotad_aggregator_prog.progver);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+rpcsvc_actor_t quotad_aggregator_actors[] = {
+ [GF_AGGREGATOR_NULL] = {"NULL", GF_AGGREGATOR_NULL, NULL, NULL, 0,
+ DRC_NA},
+ [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", GF_AGGREGATOR_NULL,
+ quotad_aggregator_lookup, NULL, 0, DRC_NA},
+ [GF_AGGREGATOR_GETLIMIT] = {"GETLIMIT", GF_AGGREGATOR_GETLIMIT,
+ quotad_aggregator_getlimit, NULL, 0},
+};
+
+
+struct rpcsvc_program quotad_aggregator_prog = {
+ .progname = "GlusterFS 3.3",
+ .prognum = GLUSTER_AGGREGATOR_PROGRAM,
+ .progver = GLUSTER_AGGREGATOR_VERSION,
+ .numactors = GF_AGGREGATOR_MAXVALUE,
+ .actors = quotad_aggregator_actors
+};
diff --git a/xlators/features/quota/src/quotad-aggregator.h b/xlators/features/quota/src/quotad-aggregator.h
new file mode 100644
index 000000000..5ddea5b3c
--- /dev/null
+++ b/xlators/features/quota/src/quotad-aggregator.h
@@ -0,0 +1,37 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _QUOTAD_AGGREGATOR_H
+#define _QUOTAD_AGGREGATOR_H
+
+#include "quota.h"
+#include "stack.h"
+#include "glusterfs3-xdr.h"
+#include "inode.h"
+
+typedef struct {
+ void *pool;
+ xlator_t *this;
+ xlator_t *active_subvol;
+ inode_table_t *itable;
+ loc_t loc;
+ dict_t *xdata;
+} quotad_aggregator_state_t;
+
+typedef int (*quotad_aggregator_lookup_cbk_t) (xlator_t *this,
+ call_frame_t *frame,
+ void *rsp);
+int
+qd_nameless_lookup (xlator_t *this, call_frame_t *frame, gfs3_lookup_req *req,
+ dict_t *xdata, quotad_aggregator_lookup_cbk_t lookup_cbk);
+int
+quotad_aggregator_init (xlator_t *this);
+
+#endif
diff --git a/xlators/features/quota/src/quotad-helpers.c b/xlators/features/quota/src/quotad-helpers.c
new file mode 100644
index 000000000..fd3099114
--- /dev/null
+++ b/xlators/features/quota/src/quotad-helpers.c
@@ -0,0 +1,113 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "quotad-helpers.h"
+
+quotad_aggregator_state_t *
+get_quotad_aggregator_state (xlator_t *this, rpcsvc_request_t *req)
+{
+ quotad_aggregator_state_t *state = NULL;
+ xlator_t *active_subvol = NULL;
+ quota_priv_t *priv = NULL;
+
+ state = (void *)GF_CALLOC (1, sizeof (*state),
+ gf_quota_mt_aggregator_state_t);
+ if (!state)
+ return NULL;
+
+ state->this = THIS;
+ priv = this->private;
+
+ LOCK (&priv->lock);
+ {
+ active_subvol = state->active_subvol = FIRST_CHILD (this);
+ }
+ UNLOCK (&priv->lock);
+
+ if (active_subvol->itable == NULL)
+ active_subvol->itable = inode_table_new (4096, active_subvol);
+
+ state->itable = active_subvol->itable;
+
+ state->pool = this->ctx->pool;
+
+ return state;
+}
+
+void
+quotad_aggregator_free_state (quotad_aggregator_state_t *state)
+{
+ if (state->xdata)
+ dict_unref (state->xdata);
+
+ GF_FREE (state);
+}
+
+call_frame_t *
+quotad_aggregator_alloc_frame (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ quotad_aggregator_state_t *state = NULL;
+ xlator_t *this = NULL;
+
+ GF_VALIDATE_OR_GOTO ("server", req, out);
+ GF_VALIDATE_OR_GOTO ("server", req->trans, out);
+ GF_VALIDATE_OR_GOTO ("server", req->svc, out);
+ GF_VALIDATE_OR_GOTO ("server", req->svc->ctx, out);
+
+ this = req->svc->mydata;
+
+ frame = create_frame (this, req->svc->ctx->pool);
+ if (!frame)
+ goto out;
+
+ state = get_quotad_aggregator_state (this, req);
+ if (!state)
+ goto out;
+
+ frame->root->state = state;
+ frame->root->unique = 0;
+
+ frame->this = this;
+out:
+ return frame;
+}
+
+call_frame_t *
+quotad_aggregator_get_frame_from_req (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ client_t *client = NULL;
+
+ GF_VALIDATE_OR_GOTO ("server", req, out);
+
+ frame = quotad_aggregator_alloc_frame (req);
+ if (!frame)
+ goto out;
+
+ client = req->trans->xl_private;
+
+ frame->root->op = req->procnum;
+
+ frame->root->unique = req->xid;
+
+ frame->root->uid = req->uid;
+ frame->root->gid = req->gid;
+ frame->root->pid = req->pid;
+
+ gf_client_ref (client);
+ frame->root->client = client;
+
+ frame->root->lk_owner = req->lk_owner;
+
+ frame->local = req;
+out:
+ return frame;
+}
diff --git a/xlators/features/quota/src/quotad-helpers.h b/xlators/features/quota/src/quotad-helpers.h
new file mode 100644
index 000000000..a10fb7fa8
--- /dev/null
+++ b/xlators/features/quota/src/quotad-helpers.h
@@ -0,0 +1,24 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef QUOTAD_HELPERS_H
+#define QUOTAD_HELPERS_H
+
+#include "rpcsvc.h"
+#include "quota.h"
+#include "quotad-aggregator.h"
+
+void
+quotad_aggregator_free_state (quotad_aggregator_state_t *state);
+
+call_frame_t *
+quotad_aggregator_get_frame_from_req (rpcsvc_request_t *req);
+
+#endif
diff --git a/xlators/features/quota/src/quotad.c b/xlators/features/quota/src/quotad.c
new file mode 100644
index 000000000..243b943e9
--- /dev/null
+++ b/xlators/features/quota/src/quotad.c
@@ -0,0 +1,210 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include "quota.h"
+#include "quotad-aggregator.h"
+#include "common-utils.h"
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_quota_mt_end + 1);
+
+ if (0 != ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Memory accounting "
+ "init failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
+qd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ quotad_aggregator_lookup_cbk_t lookup_cbk = NULL;
+ gfs3_lookup_rsp rsp = {0, };
+
+ lookup_cbk = cookie;
+
+ rsp.op_ret = op_ret;
+ rsp.op_errno = op_errno;
+
+ gf_stat_from_iatt (&rsp.postparent, postparent);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ rsp.xdata.xdata_len, rsp.op_errno, out);
+
+ gf_stat_from_iatt (&rsp.stat, buf);
+
+out:
+ lookup_cbk (this, frame, &rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ inode_unref (inode);
+
+ return 0;
+}
+
+xlator_t *
+qd_find_subvol (xlator_t *this, char *volume_uuid)
+{
+ xlator_list_t *child = NULL;
+ xlator_t *subvol = NULL;
+ char key[1024];
+ char *optstr = NULL;
+
+ if (!this || !volume_uuid)
+ goto out;
+
+ for (child = this->children; child; child = child->next) {
+ snprintf(key, 1024, "%s.volume-id", child->xlator->name);
+ if (dict_get_str(this->options, key, &optstr) < 0)
+ continue;
+
+ if (strcmp (optstr, volume_uuid) == 0) {
+ subvol = child->xlator;
+ break;
+ }
+ }
+
+out:
+ return subvol;
+}
+
+int
+qd_nameless_lookup (xlator_t *this, call_frame_t *frame, gfs3_lookup_req *req,
+ dict_t *xdata, quotad_aggregator_lookup_cbk_t lookup_cbk)
+{
+ gfs3_lookup_rsp rsp = {0, };
+ int op_errno = 0, ret = -1;
+ loc_t loc = {0, };
+ quotad_aggregator_state_t *state = NULL;
+ quota_priv_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ char *volume_uuid = NULL;
+
+ priv = this->private;
+ state = frame->root->state;
+
+ frame->root->op = GF_FOP_LOOKUP;
+
+ loc.inode = inode_new (state->itable);
+ if (loc.inode == NULL) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ memcpy (loc.gfid, req->gfid, 16);
+
+ ret = dict_get_str (xdata, "volume-uuid", &volume_uuid);
+ if (ret < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ subvol = qd_find_subvol (this, volume_uuid);
+ if (subvol == NULL) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ STACK_WIND_COOKIE (frame, qd_lookup_cbk, lookup_cbk, subvol,
+ subvol->fops->lookup, &loc, xdata);
+ return 0;
+
+out:
+ rsp.op_ret = -1;
+ rsp.op_errno = op_errno;
+
+ lookup_cbk (this, frame, &rsp);
+
+ inode_unref (loc.inode);
+ return 0;
+}
+
+int
+qd_reconfigure (xlator_t *this, dict_t *options)
+{
+ /* As of now quotad is restarted upon alteration of volfile */
+ return 0;
+}
+
+void
+qd_fini (xlator_t *this)
+{
+ return;
+}
+
+int32_t
+qd_init (xlator_t *this)
+{
+ int32_t ret = -1;
+ quota_priv_t *priv = NULL;
+
+ if (NULL == this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "FATAL: quota (%s) not configured for min of 1 child",
+ this->name);
+ ret = -1;
+ goto err;
+ }
+
+ QUOTA_ALLOC_OR_GOTO (priv, quota_priv_t, err);
+ LOCK_INIT (&priv->lock);
+
+ this->private = priv;
+
+ ret = quotad_aggregator_init (this);
+ if (ret < 0)
+ goto err;
+
+ ret = 0;
+err:
+ if (ret) {
+ GF_FREE (priv);
+ }
+ return ret;
+}
+
+class_methods_t class_methods = {
+ .init = qd_init,
+ .fini = qd_fini,
+ .reconfigure = qd_reconfigure,
+};
+
+struct xlator_fops fops = {
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"transport-type"},
+ .value = {"rpc", "rpc-over-rdma", "tcp", "socket", "ib-verbs",
+ "unix", "ib-sdp", "tcp/server", "ib-verbs/server", "rdma",
+ "rdma*([ \t]),*([ \t])socket",
+ "rdma*([ \t]),*([ \t])tcp",
+ "tcp*([ \t]),*([ \t])rdma",
+ "socket*([ \t]),*([ \t])rdma"},
+ .type = GF_OPTION_TYPE_STR
+ },
+ { .key = {"transport.*"},
+ .type = GF_OPTION_TYPE_ANY,
+ },
+ {.key = {NULL}}
+};
diff --git a/xlators/lib/src/libxlator.c b/xlators/lib/src/libxlator.c
index 624a929d0..4e680c510 100644
--- a/xlators/lib/src/libxlator.c
+++ b/xlators/lib/src/libxlator.c
@@ -11,6 +11,34 @@
#include "libxlator.h"
+int marker_xtime_default_gauge[] = {
+ [MCNT_FOUND] = 1,
+ [MCNT_NOTFOUND] = -1,
+ [MCNT_ENODATA] = -1,
+ [MCNT_ENOTCONN] = -1,
+ [MCNT_ENOENT] = -1,
+ [MCNT_EOTHER] = -1,
+};
+
+int marker_uuid_default_gauge[] = {
+ [MCNT_FOUND] = 1,
+ [MCNT_NOTFOUND] = 0,
+ [MCNT_ENODATA] = 0,
+ [MCNT_ENOTCONN] = 0,
+ [MCNT_ENOENT] = 0,
+ [MCNT_EOTHER] = 0,
+};
+
+static int marker_idx_errno_map[] = {
+ [MCNT_FOUND] = EINVAL,
+ [MCNT_NOTFOUND] = EINVAL,
+ [MCNT_ENOENT] = ENOENT,
+ [MCNT_ENOTCONN] = ENOTCONN,
+ [MCNT_ENODATA] = ENODATA,
+ [MCNT_EOTHER] = EINVAL,
+ [MCNT_MAX] = 0,
+};
+
/*Copy the contents of oldtimebuf to newtimbuf*/
static void
update_timebuf (uint32_t *oldtimbuf, uint32_t *newtimebuf)
@@ -47,22 +75,61 @@ match_uuid_local (const char *name, char *uuid)
static void
marker_local_incr_errcount (xl_marker_local_t *local, int op_errno)
{
+ marker_result_idx_t i = -1;
+
if (!local)
return;
switch (op_errno) {
case ENODATA:
- local->enodata_count++;
+ i = MCNT_ENODATA;
break;
case ENOENT:
- local->enoent_count++;
+ i = MCNT_ENOENT;
break;
case ENOTCONN:
- local->enotconn_count++;
+ i = MCNT_ENOTCONN;
break;
default:
+ i = MCNT_EOTHER;
+ break;
+ }
+
+ local->count[i]++;
+}
+
+static int
+evaluate_marker_results (int *gauge, int *count)
+{
+ int i = 0;
+ int op_errno = 0;
+ gf_boolean_t sane = _gf_true;
+
+ /* check if the policy of the gauge is violated;
+ * if yes, try to get the best errno, ie. look
+ * for the first position where there is a more
+ * specific kind of vioilation than the generic EINVAL
+ */
+ for (i = 0; i < MCNT_MAX; i++) {
+ if (sane) {
+ if ((gauge[i] > 0 && count[i] < gauge[i]) ||
+ (gauge[i] < 0 && count[i] >= -gauge[i])) {
+ sane = _gf_false;
+ /* generic action: adopt corresponding errno */
+ op_errno = marker_idx_errno_map[i];
+ }
+ } else {
+ /* already insane; trying to get a more informative
+ * errno by checking subsequent counters
+ */
+ if (count[i] > 0)
+ op_errno = marker_idx_errno_map[i];
+ }
+ if (op_errno && op_errno != EINVAL)
break;
}
+
+ return op_errno;
}
/* Aggregate all the <volid>.xtime attrs of the cluster and send the max*/
@@ -98,14 +165,10 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
callcnt = --local->call_count;
- if (local->esomerr)
- goto unlock;
-
vol_uuid = local->vol_uuid;
if (op_ret) {
marker_local_incr_errcount (local, op_errno);
- local->esomerr = op_errno;
goto unlock;
}
@@ -119,11 +182,11 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (dict_get_ptr (dict, marker_xattr, (void **)&net_timebuf)) {
gf_log (this->name, GF_LOG_WARNING,
"Unable to get <uuid>.xtime attr");
- local->noxtime_count++;
+ local->count[MCNT_NOTFOUND]++;
goto unlock;
}
- if (local->has_xtime) {
+ if (local->count[MCNT_FOUND]) {
get_hosttime (net_timebuf, host_timebuf);
if ( (host_timebuf[0]>local->host_timebuf[0]) ||
(host_timebuf[0] == local->host_timebuf[0] &&
@@ -135,7 +198,7 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
} else {
get_hosttime (net_timebuf, local->host_timebuf);
update_timebuf (net_timebuf, local->net_timebuf);
- local->has_xtime = _gf_true;
+ local->count[MCNT_FOUND]++;
}
}
@@ -147,7 +210,7 @@ unlock:
op_errno = 0;
need_unwind = 1;
- if (local->has_xtime) {
+ if (local->count[MCNT_FOUND]) {
if (!dict)
dict = dict_new();
@@ -160,17 +223,9 @@ unlock:
}
}
- if (local->noxtime_count)
- goto out;
-
- if (local->enodata_count || local->enotconn_count ||
- local->enoent_count) {
+ op_errno = evaluate_marker_results (local->gauge, local->count);
+ if (op_errno)
op_ret = -1;
- op_errno = local->enodata_count? ENODATA:
- local->enotconn_count? ENOTCONN:
- local->enoent_count? ENOENT:
- local->esomerr;
- }
}
out:
@@ -178,6 +233,7 @@ out:
frame->local = local->xl_local;
local->xl_specf_unwind (frame, op_ret,
op_errno, dict, xdata);
+ GF_FREE (local);
} else if (need_unwind) {
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno,
dict, xdata);
@@ -228,7 +284,7 @@ cluster_markeruuid_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (ret)
goto unlock;
- if (marker_has_volinfo (local)) {
+ if (local->count[MCNT_FOUND]) {
if ((local->volmark->major != volmark->major) ||
(local->volmark->minor != volmark->minor)) {
op_ret = -1;
@@ -257,6 +313,7 @@ cluster_markeruuid_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uuid_unparse (volmark->uuid, vol_uuid);
if (volmark->retval)
local->retval = volmark->retval;
+ local->count[MCNT_FOUND]++;
}
}
unlock:
@@ -267,7 +324,7 @@ unlock:
op_errno = 0;
need_unwind = 1;
- if (marker_has_volinfo (local)) {
+ if (local->count[MCNT_FOUND]) {
if (!dict)
dict = dict_new();
@@ -277,11 +334,10 @@ unlock:
op_ret = -1;
op_errno = ENOMEM;
}
- } else {
- op_ret = -1;
- op_errno = local->enotconn_count? ENOTCONN:
- local->enoent_count? ENOENT:EINVAL;
}
+ op_errno = evaluate_marker_results (local->gauge, local->count);
+ if (op_errno)
+ op_ret = -1;
}
out:
@@ -289,6 +345,7 @@ unlock:
frame->local = local->xl_local;
local->xl_specf_unwind (frame, op_ret,
op_errno, dict, xdata);
+ GF_FREE (local);
return 0;
} else if (need_unwind){
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno,
@@ -303,7 +360,7 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc,
const char *name, void *xl_local,
xlator_specf_unwind_t xl_specf_getxattr_unwind,
xlator_t **sub_volumes, int count, int type,
- char *vol_uuid)
+ int *gauge, char *vol_uuid)
{
int i = 0;
xl_marker_local_t *local = NULL;
@@ -326,6 +383,7 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc,
local->call_count = count;
local->xl_specf_unwind = xl_specf_getxattr_unwind;
local->vol_uuid = vol_uuid;
+ memcpy (local->gauge, gauge, sizeof (local->gauge));
frame->local = local;
@@ -357,3 +415,113 @@ err:
return -1;
}
+
+int
+gf_get_min_stime (xlator_t *this, dict_t *dst, char *key, data_t *value)
+{
+ int ret = -1;
+ uint32_t *net_timebuf = NULL;
+ uint32_t *value_timebuf = NULL;
+ uint32_t host_timebuf[2] = {0,};
+ uint32_t host_value_timebuf[2] = {0,};
+
+ /* stime should be minimum of all the other nodes */
+ ret = dict_get_bin (dst, key, (void **)&net_timebuf);
+ if (ret < 0) {
+ net_timebuf = GF_CALLOC (1, sizeof (int64_t),
+ gf_common_mt_char);
+ if (!net_timebuf)
+ goto out;
+
+ ret = dict_set_bin (dst, key, net_timebuf, sizeof (int64_t));
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "key=%s: dict set failed", key);
+ goto error;
+ }
+ }
+
+ value_timebuf = data_to_bin (value);
+ if (!value_timebuf) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "key=%s: getting value of stime failed", key);
+ ret = -1;
+ goto out;
+ }
+
+ get_hosttime (value_timebuf, host_value_timebuf);
+ get_hosttime (net_timebuf, host_timebuf);
+
+ /* can't use 'min()' macro here as we need to compare two fields
+ in the array, selectively */
+ if ((host_value_timebuf[0] < host_timebuf[0]) ||
+ ((host_value_timebuf[0] == host_timebuf[0]) &&
+ (host_value_timebuf[1] < host_timebuf[1]))) {
+ update_timebuf (value_timebuf, net_timebuf);
+ }
+
+ ret = 0;
+out:
+ return ret;
+error:
+ /* To be used only when net_timebuf is not set in the dict */
+ if (net_timebuf)
+ GF_FREE (net_timebuf);
+
+ return ret;
+}
+
+int
+gf_get_max_stime (xlator_t *this, dict_t *dst, char *key, data_t *value)
+{
+ int ret = -1;
+ uint32_t *net_timebuf = NULL;
+ uint32_t *value_timebuf = NULL;
+ uint32_t host_timebuf[2] = {0,};
+ uint32_t host_value_timebuf[2] = {0,};
+
+ /* stime should be maximum of all the other nodes */
+ ret = dict_get_bin (dst, key, (void **)&net_timebuf);
+ if (ret < 0) {
+ net_timebuf = GF_CALLOC (1, sizeof (int64_t),
+ gf_common_mt_char);
+ if (!net_timebuf)
+ goto out;
+
+ ret = dict_set_bin (dst, key, net_timebuf, sizeof (int64_t));
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "key=%s: dict set failed", key);
+ goto error;
+ }
+ }
+
+ value_timebuf = data_to_bin (value);
+ if (!value_timebuf) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "key=%s: getting value of stime failed", key);
+ ret = -1;
+ goto out;
+ }
+
+ get_hosttime (value_timebuf, host_value_timebuf);
+ get_hosttime (net_timebuf, host_timebuf);
+
+ /* can't use 'max()' macro here as we need to compare two fields
+ in the array, selectively */
+ if ((host_value_timebuf[0] > host_timebuf[0]) ||
+ ((host_value_timebuf[0] == host_timebuf[0]) &&
+ (host_value_timebuf[1] > host_timebuf[1]))) {
+ update_timebuf (value_timebuf, net_timebuf);
+ }
+
+ ret = 0;
+out:
+ return ret;
+error:
+ /* To be used only when net_timebuf is not set in the dict */
+ if (net_timebuf)
+ GF_FREE (net_timebuf);
+
+ return ret;
+}
diff --git a/xlators/lib/src/libxlator.h b/xlators/lib/src/libxlator.h
index 7e75c9482..175d3141d 100644
--- a/xlators/lib/src/libxlator.h
+++ b/xlators/lib/src/libxlator.h
@@ -32,6 +32,7 @@
#define MARKER_UUID_TYPE 1
#define MARKER_XTIME_TYPE 2
#define GF_XATTR_QUOTA_SIZE_KEY "trusted.glusterfs.quota.size"
+#define GF_XATTR_QUOTA_LIMIT_LIST "trusted.limit.list"
typedef int32_t (*xlator_specf_unwind_t) (call_frame_t *frame,
@@ -48,6 +49,69 @@ struct volume_mark {
uint32_t usec;
}__attribute__ ((__packed__));
+
+/*
+ * The enumerated type here
+ * is used to index two kind
+ * of integer arrays:
+ * - gauges
+ * - counters
+
+ * A counter is used internally,
+ * in getxattr callbacks, to count
+ * the results, categorized as
+ * the enum names suggest. So values
+ * in the counter are always non-negative.
+
+ * Gauges are part of the API.
+ * The caller passes one to the
+ * top-level aggregator function,
+ * cluster_getmarkerattr(). The gauge
+ * defines an evaluation policy for the
+ * counter. That is, at the
+ * end of the aggregation process
+ * the gauge is matched against the
+ * counter, and the policy
+ * represented by the gauge decides
+ * whether to return with success or failure,
+ * and in latter case, what particular failure
+ * case (errno).
+
+ * The rules are the following: for some index i,
+ * - if gauge[i] == 0, no requirement is set
+ * against counter[i];
+ * - if gauge[i] > 0, counter[i] >= gauge[i]
+ * is required;
+ * - if gauge[i] < 0, counter[i] < |gauge[i]|
+ * is required.
+
+ * If the requirement is not met, then i is mapped
+ * to the respective errno (MCNT_ENOENT -> ENOENT),
+ * or in lack of that, EINVAL.
+
+ * Cf. evaluate_marker_results() and marker_idx_errno_map[]
+ * in libxlator.c
+
+ * We provide two default gauges, one inteded for xtime
+ * aggregation, other for volume mark aggregation. The
+ * policies they represent agree with the hard-coded
+ * one prior to gauges. Cf. marker_xtime_default_gauge
+ * and marker_uuid_default_gauge in libxlator.c
+ */
+
+typedef enum {
+ MCNT_FOUND,
+ MCNT_NOTFOUND,
+ MCNT_ENODATA,
+ MCNT_ENOTCONN,
+ MCNT_ENOENT,
+ MCNT_EOTHER,
+ MCNT_MAX
+} marker_result_idx_t;
+
+extern int marker_xtime_default_gauge[];
+extern int marker_uuid_default_gauge[];
+
struct marker_str {
struct volume_mark *volmark;
data_t *data;
@@ -55,13 +119,8 @@ struct marker_str {
uint32_t host_timebuf[2];
uint32_t net_timebuf[2];
int32_t call_count;
- unsigned has_xtime:1;
- int32_t enoent_count;
- int32_t enotconn_count;
- int32_t enodata_count;
- int32_t noxtime_count;
-
- int esomerr;
+ int gauge[MCNT_MAX];
+ int count[MCNT_MAX];
xlator_specf_unwind_t xl_specf_unwind;
void *xl_local;
@@ -71,15 +130,6 @@ struct marker_str {
typedef struct marker_str xl_marker_local_t;
-static inline gf_boolean_t
-marker_has_volinfo (xl_marker_local_t *marker)
-{
- if (marker->volmark)
- return _gf_true;
- else
- return _gf_false;
-}
-
int32_t
cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *dict, dict_t *xdata);
@@ -93,12 +143,15 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc,
const char *name, void *xl_local,
xlator_specf_unwind_t xl_specf_getxattr_unwind,
xlator_t **sub_volumes, int count, int type,
- char *vol_uuid);
+ int *gauge, char *vol_uuid);
int
match_uuid_local (const char *name, char *uuid);
+int
+gf_get_min_stime (xlator_t *this, dict_t *dst, char *key, data_t *value);
-
+int
+gf_get_max_stime (xlator_t *this, dict_t *dst, char *key, data_t *value);
#endif /* !_LIBXLATOR_H */
diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am
index a6f49ae01..9b33edf4d 100644
--- a/xlators/mgmt/glusterd/src/Makefile.am
+++ b/xlators/mgmt/glusterd/src/Makefile.am
@@ -11,7 +11,9 @@ glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \
glusterd-volgen.c glusterd-rebalance.c glusterd-quota.c \
glusterd-geo-rep.c glusterd-replace-brick.c glusterd-log-ops.c \
glusterd-volume-ops.c glusterd-brick-ops.c glusterd-mountbroker.c \
- glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c
+ glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c \
+ glusterd-locks.c glusterd-snapshot.c glusterd-mgmt-handler.c \
+ glusterd-mgmt.c glusterd-etcd.c
glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
$(top_builddir)/rpc/xdr/src/libgfxdr.la \
@@ -21,12 +23,13 @@ glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
noinst_HEADERS = glusterd.h glusterd-utils.h glusterd-op-sm.h \
glusterd-sm.h glusterd-store.h glusterd-mem-types.h \
glusterd-pmap.h glusterd-volgen.h glusterd-mountbroker.h \
- glusterd-syncop.h glusterd-hooks.h
+ glusterd-syncop.h glusterd-hooks.h glusterd-locks.h \
+ glusterd-mgmt.h glusterd-etcd.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(rpclibdir) -I$(CONTRIBDIR)/rbtree \
-I$(top_srcdir)/rpc/xdr/src -I$(top_srcdir)/rpc/rpc-lib/src \
- -I$(CONTRIBDIR)/uuid \
+ -I$(CONTRIBDIR)/uuid -I$(CONTRIBDIR)/mount \
-DSBIN_DIR=\"$(sbindir)\" -DDATADIR=\"$(localstatedir)\" \
-DGSYNCD_PREFIX=\"$(libexecdir)/glusterfs\"\
-DSYNCDAEMON_COMPILE=$(SYNCDAEMON_COMPILE) $(XML_CPPFLAGS)
diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
index 912a6a798..1804dd02e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
@@ -546,6 +546,84 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
return glusterd_big_locked_handler (req, __glusterd_handle_add_brick);
}
+static int
+subvol_matcher_init (int **subvols, int count)
+{
+ int ret = -1;
+
+ *subvols = GF_CALLOC (count, sizeof(int), gf_gld_mt_int);
+ if (*subvols)
+ ret = 0;
+
+ return ret;
+}
+
+static void
+subvol_matcher_update (int *subvols, glusterd_volinfo_t *volinfo,
+ glusterd_brickinfo_t *brickinfo)
+{
+ glusterd_brickinfo_t *tmp = NULL;
+ int32_t sub_volume = 0;
+ int pos = 0;
+
+ list_for_each_entry (tmp, &volinfo->bricks, brick_list) {
+
+ if (strcmp (tmp->hostname, brickinfo->hostname) ||
+ strcmp (tmp->path, brickinfo->path)) {
+ pos++;
+ continue;
+ }
+ gf_log (THIS->name, GF_LOG_DEBUG, LOGSTR_FOUND_BRICK,
+ brickinfo->hostname, brickinfo->path,
+ volinfo->volname);
+ sub_volume = (pos / volinfo->dist_leaf_count);
+ subvols[sub_volume]++;
+ break;
+ }
+
+}
+
+static int
+subvol_matcher_verify (int *subvols, glusterd_volinfo_t *volinfo, char *err_str,
+ size_t err_len, char *vol_type, int replica_count)
+{
+ int i = 0;
+ int ret = 0;
+ int count = volinfo->replica_count-replica_count;
+
+ if (replica_count) {
+ for (i = 0; i < volinfo->subvol_count; i++) {
+ if (subvols[i] != count) {
+ ret = -1;
+ snprintf (err_str, err_len, "Remove exactly %d"
+ " brick(s) from each subvolume.", count);
+ break;
+ }
+ }
+ return ret;
+ }
+
+ do {
+
+ if (subvols[i] % volinfo->dist_leaf_count == 0) {
+ continue;
+ } else {
+ ret = -1;
+ snprintf (err_str, err_len,
+ "Bricks not from same subvol for %s", vol_type);
+ break;
+ }
+ } while (++i < volinfo->subvol_count);
+
+ return ret;
+}
+
+static void
+subvol_matcher_destroy (int *subvols)
+{
+ GF_FREE (subvols);
+}
+
int
__glusterd_handle_remove_brick (rpcsvc_request_t *req)
{
@@ -559,20 +637,12 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req)
int i = 1;
glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
- int32_t pos = 0;
- int32_t sub_volume = 0;
- int32_t sub_volume_start = 0;
- int32_t sub_volume_end = 0;
- glusterd_brickinfo_t *tmp = NULL;
+ int *subvols = NULL;
char err_str[2048] = {0};
gf_cli_rsp rsp = {0,};
void *cli_rsp = NULL;
char vol_type[256] = {0,};
int32_t replica_count = 0;
- int32_t brick_index = 0;
- int32_t tmp_brick_idx = 0;
- int found = 0;
- int diff_count = 0;
char *volname = 0;
xlator_t *this = NULL;
@@ -681,17 +751,6 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req)
goto out;
}
- /*Do not allow remove-brick if the volume is a replicate volume*/
- if ((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) &&
- (volinfo->brick_count == volinfo->replica_count)) {
- snprintf (err_str, sizeof(err_str),
- "Removing brick from a replicate volume "
- "is not allowed");
- gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
- ret = -1;
- goto out;
- }
-
if (!replica_count &&
(volinfo->type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) &&
(volinfo->brick_count == volinfo->dist_leaf_count)) {
@@ -738,6 +797,14 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req)
}
strcpy (brick_list, " ");
+
+ if ((volinfo->type != GF_CLUSTER_TYPE_NONE) &&
+ (volinfo->subvol_count > 1)) {
+ ret = subvol_matcher_init (&subvols, volinfo->subvol_count);
+ if (ret)
+ goto out;
+ }
+
while ( i <= count) {
snprintf (key, sizeof (key), "brick%d", i);
ret = dict_get_str (dict, key, &brick);
@@ -766,77 +833,18 @@ __glusterd_handle_remove_brick (rpcsvc_request_t *req)
(volinfo->brick_count <= volinfo->dist_leaf_count))
continue;
- if (replica_count) {
- /* do the validation of bricks here */
- /* -2 because i++ is already done, and i starts with 1,
- instead of 0 */
- diff_count = (volinfo->replica_count - replica_count);
- brick_index = (((i -2) / diff_count) * volinfo->replica_count);
- tmp_brick_idx = 0;
- found = 0;
- list_for_each_entry (tmp, &volinfo->bricks, brick_list) {
- tmp_brick_idx++;
- gf_log (this->name, GF_LOG_TRACE,
- "validate brick %s:%s (%d %d %d)",
- tmp->hostname, tmp->path, tmp_brick_idx,
- brick_index, volinfo->replica_count);
- if (tmp_brick_idx <= brick_index)
- continue;
- if (tmp_brick_idx >
- (brick_index + volinfo->replica_count))
- break;
- if ((!strcmp (tmp->hostname,brickinfo->hostname)) &&
- !strcmp (tmp->path, brickinfo->path)) {
- found = 1;
- break;
- }
- }
- if (found)
- continue;
+ /* Find which subvolume the brick belongs to */
+ subvol_matcher_update (subvols, volinfo, brickinfo);
+ }
- snprintf (err_str, sizeof (err_str), "Bricks are from "
- "same subvol");
- gf_log (this->name, GF_LOG_INFO,
- "failed to validate brick %s:%s (%d %d %d)",
- tmp->hostname, tmp->path, tmp_brick_idx,
- brick_index, volinfo->replica_count);
- ret = -1;
- /* brick order is not valid */
+ /* Check if the bricks belong to the same subvolumes.*/
+ if ((volinfo->type != GF_CLUSTER_TYPE_NONE) &&
+ (volinfo->subvol_count > 1)) {
+ ret = subvol_matcher_verify (subvols, volinfo,
+ err_str, sizeof(err_str),
+ vol_type, replica_count);
+ if (ret)
goto out;
- }
-
- pos = 0;
- list_for_each_entry (tmp, &volinfo->bricks, brick_list) {
-
- if (strcmp (tmp->hostname,brickinfo->hostname) ||
- strcmp (tmp->path, brickinfo->path)) {
- pos++;
- continue;
- }
-
- gf_log (this->name, GF_LOG_DEBUG, LOGSTR_FOUND_BRICK,
- brickinfo->hostname, brickinfo->path,
- volinfo->volname);
- if (!sub_volume && (volinfo->dist_leaf_count > 1)) {
- sub_volume = (pos / volinfo->dist_leaf_count) + 1;
- sub_volume_start = (volinfo->dist_leaf_count *
- (sub_volume - 1));
- sub_volume_end = (volinfo->dist_leaf_count *
- sub_volume) - 1;
- } else {
- if (pos < sub_volume_start ||
- pos >sub_volume_end) {
- ret = -1;
- snprintf (err_str, sizeof (err_str),
- "Bricks not from same subvol "
- "for %s", vol_type);
- gf_log (this->name, GF_LOG_ERROR,
- "%s", err_str);
- goto out;
- }
- }
- break;
- }
}
ret = glusterd_op_begin_synctask (req, GD_OP_REMOVE_BRICK, dict);
@@ -859,6 +867,7 @@ out:
}
GF_FREE (brick_list);
+ subvol_matcher_destroy (subvols);
free (cli_req.dict.dict_val); //its malloced by xdr
return ret;
@@ -871,23 +880,122 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
__glusterd_handle_remove_brick);
}
+static int
+_glusterd_restart_gsync_session (dict_t *this, char *key,
+ data_t *value, void *data)
+{
+ char *slave = NULL;
+ char *slave_buf = NULL;
+ char *path_list = NULL;
+ char *slave_vol = NULL;
+ char *slave_ip = NULL;
+ char *conf_path = NULL;
+ char **errmsg = NULL;
+ int ret = -1;
+ glusterd_gsync_status_temp_t *param = NULL;
+ gf_boolean_t is_running = _gf_false;
+
+ param = (glusterd_gsync_status_temp_t *)data;
+
+ GF_ASSERT (param);
+ GF_ASSERT (param->volinfo);
+
+ slave = strchr(value->data, ':');
+ if (slave) {
+ slave++;
+ slave_buf = gf_strdup (slave);
+ if (!slave_buf) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to gf_strdup");
+ ret = -1;
+ goto out;
+ }
+ }
+ else
+ return 0;
+
+ ret = dict_set_dynstr (param->rsp_dict, "slave", slave_buf);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store slave");
+ if (slave_buf)
+ GF_FREE(slave_buf);
+ goto out;
+ }
+
+ ret = glusterd_get_slave_details_confpath (param->volinfo,
+ param->rsp_dict,
+ &slave_ip, &slave_vol,
+ &conf_path, errmsg);
+ if (ret) {
+ if (*errmsg)
+ gf_log ("", GF_LOG_ERROR, "%s", *errmsg);
+ else
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave or confpath details.");
+ goto out;
+ }
+
+ /* In cases that gsyncd is not running, we will not invoke it
+ * because of add-brick. */
+ ret = glusterd_check_gsync_running_local (param->volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "gsync running validation failed.");
+ goto out;
+ }
+ if (_gf_false == is_running) {
+ gf_log ("", GF_LOG_DEBUG, "gsync session for %s and %s is"
+ " not running on this node. Hence not restarting.",
+ param->volinfo->volname, slave);
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_get_local_brickpaths (param->volinfo, &path_list);
+ if (!path_list) {
+ gf_log ("", GF_LOG_DEBUG, "This node not being part of"
+ " volume should not be running gsyncd. Hence"
+ " no gsyncd process to restart.");
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_check_restart_gsync_session (param->volinfo, slave,
+ param->rsp_dict, path_list,
+ conf_path, 0);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to restart gsync session.");
+
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d.", ret);
+ return ret;
+}
+
/* op-sm */
int
glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
char *bricks, dict_t *dict)
{
- glusterd_brickinfo_t *brickinfo = NULL;
- char *brick = NULL;
- int32_t i = 1;
- char *brick_list = NULL;
- char *free_ptr1 = NULL;
- char *free_ptr2 = NULL;
- char *saveptr = NULL;
- int32_t ret = -1;
- int32_t stripe_count = 0;
- int32_t replica_count = 0;
- int32_t type = 0;
+ char *brick = NULL;
+ int32_t i = 1;
+ char *brick_list = NULL;
+ char *free_ptr1 = NULL;
+ char *free_ptr2 = NULL;
+ char *saveptr = NULL;
+ int32_t ret = -1;
+ int32_t stripe_count = 0;
+ int32_t replica_count = 0;
+ int32_t type = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_gsync_status_temp_t param = {0, };
+ gf_boolean_t restart_needed = 0;
+ char msg[1024] __attribute__((unused)) = {0, };
+ int caps = 0;
+ int brickid = 0;
GF_ASSERT (volinfo);
@@ -915,11 +1023,17 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
"type is set %d, need to change it", type);
}
+ brickid = glusterd_get_next_available_brickid (volinfo);
+ if (brickid < 0)
+ goto out;
while ( i <= count) {
ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo);
if (ret)
goto out;
+ GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo,
+ brickid++);
+
ret = glusterd_resolve_brick (brickinfo);
if (ret)
goto out;
@@ -968,13 +1082,40 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
if (count)
brick = strtok_r (brick_list+1, " \n", &saveptr);
+#ifdef HAVE_BD_XLATOR
+ if (brickinfo->vg[0])
+ caps = CAPS_BD | CAPS_THIN |
+ CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT;
+#endif
while (i <= count) {
-
ret = glusterd_volume_brickinfo_get_by_brick (brick, volinfo,
&brickinfo);
if (ret)
goto out;
+#ifdef HAVE_BD_XLATOR
+ /* Check for VG/thin pool if its BD volume */
+ if (brickinfo->vg[0]) {
+ ret = glusterd_is_valid_vg (brickinfo, 0, msg);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "%s", msg);
+ goto out;
+ }
+ /* if anyone of the brick does not have thin support,
+ disable it for entire volume */
+ caps &= brickinfo->caps;
+ } else
+ caps = 0;
+#endif
+
+ if (uuid_is_null (brickinfo->uuid)) {
+ ret = glusterd_resolve_brick (brickinfo);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
+ brickinfo->hostname, brickinfo->path);
+ goto out;
+ }
+ }
ret = glusterd_brick_start (volinfo, brickinfo,
_gf_true);
@@ -982,8 +1123,27 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
goto out;
i++;
brick = strtok_r (NULL, " \n", &saveptr);
+
+ /* Check if the brick is added in this node, and set
+ * the restart_needed flag. */
+ if ((!uuid_compare (brickinfo->uuid, MY_UUID)) &&
+ !restart_needed) {
+ restart_needed = 1;
+ gf_log ("", GF_LOG_DEBUG,
+ "Restart gsyncd session, if it's already "
+ "running.");
+ }
}
+ /* If the restart_needed flag is set, restart gsyncd sessions for that
+ * particular master with all the slaves. */
+ if (restart_needed) {
+ param.rsp_dict = dict;
+ param.volinfo = volinfo;
+ dict_foreach (volinfo->gsync_slaves,
+ _glusterd_restart_gsync_session, &param);
+ }
+ volinfo->caps = caps;
out:
GF_FREE (free_ptr1);
GF_FREE (free_ptr2);
@@ -1047,6 +1207,7 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr)
int ret = 0;
char *volname = NULL;
int count = 0;
+ int replica_count = 0;
int i = 0;
char *bricks = NULL;
char *brick_list = NULL;
@@ -1055,17 +1216,31 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr)
char *brick = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
glusterd_volinfo_t *volinfo = NULL;
- glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
char msg[2048] = {0,};
gf_boolean_t brick_alloc = _gf_false;
char *all_bricks = NULL;
char *str_ret = NULL;
gf_boolean_t is_force = _gf_false;
- priv = THIS->private;
- if (!priv)
- goto out;
+ this = THIS;
+ GF_ASSERT (this);
+ ret = dict_get_int32 (dict, "replica-count", &replica_count);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unable to get replica count");
+ }
+
+ if (replica_count > 0) {
+ ret = op_version_check (this, GD_OP_VER_PERSISTENT_AFR_XATTRS,
+ msg, sizeof(msg));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
+ }
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
gf_log (THIS->name, GF_LOG_ERROR,
@@ -1080,15 +1255,6 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr)
goto out;
}
- if (volinfo->backend == GD_VOL_BK_BD) {
- snprintf (msg, sizeof (msg), "Add brick is not supported for "
- "Block backend volume %s.", volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- ret = -1;
- goto out;
- }
-
ret = glusterd_validate_volume_id (dict, volinfo);
if (ret)
goto out;
@@ -1166,6 +1332,18 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr)
}
if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+#ifdef HAVE_BD_XLATOR
+ if (brickinfo->vg[0]) {
+ ret = glusterd_is_valid_vg (brickinfo, 1, msg);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "%s",
+ msg);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
+ }
+#endif
+
ret = glusterd_validate_and_create_brickpath (brickinfo,
volinfo->volume_id,
op_errstr, is_force);
@@ -1195,20 +1373,32 @@ out:
int
glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
{
- int ret = -1;
- char *volname = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- char *errstr = NULL;
- int32_t brick_count = 0;
- char msg[2048] = {0,};
- int32_t flag = 0;
- gf1_op_commands cmd = GF_OP_CMD_NONE;
- char *task_id_str = NULL;
- xlator_t *this = NULL;
+ int ret = -1;
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *errstr = NULL;
+ int32_t brick_count = 0;
+ char msg[2048] = {0,};
+ int32_t flag = 0;
+ gf1_op_commands cmd = GF_OP_CMD_NONE;
+ char *task_id_str = NULL;
+ xlator_t *this = NULL;
+ int i = 1;
+ char key[256] = {0,};
+ char *brick = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
this = THIS;
GF_ASSERT (this);
+ ret = op_version_check (this, GD_OP_VER_PERSISTENT_AFR_XATTRS,
+ msg, sizeof(msg));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
+
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
@@ -1238,11 +1428,26 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
ret = dict_get_int32 (dict, "command", &flag);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to get brick count");
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get brick command");
goto out;
}
cmd = flag;
+ ret = dict_get_int32 (dict, "count", &brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get brick count");
+ goto out;
+ }
+
+ ret = 0;
+ if (volinfo->brick_count == brick_count) {
+ errstr = gf_strdup ("Deleting all the bricks of the "
+ "volume is not allowed");
+ ret = -1;
+ goto out;
+ }
+
ret = -1;
switch (cmd) {
case GF_OP_CMD_NONE:
@@ -1255,6 +1460,16 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
case GF_OP_CMD_START:
{
+ if ((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) &&
+ dict_get (dict, "replica-count")) {
+ snprintf (msg, sizeof(msg), "Migration of data is not "
+ "needed when reducing replica count. Use the"
+ " 'force' option");
+ errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
+ goto out;
+ }
+
if (GLUSTERD_STATUS_STARTED != volinfo->status) {
snprintf (msg, sizeof (msg), "Volume %s needs to be "
"started before remove-brick (you can use "
@@ -1264,6 +1479,17 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
goto out;
}
+ if (!gd_is_remove_brick_committed (volinfo)) {
+ snprintf (msg, sizeof (msg), "An earlier remove-brick "
+ "task exists for volume %s. Either commit it"
+ " or stop it before starting a new task.",
+ volinfo->volname);
+ errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_ERROR, "Earlier remove-brick"
+ " task exists for volume %s.",
+ volinfo->volname);
+ goto out;
+ }
if (glusterd_is_defrag_on(volinfo)) {
errstr = gf_strdup("Rebalance is in progress. Please "
"retry after completion");
@@ -1271,7 +1497,7 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
goto out;
}
- if (is_origin_glusterd ()) {
+ if (is_origin_glusterd (dict)) {
ret = glusterd_generate_and_set_task_id
(dict, GF_REMOVE_BRICK_TID_KEY);
if (ret) {
@@ -1301,25 +1527,44 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
"is in progress");
goto out;
}
+
+ /* Do not allow commit if the bricks are not decommissioned */
+ for ( i = 1; i <= brick_count; i++ ) {
+ snprintf (key, sizeof (key), "brick%d", i);
+ ret = dict_get_str (dict, key, &brick);
+ if (ret) {
+ snprintf (msg, sizeof (msg),
+ "Unable to get %s", key);
+ errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ ret =
+ glusterd_volume_brickinfo_get_by_brick(brick, volinfo,
+ &brickinfo);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Incorrect brick "
+ "%s for volume %s", brick, volname);
+ errstr = gf_strdup (msg);
+ goto out;
+ }
+ if ( !brickinfo->decommissioned ) {
+ snprintf (msg, sizeof (msg), "Brick %s "
+ "is not decommissioned. "
+ "Use start or force option",
+ brick);
+ errstr = gf_strdup (msg);
+ ret = -1;
+ goto out;
+ }
+ }
+
break;
case GF_OP_CMD_COMMIT_FORCE:
break;
}
-
- ret = dict_get_int32 (dict, "count", &brick_count);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to get brick count");
- goto out;
- }
-
ret = 0;
- if (volinfo->brick_count == brick_count) {
- errstr = gf_strdup ("Deleting all the bricks of the "
- "volume is not allowed");
- ret = -1;
- goto out;
- }
out:
gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -1458,15 +1703,6 @@ glusterd_op_add_brick (dict_t *dict, char **op_errstr)
goto out;
}
- /* Need to reset the defrag/rebalance status accordingly */
- switch (volinfo->rebal.defrag_status) {
- case GF_DEFRAG_STATUS_FAILED:
- case GF_DEFRAG_STATUS_COMPLETE:
- volinfo->rebal.defrag_status = 0;
- default:
- break;
- }
-
ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
if (ret)
goto out;
@@ -1498,6 +1734,8 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
glusterd_brickinfo_t *tmp = NULL;
char *task_id_str = NULL;
xlator_t *this = NULL;
+ dict_t *bricks_dict = NULL;
+ char *brick_tmpstr = NULL;
this = THIS;
GF_ASSERT (this);
@@ -1525,7 +1763,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
/* Set task-id, if available, in ctx dict for operations other than
* start
*/
- if (is_origin_glusterd () && (cmd != GF_OP_CMD_START)) {
+ if (is_origin_glusterd (dict) && (cmd != GF_OP_CMD_START)) {
if (!uuid_is_null (volinfo->rebal.rebalance_id)) {
ret = glusterd_copy_uuid_to_dict
(volinfo->rebal.rebalance_id, dict,
@@ -1538,9 +1776,14 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
}
}
- /* Clear task-id on commmitting/stopping of remove-brick operation */
- if ((cmd != GF_OP_CMD_START) || (cmd != GF_OP_CMD_STATUS))
+ /* Clear task-id, rebal.op and stored bricks on commmitting/stopping
+ * remove-brick */
+ if ((cmd != GF_OP_CMD_START) || (cmd != GF_OP_CMD_STATUS)) {
uuid_clear (volinfo->rebal.rebalance_id);
+ volinfo->rebal.op = GD_OP_NONE;
+ dict_unref (volinfo->rebal.dict);
+ volinfo->rebal.dict = NULL;
+ }
ret = -1;
switch (cmd) {
@@ -1580,6 +1823,11 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
}
case GF_OP_CMD_START:
+ /* Reset defrag status to 'NOT STARTED' whenever a
+ * remove-brick/rebalance command is issued to remove
+ * stale information from previous run.
+ */
+ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
ret = dict_get_str (dict, GF_REMOVE_BRICK_TID_KEY, &task_id_str);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -1587,6 +1835,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
ret = 0;
} else {
uuid_parse (task_id_str, volinfo->rebal.rebalance_id) ;
+ volinfo->rebal.op = GD_OP_REMOVE_BRICK;
}
force = 0;
break;
@@ -1623,7 +1872,23 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
goto out;
}
-
+ /* Save the list of bricks for later usage only on starting a
+ * remove-brick. Right now this is required for displaying the task
+ * parameters with task status in volume status.
+ */
+ if (GF_OP_CMD_START == cmd) {
+ bricks_dict = dict_new ();
+ if (!bricks_dict) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_int32 (bricks_dict, "count", count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to save remove-brick count");
+ goto out;
+ }
+ }
while ( i <= count) {
snprintf (key, 256, "brick%d", i);
ret = dict_get_str (dict, key, &brick);
@@ -1633,12 +1898,32 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
goto out;
}
+ if (GF_OP_CMD_START == cmd) {
+ brick_tmpstr = gf_strdup (brick);
+ if (!brick_tmpstr) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to duplicate brick name");
+ goto out;
+ }
+ ret = dict_set_dynstr (bricks_dict, key, brick_tmpstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add brick to dict");
+ goto out;
+ }
+ brick_tmpstr = NULL;
+ }
+
ret = glusterd_op_perform_remove_brick (volinfo, brick, force,
&need_rebalance);
if (ret)
goto out;
i++;
}
+ if (GF_OP_CMD_START == cmd)
+ volinfo->rebal.dict = dict_ref (bricks_dict);
+
ret = dict_get_int32 (dict, "replica-count", &replica_count);
if (!ret) {
gf_log (this->name, GF_LOG_INFO,
@@ -1646,6 +1931,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
volinfo->replica_count, replica_count,
volinfo->volname);
volinfo->replica_count = replica_count;
+ volinfo->sub_count = replica_count;
volinfo->dist_leaf_count = glusterd_get_dist_leaf_count (volinfo);
volinfo->subvol_count = (volinfo->brick_count /
volinfo->dist_leaf_count);
@@ -1675,6 +1961,16 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
goto out;
}
+ if (GF_OP_CMD_START == cmd &&
+ volinfo->status == GLUSTERD_STATUS_STARTED) {
+ ret = glusterd_nodesvcs_handle_reconfigure (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to reconfigure NFS-Server");
+ goto out;
+ }
+ }
+
/* Need to reset the defrag/rebalance status accordingly */
switch (volinfo->rebal.defrag_status) {
case GF_DEFRAG_STATUS_FAILED:
@@ -1706,5 +2002,9 @@ out:
if (ret && err_str[0] && op_errstr)
*op_errstr = gf_strdup (err_str);
+ GF_FREE (brick_tmpstr);
+ if (bricks_dict)
+ dict_unref (bricks_dict);
+
return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-etcd.c b/xlators/mgmt/glusterd/src/glusterd-etcd.c
new file mode 100644
index 000000000..656ea3b9b
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-etcd.c
@@ -0,0 +1,87 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "glusterfs.h"
+#include "run.h"
+#include "glusterd-etcd.h"
+
+#define GLUSTERD_ETCD_DIR "/var/lib/glusterd/etcd"
+#define GLUSTERD_ETCD_CMD "/root/etcd/bin/etcd"
+
+pid_t
+start_etcd (char *this_host, char *other_host)
+{
+ runner_t runner;
+ char me[256];
+
+ if (gethostname(me,sizeof(me)-1) != 0) {
+ gf_log (__func__, GF_LOG_ERROR, "gethostname failed?!?");
+ return -1;
+ }
+ me[sizeof(me)-1] = '\0';
+
+ if ((mkdir(GLUSTERD_ETCD_DIR,0700) < 0) && (errno != EEXIST)) {
+ gf_log (__func__, GF_LOG_ERROR,
+ "failed to create %s", GLUSTERD_ETCD_DIR);
+ return -1;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, GLUSTERD_ETCD_CMD,
+ "-name", this_host,
+ "-data-dir", GLUSTERD_ETCD_DIR,
+ "-bind-addr", NULL);
+ runner_argprintf( &runner, "%s:4001", me);
+ runner_add_arg (&runner, "-peer-addr");
+ runner_argprintf (&runner, "%s:7001", me);
+ if (other_host) {
+ runner_add_arg (&runner, "-peers");
+ runner_argprintf (&runner, "%s:7001", other_host);
+ gf_log (__func__, GF_LOG_INFO, "starting etcd via %s", other_host);
+ } else {
+ gf_log (__func__, GF_LOG_INFO, "starting etcd standalone");
+ }
+
+ /*
+ * Runner_run would wait for it. Runner_run_nowait would not wait,
+ * but would detach it so thoroughly that it won't die when we do.
+ * Also, runner->chpid would be the PID of the transient middle
+ * process, not the one we might actually need to kill later. This
+ * seems to do exactly what we need.
+ */
+ if (runner_start(&runner) != 0) {
+ gf_log (__func__, GF_LOG_ERROR,
+ "failed to start %s", GLUSTERD_ETCD_CMD);
+ return -1;
+ }
+
+ return runner.chpid;
+}
+
+void
+stop_etcd (pid_t pid)
+{
+ if (pid > 0) {
+ gf_log (__func__, GF_LOG_INFO, "killing etcd %d", pid);
+ (void)kill(pid,SIGKILL);
+ (void)waitpid(pid,NULL,0);
+ }
+}
+
+void
+nuke_etcd_dir (void)
+{
+ (void)runcmd("rm","-rf",GLUSTERD_ETCD_DIR,NULL);
+}
diff --git a/xlators/features/marker/utils/src/procdiggy.h b/xlators/mgmt/glusterd/src/glusterd-etcd.h
index 56dfc4eb2..9459f6bbd 100644
--- a/xlators/features/marker/utils/src/procdiggy.h
+++ b/xlators/mgmt/glusterd/src/glusterd-etcd.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -7,14 +7,17 @@
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
-#ifdef __NetBSD__
-#include <sys/syslimits.h>
-#endif /* __NetBSD__ */
-#define PROC "/proc"
+#ifndef _GLUSTERD_ETCD_H_
+#define _GLUSTERD_ETCD_H_
-pid_t pidinfo (pid_t pid, char **name);
+#include <sys/types.h>
+#include "glusterfs.h"
-int prociter (int (*proch) (pid_t pid, pid_t ppid, char *name, void *data),
- void *data);
+pid_t start_etcd (char *this_host, char *other_host);
+void stop_etcd (pid_t pid);
+
+void nuke_etcd_dir (void);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
index 7e2ab833f..9433a128e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
+++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
@@ -25,6 +25,34 @@
#include <signal.h>
+static int
+dict_get_param (dict_t *dict, char *key, char **param);
+
+struct gsync_config_opt_vals_ gsync_confopt_vals[] = {
+ {.op_name = "change_detector",
+ .no_of_pos_vals = 2,
+ .case_sensitive = _gf_true,
+ .values = {"xsync", "changelog"},
+ },
+ {.op_name = "special_sync_mode",
+ .no_of_pos_vals = 2,
+ .case_sensitive = _gf_true,
+ .values = {"partial", "recover"}
+ },
+ {.op_name = "log-level",
+ .no_of_pos_vals = 5,
+ .case_sensitive = _gf_false,
+ .values = {"critical", "error", "warning", "info", "debug"}
+ },
+ {.op_name = "use-tarssh",
+ .no_of_pos_vals = 6,
+ .case_sensitive = _gf_false,
+ .values = {"true", "false", "0", "1", "yes", "no"}
+ },
+ {.op_name = NULL,
+ },
+};
+
static char *gsync_reserved_opts[] = {
"gluster-command-dir",
"pid-file",
@@ -33,10 +61,161 @@ static char *gsync_reserved_opts[] = {
"session-owner",
"state-socket-unencoded",
"socketdir",
+ "ignore-deletes",
+ "local-id",
+ "local-path",
+ "slave-id",
+ NULL
+};
+
+static char *gsync_no_restart_opts[] = {
+ "checkpoint",
NULL
};
int
+__glusterd_handle_sys_exec (rpcsvc_request_t *req)
+{
+ int32_t ret = 0;
+ dict_t *dict = NULL;
+ gf_cli_req cli_req = {{0},};
+ glusterd_op_t cli_op = GD_OP_SYS_EXEC;
+ glusterd_conf_t *priv = NULL;
+ char *host_uuid = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+
+ GF_ASSERT (req);
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
+ goto out;
+ } else {
+ dict->extra_stdfree = cli_req.dict.dict_val;
+ }
+
+ host_uuid = gf_strdup (uuid_utoa(MY_UUID));
+ if (host_uuid == NULL) {
+ snprintf (err_str, sizeof (err_str), "Failed to get "
+ "the uuid of local glusterd");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, "host-uuid", host_uuid);
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_op_begin_synctask (req, cli_op, dict);
+
+out:
+ if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
+ ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
+ dict, err_str);
+ }
+ return ret;
+}
+
+int
+__glusterd_handle_copy_file (rpcsvc_request_t *req)
+{
+ int32_t ret = 0;
+ dict_t *dict = NULL;
+ gf_cli_req cli_req = {{0},};
+ glusterd_op_t cli_op = GD_OP_COPY_FILE;
+ glusterd_conf_t *priv = NULL;
+ char *host_uuid = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+
+ GF_ASSERT (req);
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
+ goto out;
+ } else {
+ dict->extra_stdfree = cli_req.dict.dict_val;
+ }
+
+ host_uuid = gf_strdup (uuid_utoa(MY_UUID));
+ if (host_uuid == NULL) {
+ snprintf (err_str, sizeof (err_str), "Failed to get "
+ "the uuid of local glusterd");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, "host-uuid", host_uuid);
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_op_begin_synctask (req, cli_op, dict);
+
+out:
+ if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
+ ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
+ dict, err_str);
+ }
+ return ret;
+}
+
+int
__glusterd_handle_gsync_set (rpcsvc_request_t *req)
{
int32_t ret = 0;
@@ -100,13 +279,13 @@ __glusterd_handle_gsync_set (rpcsvc_request_t *req)
ret = dict_get_str (dict, "master", &master);
if (ret < 0) {
gf_log (this->name, GF_LOG_INFO, "master not found, while "
- "handling"GEOREP" options");
+ "handling "GEOREP" options");
master = "(No Master)";
}
ret = dict_get_str (dict, "slave", &slave);
if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO, "slave not not found, while"
+ gf_log (this->name, GF_LOG_INFO, "slave not found, while "
"handling "GEOREP" options");
slave = "(No Slave)";
}
@@ -120,6 +299,10 @@ __glusterd_handle_gsync_set (rpcsvc_request_t *req)
}
switch (type) {
+ case GF_GSYNC_OPTION_TYPE_CREATE:
+ strncpy (operation, "create", sizeof (operation));
+ cli_op = GD_OP_GSYNC_CREATE;
+ break;
case GF_GSYNC_OPTION_TYPE_START:
strncpy (operation, "start", sizeof (operation));
@@ -136,12 +319,9 @@ __glusterd_handle_gsync_set (rpcsvc_request_t *req)
case GF_GSYNC_OPTION_TYPE_STATUS:
strncpy (operation, "status", sizeof (operation));
break;
- case GF_GSYNC_OPTION_TYPE_ROTATE:
- strncpy (operation, "rotate", sizeof(operation));
- break;
}
- ret = glusterd_op_begin_synctask (req, GD_OP_GSYNC_SET, dict);
+ ret = glusterd_op_begin_synctask (req, cli_op, dict);
out:
if (ret) {
@@ -154,6 +334,17 @@ out:
return ret;
}
+int
+glusterd_handle_sys_exec (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_sys_exec);
+}
+
+int
+glusterd_handle_copy_file (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_copy_file);
+}
int
glusterd_handle_gsync_set (rpcsvc_request_t *req)
@@ -444,7 +635,7 @@ _fcbk_conftodict (char *resbuf, size_t blen, FILE *fp, void *data)
}
static int
-glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t *dict)
+glusterd_gsync_get_config (char *master, char *slave, char *conf_path, dict_t *dict)
{
/* key + value, where value must be able to accommodate a path */
char resbuf[256 + PATH_MAX] = {0,};
@@ -452,7 +643,7 @@ glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t *
runinit (&runner);
runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir);
+ runner_argprintf (&runner, "%s", conf_path);
runner_argprintf (&runner, ":%s", master);
runner_add_args (&runner, slave, "--config-get-all", NULL);
@@ -462,13 +653,13 @@ glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t *
static int
glusterd_gsync_get_param_file (char *prmfile, const char *param, char *master,
- char *slave, char *gl_workdir)
+ char *slave, char *conf_path)
{
runner_t runner = {0,};
runinit (&runner);
runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir);
+ runner_argprintf (&runner, "%s", conf_path);
runner_argprintf (&runner, ":%s", master);
runner_add_args (&runner, slave, "--config-get", NULL);
runner_argprintf (&runner, "%s-file", param);
@@ -477,83 +668,14 @@ glusterd_gsync_get_param_file (char *prmfile, const char *param, char *master,
}
static int
-glusterd_gsync_get_session_owner (char *master, char *slave, char *session_owner,
- char *gl_workdir)
-{
- runner_t runner = {0,};
-
- runinit(&runner);
- runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir);
- runner_argprintf (&runner, ":%s", master);
- runner_add_args (&runner, slave, "--config-get", "session-owner",
- NULL);
-
- return glusterd_query_extutil (session_owner, &runner);
-}
-
-/* check whether @slave is local or remote. normalized
- * urls starting with ssh are considered to be remote
- * @returns
- * 1 if slave is remote
- * 0 is slave is local
- */
-static int
-glusterd_gsync_slave_is_remote (char *slave)
-{
- int ret = 0;
- char *ssh_pos = NULL;
-
- ssh_pos = strstr(slave, "ssh://");
- if ( ssh_pos && ((ssh_pos - slave) == 0) )
- ret = 1;
-
- return ret;
-}
-
-static int
-glusterd_gsync_get_slave_log_file (char *master, char *slave, char *log_file)
-{
- int ret = -1;
- runner_t runner = {0,};
- char uuid_str[64] = {0,};
- glusterd_conf_t *priv = NULL;
- char *gl_workdir = NULL;
-
- GF_ASSERT(THIS);
- GF_ASSERT(THIS->private);
-
- priv = THIS->private;
-
- GF_VALIDATE_OR_GOTO("gsyncd", master, out);
- GF_VALIDATE_OR_GOTO("gsyncd", slave, out);
-
- gl_workdir = priv->workdir;
-
- /* get the session owner for the master-slave session */
- ret = glusterd_gsync_get_session_owner (master, slave, uuid_str,
- gl_workdir);
- if (ret)
- goto out;
-
- /* get the log file for the slave */
- runinit(&runner);
- runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir);
- runner_argprintf (&runner, "--session-owner=%s", uuid_str);
- runner_add_args (&runner, slave, "--config-get", "log-file", NULL);
-
- ret = glusterd_query_extutil (log_file, &runner);
-
- out:
- return ret;
-}
-
-static int
-gsyncd_getpidfile (char *master, char *slave, char *pidfile)
+gsyncd_getpidfile (char *master, char *slave, char *pidfile, char *conf_path)
{
int ret = -1;
glusterd_conf_t *priv = NULL;
+ char *confpath = NULL;
+ char conf_buf[PATH_MAX] = "";
+ struct stat stbuf = {0,};
+
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
@@ -563,8 +685,22 @@ gsyncd_getpidfile (char *master, char *slave, char *pidfile)
GF_VALIDATE_OR_GOTO ("gsync", master, out);
GF_VALIDATE_OR_GOTO ("gsync", slave, out);
+ ret = lstat (conf_path, &stbuf);
+ if (!ret) {
+ gf_log ("", GF_LOG_DEBUG, "Using passed config template(%s).",
+ conf_path);
+ confpath = conf_path;
+ } else {
+ ret = snprintf (conf_buf, sizeof(conf_buf) - 1,
+ "%s/"GSYNC_CONF_TEMPLATE, priv->workdir);
+ conf_buf[ret] = '\0';
+ confpath = conf_buf;
+ gf_log ("", GF_LOG_DEBUG, "Using default config template(%s).",
+ confpath);
+ }
+
ret = glusterd_gsync_get_param_file (pidfile, "pid", master,
- slave, priv->workdir);
+ slave, confpath);
if (ret == -1) {
ret = -2;
gf_log ("", GF_LOG_WARNING, "failed to create the pidfile string");
@@ -578,32 +714,6 @@ gsyncd_getpidfile (char *master, char *slave, char *pidfile)
}
static int
-glusterd_gsyncd_getlogfile (char *master, char *slave, char *log_file)
-{
- int ret = -1;
- glusterd_conf_t *priv = NULL;
-
- GF_ASSERT (THIS);
- GF_ASSERT (THIS->private);
-
- priv = THIS->private;
-
- GF_VALIDATE_OR_GOTO ("gsync", master, out);
- GF_VALIDATE_OR_GOTO ("gsync", slave, out);
-
- ret = glusterd_gsync_get_param_file (log_file, "log", master,
- slave, priv->workdir);
- if (ret == -1) {
- ret = -2;
- gf_log ("", GF_LOG_WARNING, "failed to gsyncd logfile");
- goto out;
- }
-
- out:
- return ret;
-}
-
-static int
gsync_status_byfd (int fd)
{
GF_ASSERT (fd >= -1);
@@ -620,12 +730,12 @@ gsync_status_byfd (int fd)
* return -1 when not running
*/
int
-gsync_status (char *master, char *slave, int *status)
+gsync_status (char *master, char *slave, char *conf_path, int *status)
{
char pidfile[PATH_MAX] = {0,};
int fd = -1;
- fd = gsyncd_getpidfile (master, slave, pidfile);
+ fd = gsyncd_getpidfile (master, slave, pidfile, conf_path);
if (fd == -2)
return -1;
@@ -662,16 +772,48 @@ out:
}
static int
-gsync_verify_config_options (dict_t *dict, char **op_errstr)
+glusterd_verify_gsyncd_spawn (char *master, char *slave)
{
- char **resopt = NULL;
- int i = 0;
- char *subop = NULL;
- char *slave = NULL;
- char *op_name = NULL;
- char *op_value = NULL;
- char *t = NULL;
- gf_boolean_t banned = _gf_true;
+ int ret = 0;
+ runner_t runner = {0,};
+
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd",
+ "--verify", "spawning", NULL);
+ runner_argprintf (&runner, ":%s", master);
+ runner_add_args (&runner, slave, NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ ret = runner_start (&runner);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "spawning child failed");
+ ret = -1;
+ goto out;
+ }
+
+ if (runner_end (&runner) != 0)
+ ret = -1;
+
+out:
+ gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ return ret;
+}
+
+static int
+gsync_verify_config_options (dict_t *dict, char **op_errstr, char *volname)
+{
+ char **resopt = NULL;
+ int i = 0;
+ int ret = -1;
+ char *subop = NULL;
+ char *slave = NULL;
+ char *op_name = NULL;
+ char *op_value = NULL;
+ char *t = NULL;
+ char errmsg[PATH_MAX] = "";
+ gf_boolean_t banned = _gf_true;
+ gf_boolean_t op_match = _gf_true;
+ gf_boolean_t val_match = _gf_true;
+ struct gsync_config_opt_vals_ *conf_vals = NULL;
if (dict_get_str (dict, "subop", &subop) != 0) {
gf_log ("", GF_LOG_WARNING, "missing subop");
@@ -695,6 +837,12 @@ gsync_verify_config_options (dict_t *dict, char **op_errstr)
}
if (runcmd (GSYNCD_PREFIX"/gsyncd", "--config-check", op_name, NULL)) {
+ ret = glusterd_verify_gsyncd_spawn (volname, slave);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to spawn gsyncd");
+ return 0;
+ }
+
gf_log ("", GF_LOG_WARNING, "Invalid option %s", op_name);
*op_errstr = gf_strdup ("Invalid option");
@@ -739,35 +887,113 @@ gsync_verify_config_options (dict_t *dict, char **op_errstr)
}
}
+ /* Check options in gsync_confopt_vals for invalid values */
+ for (conf_vals = gsync_confopt_vals; conf_vals->op_name; conf_vals++) {
+ op_match = _gf_true;
+ for (i = 0; conf_vals->op_name[i] && op_name[i]; i++) {
+ if (conf_vals->op_name[i] == op_name[i] ||
+ (conf_vals->op_name[i] == '_' && op_name[i] == '-'))
+ continue;
+ op_match = _gf_false;
+ }
+
+ if (op_match) {
+ if (!op_value)
+ goto out;
+ val_match = _gf_false;
+ for (i = 0; i < conf_vals->no_of_pos_vals; i++) {
+ if(conf_vals->case_sensitive){
+ if (!strcmp (conf_vals->values[i], op_value))
+ val_match = _gf_true;
+ } else {
+ if (!strcasecmp (conf_vals->values[i], op_value))
+ val_match = _gf_true;
+ }
+ }
+
+ if (!val_match) {
+ ret = snprintf (errmsg, sizeof(errmsg) - 1,
+ "Invalid value(%s) for"
+ " option %s", op_value,
+ op_name);
+ errmsg[ret] = '\0';
+
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ return -1;
+ }
+ }
+ }
+out:
return 0;
}
static int
glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo,
- char *slave, dict_t *rsp_dict, char *node);
+ char *slave, char *conf_path,
+ dict_t *rsp_dict, char *node);
static int
_get_status_mst_slv (dict_t *this, char *key, data_t *value, void *data)
{
glusterd_gsync_status_temp_t *param = NULL;
char *slave = NULL;
- int ret = 0;
+ char *slave_buf = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *errmsg = NULL;
+ char conf_path[PATH_MAX] = "";
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
param = (glusterd_gsync_status_temp_t *)data;
GF_ASSERT (param);
GF_ASSERT (param->volinfo);
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ goto out;
+ }
+
slave = strchr(value->data, ':');
- if (slave)
- slave ++;
- else
+ if (!slave)
return 0;
+ slave++;
+
+ ret = glusterd_get_slave_info (slave, &slave_ip, &slave_vol, &errmsg);
+ if (ret) {
+ if (errmsg)
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch "
+ "slave details. Error: %s", errmsg);
+ else
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (conf_path, sizeof(conf_path) - 1,
+ "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, param->volinfo->volname,
+ slave_ip, slave_vol);
+ conf_path[ret] = '\0';
ret = glusterd_get_gsync_status_mst_slv(param->volinfo,
- slave, param->rsp_dict,
+ slave, conf_path,
+ param->rsp_dict,
param->node);
- return 0;
+out:
+
+ GF_FREE (errmsg);
+
+ if (slave_buf)
+ GF_FREE(slave_buf);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d.", ret);
+ return ret;
}
@@ -788,19 +1014,22 @@ static int
glusterd_remove_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
char **op_errstr)
{
+ int zero_slave_entries = _gf_true;
int ret = 0;
char *slavekey = NULL;
GF_ASSERT (volinfo);
GF_ASSERT (slave);
- ret = glusterd_get_slave (volinfo, slave, &slavekey);
- if (ret < 0) {
- ret++;
- goto out;
- }
-
- dict_del (volinfo->gsync_slaves, slavekey);
+ do {
+ ret = glusterd_get_slave (volinfo, slave, &slavekey);
+ if (ret < 0 && zero_slave_entries) {
+ ret++;
+ goto out;
+ }
+ zero_slave_entries = _gf_false;
+ dict_del (volinfo->gsync_slaves, slavekey);
+ } while (ret >= 0);
ret = glusterd_store_volinfo (volinfo,
GLUSTERD_VOLINFO_VER_AC_INCREMENT);
@@ -853,8 +1082,9 @@ glusterd_gsync_get_uuid (char *slave, glusterd_volinfo_t *vol,
return ret;
}
-static int
+int
glusterd_check_gsync_running_local (char *master, char *slave,
+ char *conf_path,
gf_boolean_t *is_run)
{
int ret = -1;
@@ -865,7 +1095,7 @@ glusterd_check_gsync_running_local (char *master, char *slave,
GF_ASSERT (is_run);
*is_run = _gf_false;
- ret = gsync_status (master, slave, &ret_status);
+ ret = gsync_status (master, slave, conf_path, &ret_status);
if (ret == 0 && ret_status == 0) {
*is_run = _gf_true;
} else if (ret == -1) {
@@ -882,7 +1112,8 @@ glusterd_check_gsync_running_local (char *master, char *slave,
static int
glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
- char *host_uuid, char **op_errstr)
+ char *host_uuid, char **op_errstr,
+ gf_boolean_t is_force)
{
int ret = 0;
int maxslv = 0;
@@ -905,7 +1136,8 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
case -1:
break;
default:
- GF_ASSERT (ret > 0);
+ if (!is_force)
+ GF_ASSERT (ret > 0);
ret = dict_get_str (volinfo->gsync_slaves, slavekey, &slaveentry);
GF_ASSERT (ret == 0);
@@ -914,13 +1146,23 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
* assert an uuid mismatch
*/
t = strtail (slaveentry, host_uuid);
- GF_ASSERT (!t || *t != ':');
+ if (!is_force)
+ GF_ASSERT (!t || *t != ':');
+
+ if (is_force) {
+ gf_log ("", GF_LOG_DEBUG, GEOREP" has already been "
+ "invoked for the %s (master) and %s (slave)."
+ " Allowing without saving info again due to"
+ " force command.", volinfo->volname, slave);
+ ret = 0;
+ goto out;
+ }
gf_log ("", GF_LOG_ERROR, GEOREP" has already been invoked for "
"the %s (master) and %s (slave) "
"from a different machine",
volinfo->volname, slave);
- *op_errstr = gf_strdup (GEOREP" already running in an an"
+ *op_errstr = gf_strdup (GEOREP" already running in "
"another machine");
ret = -1;
goto out;
@@ -953,23 +1195,26 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
return ret;
}
-
static int
glusterd_op_verify_gsync_start_options (glusterd_volinfo_t *volinfo,
- char *slave, char **op_errstr)
+ char *slave, char *conf_path,
+ char *statefile, char **op_errstr,
+ gf_boolean_t is_force)
{
int ret = -1;
gf_boolean_t is_running = _gf_false;
char msg[2048] = {0};
uuid_t uuid = {0};
- glusterd_conf_t *priv = NULL;
- xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ struct stat stbuf = {0,};
this = THIS;
GF_ASSERT (volinfo);
GF_ASSERT (slave);
GF_ASSERT (op_errstr);
+ GF_ASSERT (conf_path);
GF_ASSERT (this && this->private);
priv = this->private;
@@ -979,26 +1224,56 @@ glusterd_op_verify_gsync_start_options (glusterd_volinfo_t *volinfo,
"before "GEOREP" start", volinfo->volname);
goto out;
}
+
+ ret = lstat (statefile, &stbuf);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Session between %s and %s has"
+ " not been created. Please create session and retry.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ /* Check if the gsync slave info is stored. If not
+ * session has not been created */
+ ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Session between %s and %s has"
+ " not been created. Please create session and retry.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
+ goto out;
+ }
+
+ if (is_force) {
+ ret = 0;
+ goto out;
+ }
+
/*Check if the gsync is already started in cmd. inited host
* If so initiate add it into the glusterd's priv*/
- ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if ((ret == 0) && (uuid_compare (MY_UUID, uuid) == 0)) {
- ret = glusterd_check_gsync_running_local (volinfo->volname,
- slave, &is_running);
- if (ret) {
- snprintf (msg, sizeof (msg), GEOREP" start option "
- "validation failed ");
- goto out;
- }
- if (_gf_true == is_running) {
- snprintf (msg, sizeof (msg), GEOREP " session between"
- " %s & %s already started", volinfo->volname,
- slave);
- ret = -1;
- goto out;
- }
+ ret = glusterd_check_gsync_running_local (volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (ret) {
+ snprintf (msg, sizeof (msg), GEOREP" start option "
+ "validation failed ");
+ goto out;
+ }
+ if (_gf_true == is_running) {
+ snprintf (msg, sizeof (msg), GEOREP " session between"
+ " %s & %s already started", volinfo->volname,
+ slave);
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Unable to spawn gsyncd");
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
}
- ret = 0;
out:
if (ret && (msg[0] != '\0')) {
*op_errstr = gf_strdup (msg);
@@ -1022,13 +1297,168 @@ glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag)
return 0;
}
+/*
+ * is_geo_rep_active:
+ * This function reads the state_file and sets is_active to 1 if the
+ * monitor status is neither "Stopped" or "Not Started"
+ *
+ * RETURN VALUE:
+ * 0: On successful read of state_file.
+ * -1: error.
+ */
+
+static int
+is_geo_rep_active (glusterd_volinfo_t *volinfo, char *slave,
+ char *conf_path, int *is_active)
+{
+ dict_t *confd = NULL;
+ char *statefile = NULL;
+ char *master = NULL;
+ char monitor_status[PATH_MAX] = "";
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ master = volinfo->volname;
+
+ confd = dict_new ();
+ if (!confd) {
+ gf_log ("", GF_LOG_ERROR, "Not able to create dict.");
+ goto out;
+ }
+
+ ret = glusterd_gsync_get_config (master, slave, conf_path,
+ confd);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get configuration data "
+ "for %s(master), %s(slave)", master, slave);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_param (confd, "state_file", &statefile);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get state_file's name "
+ "for %s(master), %s(slave). Please check gsync "
+ "config file.", master, slave);
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_gsync_read_frm_status (statefile, monitor_status,
+ sizeof (monitor_status));
+ if (ret <= 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read the status "
+ "file for %s(master), %s(slave)", master, slave);
+ strncpy (monitor_status, "defunct", sizeof (monitor_status));
+ }
+
+ if ((!strcmp(monitor_status, "Stopped")) ||
+ (!strcmp(monitor_status, "Not Started"))) {
+ *is_active = 0;
+ } else {
+ *is_active = 1;
+ }
+ ret = 0;
+out:
+ if (confd)
+ dict_destroy (confd);
+ return ret;
+}
+
+/*
+ * _get_slave_status:
+ * Called for each slave in the volume from dict_foreach.
+ * It calls is_geo_rep_active to get the monitor status.
+ *
+ * RETURN VALUE:
+ * 0: On successful read of state_file from is_geo_rep_active.
+ * When it is found geo-rep is already active from previous calls.
+ * When there is no slave.
+ * -1: On error.
+ */
+
+int
+_get_slave_status (dict_t *dict, char *key, data_t *value, void *data)
+{
+ gsync_status_param_t *param = NULL;
+ char *slave = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *errmsg = NULL;
+ char conf_path[PATH_MAX] = "";
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ param = (gsync_status_param_t *)data;
+
+ GF_ASSERT (param);
+ GF_ASSERT (param->volinfo);
+
+ if (param->is_active) {
+ ret = 0;
+ goto out;
+ }
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (this)
+ priv = this->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ goto out;
+ }
+
+ slave = strchr(value->data, ':');
+ if (!slave) {
+ ret = 0;
+ goto out;
+ }
+ slave++;
+
+ ret = glusterd_get_slave_info (slave, &slave_ip, &slave_vol, &errmsg);
+ if (ret) {
+ if (errmsg)
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch "
+ "slave details. Error: %s", errmsg);
+ else
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (conf_path, sizeof(conf_path) - 1,
+ "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, param->volinfo->volname,
+ slave_ip, slave_vol);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to assign conf_path.");
+ ret = -1;
+ goto out;
+ }
+ conf_path[ret] = '\0';
+
+ ret = is_geo_rep_active (param->volinfo,slave, conf_path,
+ &param->is_active);
+out:
+ GF_FREE(errmsg);
+ return ret;
+}
+
static int
glusterd_op_verify_gsync_running (glusterd_volinfo_t *volinfo,
- char *slave, char **op_errstr)
+ char *slave, char *conf_path,
+ char **op_errstr)
{
- int ret = -1;
- char msg[2048] = {0};
- uuid_t uuid = {0};
+ int pfd = -1;
+ int ret = -1;
+ char msg[2048] = {0};
+ char pidfile[PATH_MAX] = {0,};
GF_ASSERT (THIS && THIS->private);
GF_ASSERT (volinfo);
@@ -1041,12 +1471,25 @@ glusterd_op_verify_gsync_running (glusterd_volinfo_t *volinfo,
goto out;
}
- ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if (ret == -1) {
- snprintf (msg, sizeof (msg), GEOREP" session between %s & %s"
- " not active", volinfo->volname, slave);
+
+ pfd = gsyncd_getpidfile (volinfo->volname, slave, pidfile, conf_path);
+ if (pfd == -2) {
+ gf_log ("", GF_LOG_ERROR, GEOREP" stop validation "
+ "failed for %s & %s", volinfo->volname, slave);
+ ret = -1;
goto out;
}
+ if (gsync_status_byfd (pfd) == -1) {
+ snprintf (msg, sizeof (msg), GEOREP" session b/w %s & %s is not"
+ " running on this node.", volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
+ ret = -1;
+ /* monitor gsyncd already dead */
+ goto out;
+ }
+
+ if (pfd < 0)
+ goto out;
ret = 0;
out:
@@ -1066,6 +1509,18 @@ glusterd_verify_gsync_status_opts (dict_t *dict, char **op_errstr)
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
int ret = 0;
+ char *conf_path = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
+ goto out;
+ }
ret = dict_get_str (dict, "master", &volname);
if (ret < 0) {
@@ -1090,16 +1545,25 @@ glusterd_verify_gsync_status_opts (dict_t *dict, char **op_errstr)
goto out;
}
- out:
+ ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip,
+ &slave_vol, &conf_path,
+ op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave or confpath details.");
+ ret = -1;
+ goto out;
+ }
+
+out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
-
}
int
glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr,
- char **master, char **slave)
+ char **master, char **slave, char **host_uuid)
{
int ret = -1;
@@ -1124,6 +1588,14 @@ glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr,
}
}
+ if (host_uuid) {
+ ret = dict_get_str (dict, "host-uuid", host_uuid);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_WARNING, "host_uuid not found");
+ *op_errstr = gf_strdup ("host_uuid not found");
+ goto out;
+ }
+ }
ret = 0;
out:
@@ -1132,17 +1604,647 @@ out:
}
int
+glusterd_op_stage_sys_exec (dict_t *dict, char **op_errstr)
+{
+ char errmsg[PATH_MAX] = "";
+ char *command = NULL;
+ char command_path[PATH_MAX] = "";
+ struct stat st = {0,};
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ if (conf->op_version < 2) {
+ gf_log ("", GF_LOG_ERROR, "Op Version not supported.");
+ snprintf (errmsg, sizeof(errmsg), "One or more nodes do not"
+ " support the required op version.");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "command", &command);
+ if (ret) {
+ strcpy (errmsg, "internal error");
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get command from dict");
+ goto out;
+ }
+
+ /* enforce local occurrence of the command */
+ if (strchr (command, '/')) {
+ strcpy (errmsg, "invalid command name");
+ ret = -1;
+ goto out;
+ }
+
+ sprintf (command_path, GSYNCD_PREFIX"/peer_%s", command);
+ /* check if it's executable */
+ ret = access (command_path, X_OK);
+ if (!ret)
+ /* check if it's a regular file */
+ ret = stat (command_path, &st);
+ if (!ret && !S_ISREG (st.st_mode))
+ ret = -1;
+
+out:
+ if (ret) {
+ if (errmsg[0] == '\0')
+ snprintf (errmsg, sizeof (errmsg), "%s not found.",
+ command);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ }
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_op_stage_copy_file (dict_t *dict, char **op_errstr)
+{
+ char abs_filename[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ char *filename = NULL;
+ char *host_uuid = NULL;
+ char uuid_str [64] = {0};
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ struct stat stbuf = {0,};
+
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
+ goto out;
+ }
+
+ if (priv->op_version < 2) {
+ gf_log ("", GF_LOG_ERROR, "Op Version not supported.");
+ snprintf (errmsg, sizeof(errmsg), "One or more nodes do not"
+ " support the required op version.");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "host-uuid", &host_uuid);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch"
+ " host-uuid from dict.");
+ goto out;
+ }
+
+ uuid_utoa_r (MY_UUID, uuid_str);
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = dict_get_str (dict, "source", &filename);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch"
+ " filename from dict.");
+ *op_errstr = gf_strdup ("command unsuccessful");
+ goto out;
+ }
+ snprintf (abs_filename, sizeof(abs_filename),
+ "%s/%s", priv->workdir, filename);
+
+ ret = lstat (abs_filename, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Source file"
+ " does not exist in %s", priv->workdir);
+ *op_errstr = gf_strdup (errmsg);
+ goto out;
+ }
+
+ if (!S_ISREG(stbuf.st_mode)) {
+ snprintf (errmsg, sizeof (errmsg), "Source file"
+ " is not a regular file.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_get_statefile_name (glusterd_volinfo_t *volinfo, char *slave,
+ char *conf_path, char **statefile)
+{
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+ char *master = NULL;
+ char *buf = NULL;
+ dict_t *confd = NULL;
+ char *confpath = NULL;
+ char conf_buf[PATH_MAX] = "";
+ struct stat stbuf = {0,};
+
+ GF_ASSERT (THIS);
+ GF_ASSERT (THIS->private);
+ GF_ASSERT (volinfo);
+
+ master = volinfo->volname;
+
+ confd = dict_new ();
+ if (!confd) {
+ gf_log ("", GF_LOG_ERROR, "Unable to create new dict");
+ goto out;
+ }
+
+ priv = THIS->private;
+
+ ret = lstat (conf_path, &stbuf);
+ if (!ret) {
+ gf_log ("", GF_LOG_INFO, "Using passed config template(%s).",
+ conf_path);
+ confpath = conf_path;
+ } else {
+ ret = snprintf (conf_buf, sizeof(conf_buf) - 1,
+ "%s/"GSYNC_CONF_TEMPLATE, priv->workdir);
+ conf_buf[ret] = '\0';
+ confpath = conf_buf;
+ gf_log ("", GF_LOG_INFO, "Using default config template(%s).",
+ confpath);
+ }
+
+ ret = glusterd_gsync_get_config (master, slave, confpath,
+ confd);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get configuration data"
+ "for %s(master), %s(slave)", master, slave);
+ goto out;
+
+ }
+
+ ret = dict_get_param (confd, "state_file", &buf);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get state_file's name.");
+ goto out;
+ }
+
+ *statefile = gf_strdup(buf);
+ if (!*statefile) {
+ gf_log ("", GF_LOG_ERROR, "Unable to gf_strdup.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+ out:
+ if (confd)
+ dict_destroy (confd);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d ", ret);
+ return ret;
+}
+
+static int
+glusterd_create_status_file (char *master, char *slave, char *slave_ip,
+ char *slave_vol, char *status)
+{
+ int ret = -1;
+ runner_t runner = {0,};
+ glusterd_conf_t *priv = NULL;
+
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ goto out;
+ }
+
+ if (!status) {
+ gf_log ("", GF_LOG_ERROR, "Status Empty");
+ goto out;
+ }
+ gf_log ("", GF_LOG_DEBUG, "slave = %s", slave);
+
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--create",
+ status, "-c", NULL);
+ runner_argprintf (&runner, "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, master, slave_ip, slave_vol);
+ runner_argprintf (&runner, ":%s", master);
+ runner_add_args (&runner, slave, NULL);
+ synclock_unlock (&priv->big_lock);
+ ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Creating status file failed.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_verify_slave (char *volname, char *slave_ip, char *slave,
+ char **op_errstr, gf_boolean_t *is_force_blocker)
+{
+ int32_t ret = -1;
+ runner_t runner = {0,};
+ char log_file_path[PATH_MAX] = "";
+ char buf[PATH_MAX] = "";
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ GF_ASSERT (volname);
+ GF_ASSERT (slave_ip);
+ GF_ASSERT (slave);
+
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ goto out;
+ }
+
+ snprintf (log_file_path, sizeof(log_file_path),
+ DEFAULT_LOG_FILE_DIRECTORY"/create_verify_log");
+
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gverify.sh", NULL);
+ runner_argprintf (&runner, "%s", volname);
+ runner_argprintf (&runner, "%s", slave_ip);
+ runner_argprintf (&runner, "%s", slave);
+ runner_argprintf (&runner, "%s", log_file_path);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ synclock_unlock (&priv->big_lock);
+ ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Not a valid slave");
+ ret = glusterd_gsync_read_frm_status (log_file_path,
+ buf, sizeof(buf));
+ if (ret <= 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read from %s",
+ log_file_path);
+ goto out;
+ }
+
+ /* Tokenize the error message from gverify.sh to figure out
+ * if the error is a force blocker or not. */
+ tmp = strtok_r (buf, "|", &save_ptr);
+ if (!strcmp (tmp, "FORCE_BLOCKER"))
+ *is_force_blocker = 1;
+ else {
+ /* No FORCE_BLOCKER flag present so all that is
+ * present is the error message. */
+ *is_force_blocker = 0;
+ if (tmp)
+ *op_errstr = gf_strdup (tmp);
+ ret = -1;
+ goto out;
+ }
+
+ /* Copy rest of the error message to op_errstr */
+ tmp = strtok_r (NULL, "|", &save_ptr);
+ if (tmp)
+ *op_errstr = gf_strdup (tmp);
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ unlink (log_file_path);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mountbroker_check (char **slave_ip, char **op_errstr)
+{
+ int ret = -1;
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ char *username = NULL;
+ char *host = NULL;
+ char errmsg[PATH_MAX] = "";
+
+ GF_ASSERT (slave_ip);
+ GF_ASSERT (*slave_ip);
+
+ /* Checking if hostname has user specified */
+ host = strstr (*slave_ip, "@");
+ if (!host) {
+ gf_log ("", GF_LOG_DEBUG, "No username provided.");
+ ret = 0;
+ goto out;
+ } else {
+ /* Moving the host past the '@' and checking if the
+ * actual hostname also has '@' */
+ host++;
+ if (strstr (host, "@")) {
+ gf_log ("", GF_LOG_DEBUG, "host = %s", host);
+ ret = snprintf (errmsg, sizeof(errmsg) - 1,
+ "Invalid Hostname (%s).", host);
+ errmsg[ret] = '\0';
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ if (op_errstr)
+ *op_errstr = gf_strdup (errmsg);
+ goto out;
+ }
+
+ /* Fetching the username and hostname
+ * and checking if the username is non-root */
+ username = strtok_r (*slave_ip, "@", &save_ptr);
+ tmp = strtok_r (NULL, "@", &save_ptr);
+ if (strcmp (username, "root")) {
+ ret = snprintf (errmsg, sizeof(errmsg) - 1,
+ "Non-root username (%s@%s) not allowed.",
+ username, tmp);
+ errmsg[ret] = '\0';
+ if (op_errstr)
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR,
+ "Non-Root username not allowed.");
+ ret = -1;
+ goto out;
+ }
+
+ *slave_ip = gf_strdup (tmp);
+ if (!*slave_ip) {
+ gf_log ("", GF_LOG_ERROR, "Out of memory");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_op_stage_gsync_create (dict_t *dict, char **op_errstr)
+{
+ char *down_peerstr = NULL;
+ char *slave = NULL;
+ char *volname = NULL;
+ char *host_uuid = NULL;
+ char *statefile = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *conf_path = NULL;
+ char errmsg[PATH_MAX] = "";
+ char common_pem_file[PATH_MAX] = "";
+ char hook_script[PATH_MAX] = "";
+ char uuid_str [64] = "";
+ int ret = -1;
+ int is_pem_push = -1;
+ gf_boolean_t is_force = -1;
+ gf_boolean_t is_force_blocker = -1;
+ gf_boolean_t exists = _gf_false;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ struct stat stbuf = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ ret = glusterd_op_gsync_args_get (dict, op_errstr, &volname,
+ &slave, &host_uuid);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch arguments");
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return -1;
+ }
+
+ if (conf->op_version < 2) {
+ gf_log ("", GF_LOG_ERROR, "Op Version not supported.");
+ snprintf (errmsg, sizeof(errmsg), "One or more nodes do not"
+ " support the required op version.");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ exists = glusterd_check_volume_exists (volname);
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if ((ret) || (!exists)) {
+ gf_log ("", GF_LOG_WARNING, "volume name does not exist");
+ snprintf (errmsg, sizeof(errmsg), "Volume name %s does not"
+ " exist", volname);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return -1;
+ }
+
+ ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip,
+ &slave_vol, &conf_path,
+ op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave or confpath details.");
+ ret = -1;
+ goto out;
+ }
+
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
+ uuid_utoa_r (MY_UUID, uuid_str);
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = glusterd_are_vol_all_peers_up (volinfo,
+ &conf->peers,
+ &down_peerstr);
+ if ((ret == _gf_false) && !is_force) {
+ snprintf (errmsg, sizeof (errmsg), "Peer %s,"
+ " which is a part of %s volume, is"
+ " down. Please bring up the peer and"
+ " retry.", down_peerstr,
+ volinfo->volname);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ GF_FREE (down_peerstr);
+ down_peerstr = NULL;
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return -1;
+ } else if (ret == _gf_false) {
+ gf_log ("", GF_LOG_INFO, "Peer %s,"
+ " which is a part of %s volume, is"
+ " down. Force creating geo-rep session."
+ " On bringing up the peer, re-run"
+ " \"gluster system:: execute"
+ " gsec_create\" and \"gluster volume"
+ " geo-replication %s %s create push-pem"
+ " force\"", down_peerstr, volinfo->volname,
+ volinfo->volname, slave);
+ }
+
+ /* Checking if slave host is pingable, has proper passwordless
+ * ssh login setup, slave volume is created, slave vol is empty,
+ * and if it has enough memory and bypass in case of force if
+ * the error is not a force blocker */
+ ret = glusterd_verify_slave (volname, slave_ip, slave_vol,
+ op_errstr, &is_force_blocker);
+ if (ret) {
+ if (is_force && !is_force_blocker) {
+ gf_log ("", GF_LOG_INFO, "%s is not a valid slave"
+ " volume. Error: %s. Force creating geo-rep"
+ " session.", slave, *op_errstr);
+ } else {
+ gf_log ("", GF_LOG_ERROR,
+ "%s is not a valid slave volume. Error: %s",
+ slave, *op_errstr);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = dict_get_int32 (dict, "push_pem", &is_pem_push);
+ if (!ret && is_pem_push) {
+ ret = snprintf (common_pem_file,
+ sizeof(common_pem_file) - 1,
+ "%s"GLUSTERD_COMMON_PEM_PUB_FILE,
+ conf->workdir);
+ common_pem_file[ret] = '\0';
+
+ ret = snprintf (hook_script, sizeof(hook_script) - 1,
+ "%s"GLUSTERD_CREATE_HOOK_SCRIPT,
+ conf->workdir);
+ hook_script[ret] = '\0';
+
+ ret = lstat (common_pem_file, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "%s"
+ " required for push-pem is"
+ " not present. Please run"
+ " \"gluster system:: execute"
+ " gsec_create\"", common_pem_file);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = lstat (hook_script, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "The hook-script (%s) required "
+ "for push-pem is not present. "
+ "Please install the hook-script "
+ "and retry", hook_script);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ if (!S_ISREG(stbuf.st_mode)) {
+ snprintf (errmsg, sizeof (errmsg), "%s"
+ " required for push-pem is"
+ " not a regular file. Please run"
+ " \"gluster system:: execute"
+ " gsec_create\"", common_pem_file);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ ret = glusterd_get_statefile_name (volinfo, slave, conf_path, &statefile);
+ if (ret) {
+ if (!strstr(slave, "::"))
+ snprintf (errmsg, sizeof (errmsg),
+ "%s is not a valid slave url.", slave);
+ else
+ snprintf (errmsg, sizeof (errmsg), "Please check gsync "
+ "config file. Unable to get statefile's name");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "statefile", statefile);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store statefile path");
+ goto out;
+ }
+
+ ret = lstat (statefile, &stbuf);
+ if (!ret && !is_force) {
+ snprintf (errmsg, sizeof (errmsg), "Session between %s"
+ " and %s is already created.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ } else if (!ret)
+ gf_log ("", GF_LOG_INFO, "Session between %s"
+ " and %s is already created. Force"
+ " creating again.", volinfo->volname, slave);
+
+ ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to spawn gsyncd.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ if (ret && errmsg[0] != '\0')
+ *op_errstr = gf_strdup (errmsg);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
{
int ret = 0;
int type = 0;
char *volname = NULL;
char *slave = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *down_peerstr = NULL;
+ char *statefile = NULL;
+ char *path_list = NULL;
+ char *conf_path = NULL;
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
char errmsg[PATH_MAX] = {0,};
dict_t *ctx = NULL;
+ gf_boolean_t is_force = 0;
+ gf_boolean_t is_force_blocker = -1;
+ gf_boolean_t is_running = _gf_false;
+ uuid_t uuid = {0};
+ char uuid_str [64] = {0};
+ char *host_uuid = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ struct stat stbuf = {0,};
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
ret = dict_get_int32 (dict, "type", &type);
if (ret < 0) {
@@ -1151,25 +2253,26 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
goto out;
}
- switch (type) {
- case GF_GSYNC_OPTION_TYPE_STATUS:
+ if (type == GF_GSYNC_OPTION_TYPE_STATUS) {
ret = glusterd_verify_gsync_status_opts (dict, op_errstr);
-
goto out;
- case GF_GSYNC_OPTION_TYPE_CONFIG:
- ret = gsync_verify_config_options (dict, op_errstr);
+ }
+ ret = glusterd_op_gsync_args_get (dict, op_errstr,
+ &volname, &slave, &host_uuid);
+ if (ret)
goto out;
- case GF_GSYNC_OPTION_TYPE_ROTATE:
- /* checks same as status mode */
- ret = glusterd_verify_gsync_status_opts(dict, op_errstr);
- goto out;
- }
+ uuid_utoa_r (MY_UUID, uuid_str);
- ret = glusterd_op_gsync_args_get (dict, op_errstr, &volname, &slave);
- if (ret)
+ if (conf->op_version < 2) {
+ gf_log ("", GF_LOG_ERROR, "Op Version not supported.");
+ snprintf (errmsg, sizeof(errmsg), "One or more nodes do not"
+ " support the required op version.");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
goto out;
+ }
exists = glusterd_check_volume_exists (volname);
ret = glusterd_volinfo_find (volname, &volinfo);
@@ -1182,12 +2285,96 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
goto out;
}
+ ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip,
+ &slave_vol, &conf_path,
+ op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave or confpath details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_get_statefile_name (volinfo, slave, conf_path, &statefile);
+ if (ret) {
+ /* Checking if slave host is pingable, has proper passwordless
+ * ssh login setup */
+ ret = glusterd_verify_slave (volname, slave_ip, slave_vol,
+ op_errstr, &is_force_blocker);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "%s is not a valid slave volume. Error: %s",
+ slave, *op_errstr);
+ goto out;
+ }
+
+ if (!strstr(slave, "::"))
+ snprintf (errmsg, sizeof (errmsg),
+ "%s is not a valid slave url.", slave);
+ else
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to get statefile's name");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "statefile", statefile);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store statefile path");
+ goto out;
+ }
+
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
+ /* Allowing stop force to bypass the statefile check
+ * as this command acts as a fail safe method to stop geo-rep
+ * session. */
+ if ((type == GF_GSYNC_OPTION_TYPE_CONFIG) ||
+ ((type == GF_GSYNC_OPTION_TYPE_STOP) && !is_force) ||
+ (type == GF_GSYNC_OPTION_TYPE_DELETE)) {
+ ret = lstat (statefile, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof(errmsg), "Geo-replication"
+ " session between %s and %s does not exist.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s. statefile = %s",
+ errmsg, statefile);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* Check if all peers that are a part of the volume are up or not */
+ if ((type == GF_GSYNC_OPTION_TYPE_DELETE) ||
+ ((type == GF_GSYNC_OPTION_TYPE_STOP) && !is_force)) {
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = glusterd_are_vol_all_peers_up (volinfo,
+ &conf->peers,
+ &down_peerstr);
+ if (ret == _gf_false) {
+ snprintf (errmsg, sizeof (errmsg), "Peer %s,"
+ " which is a part of %s volume, is"
+ " down. Please bring up the peer and"
+ " retry.", down_peerstr,
+ volinfo->volname);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ GF_FREE (down_peerstr);
+ down_peerstr = NULL;
+ goto out;
+ }
+ }
+ }
+
switch (type) {
case GF_GSYNC_OPTION_TYPE_START:
/* don't attempt to start gsync if replace-brick is
* in progress */
if (glusterd_is_rb_ongoing (volinfo)) {
- snprintf (errmsg, sizeof(errmsg),"replace-brick is in"
+ snprintf (errmsg, sizeof(errmsg), "replace-brick is in"
" progress, not starting geo-replication");
*op_errstr = gf_strdup (errmsg);
ret = -1;
@@ -1195,16 +2382,18 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
}
ret = glusterd_op_verify_gsync_start_options (volinfo, slave,
- op_errstr);
+ conf_path, statefile,
+ op_errstr, is_force);
if (ret)
goto out;
ctx = glusterd_op_get_ctx();
if (ctx) {
- /*gsyncd does a fuse mount to start the geo-rep session*/
+ /* gsyncd does a fuse mount to start
+ * the geo-rep session */
if (!glusterd_is_fuse_available ()) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to open"
- " /dev/fuse (%s), geo-replication start"
- " failed", strerror (errno));
+ gf_log ("glusterd", GF_LOG_ERROR, "Unable to "
+ "open /dev/fuse (%s), geo-replication "
+ "start failed", strerror (errno));
snprintf (errmsg, sizeof(errmsg),
"fuse unvailable");
*op_errstr = gf_strdup (errmsg);
@@ -1215,17 +2404,72 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
break;
case GF_GSYNC_OPTION_TYPE_STOP:
- ret = glusterd_op_verify_gsync_running (volinfo, slave,
- op_errstr);
+ if (!is_force) {
+ ret = glusterd_op_verify_gsync_running (volinfo, slave,
+ conf_path,
+ op_errstr);
+ if (ret) {
+ ret = glusterd_get_local_brickpaths (volinfo,
+ &path_list);
+ if (path_list)
+ ret = -1;
+ }
+ }
+ break;
+
+ case GF_GSYNC_OPTION_TYPE_CONFIG:
+ ret = gsync_verify_config_options (dict, op_errstr, volname);
+ goto out;
+ break;
+
+ case GF_GSYNC_OPTION_TYPE_DELETE:
+ /* Check if the gsync session is still running
+ * If so ask the user to stop geo-replication first.*/
+ ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
+ if (ret) {
+ snprintf (errmsg, sizeof(errmsg), "Geo-replication"
+ " session between %s and %s does not exist.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ } else {
+ ret = glusterd_check_gsync_running_local (volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (_gf_true == is_running) {
+ snprintf (errmsg, sizeof (errmsg), GEOREP
+ " session between %s & %s is "
+ "still active. Please stop the "
+ "session and retry.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to spawn gsyncd");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ }
+
break;
}
out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
static int
-stop_gsync (char *master, char *slave, char **msg)
+stop_gsync (char *master, char *slave, char **msg,
+ char *conf_path, gf_boolean_t is_force)
{
int32_t ret = 0;
int pfd = -1;
@@ -1237,19 +2481,16 @@ stop_gsync (char *master, char *slave, char **msg)
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
- pfd = gsyncd_getpidfile (master, slave, pidfile);
- if (pfd == -2) {
+ pfd = gsyncd_getpidfile (master, slave, pidfile, conf_path);
+ if (pfd == -2 && !is_force) {
gf_log ("", GF_LOG_ERROR, GEOREP" stop validation "
" failed for %s & %s", master, slave);
ret = -1;
goto out;
}
- if (gsync_status_byfd (pfd) == -1) {
+ if (gsync_status_byfd (pfd) == -1 && !is_force) {
gf_log ("", GF_LOG_ERROR, "gsyncd b/w %s & %s is not"
" running", master, slave);
- if (msg)
- *msg = gf_strdup ("Warning: "GEOREP" session was "
- "defunct at stop time");
/* monitor gsyncd already dead */
goto out;
}
@@ -1284,16 +2525,100 @@ stop_gsync (char *master, char *slave, char **msg)
out:
sys_close (pfd);
+
+ if (is_force)
+ ret = 0;
return ret;
}
-static int
-glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
- dict_t *resp_dict);
+/*
+ * glusterd_gsync_op_already_set:
+ * This funcion checks whether the op_value is same as in the
+ * gsyncd.conf file.
+ *
+ * RETURN VALUE:
+ * 0 : op_value matches the conf file.
+ * 1 : op_value does not matches the conf file or op_param not
+ * found in conf file.
+ * -1 : error
+ */
+
+int
+glusterd_gsync_op_already_set (char* master, char* slave, char* conf_path,
+ char* op_name, char* op_value)
+{
+ dict_t *confd = NULL;
+ char *op_val_buf = NULL;
+ int32_t op_val_conf = 0;
+ int32_t op_val_cli = 0;
+ int32_t ret = -1;
+ gf_boolean_t is_bool = _gf_true;
+
+ confd = dict_new ();
+ if (!confd) {
+ gf_log ("", GF_LOG_ERROR, "Not able to create dict.");
+ return -1;
+ }
+
+ ret = glusterd_gsync_get_config (master, slave, conf_path,
+ confd);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get configuration data"
+ "for %s(master), %s(slave)", master, slave);
+ goto out;
+ }
+
+ ret = dict_get_param (confd, op_name, &op_val_buf);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get op_value "
+ "for %s(master), %s(slave). Please check gsync "
+ "config file.", master, slave);
+ ret = 1;
+ goto out;
+ }
+
+ gf_log("",GF_LOG_DEBUG, "val_cli:%s val_conf:%s",op_value,op_val_buf);
+
+ if (!strcmp(op_val_buf,"true") || !strcmp(op_val_buf,"1")
+ || !strcmp(op_val_buf,"yes")) {
+ op_val_conf = 1;
+ } else if(!strcmp(op_val_buf,"false") || !strcmp(op_val_buf,"0")
+ || !strcmp(op_val_buf,"no")) {
+ op_val_conf = 0;
+ } else {
+ is_bool = _gf_false;
+ }
+
+ if (is_bool) {
+ if (!strcmp(op_value,"true") || !strcmp(op_value,"1")
+ || !strcmp(op_value,"yes")) {
+ op_val_cli = 1;
+ } else {
+ op_val_cli = 0;
+ }
+
+ if ( op_val_cli == op_val_conf ) {
+ ret = 0;
+ goto out;
+ }
+ } else {
+ if (!strcmp(op_val_buf,op_value)) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = 1;
+
+out:
+ dict_unref(confd);
+ return ret;
+}
static int
glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
- dict_t *dict, dict_t *resp_dict, char **op_errstr)
+ char *path_list, dict_t *dict,
+ dict_t *resp_dict, char **op_errstr)
{
int32_t ret = -1;
char *op_name = NULL;
@@ -1302,6 +2627,13 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
glusterd_conf_t *priv = NULL;
char *subop = NULL;
char *master = NULL;
+ char *conf_path = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ struct stat stbuf = {0, };
+ gf_boolean_t restart_required = _gf_true;
+ char **resopt = NULL;
+ gf_boolean_t op_already_set = _gf_false;
GF_ASSERT (slave);
GF_ASSERT (op_errstr);
@@ -1336,10 +2668,17 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
goto out;
}
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch conf file path.");
+ goto out;
+ }
+
master = "";
runinit (&runner);
runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir);
+ runner_argprintf (&runner, "%s", conf_path);
if (volinfo) {
master = volinfo->volname;
runner_argprintf (&runner, ":%s", master);
@@ -1349,6 +2688,24 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
runner_add_arg (&runner, op_name);
if (op_value)
runner_add_arg (&runner, op_value);
+
+ if ( strcmp(op_name,"checkpoint") != 0 ) {
+ ret = glusterd_gsync_op_already_set(master,slave,conf_path,
+ op_name,op_value);
+ if (ret == -1) {
+ gf_log ("", GF_LOG_WARNING,
+ "glusterd_gsync_op_already_set failed.");
+ gf_asprintf (op_errstr, GEOREP" config-%s failed for "
+ "%s %s", subop, master, slave);
+ goto out;
+ }
+ if (ret == 0) {
+ gf_log("", GF_LOG_DEBUG, "op_value is already set");
+ op_already_set = _gf_true;
+ goto out;
+ }
+ }
+
synclock_unlock (&priv->big_lock);
ret = runner_run (&runner);
synclock_lock (&priv->big_lock);
@@ -1362,22 +2719,65 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
goto out;
}
+
+ if ((!strcmp (op_name, "state_file")) && (op_value)) {
+
+ ret = lstat (op_value, &stbuf);
+ if (ret) {
+ ret = dict_get_str (dict, "slave_ip", &slave_ip);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave IP.");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "slave_vol", &slave_vol);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave volume name.");
+ goto out;
+ }
+
+ ret = glusterd_create_status_file (volinfo->volname, slave,
+ slave_ip, slave_vol,
+ "Switching Status File");
+ if (ret || lstat (op_value, &stbuf)) {
+ gf_log ("", GF_LOG_ERROR, "Unable to create %s"
+ ". Error : %s", op_value,
+ strerror (errno));
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
ret = 0;
gf_asprintf (op_errstr, "config-%s successful", subop);
out:
- if (!ret && volinfo) {
+ if (!ret && volinfo && !op_already_set) {
+ for (resopt = gsync_no_restart_opts; *resopt; resopt++) {
+ restart_required = _gf_true;
+ if (!strcmp ((*resopt), op_name)){
+ restart_required = _gf_false;
+ break;
+ }
+ }
+
+ if (restart_required) {
ret = glusterd_check_restart_gsync_session (volinfo, slave,
- resp_dict);
+ resp_dict, path_list,
+ conf_path, 0);
if (ret)
- *op_errstr = gf_strdup ("internal error");
+ *op_errstr = gf_strdup ("internal error");
+ }
}
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-static int
+int
glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen)
{
int ret = 0;
@@ -1401,7 +2801,6 @@ glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen)
char *p = buf + len - 1;
while (isspace (*p))
*p-- = '\0';
- ret = 0;
}
} else if (ret < 0)
gf_log ("", GF_LOG_ERROR, "Status file of gsyncd is corrupt");
@@ -1411,20 +2810,146 @@ glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen)
}
static int
-glusterd_gsync_fetch_status_extra (char *path, char *buf, size_t blen)
+dict_get_param (dict_t *dict, char *key, char **param)
+{
+ char *dk = NULL;
+ char *s = NULL;
+ char x = '\0';
+ int ret = 0;
+
+ if (dict_get_str (dict, key, param) == 0)
+ return 0;
+
+ dk = gf_strdup (key);
+ if (!key)
+ return -1;
+
+ s = strpbrk (dk, "-_");
+ if (!s)
+ return -1;
+ x = (*s == '-') ? '_' : '-';
+ *s++ = x;
+ while ((s = strpbrk (s, "-_")))
+ *s++ = x;
+
+ ret = dict_get_str (dict, dk, param);
+
+ GF_FREE (dk);
+ return ret;
+}
+
+static int
+glusterd_parse_gsync_status (char *buf, gf_gsync_status_t *sts_val)
+{
+ int ret = -1;
+ int i = -1;
+ int num_of_fields = 8;
+ char *token = NULL;
+ char **tokens = NULL;
+ char **ptr = NULL;
+ char *save_ptr = NULL;
+ char na_buf[] = "N/A";
+
+ if (!buf) {
+ gf_log ("", GF_LOG_ERROR, "Empty buf");
+ goto out;
+ }
+
+ tokens = calloc (num_of_fields, sizeof (char *));
+ if (!tokens) {
+ gf_log ("", GF_LOG_ERROR, "Out of memory");
+ goto out;
+ }
+
+ ptr = tokens;
+
+ for (token = strtok_r (buf, ",", &save_ptr); token;
+ token = strtok_r (NULL, ",", &save_ptr)) {
+ *ptr = gf_strdup(token);
+ if (!*ptr) {
+ gf_log ("", GF_LOG_ERROR, "Out of memory");
+ goto out;
+ }
+ ptr++;
+ }
+
+ for (i = 0; i < num_of_fields; i++) {
+ token = strtok_r (tokens[i], ":", &save_ptr);
+ token = strtok_r (NULL, "\0", &save_ptr);
+ token++;
+
+ /* token NULL check */
+ if (!token && (i != 0) &&
+ (i != 5) && (i != 7))
+ token = na_buf;
+
+ if (i == 0) {
+ if (!token)
+ token = na_buf;
+ else {
+ token++;
+ if (!token)
+ token = na_buf;
+ else
+ token[strlen(token) - 1] = '\0';
+ }
+ memcpy (sts_val->slave_node, token, strlen(token));
+ }
+ if (i == 1)
+ memcpy (sts_val->files_syncd, token, strlen(token));
+ if (i == 2)
+ memcpy (sts_val->purges_remaining, token, strlen(token));
+ if (i == 3)
+ memcpy (sts_val->total_files_skipped, token, strlen(token));
+ if (i == 4)
+ memcpy (sts_val->files_remaining, token, strlen(token));
+ if (i == 5) {
+ if (!token)
+ token = na_buf;
+ else {
+ token++;
+ if (!token)
+ token = na_buf;
+ else
+ token[strlen(token) - 1] = '\0';
+ }
+ memcpy (sts_val->worker_status, token, strlen(token));
+ }
+ if (i == 6)
+ memcpy (sts_val->bytes_remaining, token, strlen(token));
+ if (i == 7) {
+ if (!token)
+ token = na_buf;
+ else {
+ token++;
+ if (!token)
+ token = na_buf;
+ else
+ token[strlen(token) - 2] = '\0';
+ }
+ memcpy (sts_val->crawl_status, token, strlen(token));
+ }
+ }
+
+ ret = 0;
+out:
+ for (i = 0; i< num_of_fields; i++)
+ if (tokens[i])
+ GF_FREE(tokens[i]);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_gsync_fetch_status_extra (char *path, gf_gsync_status_t *sts_val)
{
char sockpath[PATH_MAX] = {0,};
struct sockaddr_un sa = {0,};
- size_t l = 0;
int s = -1;
struct pollfd pfd = {0,};
int ret = 0;
- l = strlen (buf);
- /* seek to end of data in buf */
- buf += l;
- blen -= l;
-
glusterd_set_socket_filepath (path, sockpath, sizeof (sockpath));
strncpy(sa.sun_path, sockpath, sizeof(sa.sun_path));
@@ -1452,74 +2977,56 @@ glusterd_gsync_fetch_status_extra (char *path, char *buf, size_t blen)
ret = -1;
goto out;
}
- ret = read(s, buf, blen);
+ ret = read(s, sts_val->checkpoint_status,
+ sizeof(sts_val->checkpoint_status));
/* we expect a terminating 0 byte */
- if (ret == 0 || (ret > 0 && buf[ret - 1]))
+ if (ret == 0 || (ret > 0 && sts_val->checkpoint_status[ret - 1]))
ret = -1;
- if (ret > 0)
+ if (ret > 0) {
ret = 0;
+ }
- out:
+out:
close (s);
return ret;
}
-static int
-dict_get_param (dict_t *dict, char *key, char **param)
-{
- char *dk = NULL;
- char *s = NULL;
- char x = '\0';
- int ret = 0;
-
- if (dict_get_str (dict, key, param) == 0)
- return 0;
-
- dk = gf_strdup (key);
- if (!key)
- return -1;
-
- s = strpbrk (dk, "-_");
- if (!s)
- return -1;
- x = (*s == '-') ? '_' : '-';
- *s++ = x;
- while ((s = strpbrk (s, "-_")))
- *s++ = x;
-
- ret = dict_get_str (dict, dk, param);
-
- GF_FREE (dk);
- return ret;
-}
-
-static int
-glusterd_read_status_file (char *master, char *slave,
- dict_t *dict, char *node)
+int
+glusterd_read_status_file (glusterd_volinfo_t *volinfo, char *slave,
+ char *conf_path, dict_t *dict, char *node)
{
- glusterd_conf_t *priv = NULL;
- int ret = 0;
- char *statefile = NULL;
- char buf[1024] = {0, };
- char nds[1024] = {0, };
- char mst[1024] = {0, };
- char slv[1024] = {0, };
- char sts[1024] = {0, };
- char *bufp = NULL;
- dict_t *confd = NULL;
- int gsync_count = 0;
- int status = 0;
- char *dyn_node = NULL;
+ char brick_state_file[PATH_MAX] = "";
+ char brick_path[PATH_MAX] = "";
+ char *georep_session_wrkng_dir = NULL;
+ char *master = NULL;
+ char tmp[1024] = "";
+ char sts_val_name[1024] = "";
+ char monitor_status[NAME_MAX] = "";
+ char *statefile = NULL;
+ char *socketfile = NULL;
+ dict_t *confd = NULL;
+ int gsync_count = 0;
+ int i = 0;
+ int ret = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ gf_gsync_status_t *sts_val = NULL;
+ glusterd_conf_t *priv = NULL;
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
+ GF_ASSERT (volinfo);
+
+ master = volinfo->volname;
confd = dict_new ();
- if (!dict)
+ if (!dict) {
+ gf_log ("", GF_LOG_ERROR, "Not able to create dict.");
return -1;
+ }
priv = THIS->private;
- ret = glusterd_gsync_get_config (master, slave, priv->workdir,
+
+ ret = glusterd_gsync_get_config (master, slave, conf_path,
confd);
if (ret) {
gf_log ("", GF_LOG_ERROR, "Unable to get configuration data"
@@ -1528,116 +3035,188 @@ glusterd_read_status_file (char *master, char *slave,
}
- ret = gsync_status (master, slave, &status);
- if (ret == 0 && status == -1) {
- strncpy (buf, "defunct", sizeof (buf));
- goto done;
- } else if (ret == -1)
- goto out;
-
ret = dict_get_param (confd, "state_file", &statefile);
- if (ret)
- goto out;
- ret = glusterd_gsync_read_frm_status (statefile, buf, sizeof (buf));
if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get state_file's name "
+ "for %s(master), %s(slave). Please check gsync "
+ "config file.", master, slave);
+ goto out;
+ }
+
+ ret = glusterd_gsync_read_frm_status (statefile, monitor_status,
+ sizeof (monitor_status));
+ if (ret <= 0) {
gf_log ("", GF_LOG_ERROR, "Unable to read the status"
"file for %s(master), %s(slave)", master, slave);
- strncpy (buf, "defunct", sizeof (buf));
- goto done;
+ strncpy (monitor_status, "defunct", sizeof (monitor_status));
}
- if (strcmp (buf, "OK") != 0)
- goto done;
- ret = dict_get_param (confd, "state_socket_unencoded", &statefile);
- if (ret)
+ ret = dict_get_param (confd, "georep_session_working_dir",
+ &georep_session_wrkng_dir);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get geo-rep session's "
+ "working directory name for %s(master), %s(slave). "
+ "Please check gsync config file.", master, slave);
goto out;
- ret = glusterd_gsync_fetch_status_extra (statefile, buf, sizeof (buf));
+ }
+
+ ret = dict_get_param (confd, "state_socket_unencoded", &socketfile);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to fetch extra status"
- "for %s(master), %s(slave)", master, slave);
- /* there is a slight chance that this occurs due to race
- * -- in that case, the following options all seem bad:
- *
- * - suppress irregurlar behavior by just leaving status
- * on "OK"
- * - freak out users with a misleading "defunct"
- * - overload the meaning of the regular error signal
- * mechanism of gsyncd, that is, when status is "faulty"
- *
- * -- so we just come up with something new...
- */
- strncpy (buf, "N/A", sizeof (buf));
- goto done;
+ gf_log ("", GF_LOG_ERROR, "Unable to get socket file's name "
+ "for %s(master), %s(slave). Please check gsync "
+ "config file.", master, slave);
+ goto out;
}
- done:
ret = dict_get_int32 (dict, "gsync-count", &gsync_count);
-
if (ret)
- gsync_count = 1;
- else
- gsync_count++;
+ gsync_count = 0;
- (void) snprintf (nds, sizeof (nds), "node%d", gsync_count);
- dyn_node = gf_strdup (node);
- if (!dyn_node)
- goto out;
- ret = dict_set_dynstr (dict, nds, dyn_node);
- if (ret) {
- GF_FREE (dyn_node);
- goto out;
- }
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
- snprintf (mst, sizeof (mst), "master%d", gsync_count);
- master = gf_strdup (master);
- if (!master)
- goto out;
- ret = dict_set_dynstr (dict, mst, master);
- if (ret) {
- GF_FREE (master);
- goto out;
- }
+ sts_val = GF_CALLOC (1, sizeof(gf_gsync_status_t),
+ gf_common_mt_gsync_status_t);
+ if (!sts_val) {
+ gf_log ("", GF_LOG_ERROR, "Out Of Memory");
+ goto out;
+ }
- snprintf (slv, sizeof (slv), "slave%d", gsync_count);
- slave = gf_strdup (slave);
- if (!slave)
- goto out;
- ret = dict_set_dynstr (dict, slv, slave);
- if (ret) {
- GF_FREE (slave);
- goto out;
- }
+ /* Creating the brick state file's path */
+ memset(brick_state_file, '\0', PATH_MAX);
+ memcpy (brick_path, brickinfo->path, PATH_MAX - 1);
+ for (i = 0; i < strlen(brick_path) - 1; i++)
+ if (brick_path[i] == '/')
+ brick_path[i] = '_';
+ ret = snprintf(brick_state_file, PATH_MAX - 1, "%s%s.status",
+ georep_session_wrkng_dir, brick_path);
+ brick_state_file[ret] = '\0';
+
+ gf_log ("", GF_LOG_DEBUG, "brick_state_file = %s", brick_state_file);
+
+ memset (tmp, '\0', sizeof(tmp));
+
+ ret = glusterd_gsync_read_frm_status (brick_state_file,
+ tmp, sizeof (tmp));
+ if (ret <= 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read the status"
+ "file for %s brick for %s(master), %s(slave) "
+ "session", brickinfo->path, master, slave);
+ memcpy (sts_val->slave_node, slave, strlen(slave));
+ sts_val->slave_node[strlen(slave)] = '\0';
+ ret = snprintf (sts_val->worker_status, sizeof(sts_val->worker_status), "N/A");
+ sts_val->worker_status[ret] = '\0';
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ ret = snprintf (sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+ sts_val->crawl_status[ret] = '\0';
+ ret = snprintf (sts_val->files_syncd, sizeof(sts_val->files_syncd), "N/A");
+ sts_val->files_syncd[ret] = '\0';
+ ret = snprintf (sts_val->purges_remaining, sizeof(sts_val->purges_remaining), "N/A");
+ sts_val->purges_remaining[ret] = '\0';
+ ret = snprintf (sts_val->total_files_skipped, sizeof(sts_val->total_files_skipped), "N/A");
+ sts_val->total_files_skipped[ret] = '\0';
+ ret = snprintf (sts_val->files_remaining, sizeof(sts_val->files_remaining), "N/A");
+ sts_val->files_remaining[ret] = '\0';
+ ret = snprintf (sts_val->bytes_remaining, sizeof(sts_val->bytes_remaining), "N/A");
+ sts_val->bytes_remaining[ret] = '\0';
+ goto store_status;
+ }
- snprintf (sts, sizeof (slv), "status%d", gsync_count);
- bufp = gf_strdup (buf);
- if (!bufp)
- goto out;
- ret = dict_set_dynstr (dict, sts, bufp);
- if (ret) {
- GF_FREE (bufp);
- goto out;
+ ret = glusterd_gsync_fetch_status_extra (socketfile, sts_val);
+ if (ret || strlen(sts_val->checkpoint_status) == 0) {
+ gf_log ("", GF_LOG_DEBUG, "No checkpoint status"
+ "for %s(master), %s(slave)", master, slave);
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ }
+
+ ret = glusterd_parse_gsync_status (tmp, sts_val);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to parse the gsync status for %s",
+ brickinfo->path);
+ memcpy (sts_val->slave_node, slave, strlen(slave));
+ sts_val->slave_node[strlen(slave)] = '\0';
+ ret = snprintf (sts_val->worker_status, sizeof(sts_val->worker_status), "N/A");
+ sts_val->worker_status[ret] = '\0';
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ ret = snprintf (sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+ sts_val->crawl_status[ret] = '\0';
+ ret = snprintf (sts_val->files_syncd, sizeof(sts_val->files_syncd), "N/A");
+ sts_val->files_syncd[ret] = '\0';
+ ret = snprintf (sts_val->purges_remaining, sizeof(sts_val->purges_remaining), "N/A");
+ sts_val->purges_remaining[ret] = '\0';
+ ret = snprintf (sts_val->total_files_skipped, sizeof(sts_val->total_files_skipped), "N/A");
+ sts_val->total_files_skipped[ret] = '\0';
+ ret = snprintf (sts_val->files_remaining, sizeof(sts_val->files_remaining), "N/A");
+ sts_val->files_remaining[ret] = '\0';
+ ret = snprintf (sts_val->bytes_remaining, sizeof(sts_val->bytes_remaining), "N/A");
+ sts_val->bytes_remaining[ret] = '\0';
+ }
+
+store_status:
+ if ((strcmp (monitor_status, "Stable"))) {
+ memcpy (sts_val->worker_status, monitor_status, strlen(monitor_status));
+ sts_val->worker_status[strlen(monitor_status)] = '\0';
+ ret = snprintf (sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+ sts_val->crawl_status[ret] = '\0';
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ }
+
+ if (strcmp (sts_val->worker_status, "Active")) {
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ ret = snprintf (sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+ sts_val->crawl_status[ret] = '\0';
+ }
+
+ if (!strcmp (sts_val->slave_node, "N/A")) {
+ memcpy (sts_val->slave_node, slave, strlen(slave));
+ sts_val->slave_node[strlen(slave)] = '\0';
+ }
+
+ memcpy (sts_val->node, node, strlen(node));
+ sts_val->node[strlen(node)] = '\0';
+ memcpy (sts_val->brick, brickinfo->path, strlen(brickinfo->path));
+ sts_val->brick[strlen(brickinfo->path)] = '\0';
+ memcpy (sts_val->master, master, strlen(master));
+ sts_val->master[strlen(master)] = '\0';
+
+ snprintf (sts_val_name, sizeof (sts_val_name), "status_value%d", gsync_count);
+ ret = dict_set_bin (dict, sts_val_name, sts_val, sizeof(gf_gsync_status_t));
+ if (ret) {
+ GF_FREE (sts_val);
+ goto out;
+ }
+
+ gsync_count++;
+ sts_val = NULL;
}
+
ret = dict_set_int32 (dict, "gsync-count", gsync_count);
if (ret)
goto out;
- ret = 0;
- out:
+out:
dict_destroy (confd);
- gf_log ("", GF_LOG_DEBUG, "Returning %d ", ret);
- return ret;
+ return 0;
}
-static int
+int
glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
- dict_t *resp_dict)
+ dict_t *resp_dict, char *path_list,
+ char *conf_path, gf_boolean_t is_force)
{
int ret = 0;
- uuid_t uuid = {0, };
glusterd_conf_t *priv = NULL;
char *status_msg = NULL;
+ gf_boolean_t is_running = _gf_false;
GF_ASSERT (volinfo);
GF_ASSERT (slave);
@@ -1646,18 +3225,22 @@ glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
priv = THIS->private;
- if (glusterd_gsync_get_uuid (slave, volinfo, uuid))
- /* session does not exist, nothing to do */
+ ret = glusterd_check_gsync_running_local (volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (!ret && (_gf_true != is_running))
+ /* gsynd not running, nothing to do */
goto out;
- if (uuid_compare (MY_UUID, uuid) == 0) {
- ret = stop_gsync (volinfo->volname, slave, &status_msg);
- if (ret == 0 && status_msg)
- ret = dict_set_str (resp_dict, "gsync-status",
- status_msg);
- if (ret == 0)
- ret = glusterd_start_gsync (volinfo, slave,
- uuid_utoa(MY_UUID), NULL);
- }
+
+ ret = stop_gsync (volinfo->volname, slave, &status_msg,
+ conf_path, is_force);
+ if (ret == 0 && status_msg)
+ ret = dict_set_str (resp_dict, "gsync-status",
+ status_msg);
+ if (ret == 0)
+ ret = glusterd_start_gsync (volinfo, slave, path_list,
+ conf_path, uuid_utoa(MY_UUID),
+ NULL);
out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
@@ -1665,7 +3248,7 @@ glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
}
static int32_t
-glusterd_marker_create_volfile (glusterd_volinfo_t *volinfo)
+glusterd_marker_changelog_create_volfile (glusterd_volinfo_t *volinfo)
{
int32_t ret = 0;
@@ -1689,58 +3272,82 @@ out:
}
static int
-glusterd_set_marker_gsync (glusterd_volinfo_t *volinfo)
+glusterd_set_gsync_knob (glusterd_volinfo_t *volinfo, char *key, int *vc)
{
- int ret = -1;
- int marker_set = _gf_false;
- char *gsync_status = NULL;
+ int ret = -1;
+ int conf_enabled = _gf_false;
+ char *knob_on = NULL;
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
- marker_set = glusterd_volinfo_get_boolean (volinfo, VKEY_MARKER_XTIME);
- if (marker_set == -1) {
- gf_log ("", GF_LOG_ERROR, "failed to get the marker status");
- ret = -1;
+ conf_enabled = glusterd_volinfo_get_boolean (volinfo, key);
+ if (conf_enabled == -1) {
+ gf_log ("", GF_LOG_ERROR,
+ "failed to get key %s from volinfo", key);
goto out;
}
- if (marker_set == _gf_false) {
- gsync_status = gf_strdup ("on");
- if (gsync_status == NULL) {
+ ret = 0;
+ if (conf_enabled == _gf_false) {
+ *vc = 1;
+ knob_on = gf_strdup ("on");
+ if (knob_on == NULL) {
ret = -1;
goto out;
}
ret = glusterd_gsync_volinfo_dict_set (volinfo,
- VKEY_MARKER_XTIME, gsync_status);
- if (ret < 0)
- goto out;
-
- ret = glusterd_marker_create_volfile (volinfo);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Setting dict failed");
- goto out;
- }
+ key, knob_on);
}
- ret = 0;
-out:
+ out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+static int
+glusterd_set_gsync_confs (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int volfile_changed = 0;
+ ret = glusterd_set_gsync_knob (volinfo,
+ VKEY_MARKER_XTIME, &volfile_changed);
+ if (ret)
+ goto out;
+ /**
+ * enable ignore-pid-check blindly as it could be needed for
+ * cascading setups.
+ */
+ ret = glusterd_set_gsync_knob (volinfo, VKEY_MARKER_XTIME_FORCE,
+ &volfile_changed);
+ if (ret)
+ goto out;
+
+ ret = glusterd_set_gsync_knob (volinfo,
+ VKEY_CHANGELOG, &volfile_changed);
+ if (ret)
+ goto out;
+
+ if (volfile_changed)
+ ret = glusterd_marker_changelog_create_volfile (volinfo);
+
+ out:
+ return ret;
+}
static int
glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo,
- char *slave, dict_t *rsp_dict,
- char *node)
+ char *slave, char *conf_path,
+ dict_t *rsp_dict, char *node)
{
+ char *statefile = NULL;
uuid_t uuid = {0, };
glusterd_conf_t *priv = NULL;
int ret = 0;
+ struct stat stbuf = {0, };
GF_ASSERT (volinfo);
GF_ASSERT (slave);
@@ -1750,19 +3357,38 @@ glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo,
priv = THIS->private;
ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if ((ret == 0) && (uuid_compare (MY_UUID, uuid) != 0))
- goto out;
-
if (ret) {
- ret = 0;
gf_log ("", GF_LOG_INFO, "geo-replication status %s %s :"
"session is not active", volinfo->volname, slave);
- goto out;
+
+ ret = glusterd_get_statefile_name (volinfo, slave,
+ conf_path, &statefile);
+ if (ret) {
+ if (!strstr(slave, "::"))
+ gf_log ("", GF_LOG_INFO,
+ "%s is not a valid slave url.", slave);
+ else
+ gf_log ("", GF_LOG_INFO, "Unable to get"
+ " statefile's name");
+ ret = 0;
+ goto out;
+ }
+
+ ret = lstat (statefile, &stbuf);
+ if (ret) {
+ gf_log ("", GF_LOG_INFO, "%s statefile not present.",
+ statefile);
+ ret = 0;
+ goto out;
+ }
}
- ret = glusterd_read_status_file (volinfo->volname,
- slave, rsp_dict, node);
- out:
+ ret = glusterd_read_status_file (volinfo, slave, conf_path,
+ rsp_dict, node);
+out:
+ if (statefile)
+ GF_FREE (statefile);
+
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -1813,6 +3439,7 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
char *slave = NULL;
char *volname = NULL;
+ char *conf_path = NULL;
char errmsg[PATH_MAX] = {0, };
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
@@ -1850,7 +3477,14 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
goto out;
}
- ret = glusterd_get_gsync_status_mst_slv (volinfo, slave,
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch conf file path.");
+ goto out;
+ }
+
+ ret = glusterd_get_gsync_status_mst_slv (volinfo, slave, conf_path,
rsp_dict, my_hostname);
out:
@@ -1859,319 +3493,431 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
}
static int
-glusterd_send_sigstop (pid_t pid)
-{
- int ret = 0;
- ret = kill (pid, SIGSTOP);
- if (ret)
- gf_log ("", GF_LOG_ERROR, GEOREP"failed to send SIGSTOP signal");
- return ret;
-}
-
-static int
-glusterd_send_sigcont (pid_t pid)
+glusterd_gsync_delete (glusterd_volinfo_t *volinfo, char *slave, char *slave_ip,
+ char *slave_vol, char *path_list, dict_t *dict,
+ dict_t *resp_dict, char **op_errstr)
{
- int ret = 0;
- ret = kill (pid, SIGCONT);
- if (ret)
- gf_log ("", GF_LOG_ERROR, GEOREP"failed to send SIGCONT signal");
- return ret;
-}
-
-/*
- * Log rotations flow is something like this:
- * - Send SIGSTOP to process group (this will stop monitor/worker process
- * and also the slave if it's local)
- * - Rotate log file for monitor/worker
- * - Rotate log file for slave if it's local
- * - Send SIGCONT to the process group. Monitor wakes up, kills the worker
- * (this is done in the SIGCONT handler), which results in the termination
- * of the slave (local/remote). After returning from signal handler,
- * monitor detects absence of worker and starts it again, which in-turn
- * starts the slave.
- */
-static int
-glusterd_send_log_rotate_signal (pid_t pid, char *logfile1, char *logfile2)
-{
- int ret = 0;
- char rlogfile[PATH_MAX] = {0,};
- time_t rottime = 0;
-
- ret = glusterd_send_sigstop (-pid);
- rottime = time (NULL);
+ int32_t ret = -1;
+ runner_t runner = {0,};
+ glusterd_conf_t *priv = NULL;
+ char *master = NULL;
+ char *gl_workdir = NULL;
+ char geo_rep_dir[PATH_MAX] = "";
+ char *conf_path = NULL;
- snprintf (rlogfile, sizeof (rlogfile), "%s.%"PRIu64, logfile1,
- (uint64_t) rottime);
- ret = rename (logfile1, rlogfile);
- if (ret)
- gf_log ("", GF_LOG_ERROR, "rename failed for geo-rep log file");
+ GF_ASSERT (slave);
+ GF_ASSERT (slave_ip);
+ GF_ASSERT (slave_vol);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (dict);
+ GF_ASSERT (resp_dict);
- if (!*logfile2) {
- gf_log ("", GF_LOG_DEBUG, "Slave is not local,"
- " skipping rotation");
- ret = 0;
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
goto out;
}
- (void) snprintf (rlogfile, sizeof (rlogfile), "%s.%"PRIu64, logfile2,
- (uint64_t) rottime);
- ret = rename (logfile2, rlogfile);
- if (ret)
- gf_log ("", GF_LOG_ERROR, "rename failed for geo-rep slave"
- " log file");
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch conf file path.");
+ goto out;
+ }
- out:
- ret = glusterd_send_sigcont (-pid);
+ gl_workdir = priv->workdir;
+ master = "";
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd",
+ "--delete", "-c", NULL);
+ runner_argprintf (&runner, "%s", conf_path);
- return ret;
-}
+ if (volinfo) {
+ master = volinfo->volname;
+ runner_argprintf (&runner, ":%s", master);
+ }
+ runner_add_arg (&runner, slave);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ synclock_unlock (&priv->big_lock);
+ ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "gsyncd failed to "
+ "delete session info for %s and %s peers",
+ master, slave);
-static int
-glusterd_get_pid_from_file (char *master, char *slave, pid_t *pid)
-{
- int ret = -1;
- int pfd = 0;
- char pidfile[PATH_MAX] = {0,};
- char buff[1024] = {0,};
+ gf_asprintf (op_errstr, "gsyncd failed to "
+ "delete session info for %s and %s peers",
+ master, slave);
- pfd = gsyncd_getpidfile (master, slave, pidfile);
- if (pfd == -2) {
- gf_log ("", GF_LOG_ERROR, GEOREP" log-rotate validation "
- " failed for %s & %s", master, slave);
- goto out;
- }
- if (gsync_status_byfd (pfd) == -1) {
- gf_log ("", GF_LOG_ERROR, "gsyncd b/w %s & %s is not"
- " running", master, slave);
goto out;
}
- if (pfd < 0)
- goto out;
+ ret = snprintf (geo_rep_dir, sizeof(geo_rep_dir) - 1,
+ "%s/"GEOREP"/%s_%s_%s", gl_workdir,
+ volinfo->volname, slave_ip, slave_vol);
+ geo_rep_dir[ret] = '\0';
- ret = read (pfd, buff, 1024);
- if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, GEOREP" cannot read pid from pid-file");
- goto out;
+ ret = rmdir (geo_rep_dir);
+ if (ret) {
+ if (errno == ENOENT)
+ gf_log ("", GF_LOG_DEBUG, "Geo Rep Dir(%s) Not Present.",
+ geo_rep_dir);
+ else {
+ gf_log ("", GF_LOG_ERROR, "Unable to delete "
+ "Geo Rep Dir(%s). Error: %s", geo_rep_dir,
+ strerror (errno));
+ goto out;
+ }
}
-
- *pid = strtol (buff, NULL, 10);
ret = 0;
+ gf_asprintf (op_errstr, "delete successful");
+
out:
- sys_close(pfd);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-static int
-glusterd_do_gsync_log_rotate (char *master, char *slave, uuid_t *uuid, char **op_errstr)
+int
+glusterd_op_sys_exec (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
- int ret = 0;
- glusterd_conf_t *priv = NULL;
- pid_t pid = 0;
- char log_file1[PATH_MAX] = {0,};
- char log_file2[PATH_MAX] = {0,};
+ char buf[PATH_MAX] = "";
+ char cmd_arg_name[PATH_MAX] = "";
+ char output_name[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ char *ptr = NULL;
+ char *bufp = NULL;
+ char *command = NULL;
+ char **cmd_args = NULL;
+ int ret = -1;
+ int i = -1;
+ int cmd_args_count = 0;
+ int output_count = 0;
+ glusterd_conf_t *priv = NULL;
+ runner_t runner = {0,};
- GF_ASSERT (THIS);
- GF_ASSERT (THIS->private);
-
- priv = THIS->private;
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
- ret = glusterd_get_pid_from_file (master, slave, &pid);
- if (ret)
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
goto out;
+ }
- /* log file */
- ret = glusterd_gsyncd_getlogfile (master, slave, log_file1);
- if (ret)
+ ret = dict_get_str (dict, "command", &command);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get command from dict");
goto out;
+ }
- /* check if slave is local or remote */
- ret = glusterd_gsync_slave_is_remote (slave);
- if (ret)
- goto do_rotate;
-
- /* slave log file - slave is local and it's log can be rotated */
- ret = glusterd_gsync_get_slave_log_file (master, slave, log_file2);
+ ret = dict_get_int32 (dict, "cmd_args_count", &cmd_args_count);
if (ret)
- goto out;
-
- do_rotate:
- ret = glusterd_send_log_rotate_signal (pid, log_file1, log_file2);
-
- out:
- if (ret && op_errstr)
- *op_errstr = gf_strdup("Error rotating log file");
- return ret;
-}
-
-static int
-glusterd_do_gsync_log_rotation_mst_slv (glusterd_volinfo_t *volinfo, char *slave,
- char **op_errstr)
-{
- uuid_t uuid = {0, };
- glusterd_conf_t *priv = NULL;
- int ret = 0;
- char errmsg[1024] = {0,};
- xlator_t *this = NULL;
+ gf_log ("", GF_LOG_INFO, "No cmd_args_count");
+
+ if (cmd_args_count) {
+ cmd_args = GF_CALLOC (cmd_args_count, sizeof (char*),
+ gf_common_mt_char);
+ if (!cmd_args) {
+ gf_log ("", GF_LOG_ERROR, "Unable to calloc. "
+ "Errno = %s", strerror(errno));
+ goto out;
+ }
- GF_ASSERT (volinfo);
- GF_ASSERT (slave);
- GF_ASSERT (THIS);
- this = THIS;
- GF_ASSERT (this->private);
- priv = this->private;
+ for (i=1; i <= cmd_args_count; i++) {
+ memset (cmd_arg_name, '\0', sizeof(cmd_arg_name));
+ snprintf (cmd_arg_name, sizeof(cmd_arg_name),
+ "cmd_arg_%d", i);
+ ret = dict_get_str (dict, cmd_arg_name, &cmd_args[i-1]);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s in dict",
+ cmd_arg_name);
+ goto out;
+ }
+ }
+ }
- ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if ((ret == 0) && (uuid_compare (MY_UUID, uuid) != 0))
+ runinit (&runner);
+ runner_argprintf (&runner, GSYNCD_PREFIX"/peer_%s", command);
+ for (i=0; i < cmd_args_count; i++)
+ runner_add_arg (&runner, cmd_args[i]);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ synclock_unlock (&priv->big_lock);
+ ret = runner_start (&runner);
+ if (ret == -1) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to "
+ "execute command. Error : %s",
+ strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ synclock_lock (&priv->big_lock);
goto out;
+ }
+
+ do {
+ ptr = fgets(buf, sizeof(buf), runner_chio (&runner, STDOUT_FILENO));
+ if (ptr) {
+ ret = dict_get_int32 (rsp_dict, "output_count", &output_count);
+ if (ret)
+ output_count = 1;
+ else
+ output_count++;
+ memset (output_name, '\0', sizeof (output_name));
+ snprintf (output_name, sizeof (output_name),
+ "output_%d", output_count);
+ if (buf[strlen(buf) - 1] == '\n')
+ buf[strlen(buf) - 1] = '\0';
+ bufp = gf_strdup (buf);
+ if (!bufp)
+ gf_log ("", GF_LOG_ERROR, "gf_strdup failed.");
+ ret = dict_set_dynstr (rsp_dict, output_name, bufp);
+ if (ret) {
+ GF_FREE (bufp);
+ gf_log ("", GF_LOG_ERROR, "output set failed.");
+ }
+ ret = dict_set_int32 (rsp_dict, "output_count", output_count);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, "output_count set failed.");
+ }
+ } while (ptr);
+ ret = runner_end (&runner);
if (ret) {
- snprintf(errmsg, sizeof(errmsg), "geo-replication session b/w %s %s not active",
- volinfo->volname, slave);
- gf_log (this->name, GF_LOG_WARNING, "%s", errmsg);
- if (op_errstr)
- *op_errstr = gf_strdup(errmsg);
+ snprintf (errmsg, sizeof (errmsg), "Unable to "
+ "end. Error : %s",
+ strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ synclock_lock (&priv->big_lock);
goto out;
}
+ synclock_lock (&priv->big_lock);
- ret = glusterd_do_gsync_log_rotate (volinfo->volname, slave, &uuid, op_errstr);
+ ret = 0;
+out:
+ if (cmd_args) {
+ GF_FREE (cmd_args);
+ cmd_args = NULL;
+ }
- out:
- gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-static int
-_iterate_log_rotate_mst_slv (dict_t *this, char *key, data_t *value, void *data)
+int
+glusterd_op_copy_file (dict_t *dict, char **op_errstr)
{
- glusterd_gsync_status_temp_t *param = NULL;
- char *slave = NULL;
+ char abs_filename[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ char *filename = NULL;
+ char *host_uuid = NULL;
+ char uuid_str [64] = {0};
+ char *contents = NULL;
+ char buf[1024] = "";
+ int ret = -1;
+ int fd = -1;
+ int bytes_writen = 0;
+ int bytes_read = 0;
+ int contents_size = -1;
+ int file_mode = -1;
+ glusterd_conf_t *priv = NULL;
+ struct stat stbuf = {0,};
- param = (glusterd_gsync_status_temp_t *) data;
- GF_ASSERT (param);
- GF_ASSERT (param->volinfo);
-
- slave = strchr (value->data, ':');
- if (slave)
- slave++;
- else {
- gf_log ("", GF_LOG_ERROR, "geo-replication log-rotate: slave (%s) "
- "not conforming to format", slave);
- return -1;
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
+ goto out;
}
- (void) glusterd_do_gsync_log_rotation_mst_slv (param->volinfo, slave, NULL);
- return 0;
-}
+ ret = dict_get_str (dict, "host-uuid", &host_uuid);
+ if (ret < 0)
+ goto out;
-static int
-glusterd_do_gsync_log_rotation_mst (glusterd_volinfo_t *volinfo)
-{
- glusterd_gsync_status_temp_t param = {0, };
+ ret = dict_get_str (dict, "source", &filename);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch"
+ " filename from dict.");
+ *op_errstr = gf_strdup ("command unsuccessful");
+ goto out;
+ }
+ snprintf (abs_filename, sizeof(abs_filename),
+ "%s/%s", priv->workdir, filename);
- GF_ASSERT (volinfo);
+ uuid_utoa_r (MY_UUID, uuid_str);
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = lstat (abs_filename, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Source file"
+ " does not exist in %s", priv->workdir);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
- param.volinfo = volinfo;
- dict_foreach (volinfo->gsync_slaves, _iterate_log_rotate_mst_slv, &param);
- return 0;
-}
+ contents = GF_CALLOC(1, stbuf.st_size+1, gf_common_mt_char);
+ if (!contents) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to allocate memory");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
-static int
-glusterd_rotate_gsync_all ()
-{
- int32_t ret = 0;
- glusterd_conf_t *priv = NULL;
- glusterd_volinfo_t *volinfo = NULL;
+ fd = open (abs_filename, O_RDONLY);
+ if (fd < 0) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to open %s",
+ abs_filename);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
- GF_ASSERT (THIS);
- priv = THIS->private;
+ do {
+ ret = read (fd, buf, sizeof(buf));
+ if (ret > 0) {
+ memcpy (contents+bytes_read, buf, ret);
+ bytes_read += ret;
+ memset (buf, '\0', sizeof(buf));
+ }
+ } while (ret > 0);
- GF_ASSERT (priv);
+ if (bytes_read != stbuf.st_size) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to read all "
+ "the data from %s", abs_filename);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
- list_for_each_entry (volinfo, &priv->volumes, vol_list) {
- ret = glusterd_do_gsync_log_rotation_mst (volinfo);
- if (ret)
+ ret = dict_set_int32 (dict, "contents_size", stbuf.st_size);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to set"
+ " contents size in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
goto out;
- }
+ }
- out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
- return ret;
-}
+ ret = dict_set_int32 (dict, "file_mode",
+ (int32_t)stbuf.st_mode);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to set"
+ " file mode in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
-static int
-glusterd_rotate_gsync_logs (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
-{
- char *slave = NULL;
- char *volname = NULL;
- char errmsg[1024] = {0,};
- gf_boolean_t exists = _gf_false;
- glusterd_volinfo_t *volinfo = NULL;
- char **linearr = NULL;
- int ret = 0;
+ ret = dict_set_bin (dict, "common_pem_contents",
+ contents, stbuf.st_size);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to set"
+ " pem contents in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+ close (fd);
+ } else {
+ ret = dict_get_bin (dict, "common_pem_contents",
+ (void **) &contents);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to get"
+ " pem contents in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
- ret = dict_get_str (dict, "master", &volname);
- if (ret < 0) {
- ret = glusterd_rotate_gsync_all ();
- goto out;
- }
+ ret = dict_get_int32 (dict, "contents_size", &contents_size);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to set"
+ " contents size in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
- exists = glusterd_check_volume_exists (volname);
- ret = glusterd_volinfo_find (volname, &volinfo);
- if ((ret) || (!exists)) {
- snprintf (errmsg, sizeof(errmsg), "Volume %s does not"
- " exist", volname);
- gf_log ("", GF_LOG_WARNING, "%s", errmsg);
- *op_errstr = gf_strdup (errmsg);
- ret = -1;
- goto out;
- }
+ ret = dict_get_int32 (dict, "file_mode", &file_mode);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to get"
+ " file mode in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
- ret = dict_get_str (dict, "slave", &slave);
- if (ret < 0) {
- ret = glusterd_do_gsync_log_rotation_mst (volinfo);
- goto out;
- }
+ fd = open (abs_filename, O_WRONLY | O_TRUNC | O_CREAT, 0600);
+ if (fd < 0) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to open %s",
+ abs_filename);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
- /* for the given slave use the normalized url */
- ret = glusterd_urltransform_single (slave, "normalize", &linearr);
- if (ret == -1)
- goto out;
+ bytes_writen = write (fd, contents, contents_size);
- ret = glusterd_do_gsync_log_rotation_mst_slv (volinfo, linearr[0],
- op_errstr);
- if (ret)
- gf_log ("gsyncd", GF_LOG_ERROR, "gsyncd log-rotate failed for"
- " %s & %s", volname, slave);
+ if (bytes_writen != contents_size) {
+ snprintf (errmsg, sizeof (errmsg), "Failed to write"
+ " to %s", abs_filename);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
- glusterd_urltransform_free (linearr, 1);
- out:
+ fchmod (fd, file_mode);
+ close (fd);
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-
int
glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
int32_t ret = -1;
int32_t type = -1;
- dict_t *ctx = NULL;
- dict_t *resp_dict = NULL;
char *host_uuid = NULL;
char *slave = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
char *volname = NULL;
+ char *path_list = NULL;
glusterd_volinfo_t *volinfo = NULL;
glusterd_conf_t *priv = NULL;
+ gf_boolean_t is_force = _gf_false;
char *status_msg = NULL;
- uuid_t uuid = {0, };
+ gf_boolean_t is_running = _gf_false;
+ char *conf_path = NULL;
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
GF_ASSERT (dict);
GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
priv = THIS->private;
@@ -2183,37 +3929,67 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
if (ret < 0)
goto out;
- ctx = glusterd_op_get_ctx ();
- resp_dict = ctx ? ctx : rsp_dict;
- GF_ASSERT (resp_dict);
-
if (type == GF_GSYNC_OPTION_TYPE_STATUS) {
- ret = glusterd_get_gsync_status (dict, op_errstr, resp_dict);
+ ret = glusterd_get_gsync_status (dict, op_errstr, rsp_dict);
goto out;
}
- if (type == GF_GSYNC_OPTION_TYPE_ROTATE) {
- ret = glusterd_rotate_gsync_logs (dict, op_errstr, resp_dict);
+ ret = dict_get_str (dict, "slave", &slave);
+ if (ret < 0)
goto out;
+ ret = dict_get_str (dict, "slave_ip", &slave_ip);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch slave volume name.");
+ goto out;
}
- ret = dict_get_str (dict, "slave", &slave);
- if (ret < 0)
+ ret = dict_get_str (dict, "slave_vol", &slave_vol);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch slave volume name.");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch conf file path.");
goto out;
+ }
if (dict_get_str (dict, "master", &volname) == 0) {
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "Volinfo for %s (master) not found",
- volname);
+ gf_log ("", GF_LOG_WARNING, "Volinfo for %s (master)"
+ " not found", volname);
goto out;
}
+
+ ret = glusterd_get_local_brickpaths (volinfo, &path_list);
}
if (type == GF_GSYNC_OPTION_TYPE_CONFIG) {
- ret = glusterd_gsync_configure (volinfo, slave, dict, resp_dict,
- op_errstr);
+ ret = glusterd_gsync_configure (volinfo, slave, path_list,
+ dict, rsp_dict, op_errstr);
+ if (!ret) {
+ ret = dict_set_str (rsp_dict, "conf_path", conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store conf_file_path.");
+ goto out;
+ }
+ }
+ goto out;
+ }
+
+ if (type == GF_GSYNC_OPTION_TYPE_DELETE) {
+ ret = glusterd_remove_slave_in_info(volinfo, slave, op_errstr);
+ if (ret && !is_force && path_list)
+ goto out;
+
+ ret = glusterd_gsync_delete (volinfo, slave, slave_ip,
+ slave_vol, path_list, dict,
+ rsp_dict, op_errstr);
goto out;
}
@@ -2222,48 +3998,671 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
goto out;
}
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
if (type == GF_GSYNC_OPTION_TYPE_START) {
- ret = glusterd_set_marker_gsync (volinfo);
+ ret = glusterd_set_gsync_confs (volinfo);
if (ret != 0) {
- gf_log ("", GF_LOG_WARNING, "marker start failed");
- *op_errstr = gf_strdup ("failed to initialize indexing");
+ gf_log ("", GF_LOG_WARNING, "marker/changelog"
+ " start failed");
+ *op_errstr = gf_strdup ("Index initialization failed");
ret = -1;
goto out;
}
- ret = glusterd_store_slave_in_info(volinfo, slave,
- host_uuid, op_errstr);
- if (ret)
- goto out;
- ret = glusterd_start_gsync (volinfo, slave, host_uuid,
- op_errstr);
+ ret = glusterd_start_gsync (volinfo, slave, path_list,
+ conf_path, host_uuid, op_errstr);
}
if (type == GF_GSYNC_OPTION_TYPE_STOP) {
-
- ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if (ret) {
+ ret = glusterd_check_gsync_running_local (volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (!ret && !is_force && path_list &&
+ (_gf_true != is_running)) {
gf_log ("", GF_LOG_WARNING, GEOREP" is not set up for"
"%s(master) and %s(slave)", volname, slave);
*op_errstr = strdup (GEOREP" is not set up");
goto out;
}
- ret = glusterd_remove_slave_in_info(volinfo, slave, op_errstr);
- if (ret)
+ ret = stop_gsync (volname, slave, &status_msg,
+ conf_path, is_force);
+ if (ret == 0 && status_msg)
+ ret = dict_set_str (rsp_dict, "gsync-status",
+ status_msg);
+ if (ret != 0 && !is_force && path_list)
+ *op_errstr = gf_strdup ("internal error");
+
+ if (!ret) {
+ ret = glusterd_create_status_file (volinfo->volname,
+ slave, slave_ip,
+ slave_vol,"Stopped");
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to update"
+ "state_file. Error : %s",
+ strerror (errno));
+ }
+ }
+ }
+
+out:
+ if (path_list) {
+ GF_FREE (path_list);
+ path_list = NULL;
+ }
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_get_slave_details_confpath (glusterd_volinfo_t *volinfo, dict_t *dict,
+ char **slave_ip, char **slave_vol,
+ char **conf_path, char **op_errstr)
+{
+ int ret = -1;
+ char confpath[PATH_MAX] = "";
+ glusterd_conf_t *priv = NULL;
+ char *slave = NULL;
+
+ GF_ASSERT (THIS);
+ priv = THIS->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_str (dict, "slave", &slave);
+ if (ret || !slave) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch slave from dict");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_get_slave_info (slave, slave_ip, slave_vol, op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "slave_ip", *slave_ip);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store slave IP.");
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "slave_vol", *slave_vol);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store slave volume name.");
+ goto out;
+ }
+
+ ret = snprintf (confpath, sizeof(confpath) - 1,
+ "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, volinfo->volname,
+ *slave_ip, *slave_vol);
+ confpath[ret] = '\0';
+ *conf_path = gf_strdup (confpath);
+ if (!(*conf_path)) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to gf_strdup. Error: %s", strerror (errno));
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "conf_path", *conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store conf_path");
+ goto out;
+ }
+
+out:
+ gf_log ("", GF_LOG_DEBUG,"Returning %d", ret);
+ return ret;
+
+}
+
+int
+glusterd_get_slave_info (char *slave, char **slave_ip,
+ char **slave_vol, char **op_errstr)
+{
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ char **linearr = NULL;
+ int32_t ret = -1;
+ char errmsg[PATH_MAX] = "";
+
+ ret = glusterd_urltransform_single (slave, "normalize",
+ &linearr);
+ if (ret == -1) {
+ ret = snprintf (errmsg, sizeof(errmsg) - 1,
+ "Invalid Url: %s", slave);
+ errmsg[ret] = '\0';
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "Failed to normalize url");
+ goto out;
+ }
+
+ tmp = strtok_r (linearr[0], "/", &save_ptr);
+ tmp = strtok_r (NULL, "/", &save_ptr);
+ slave = strtok_r (tmp, ":", &save_ptr);
+ if (slave) {
+ ret = glusterd_mountbroker_check (&slave, op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Invalid slave url: %s", *op_errstr);
+ goto out;
+ }
+
+ *slave_ip = gf_strdup (slave);
+ if (!*slave_ip) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to gf_strdup");
+ ret = -1;
goto out;
+ }
+ gf_log ("", GF_LOG_DEBUG, "Slave IP : %s", *slave_ip);
+ ret = 0;
+ } else {
+ gf_log ("", GF_LOG_ERROR, "Invalid slave name");
+ goto out;
+ }
- if (uuid_compare (MY_UUID, uuid) != 0) {
+ slave = strtok_r (NULL, ":", &save_ptr);
+ if (slave) {
+ *slave_vol = gf_strdup (slave);
+ if (!*slave_vol) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to gf_strdup");
+ ret = -1;
goto out;
}
+ gf_log ("", GF_LOG_DEBUG, "Slave Vol : %s", *slave_vol);
+ ret = 0;
+ } else {
+ gf_log ("", GF_LOG_ERROR, "Invalid slave name");
+ goto out;
+ }
- ret = stop_gsync (volname, slave, &status_msg);
- if (ret == 0 && status_msg)
- ret = dict_set_str (resp_dict, "gsync-status",
- status_msg);
- if (ret != 0)
- *op_errstr = gf_strdup ("internal error");
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+static void
+runinit_gsyncd_setrx (runner_t *runner, char *conf_path)
+{
+ runinit (runner);
+ runner_add_args (runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
+ runner_argprintf (runner, "%s", conf_path);
+ runner_add_arg (runner, "--config-set-rx");
+}
+
+static int
+glusterd_check_gsync_present (int *valid_state)
+{
+ char buff[PATH_MAX] = {0, };
+ runner_t runner = {0,};
+ char *ptr = NULL;
+ int ret = 0;
+
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--version", NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ ret = runner_start (&runner);
+ if (ret == -1) {
+ if (errno == ENOENT) {
+ gf_log ("glusterd", GF_LOG_INFO, GEOREP
+ " module not installed in the system");
+ *valid_state = 0;
+ }
+ else {
+ gf_log ("glusterd", GF_LOG_ERROR, GEOREP
+ " module not working as desired");
+ *valid_state = -1;
+ }
+ goto out;
+ }
+
+ ptr = fgets(buff, sizeof(buff), runner_chio (&runner, STDOUT_FILENO));
+ if (ptr) {
+ if (!strstr (buff, "gsyncd")) {
+ ret = -1;
+ gf_log ("glusterd", GF_LOG_ERROR, GEOREP" module not "
+ "working as desired");
+ *valid_state = -1;
+ goto out;
+ }
+ } else {
+ ret = -1;
+ gf_log ("glusterd", GF_LOG_ERROR, GEOREP" module not "
+ "working as desired");
+ *valid_state = -1;
+ goto out;
+ }
+
+ ret = 0;
+ out:
+
+ runner_end (&runner);
+
+ gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+
+}
+
+static int
+create_conf_file (glusterd_conf_t *conf, char *conf_path)
+#define RUN_GSYNCD_CMD do { \
+ ret = runner_run_reuse (&runner); \
+ if (ret == -1) { \
+ runner_log (&runner, "glusterd", GF_LOG_ERROR, "command failed"); \
+ runner_end (&runner); \
+ goto out; \
+ } \
+ runner_end (&runner); \
+} while (0)
+{
+ int ret = 0;
+ runner_t runner = {0,};
+ char georepdir[PATH_MAX] = {0,};
+ int valid_state = 0;
+
+ valid_state = -1;
+ ret = glusterd_check_gsync_present (&valid_state);
+ if (-1 == ret) {
+ ret = valid_state;
+ goto out;
+ }
+
+ ret = snprintf (georepdir, sizeof(georepdir) - 1, "%s/"GEOREP,
+ conf->workdir);
+ georepdir[ret] = '\0';
+
+ /************
+ * master pre-configuration
+ ************/
+
+ /* remote-gsyncd */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "remote-gsyncd", GSYNCD_PREFIX"/gsyncd", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "remote-gsyncd", "/nonexistent/gsyncd",
+ ".", "^ssh:", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-command-dir */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "gluster-command-dir", SBIN_DIR"/",
+ ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-params */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "gluster-params",
+ "aux-gfid-mount",
+ ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* ssh-command */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "ssh-command");
+ runner_argprintf (&runner,
+ "ssh -oPasswordAuthentication=no "
+ "-oStrictHostKeyChecking=no "
+ "-i %s/secret.pem", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* ssh-command tar */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "ssh-command-tar");
+ runner_argprintf (&runner,
+ "ssh -oPasswordAuthentication=no "
+ "-oStrictHostKeyChecking=no "
+ "-i %s/tar_ssh.pem", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* pid-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "pid-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.pid", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* geo-rep-working-dir */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "georep-session-working-dir");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "state-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.status", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-detail-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "state-detail-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-socket */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "state-socket-unencoded");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.socket", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* socketdir */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "socketdir", GLUSTERD_SOCK_DIR, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "log-file",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}.log",
+ ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "gluster-log-file",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}${local_id}.gluster.log",
+ ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* ignore-deletes */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "ignore-deletes", "true", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* special-sync-mode */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "special-sync-mode", "partial", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* change-detector == changelog */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args(&runner, "change-detector", "changelog", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg(&runner, "working-dir");
+ runner_argprintf(&runner, "%s/${mastervol}/${eSlave}",
+ DEFAULT_VAR_RUN_DIRECTORY);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /************
+ * slave pre-configuration
+ ************/
+
+ /* gluster-command-dir */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "gluster-command-dir", SBIN_DIR"/",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-params */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "gluster-params",
+ "aux-gfid-mount",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "log-file",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/${session_owner}:${eSlave}.log",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* MountBroker log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "log-file-mbr",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/mbr/${session_owner}:${eSlave}.log",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "gluster-log-file",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/${session_owner}:${eSlave}.gluster.log",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ out:
+ return ret ? -1 : 0;
+}
+
+static int
+glusterd_create_essential_dir_files (glusterd_volinfo_t *volinfo, dict_t *dict,
+ char *slave, char *slave_ip,
+ char *slave_vol, char **op_errstr)
+{
+ int ret = -1;
+ char *conf_path = NULL;
+ char *statefile = NULL;
+ char buf[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ glusterd_conf_t *conf = NULL;
+ struct stat stbuf = {0,};
+
+ GF_ASSERT (THIS);
+ conf = THIS->private;
+
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to fetch conf file path.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "statefile", &statefile);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to fetch statefile path.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = snprintf (buf, sizeof(buf) - 1, "%s/"GEOREP"/%s_%s_%s",
+ conf->workdir, volinfo->volname, slave_ip, slave_vol);
+ buf[ret] = '\0';
+ ret = mkdir_p (buf, 0777, _gf_true);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to create %s"
+ ". Error : %s", buf, strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = snprintf (buf, PATH_MAX, DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/%s",
+ volinfo->volname);
+ buf[ret] = '\0';
+ ret = mkdir_p (buf, 0777, _gf_true);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to create %s"
+ ". Error : %s", buf, strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = lstat (conf_path, &stbuf);
+ if (!ret) {
+ gf_log ("", GF_LOG_DEBUG, "Session already running."
+ " Not creating config file again.");
+ } else {
+ ret = create_conf_file (conf, conf_path);
+ if (ret || lstat (conf_path, &stbuf)) {
+ snprintf (errmsg, sizeof (errmsg), "Failed to create"
+ " config file(%s).", conf_path);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+ }
+
+ ret = lstat (statefile, &stbuf);
+ if (!ret) {
+ gf_log ("", GF_LOG_DEBUG, "Session already running."
+ " Not creating status file again.");
+ goto out;
+ } else {
+ ret = glusterd_create_status_file (volinfo->volname, slave,
+ slave_ip, slave_vol,
+ "Not Started");
+ if (ret || lstat (statefile, &stbuf)) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to create %s"
+ ". Error : %s", statefile, strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+
+out:
+ gf_log ("", GF_LOG_DEBUG,"Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_op_gsync_create (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ char common_pem_file[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ char hooks_args[PATH_MAX] = "";
+ char uuid_str [64] = "";
+ char *host_uuid = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *arg_buf = NULL;
+ char *volname = NULL;
+ char *slave = NULL;
+ int32_t ret = -1;
+ int32_t is_pem_push = -1;
+ gf_boolean_t is_force = -1;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ GF_ASSERT (THIS);
+ conf = THIS->private;
+ GF_ASSERT (conf);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ ret = glusterd_op_gsync_args_get (dict, op_errstr,
+ &volname, &slave, &host_uuid);
+ if (ret)
+ goto out;
+
+ snprintf (common_pem_file, sizeof(common_pem_file),
+ "%s"GLUSTERD_COMMON_PEM_PUB_FILE, conf->workdir);
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Volinfo for %s"
+ " (master) not found", volname);
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "slave_vol", &slave_vol);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to fetch slave volume name.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "slave_ip", &slave_ip);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to fetch slave IP.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
+ uuid_utoa_r (MY_UUID, uuid_str);
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = dict_get_int32 (dict, "push_pem", &is_pem_push);
+ if (!ret && is_pem_push) {
+ gf_log ("", GF_LOG_DEBUG, "Trying to setup"
+ " pem files in slave");
+ is_pem_push = 1;
+ } else
+ is_pem_push = 0;
+
+ snprintf(hooks_args, sizeof(hooks_args),
+ "is_push_pem=%d pub_file=%s slave_ip=%s",
+ is_pem_push, common_pem_file, slave_ip);
+
+ } else
+ snprintf(hooks_args, sizeof(hooks_args),
+ "This argument will stop the hooks script");
+
+ arg_buf = gf_strdup (hooks_args);
+ if (!arg_buf) {
+ gf_log ("", GF_LOG_ERROR, "Failed to"
+ " gf_strdup");
+ if (is_force) {
+ ret = 0;
+ goto create_essentials;
+ }
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "hooks_args", arg_buf);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Failed to set"
+ " hooks_args in dict.");
+ if (is_force) {
+ ret = 0;
+ goto create_essentials;
+ }
+ goto out;
+ }
+
+create_essentials:
+
+ ret = glusterd_create_essential_dir_files (volinfo, dict, slave,
+ slave_ip, slave_vol,
+ op_errstr);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_slave_in_info (volinfo, slave,
+ host_uuid, op_errstr,
+ is_force);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to store"
+ " slave info.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
}
out:
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index b6aaffcd1..e0373c774 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -33,6 +33,7 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "glusterd-store.h"
+#include "glusterd-locks.h"
#include "glusterd1-xdr.h"
#include "cli1-xdr.h"
@@ -49,11 +50,14 @@
#include "globals.h"
#include "glusterd-syncop.h"
+#include "glusterd-etcd.h"
#ifdef HAVE_BD_XLATOR
#include <lvm2app.h>
#endif
+extern glusterd_op_info_t opinfo;
+
int glusterd_big_locked_notify (struct rpc_clnt *rpc, void *mydata,
rpc_clnt_event_t event,
void *data, rpc_clnt_notify_t notify_fn)
@@ -254,13 +258,14 @@ glusterd_add_peer_detail_to_dict (glusterd_peerinfo_t *peerinfo,
int ret = -1;
char key[256] = {0, };
+ char *peer_uuid_str = NULL;
GF_ASSERT (peerinfo);
GF_ASSERT (friends);
snprintf (key, 256, "friend%d.uuid", count);
- uuid_utoa_r (peerinfo->uuid, peerinfo->uuid_str);
- ret = dict_set_str (friends, key, peerinfo->uuid_str);
+ peer_uuid_str = gd_peer_uuid_str (peerinfo);
+ ret = dict_set_str (friends, key, peer_uuid_str);
if (ret)
goto out;
@@ -306,10 +311,23 @@ _build_option_key (dict_t *d, char *k, data_t *v, void *tmp)
char reconfig_key[256] = {0, };
struct args_pack *pack = NULL;
int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
pack = tmp;
if (strcmp (k, GLUSTERD_GLOBAL_OPT_VERSION) == 0)
return 0;
+
+ if (priv->op_version > GD_OP_VERSION_MIN) {
+ if ((strcmp (k, "features.limit-usage") == 0) ||
+ (strcmp (k, "features.soft-limit") == 0))
+ return 0;
+ }
snprintf (reconfig_key, 256, "volume%d.option.%s",
pack->vol_count, k);
ret = dict_set_str (pack->dict, reconfig_key, v->data);
@@ -334,7 +352,7 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
char *volume_id_str = NULL;
struct args_pack pack = {0,};
xlator_t *this = NULL;
-
+ GF_UNUSED int caps = 0;
GF_ASSERT (volinfo);
GF_ASSERT (volumes);
@@ -359,6 +377,21 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
if (ret)
goto out;
+ /* As of now, the snap volumes are also displayed as part of
+ volume info command. So this change is to display whether
+ the volume is original volume or the snap_volume. If
+ displaying of snap volumes in volume info o/p is not needed
+ this should be removed.
+ */
+ snprintf (key, 256, "volume%d.snap_volume", count);
+ ret = dict_set_int32 (volumes, key, volinfo->is_snap_volume);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "failed to set whether "
+ "the volume is a snap volume or actual volume (%s)",
+ volinfo->volname);
+ goto out;
+ }
+
snprintf (key, 256, "volume%d.brick_count", count);
ret = dict_set_int32 (volumes, key, volinfo->brick_count);
if (ret)
@@ -399,14 +432,92 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
goto out;
#ifdef HAVE_BD_XLATOR
- snprintf (key, 256, "volume%d.backend", count);
- ret = dict_set_int32 (volumes, key, volinfo->backend);
- if (ret)
- goto out;
+ if (volinfo->caps) {
+ caps = 0;
+ snprintf (key, 256, "volume%d.xlator0", count);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ if (volinfo->caps & CAPS_BD)
+ snprintf (buf, 256, "BD");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+
+ if (volinfo->caps & CAPS_THIN) {
+ snprintf (key, 256, "volume%d.xlator0.caps%d", count,
+ caps++);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ snprintf (buf, 256, "thin");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+ }
+
+ if (volinfo->caps & CAPS_OFFLOAD_COPY) {
+ snprintf (key, 256, "volume%d.xlator0.caps%d", count,
+ caps++);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ snprintf (buf, 256, "offload_copy");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+ }
+
+ if (volinfo->caps & CAPS_OFFLOAD_SNAPSHOT) {
+ snprintf (key, 256, "volume%d.xlator0.caps%d", count,
+ caps++);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ snprintf (buf, 256, "offload_snapshot");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+ }
+
+ if (volinfo->caps & CAPS_OFFLOAD_ZERO) {
+ snprintf (key, 256, "volume%d.xlator0.caps%d", count,
+ caps++);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ snprintf (buf, 256, "offload_zerofill");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+ }
+
+ }
#endif
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
char brick[1024] = {0,};
+ char brick_uuid[64] = {0,};
snprintf (key, 256, "volume%d.brick%d", count, i);
snprintf (brick, 1024, "%s:%s", brickinfo->hostname,
brickinfo->path);
@@ -414,6 +525,25 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
ret = dict_set_dynstr (volumes, key, buf);
if (ret)
goto out;
+ snprintf (key, 256, "volume%d.brick%d.uuid", count, i);
+ snprintf (brick_uuid, 64, "%s", uuid_utoa (brickinfo->uuid));
+ buf = gf_strdup (brick_uuid);
+ if (!buf)
+ goto out;
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret)
+ goto out;
+
+#ifdef HAVE_BD_XLATOR
+ if (volinfo->caps & CAPS_BD) {
+ snprintf (key, 256, "volume%d.vg%d", count, i);
+ snprintf (brick, 1024, "%s", brickinfo->vg);
+ buf = gf_strdup (brick);
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret)
+ goto out;
+ }
+#endif
i++;
}
@@ -477,10 +607,16 @@ int32_t
glusterd_op_txn_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
char *err_str, size_t err_len)
{
- int32_t ret = -1;
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
- int32_t locked = 0;
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int32_t locked = 0;
+ char *tmp = NULL;
+ char *volname = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_op_info_t txn_op_info = {{0},};
+ glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
GF_ASSERT (req);
GF_ASSERT ((op > GD_OP_NONE) && (op < GD_OP_MAX));
@@ -491,33 +627,122 @@ glusterd_op_txn_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
priv = this->private;
GF_ASSERT (priv);
- ret = glusterd_lock (MY_UUID);
+ dict = ctx;
+
+ /* Generate a transaction-id for this operation and
+ * save it in the dict. This transaction id distinguishes
+ * each transaction, and helps separate opinfos in the
+ * op state machine. */
+ ret = glusterd_generate_txn_id (dict, &txn_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to generate transaction id");
+ goto out;
+ }
+
+ /* Save the MY_UUID as the originator_uuid. This originator_uuid
+ * will be used by is_origin_glusterd() to determine if a node
+ * is the originator node for a command. */
+ ret = glusterd_set_originator_uuid (dict);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "Unable to acquire lock on localhost, ret: %d", ret);
- snprintf (err_str, err_len, "Another transaction is in progress. "
- "Please try again after sometime.");
+ "Failed to set originator_uuid.");
goto out;
}
+ /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4) {
+ ret = glusterd_lock (MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock on localhost, ret: %d",
+ ret);
+ snprintf (err_str, err_len,
+ "Another transaction is in progress. "
+ "Please try again after sometime.");
+ goto out;
+ }
+ } else {
+ /* If no volname is given as a part of the command, locks will
+ * not be held */
+ ret = dict_get_str (dict, "volname", &tmp);
+ if (ret) {
+ gf_log ("", GF_LOG_INFO,
+ "No Volume name present. "
+ "Locks not being held.");
+ goto local_locking_done;
+ } else {
+ /* Use a copy of volname, as cli response will be
+ * sent before the unlock, and the volname in the
+ * dict, might be removed */
+ volname = gf_strdup (tmp);
+ if (!volname)
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_lock (volname, MY_UUID, "vol");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock for %s", volname);
+ snprintf (err_str, err_len,
+ "Another transaction is in progress for %s. "
+ "Please try again after sometime.", volname);
+ goto out;
+ }
+ }
+
locked = 1;
gf_log (this->name, GF_LOG_DEBUG, "Acquired lock on localhost");
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_START_LOCK, NULL);
+local_locking_done:
+
+ /* If no volname is given as a part of the command, locks will
+ * not be held, hence sending stage event. */
+ if (volname || (priv->op_version < GD_OP_VERSION_4))
+ event_type = GD_OP_EVENT_START_LOCK;
+ else {
+ txn_op_info.state.state = GD_OP_STATE_LOCK_SENT;
+ event_type = GD_OP_EVENT_ALL_ACC;
+ }
+
+ /* Save opinfo for this transaction with the transaction id */
+ glusterd_txn_opinfo_init (&txn_op_info, NULL, &op, ctx, req);
+
+ ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+ if (ctx)
+ dict_unref (ctx);
+ goto out;
+ }
+
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, ctx);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to acquire cluster"
" lock.");
goto out;
}
- glusterd_op_set_op (op);
- glusterd_op_set_ctx (ctx);
- glusterd_op_set_req (req);
-
-
out:
- if (locked && ret)
- glusterd_unlock (MY_UUID);
+ if (locked && ret) {
+ /* Based on the op-version, we release the
+ * cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4)
+ glusterd_unlock (MY_UUID);
+ else {
+ ret = glusterd_mgmt_v3_unlock (volname, MY_UUID,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to release lock for %s",
+ volname);
+ ret = -1;
+ }
+ }
+
+ if (volname)
+ GF_FREE (volname);
gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -526,16 +751,25 @@ out:
int
__glusterd_handle_cluster_lock (rpcsvc_request_t *req)
{
- gd1_mgmt_cluster_lock_req lock_req = {{0},};
- int32_t ret = -1;
- glusterd_op_lock_ctx_t *ctx = NULL;
- glusterd_peerinfo_t *peerinfo = NULL;
- xlator_t *this = NULL;
+ dict_t *op_ctx = NULL;
+ int32_t ret = -1;
+ gd1_mgmt_cluster_lock_req lock_req = {{0},};
+ glusterd_op_lock_ctx_t *ctx = NULL;
+ glusterd_op_t op = GD_OP_EVENT_LOCK;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_op_info_t txn_op_info = {{0},};
+ glusterd_conf_t *priv = NULL;
+ uuid_t *txn_id = NULL;
+ xlator_t *this = NULL;
this = THIS;
GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+ txn_id = &priv->global_txn_id;
+
ret = xdr_to_generic (req->msg[0], &lock_req,
(xdrproc_t)xdr_gd1_mgmt_cluster_lock_req);
if (ret < 0) {
@@ -565,8 +799,29 @@ __glusterd_handle_cluster_lock (rpcsvc_request_t *req)
uuid_copy (ctx->uuid, lock_req.uuid);
ctx->req = req;
+ ctx->dict = NULL;
+
+ op_ctx = dict_new ();
+ if (!op_ctx) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set new dict");
+ goto out;
+ }
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK, ctx);
+ glusterd_txn_opinfo_init (&txn_op_info, NULL, &op, op_ctx, req);
+
+ ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+ dict_unref (txn_op_info.op_ctx);
+ goto out;
+ }
+
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK, txn_id, ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to inject event GD_OP_EVENT_LOCK");
out:
gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -642,11 +897,19 @@ __glusterd_handle_stage_op (rpcsvc_request_t *req)
gd1_mgmt_stage_op_req op_req = {{0},};
glusterd_peerinfo_t *peerinfo = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_op_info_t txn_op_info = {{0},};
+ glusterd_op_sm_state_info_t state = {0,};
+ glusterd_conf_t *priv = NULL;
this = THIS;
GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+ txn_id = &priv->global_txn_id;
+
ret = xdr_to_generic (req->msg[0], &op_req,
(xdrproc_t)xdr_gd1_mgmt_stage_op_req);
if (ret < 0) {
@@ -670,7 +933,36 @@ __glusterd_handle_stage_op (rpcsvc_request_t *req)
if (ret)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_OP, req_ctx);
+ ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
+ /* In cases where there is no volname, the receivers won't have a
+ * transaction opinfo created, as for those operations, the locking
+ * phase where the transaction opinfos are created, won't be called. */
+ ret = glusterd_get_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No transaction's opinfo set");
+
+ state.state = GD_OP_STATE_LOCKED;
+ glusterd_txn_opinfo_init (&txn_op_info, &state,
+ &op_req.op, req_ctx->dict, req);
+
+ ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+ dict_unref (req_ctx->dict);
+ goto out;
+ }
+ }
+
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_OP,
+ txn_id, req_ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to inject event GD_OP_EVENT_STAGE_OP");
out:
free (op_req.buf.buf_val);//malloced by xdr
@@ -694,11 +986,17 @@ __glusterd_handle_commit_op (rpcsvc_request_t *req)
gd1_mgmt_commit_op_req op_req = {{0},};
glusterd_peerinfo_t *peerinfo = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
this = THIS;
GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+ txn_id = &priv->global_txn_id;
+
ret = xdr_to_generic (req->msg[0], &op_req,
(xdrproc_t)xdr_gd1_mgmt_commit_op_req);
if (ret < 0) {
@@ -724,11 +1022,12 @@ __glusterd_handle_commit_op (rpcsvc_request_t *req)
if (ret)
goto out;
- ret = glusterd_op_init_ctx (op_req.op);
- if (ret)
- goto out;
+ ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_OP, req_ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_OP,
+ txn_id, req_ctx);
out:
free (op_req.buf.buf_val);//malloced by xdr
@@ -747,17 +1046,19 @@ int
__glusterd_handle_cli_probe (rpcsvc_request_t *req)
{
int32_t ret = -1;
- gf1_cli_probe_req cli_req = {0,};
- glusterd_peerinfo_t *peerinfo = NULL;
- gf_boolean_t run_fsm = _gf_true;
- xlator_t *this = NULL;
- char *bind_name = NULL;
+ gf_cli_req cli_req = {{0,},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gf_boolean_t run_fsm = _gf_true;
+ xlator_t *this = NULL;
+ char *bind_name = NULL;
+ dict_t *dict = NULL;
+ char *hostname = NULL;
+ int port = 0;
GF_ASSERT (req);
this = THIS;
- ret = xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf1_cli_probe_req);
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
if (ret < 0) {
//failed to decode msg;
gf_log ("", GF_LOG_ERROR, "xdr decoding error");
@@ -765,63 +1066,80 @@ __glusterd_handle_cli_probe (rpcsvc_request_t *req)
goto out;
}
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len, &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "unserialize req-buffer to dictionary");
+ goto out;
+ }
+ }
+
+ ret = dict_get_str (dict, "hostname", &hostname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get hostname");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "port", &port);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get port");
+ goto out;
+ }
+
if (glusterd_is_any_volume_in_server_quorum (this) &&
!does_gd_meet_server_quorum (this)) {
glusterd_xfer_cli_probe_resp (req, -1, GF_PROBE_QUORUM_NOT_MET,
- NULL,
- cli_req.hostname, cli_req.port);
+ NULL, hostname, port, dict);
gf_log (this->name, GF_LOG_ERROR, "Quorum does not meet, "
"rejecting operation");
ret = 0;
goto out;
}
- gf_cmd_log ("peer probe", " on host %s:%d", cli_req.hostname,
- cli_req.port);
gf_log ("glusterd", GF_LOG_INFO, "Received CLI probe req %s %d",
- cli_req.hostname, cli_req.port);
+ hostname, port);
if (dict_get_str(this->options,"transport.socket.bind-address",
&bind_name) == 0) {
gf_log ("glusterd", GF_LOG_DEBUG,
"only checking probe address vs. bind address");
- ret = glusterd_is_same_address(bind_name,cli_req.hostname);
+ ret = gf_is_same_address (bind_name, hostname);
}
else {
- ret = glusterd_is_local_addr(cli_req.hostname);
+ ret = gf_is_local_addr (hostname);
}
if (ret) {
- glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_LOCALHOST, NULL,
- cli_req.hostname, cli_req.port);
+ glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_LOCALHOST,
+ NULL, hostname, port, dict);
ret = 0;
goto out;
}
- if (!(ret = glusterd_friend_find_by_hostname(cli_req.hostname,
- &peerinfo))) {
- if (strcmp (peerinfo->hostname, cli_req.hostname) == 0) {
+ if (!(ret = glusterd_friend_find_by_hostname (hostname, &peerinfo))) {
+ if (strcmp (peerinfo->hostname, hostname) == 0) {
gf_log ("glusterd", GF_LOG_DEBUG, "Probe host %s port "
- "%d already a peer", cli_req.hostname,
- cli_req.port);
+ "%d already a peer", hostname, port);
glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_FRIEND,
- NULL, cli_req.hostname,
- cli_req.port);
+ NULL, hostname, port,
+ dict);
goto out;
}
}
- ret = glusterd_probe_begin (req, cli_req.hostname, cli_req.port);
-
- gf_cmd_log ("peer probe","on host %s:%d %s",cli_req.hostname,
- cli_req.port, (ret) ? "FAILED" : "SUCCESS");
+ ret = glusterd_probe_begin (req, hostname, port, dict);
if (ret == GLUSTERD_CONNECTION_AWAITED) {
//fsm should be run after connection establishes
run_fsm = _gf_false;
ret = 0;
}
+
out:
- free (cli_req.hostname);//its malloced by xdr
+ free (cli_req.dict.dict_val);
if (run_fsm) {
glusterd_friend_sm ();
@@ -841,11 +1159,17 @@ int
__glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
{
int32_t ret = -1;
- gf1_cli_deprobe_req cli_req = {0,};
- uuid_t uuid = {0};
- int op_errno = 0;
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
+ gf_cli_req cli_req = {{0,},};
+ uuid_t uuid = {0};
+ int op_errno = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ dict_t *dict = NULL;
+ char *hostname = NULL;
+ int port = 0;
+ int flags = 0;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_volinfo_t *tmp = NULL;
this = THIS;
GF_ASSERT (this);
@@ -854,16 +1178,45 @@ __glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
GF_ASSERT (req);
ret = xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf1_cli_deprobe_req);
+ (xdrproc_t)xdr_gf_cli_req);
if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
}
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len, &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "unserialize req-buffer to dictionary");
+ goto out;
+ }
+ }
+
gf_log ("glusterd", GF_LOG_INFO, "Received CLI deprobe req");
- ret = glusterd_hostname_to_uuid (cli_req.hostname, uuid);
+ ret = dict_get_str (dict, "hostname", &hostname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get hostname");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "port", &port);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get port");
+ goto out;
+ }
+ ret = dict_get_int32 (dict, "flags", &flags);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get flags");
+ goto out;
+ }
+
+ ret = glusterd_hostname_to_uuid (hostname, uuid);
if (ret) {
op_errno = GF_DEPROBE_NOT_FRIEND;
goto out;
@@ -875,23 +1228,31 @@ __glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
goto out;
}
- if (!(cli_req.flags & GF_CLI_FLAG_OP_FORCE)) {
- if (!uuid_is_null (uuid)) {
- /* Check if peers are connected, except peer being detached*/
- if (!glusterd_chk_peers_connected_befriended (uuid)) {
- ret = -1;
- op_errno = GF_DEPROBE_FRIEND_DOWN;
- goto out;
- }
- ret = glusterd_all_volume_cond_check (
- glusterd_friend_brick_belongs,
- -1, &uuid);
- if (ret) {
- op_errno = GF_DEPROBE_BRICK_EXIST;
- goto out;
- }
+ if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
+ /* Check if peers are connected, except peer being
+ * detached*/
+ if (!glusterd_chk_peers_connected_befriended (uuid)) {
+ ret = -1;
+ op_errno = GF_DEPROBE_FRIEND_DOWN;
+ goto out;
+ }
+ }
+
+ /* Check for if volumes exist with some bricks on the peer being
+ * detached. It's not a problem if a volume contains none or all
+ * of its bricks on the peer being detached
+ */
+ list_for_each_entry_safe (volinfo, tmp, &priv->volumes,
+ vol_list) {
+ ret = glusterd_friend_contains_vol_bricks (volinfo,
+ uuid);
+ if (ret == 1) {
+ op_errno = GF_DEPROBE_BRICK_EXIST;
+ goto out;
}
+ }
+ if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
if (glusterd_is_any_volume_in_server_quorum (this) &&
!does_gd_meet_server_quorum (this)) {
gf_log (this->name, GF_LOG_ERROR, "Quorum does not "
@@ -903,23 +1264,19 @@ __glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
}
if (!uuid_is_null (uuid)) {
- ret = glusterd_deprobe_begin (req, cli_req.hostname,
- cli_req.port, uuid);
+ ret = glusterd_deprobe_begin (req, hostname, port, uuid, dict);
} else {
- ret = glusterd_deprobe_begin (req, cli_req.hostname,
- cli_req.port, NULL);
+ ret = glusterd_deprobe_begin (req, hostname, port, NULL, dict);
}
- gf_cmd_log ("peer deprobe", "on host %s:%d %s", cli_req.hostname,
- cli_req.port, (ret) ? "FAILED" : "SUCCESS");
out:
+ free (cli_req.dict.dict_val);
+
if (ret) {
ret = glusterd_xfer_cli_deprobe_resp (req, ret, op_errno, NULL,
- cli_req.hostname);
+ hostname, dict);
}
- free (cli_req.hostname);//malloced by xdr
-
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -1048,27 +1405,32 @@ glusterd_handle_cli_get_volume (rpcsvc_request_t *req)
__glusterd_handle_cli_get_volume);
}
-#ifdef HAVE_BD_XLATOR
int
-__glusterd_handle_cli_bd_op (rpcsvc_request_t *req)
+__glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req)
{
- int32_t ret = -1;
- gf_cli_req cli_req = { {0,} };
- dict_t *dict = NULL;
- char *volname = NULL;
- char op_errstr[2048] = {0,};
- glusterd_op_t cli_op = GD_OP_BD_OP;
+ int ret = -1;
+ dict_t *dict = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ uuid_t uuid = {0};
+ gf_cli_rsp rsp = {0,};
+ gf_cli_req cli_req = {{0,}};
+ char msg_str[2048] = {0,};
GF_ASSERT (req);
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+
ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
if (ret < 0) {
- /* failed to decode msg */
+ //failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
}
- gf_log ("glusterd", GF_LOG_DEBUG, "Received bd op req");
+ gf_log ("glusterd", GF_LOG_DEBUG, "Received uuid reset req");
if (cli_req.dict.dict_len) {
/* Unserialize the dictionary */
@@ -1081,58 +1443,84 @@ __glusterd_handle_cli_bd_op (rpcsvc_request_t *req)
gf_log ("glusterd", GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (msg_str, sizeof (msg_str), "Unable to decode "
+ "the buffer");
goto out;
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
}
}
- ret = dict_get_str (dict, "volname", &volname);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to get volname");
+ /* In the above section if dict_unserialize is successful, ret is set
+ * to zero.
+ */
+ ret = -1;
+ // Do not allow peer reset if there are any volumes in the cluster
+ if (!list_empty (&priv->volumes)) {
+ snprintf (msg_str, sizeof (msg_str), "volumes are already "
+ "present in the cluster. Resetting uuid is not "
+ "allowed");
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg_str);
goto out;
}
- ret = glusterd_op_begin (req, GD_OP_BD_OP, dict, op_errstr,
- sizeof (op_errstr));
- gf_cmd_log ("bd op: %s", ((ret == 0) ? "SUCCESS": "FAILED"));
-out:
- if (ret && dict)
- dict_unref (dict);
+ // Do not allow peer reset if trusted storage pool is already formed
+ if (!list_empty (&priv->peers)) {
+ snprintf (msg_str, sizeof (msg_str),"trusted storage pool "
+ "has been already formed. Please detach this peer "
+ "from the pool and reset its uuid.");
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg_str);
+ goto out;
+ }
- glusterd_friend_sm ();
- glusterd_op_sm ();
+ uuid_copy (uuid, priv->uuid);
+ ret = glusterd_uuid_generate_save ();
+ if (!uuid_compare (uuid, MY_UUID)) {
+ snprintf (msg_str, sizeof (msg_str), "old uuid and the new uuid"
+ " are same. Try gluster peer reset again");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg_str);
+ ret = -1;
+ goto out;
+ }
+
+out:
if (ret) {
- if (op_errstr[0] == '\0')
- snprintf (op_errstr, sizeof (op_errstr),
- "Operation failed");
- ret = glusterd_op_send_cli_response (cli_op, ret, 0,
- req, NULL, op_errstr);
+ rsp.op_ret = -1;
+ if (msg_str[0] == '\0')
+ snprintf (msg_str, sizeof (msg_str), "Operation "
+ "failed");
+ rsp.op_errstr = msg_str;
+ ret = 0;
+ } else {
+ rsp.op_errstr = "";
}
+ glusterd_to_cli (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gf_cli_rsp, dict);
+
return ret;
}
int
-glusterd_handle_cli_bd_op (rpcsvc_request_t *req)
+glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req)
{
- return glusterd_big_locked_handler (req, __glusterd_handle_cli_bd_op);
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_uuid_reset);
}
-#endif
int
-__glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req)
+__glusterd_handle_cli_uuid_get (rpcsvc_request_t *req)
{
- int ret = -1;
- dict_t *dict = NULL;
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
- uuid_t uuid = {0};
- gf_cli_rsp rsp = {0,};
- gf_cli_req cli_req = {{0,}};
+ int ret = -1;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ gf_cli_rsp rsp = {0,};
+ gf_cli_req cli_req = {{0,}};
char msg_str[2048] = {0,};
+ char uuid_str[64] = {0,};
GF_ASSERT (req);
@@ -1142,16 +1530,18 @@ __glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req)
ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
if (ret < 0) {
- //failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
}
- gf_log ("glusterd", GF_LOG_DEBUG, "Received uuid reset req");
+ gf_log ("glusterd", GF_LOG_DEBUG, "Received uuid get req");
if (cli_req.dict.dict_len) {
- /* Unserialize the dictionary */
dict = dict_new ();
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
ret = dict_unserialize (cli_req.dict.dict_val,
cli_req.dict.dict_len,
@@ -1163,44 +1553,35 @@ __glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req)
snprintf (msg_str, sizeof (msg_str), "Unable to decode "
"the buffer");
goto out;
+
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
+
}
}
- /* In the above section if dict_unserialize is successful, ret is set
- * to zero.
- */
- ret = -1;
- // Do not allow peer reset if there are any volumes in the cluster
- if (!list_empty (&priv->volumes)) {
- snprintf (msg_str, sizeof (msg_str), "volumes are already "
- "present in the cluster. Resetting uuid is not "
- "allowed");
- gf_log (this->name, GF_LOG_WARNING, "%s", msg_str);
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ ret = -1;
goto out;
}
- // Do not allow peer reset if trusted storage pool is already formed
- if (!list_empty (&priv->peers)) {
- snprintf (msg_str, sizeof (msg_str),"trusted storage pool "
- "has been already formed. Please detach this peer "
- "from the pool and reset its uuid.");
- gf_log (this->name, GF_LOG_WARNING, "%s", msg_str);
+ uuid_utoa_r (MY_UUID, uuid_str);
+ ret = dict_set_str (rsp_dict, "uuid", uuid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set uuid in "
+ "dictionary.");
goto out;
}
- uuid_copy (uuid, priv->uuid);
- ret = glusterd_uuid_generate_save ();
-
- if (!uuid_compare (uuid, MY_UUID)) {
- snprintf (msg_str, sizeof (msg_str), "old uuid and the new uuid"
- " are same. Try gluster peer reset again");
- gf_log (this->name, GF_LOG_ERROR, "%s", msg_str);
- ret = -1;
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to serialize "
+ "dictionary.");
goto out;
}
-
+ ret = 0;
out:
if (ret) {
rsp.op_ret = -1;
@@ -1208,22 +1589,22 @@ out:
snprintf (msg_str, sizeof (msg_str), "Operation "
"failed");
rsp.op_errstr = msg_str;
- ret = 0;
+
} else {
rsp.op_errstr = "";
+
}
glusterd_to_cli (req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_cli_rsp, dict);
- return ret;
+ return 0;
}
-
int
-glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req)
+glusterd_handle_cli_uuid_get (rpcsvc_request_t *req)
{
return glusterd_big_locked_handler (req,
- __glusterd_handle_cli_uuid_reset);
+ __glusterd_handle_cli_uuid_get);
}
int
@@ -1547,7 +1928,7 @@ __glusterd_handle_sync_volume (rpcsvc_request_t *req)
gf_log (this->name, GF_LOG_INFO, "Received volume sync req "
"for volume %s", (flags & GF_CLI_SYNC_ALL) ? "all" : volname);
- if (glusterd_is_local_addr (hostname)) {
+ if (gf_is_local_addr (hostname)) {
ret = -1;
snprintf (msg, sizeof (msg), "sync from localhost"
" not allowed");
@@ -1706,6 +2087,56 @@ glusterd_op_unlock_send_resp (rpcsvc_request_t *req, int32_t status)
}
int
+glusterd_op_mgmt_v3_lock_send_resp (rpcsvc_request_t *req, uuid_t *txn_id,
+ int32_t status)
+{
+
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ int ret = -1;
+
+ GF_ASSERT (req);
+ GF_ASSERT (txn_id);
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op_ret = status;
+ if (rsp.op_ret)
+ rsp.op_errno = errno;
+ uuid_copy (rsp.txn_id, *txn_id);
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+
+ gf_log (THIS->name, GF_LOG_DEBUG, "Responded to mgmt_v3 lock, ret: %d",
+ ret);
+
+ return ret;
+}
+
+int
+glusterd_op_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req, uuid_t *txn_id,
+ int32_t status)
+{
+
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ int ret = -1;
+
+ GF_ASSERT (req);
+ GF_ASSERT (txn_id);
+ rsp.op_ret = status;
+ if (rsp.op_ret)
+ rsp.op_errno = errno;
+ glusterd_get_uuid (&rsp.uuid);
+ uuid_copy (rsp.txn_id, *txn_id);
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+
+ gf_log (THIS->name, GF_LOG_DEBUG, "Responded to mgmt_v3 unlock, ret: %d",
+ ret);
+
+ return ret;
+}
+
+int
__glusterd_handle_cluster_unlock (rpcsvc_request_t *req)
{
gd1_mgmt_cluster_unlock_req unlock_req = {{0}, };
@@ -1713,11 +2144,17 @@ __glusterd_handle_cluster_unlock (rpcsvc_request_t *req)
glusterd_op_lock_ctx_t *ctx = NULL;
glusterd_peerinfo_t *peerinfo = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
this = THIS;
GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+ txn_id = &priv->global_txn_id;
+
ret = xdr_to_generic (req->msg[0], &unlock_req,
(xdrproc_t)xdr_gd1_mgmt_cluster_unlock_req);
if (ret < 0) {
@@ -1747,8 +2184,9 @@ __glusterd_handle_cluster_unlock (rpcsvc_request_t *req)
}
uuid_copy (ctx->uuid, unlock_req.uuid);
ctx->req = req;
+ ctx->dict = NULL;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK, ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK, txn_id, ctx);
out:
glusterd_friend_sm ();
@@ -2194,7 +2632,18 @@ __glusterd_handle_probe_query (rpcsvc_request_t *req)
gf_log ("", GF_LOG_ERROR, "Failed to add peer %s",
remote_hostname);
rsp.op_errno = GF_PROBE_ADD_FAILED;
+ goto respond;
}
+ gf_log (THIS->name, GF_LOG_INFO,
+ "joining, should point etcd at %s", remote_hostname);
+ /*
+ * We should have started a standalone etcd before. Now we
+ * need a new one, with a new config.
+ */
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
+ conf->etcd_pid = start_etcd (uuid_utoa(MY_UUID),
+ remote_hostname);
}
respond:
@@ -2339,8 +2788,10 @@ __glusterd_handle_mount (rpcsvc_request_t *req)
gf1_cli_mount_rsp rsp = {0,};
dict_t *dict = NULL;
int ret = 0;
+ glusterd_conf_t *priv = NULL;
GF_ASSERT (req);
+ priv = THIS->private;
ret = xdr_to_generic (req->msg[0], &mnt_req,
(xdrproc_t)xdr_gf1_cli_mount_req);
@@ -2373,8 +2824,10 @@ __glusterd_handle_mount (rpcsvc_request_t *req)
}
}
+ synclock_unlock (&priv->big_lock);
rsp.op_ret = glusterd_do_mount (mnt_req.label, dict,
&rsp.path, &rsp.op_errno);
+ synclock_lock (&priv->big_lock);
out:
if (!rsp.path)
@@ -2724,7 +3177,8 @@ out:
}
int
-glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port)
+glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
+ dict_t *dict)
{
int ret = -1;
glusterd_peerinfo_t *peerinfo = NULL;
@@ -2740,6 +3194,7 @@ glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port)
" for host: %s (%d)", hoststr, port);
args.mode = GD_MODE_ON;
args.req = req;
+ args.dict = dict;
ret = glusterd_friend_add ((char *)hoststr, port,
GD_FRIEND_STATE_DEFAULT,
NULL, &peerinfo, 0, &args);
@@ -2761,11 +3216,11 @@ glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port)
ret = glusterd_friend_sm_inject_event (event);
glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_SUCCESS,
NULL, (char*)hoststr,
- port);
+ port, dict);
}
} else {
glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_FRIEND, NULL,
- (char*)hoststr, port);
+ (char*)hoststr, port, dict);
}
out:
@@ -2775,7 +3230,7 @@ out:
int
glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
- uuid_t uuid)
+ uuid_t uuid, dict_t *dict)
{
int ret = -1;
glusterd_peerinfo_t *peerinfo = NULL;
@@ -2816,6 +3271,7 @@ glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
ctx->hostname = gf_strdup (hoststr);
ctx->port = port;
ctx->req = req;
+ ctx->dict = dict;
event->ctx = ctx;
@@ -2894,49 +3350,217 @@ glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *myhostname,
return ret;
}
+static void
+set_probe_error_str (int op_ret, int op_errno, char *op_errstr, char *errstr,
+ size_t len, char *hostname, int port)
+{
+ if ((op_errstr) && (strcmp (op_errstr, ""))) {
+ snprintf (errstr, len, "%s", op_errstr);
+ return;
+ }
+
+ if (!op_ret) {
+ switch (op_errno) {
+ case GF_PROBE_LOCALHOST:
+ snprintf (errstr, len, "Probe on localhost not "
+ "needed");
+ break;
+
+ case GF_PROBE_FRIEND:
+ snprintf (errstr, len, "Host %s port %d already"
+ " in peer list", hostname, port);
+ break;
+
+ default:
+ if (op_errno != 0)
+ snprintf (errstr, len, "Probe returned "
+ "with unknown errno %d",
+ op_errno);
+ break;
+ }
+ } else {
+ switch (op_errno) {
+ case GF_PROBE_ANOTHER_CLUSTER:
+ snprintf (errstr, len, "%s is already part of "
+ "another cluster", hostname);
+ break;
+
+ case GF_PROBE_VOLUME_CONFLICT:
+ snprintf (errstr, len, "Atleast one volume on "
+ "%s conflicts with existing volumes "
+ "in the cluster", hostname);
+ break;
+
+ case GF_PROBE_UNKNOWN_PEER:
+ snprintf (errstr, len, "%s responded with "
+ "'unknown peer' error, this could "
+ "happen if %s doesn't have localhost "
+ "in its peer database", hostname,
+ hostname);
+ break;
+
+ case GF_PROBE_ADD_FAILED:
+ snprintf (errstr, len, "Failed to add peer "
+ "information on %s", hostname);
+ break;
+
+ case GF_PROBE_SAME_UUID:
+ snprintf (errstr, len, "Peer uuid (host %s) is "
+ "same as local uuid", hostname);
+ break;
+
+ case GF_PROBE_QUORUM_NOT_MET:
+ snprintf (errstr, len, "Cluster quorum is not "
+ "met. Changing peers is not allowed "
+ "in this state");
+ break;
+
+ case GF_PROBE_MISSED_SNAP_CONFLICT:
+ snprintf (errstr, len, "Failed to update "
+ "list of missed snapshots from "
+ "peer %s", hostname);
+ break;
+
+ case GF_PROBE_SNAP_CONFLICT:
+ snprintf (errstr, len, "Conflict in comparing "
+ "list of snapshots from "
+ "peer %s", hostname);
+ break;
+
+ default:
+ snprintf (errstr, len, "Probe returned with "
+ "unknown errno %d", op_errno);
+ break;
+ }
+ }
+}
+
int
glusterd_xfer_cli_probe_resp (rpcsvc_request_t *req, int32_t op_ret,
int32_t op_errno, char *op_errstr, char *hostname,
- int port)
+ int port, dict_t *dict)
{
- gf1_cli_probe_rsp rsp = {0, };
+ gf_cli_rsp rsp = {0,};
int32_t ret = -1;
+ char errstr[2048] = {0,};
+ char *cmd_str = NULL;
+ xlator_t *this = THIS;
GF_ASSERT (req);
+ GF_ASSERT (this);
+
+ (void) set_probe_error_str (op_ret, op_errno, op_errstr, errstr,
+ sizeof (errstr), hostname, port);
+
+ if (dict) {
+ ret = dict_get_str (dict, "cmd-str", &cmd_str);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "command string");
+ }
rsp.op_ret = op_ret;
rsp.op_errno = op_errno;
- rsp.op_errstr = op_errstr ? op_errstr : "";
- rsp.hostname = hostname;
- rsp.port = port;
+ rsp.op_errstr = (errstr[0] != '\0') ? errstr : "";
+
+ gf_cmd_log ("", "%s : %s %s %s", cmd_str,
+ (op_ret) ? "FAILED" : "SUCCESS",
+ (errstr[0] != '\0') ? ":" : " ",
+ (errstr[0] != '\0') ? errstr : " ");
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
- (xdrproc_t)xdr_gf1_cli_probe_rsp);
+ (xdrproc_t)xdr_gf_cli_rsp);
- gf_log ("glusterd", GF_LOG_INFO, "Responded to CLI, ret: %d",ret);
+ if (dict)
+ dict_unref (dict);
+ gf_log (this->name, GF_LOG_DEBUG, "Responded to CLI, ret: %d",ret);
return ret;
}
+static void
+set_deprobe_error_str (int op_ret, int op_errno, char *op_errstr, char *errstr,
+ size_t len, char *hostname)
+{
+ if ((op_errstr) && (strcmp (op_errstr, ""))) {
+ snprintf (errstr, len, "%s", op_errstr);
+ return;
+ }
+
+ if (op_ret) {
+ switch (op_errno) {
+ case GF_DEPROBE_LOCALHOST:
+ snprintf (errstr, len, "%s is localhost",
+ hostname);
+ break;
+
+ case GF_DEPROBE_NOT_FRIEND:
+ snprintf (errstr, len, "%s is not part of "
+ "cluster", hostname);
+ break;
+
+ case GF_DEPROBE_BRICK_EXIST:
+ snprintf (errstr, len, "Brick(s) with the peer "
+ "%s exist in cluster", hostname);
+ break;
+
+ case GF_DEPROBE_FRIEND_DOWN:
+ snprintf (errstr, len, "One of the peers is "
+ "probably down. Check with "
+ "'peer status'");
+ break;
+
+ case GF_DEPROBE_QUORUM_NOT_MET:
+ snprintf (errstr, len, "Cluster quorum is not "
+ "met. Changing peers is not allowed "
+ "in this state");
+ break;
+
+ default:
+ snprintf (errstr, len, "Detach returned with "
+ "unknown errno %d", op_errno);
+ break;
+
+ }
+ }
+}
+
+
int
glusterd_xfer_cli_deprobe_resp (rpcsvc_request_t *req, int32_t op_ret,
int32_t op_errno, char *op_errstr,
- char *hostname)
+ char *hostname, dict_t *dict)
{
- gf1_cli_deprobe_rsp rsp = {0, };
+ gf_cli_rsp rsp = {0,};
int32_t ret = -1;
+ char *cmd_str = NULL;
+ char errstr[2048] = {0,};
GF_ASSERT (req);
+ (void) set_deprobe_error_str (op_ret, op_errno, op_errstr, errstr,
+ sizeof (errstr), hostname);
+
+ if (dict) {
+ ret = dict_get_str (dict, "cmd-str", &cmd_str);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to get "
+ "command string");
+ }
+
rsp.op_ret = op_ret;
rsp.op_errno = op_errno;
- rsp.op_errstr = op_errstr ? op_errstr : "";
- rsp.hostname = hostname;
+ rsp.op_errstr = (errstr[0] != '\0') ? errstr : "";
+
+ gf_cmd_log ("", "%s : %s %s %s", cmd_str,
+ (op_ret) ? "FAILED" : "SUCCESS",
+ (errstr[0] != '\0') ? ":" : " ",
+ (errstr[0] != '\0') ? errstr : " ");
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
- (xdrproc_t)xdr_gf1_cli_deprobe_rsp);
+ (xdrproc_t)xdr_gf_cli_rsp);
- gf_log ("glusterd", GF_LOG_INFO, "Responded to CLI, ret: %d",ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Responded to CLI, ret: %d",ret);
return ret;
}
@@ -3134,10 +3758,13 @@ __glusterd_handle_status_volume (rpcsvc_request_t *req)
glusterd_op_t cli_op = GD_OP_STATUS_VOLUME;
char err_str[2048] = {0,};
xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
GF_ASSERT (req);
this = THIS;
GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
if (ret < 0) {
@@ -3178,6 +3805,14 @@ __glusterd_handle_status_volume (rpcsvc_request_t *req)
"Received status volume req for volume %s", volname);
}
+ if ((cmd & GF_CLI_STATUS_QUOTAD) &&
+ (conf->op_version == GD_OP_VERSION_MIN)) {
+ snprintf (err_str, sizeof (err_str), "The cluster is operating "
+ "at version 1. Getting the status of quotad is not "
+ "allowed in this state.");
+ ret = -1;
+ goto out;
+ }
ret = glusterd_op_begin_synctask (req, GD_OP_STATUS_VOLUME, dict);
@@ -3278,17 +3913,110 @@ glusterd_handle_cli_clearlocks_volume (rpcsvc_request_t *req)
__glusterd_handle_cli_clearlocks_volume);
}
+static int
+get_volinfo_from_brickid (char *brickid, glusterd_volinfo_t **volinfo)
+{
+ int ret = -1;
+ char *volid_str = NULL;
+ char *brick = NULL;
+ char *brickid_dup = NULL;
+ uuid_t volid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brickid);
+
+ brickid_dup = gf_strdup (brickid);
+ if (!brickid_dup)
+ goto out;
+
+ volid_str = brickid_dup;
+ brick = strchr (brickid_dup, ':');
+ if (!brick) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid brickid");
+ goto out;
+ }
+
+ *brick = '\0';
+ brick++;
+ uuid_parse (volid_str, volid);
+ ret = glusterd_volinfo_find_by_volume_id (volid, volinfo);
+ if (ret) {
+ /* Check if it is a snapshot volume */
+ ret = glusterd_snap_volinfo_find_by_volume_id (volid, volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to find volinfo");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ GF_FREE (brickid_dup);
+ return ret;
+}
+
+static int
+get_brickinfo_from_brickid (char *brickid, glusterd_brickinfo_t **brickinfo)
+{
+ glusterd_volinfo_t *volinfo = NULL;
+ char *volid_str = NULL;
+ char *brick = NULL;
+ char *brickid_dup = NULL;
+ uuid_t volid = {0};
+ int ret = -1;
+
+ brickid_dup = gf_strdup (brickid);
+ if (!brickid_dup)
+ goto out;
+
+ volid_str = brickid_dup;
+ brick = strchr (brickid_dup, ':');
+ if (!volid_str || !brick)
+ goto out;
+
+ *brick = '\0';
+ brick++;
+ uuid_parse (volid_str, volid);
+ ret = glusterd_volinfo_find_by_volume_id (volid, &volinfo);
+ if (ret) {
+ /* Check if it a snapshot volume */
+ ret = glusterd_snap_volinfo_find_by_volume_id (volid, &volinfo);
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_volume_brickinfo_get_by_brick (brick, volinfo,
+ brickinfo);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+ GF_FREE (brickid_dup);
+ return ret;
+}
+
int
__glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
- rpc_clnt_event_t event, void *data)
+ rpc_clnt_event_t event, void *data)
{
- xlator_t *this = NULL;
- glusterd_conf_t *conf = NULL;
- int ret = 0;
- glusterd_brickinfo_t *brickinfo = NULL;
+ char *brickid = NULL;
+ int ret = 0;
+ glusterd_conf_t *conf = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+
+ brickid = mydata;
+ if (!brickid)
+ return 0;
- brickinfo = mydata;
- if (!brickinfo)
+ ret = get_brickinfo_from_brickid (brickid, &brickinfo);
+ if (ret)
return 0;
this = THIS;
@@ -3298,23 +4026,63 @@ __glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
switch (event) {
case RPC_CLNT_CONNECT:
- gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_CONNECT");
+ /* If a node on coming back up, already starts a brick
+ * before the handshake, and the notification comes after
+ * the handshake is done, then we need to check if this
+ * is a restored brick with a snapshot pending. If so, we
+ * need to stop the brick
+ */
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Snapshot is pending on %s:%s. "
+ "Hence not starting the brick",
+ brickinfo->hostname,
+ brickinfo->path);
+ ret = get_volinfo_from_brickid (brickid, &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get volinfo from "
+ "brickid(%s)", brickid);
+ goto out;
+ }
+
+ ret = glusterd_brick_stop (volinfo, brickinfo,
+ _gf_false);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unable to stop %s:%s",
+ brickinfo->hostname, brickinfo->path);
+ goto out;
+ }
+
+ break;
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "Connected to %s:%s",
+ brickinfo->hostname, brickinfo->path);
glusterd_set_brick_status (brickinfo, GF_BRICK_STARTED);
ret = default_notify (this, GF_EVENT_CHILD_UP, NULL);
break;
case RPC_CLNT_DISCONNECT:
- gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_DISCONNECT");
+ if (GF_BRICK_STARTED == brickinfo->status)
+ gf_log (this->name, GF_LOG_INFO, "Disconnected from "
+ "%s:%s", brickinfo->hostname, brickinfo->path);
+
glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
break;
+ case RPC_CLNT_DESTROY:
+ GF_FREE (mydata);
+ mydata = NULL;
+ break;
default:
gf_log (this->name, GF_LOG_TRACE,
"got some other RPC event %d", event);
break;
}
+out:
return ret;
}
@@ -3382,11 +4150,13 @@ glusterd_friend_remove_notify (glusterd_peerctx_t *peerctx)
glusterd_peerinfo_t *peerinfo = peerctx->peerinfo;
rpcsvc_request_t *req = peerctx->args.req;
char *errstr = peerctx->errstr;
+ dict_t *dict = NULL;
GF_ASSERT (peerctx);
peerinfo = peerctx->peerinfo;
req = peerctx->args.req;
+ dict = peerctx->args.dict;
errstr = peerctx->errstr;
ret = glusterd_friend_sm_new_event (GD_FRIEND_EVENT_REMOVE_FRIEND,
@@ -3400,7 +4170,8 @@ glusterd_friend_remove_notify (glusterd_peerctx_t *peerctx)
}
glusterd_xfer_cli_probe_resp (req, -1, ENOTCONN, errstr,
- peerinfo->hostname, peerinfo->port);
+ peerinfo->hostname,
+ peerinfo->port, dict);
new_event->peerinfo = peerinfo;
ret = glusterd_friend_sm_inject_event (new_event);
@@ -3425,6 +4196,8 @@ __glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
glusterd_peerinfo_t *peerinfo = NULL;
glusterd_peerctx_t *peerctx = NULL;
gf_boolean_t quorum_action = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+ uuid_t uuid;
peerctx = mydata;
if (!peerctx)
@@ -3452,6 +4225,31 @@ __glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_DISCONNECT %d",
peerinfo->state.state);
+ if (peerinfo->connected) {
+ if (conf->op_version < GD_OP_VERSION_4) {
+ glusterd_get_lock_owner (&uuid);
+ if (!uuid_is_null (uuid) &&
+ !uuid_compare (peerinfo->uuid, uuid))
+ glusterd_unlock (peerinfo->uuid);
+ } else {
+ list_for_each_entry (volinfo, &conf->volumes,
+ vol_list) {
+ ret = glusterd_mgmt_v3_unlock
+ (volinfo->volname,
+ peerinfo->uuid,
+ "vol");
+ if (ret)
+ gf_log (this->name,
+ GF_LOG_TRACE,
+ "Lock not released "
+ "for %s",
+ volinfo->volname);
+ }
+ }
+
+ ret = 0;
+ }
+
if ((peerinfo->quorum_contrib != QUORUM_DOWN) &&
(peerinfo->state.state == GD_FRIEND_STATE_BEFRIENDED)) {
peerinfo->quorum_contrib = QUORUM_DOWN;
@@ -3501,11 +4299,11 @@ glusterd_null (rpcsvc_request_t *req)
}
rpcsvc_actor_t gd_svc_mgmt_actors[] = {
- [GLUSTERD_MGMT_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0},
- [GLUSTERD_MGMT_CLUSTER_LOCK] = { "CLUSTER_LOCK", GLUSTERD_MGMT_CLUSTER_LOCK, glusterd_handle_cluster_lock, NULL, 0},
- [GLUSTERD_MGMT_CLUSTER_UNLOCK] = { "CLUSTER_UNLOCK", GLUSTERD_MGMT_CLUSTER_UNLOCK, glusterd_handle_cluster_unlock, NULL, 0},
- [GLUSTERD_MGMT_STAGE_OP] = { "STAGE_OP", GLUSTERD_MGMT_STAGE_OP, glusterd_handle_stage_op, NULL, 0},
- [GLUSTERD_MGMT_COMMIT_OP] = { "COMMIT_OP", GLUSTERD_MGMT_COMMIT_OP, glusterd_handle_commit_op, NULL, 0},
+ [GLUSTERD_MGMT_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_CLUSTER_LOCK] = { "CLUSTER_LOCK", GLUSTERD_MGMT_CLUSTER_LOCK, glusterd_handle_cluster_lock, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_CLUSTER_UNLOCK] = { "CLUSTER_UNLOCK", GLUSTERD_MGMT_CLUSTER_UNLOCK, glusterd_handle_cluster_unlock, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_STAGE_OP] = { "STAGE_OP", GLUSTERD_MGMT_STAGE_OP, glusterd_handle_stage_op, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_COMMIT_OP] = { "COMMIT_OP", GLUSTERD_MGMT_COMMIT_OP, glusterd_handle_commit_op, NULL, 0, DRC_NA},
};
struct rpcsvc_program gd_svc_mgmt_prog = {
@@ -3518,11 +4316,11 @@ struct rpcsvc_program gd_svc_mgmt_prog = {
};
rpcsvc_actor_t gd_svc_peer_actors[] = {
- [GLUSTERD_FRIEND_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0},
- [GLUSTERD_PROBE_QUERY] = { "PROBE_QUERY", GLUSTERD_PROBE_QUERY, glusterd_handle_probe_query, NULL, 0},
- [GLUSTERD_FRIEND_ADD] = { "FRIEND_ADD", GLUSTERD_FRIEND_ADD, glusterd_handle_incoming_friend_req, NULL, 0},
- [GLUSTERD_FRIEND_REMOVE] = { "FRIEND_REMOVE", GLUSTERD_FRIEND_REMOVE, glusterd_handle_incoming_unfriend_req, NULL, 0},
- [GLUSTERD_FRIEND_UPDATE] = { "FRIEND_UPDATE", GLUSTERD_FRIEND_UPDATE, glusterd_handle_friend_update, NULL, 0},
+ [GLUSTERD_FRIEND_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0, DRC_NA},
+ [GLUSTERD_PROBE_QUERY] = { "PROBE_QUERY", GLUSTERD_PROBE_QUERY, glusterd_handle_probe_query, NULL, 0, DRC_NA},
+ [GLUSTERD_FRIEND_ADD] = { "FRIEND_ADD", GLUSTERD_FRIEND_ADD, glusterd_handle_incoming_friend_req, NULL, 0, DRC_NA},
+ [GLUSTERD_FRIEND_REMOVE] = { "FRIEND_REMOVE", GLUSTERD_FRIEND_REMOVE, glusterd_handle_incoming_unfriend_req, NULL, 0, DRC_NA},
+ [GLUSTERD_FRIEND_UPDATE] = { "FRIEND_UPDATE", GLUSTERD_FRIEND_UPDATE, glusterd_handle_friend_update, NULL, 0, DRC_NA},
};
struct rpcsvc_program gd_svc_peer_prog = {
@@ -3537,38 +4335,39 @@ struct rpcsvc_program gd_svc_peer_prog = {
rpcsvc_actor_t gd_svc_cli_actors[] = {
- [GLUSTER_CLI_PROBE] = { "CLI_PROBE", GLUSTER_CLI_PROBE, glusterd_handle_cli_probe, NULL, 0},
- [GLUSTER_CLI_CREATE_VOLUME] = { "CLI_CREATE_VOLUME", GLUSTER_CLI_CREATE_VOLUME, glusterd_handle_create_volume, NULL, 0},
- [GLUSTER_CLI_DEFRAG_VOLUME] = { "CLI_DEFRAG_VOLUME", GLUSTER_CLI_DEFRAG_VOLUME, glusterd_handle_defrag_volume, NULL, 0},
- [GLUSTER_CLI_DEPROBE] = { "FRIEND_REMOVE", GLUSTER_CLI_DEPROBE, glusterd_handle_cli_deprobe, NULL, 0},
- [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0},
- [GLUSTER_CLI_UUID_RESET] = { "UUID_RESET", GLUSTER_CLI_UUID_RESET, glusterd_handle_cli_uuid_reset, NULL, 0},
- [GLUSTER_CLI_START_VOLUME] = { "START_VOLUME", GLUSTER_CLI_START_VOLUME, glusterd_handle_cli_start_volume, NULL, 0},
- [GLUSTER_CLI_STOP_VOLUME] = { "STOP_VOLUME", GLUSTER_CLI_STOP_VOLUME, glusterd_handle_cli_stop_volume, NULL, 0},
- [GLUSTER_CLI_DELETE_VOLUME] = { "DELETE_VOLUME", GLUSTER_CLI_DELETE_VOLUME, glusterd_handle_cli_delete_volume, NULL, 0},
- [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0},
- [GLUSTER_CLI_ADD_BRICK] = { "ADD_BRICK", GLUSTER_CLI_ADD_BRICK, glusterd_handle_add_brick, NULL, 0},
- [GLUSTER_CLI_REPLACE_BRICK] = { "REPLACE_BRICK", GLUSTER_CLI_REPLACE_BRICK, glusterd_handle_replace_brick, NULL, 0},
- [GLUSTER_CLI_REMOVE_BRICK] = { "REMOVE_BRICK", GLUSTER_CLI_REMOVE_BRICK, glusterd_handle_remove_brick, NULL, 0},
- [GLUSTER_CLI_LOG_ROTATE] = { "LOG FILENAME", GLUSTER_CLI_LOG_ROTATE, glusterd_handle_log_rotate, NULL, 0},
- [GLUSTER_CLI_SET_VOLUME] = { "SET_VOLUME", GLUSTER_CLI_SET_VOLUME, glusterd_handle_set_volume, NULL, 0},
- [GLUSTER_CLI_SYNC_VOLUME] = { "SYNC_VOLUME", GLUSTER_CLI_SYNC_VOLUME, glusterd_handle_sync_volume, NULL, 0},
- [GLUSTER_CLI_RESET_VOLUME] = { "RESET_VOLUME", GLUSTER_CLI_RESET_VOLUME, glusterd_handle_reset_volume, NULL, 0},
- [GLUSTER_CLI_FSM_LOG] = { "FSM_LOG", GLUSTER_CLI_FSM_LOG, glusterd_handle_fsm_log, NULL, 0},
- [GLUSTER_CLI_GSYNC_SET] = { "GSYNC_SET", GLUSTER_CLI_GSYNC_SET, glusterd_handle_gsync_set, NULL, 0},
- [GLUSTER_CLI_PROFILE_VOLUME] = { "STATS_VOLUME", GLUSTER_CLI_PROFILE_VOLUME, glusterd_handle_cli_profile_volume, NULL, 0},
- [GLUSTER_CLI_QUOTA] = { "QUOTA", GLUSTER_CLI_QUOTA, glusterd_handle_quota, NULL, 0},
- [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1},
- [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0},
- [GLUSTER_CLI_MOUNT] = { "MOUNT", GLUSTER_CLI_MOUNT, glusterd_handle_mount, NULL, 1},
- [GLUSTER_CLI_UMOUNT] = { "UMOUNT", GLUSTER_CLI_UMOUNT, glusterd_handle_umount, NULL, 1},
- [GLUSTER_CLI_HEAL_VOLUME] = { "HEAL_VOLUME", GLUSTER_CLI_HEAL_VOLUME, glusterd_handle_cli_heal_volume, NULL, 0},
- [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", GLUSTER_CLI_STATEDUMP_VOLUME, glusterd_handle_cli_statedump_volume, NULL, 0},
- [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0},
- [GLUSTER_CLI_CLRLOCKS_VOLUME] = {"CLEARLOCKS_VOLUME", GLUSTER_CLI_CLRLOCKS_VOLUME, glusterd_handle_cli_clearlocks_volume, NULL, 0},
-#ifdef HAVE_BD_XLATOR
- [GLUSTER_CLI_BD_OP] = {"BD_OP", GLUSTER_CLI_BD_OP, glusterd_handle_cli_bd_op, NULL, 0},
-#endif
+ [GLUSTER_CLI_PROBE] = { "CLI_PROBE", GLUSTER_CLI_PROBE, glusterd_handle_cli_probe, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_CREATE_VOLUME] = { "CLI_CREATE_VOLUME", GLUSTER_CLI_CREATE_VOLUME, glusterd_handle_create_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_DEFRAG_VOLUME] = { "CLI_DEFRAG_VOLUME", GLUSTER_CLI_DEFRAG_VOLUME, glusterd_handle_defrag_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_DEPROBE] = { "FRIEND_REMOVE", GLUSTER_CLI_DEPROBE, glusterd_handle_cli_deprobe, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_UUID_RESET] = { "UUID_RESET", GLUSTER_CLI_UUID_RESET, glusterd_handle_cli_uuid_reset, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_UUID_GET] = { "UUID_GET", GLUSTER_CLI_UUID_GET, glusterd_handle_cli_uuid_get, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_START_VOLUME] = { "START_VOLUME", GLUSTER_CLI_START_VOLUME, glusterd_handle_cli_start_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_STOP_VOLUME] = { "STOP_VOLUME", GLUSTER_CLI_STOP_VOLUME, glusterd_handle_cli_stop_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_DELETE_VOLUME] = { "DELETE_VOLUME", GLUSTER_CLI_DELETE_VOLUME, glusterd_handle_cli_delete_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_ADD_BRICK] = { "ADD_BRICK", GLUSTER_CLI_ADD_BRICK, glusterd_handle_add_brick, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_REPLACE_BRICK] = { "REPLACE_BRICK", GLUSTER_CLI_REPLACE_BRICK, glusterd_handle_replace_brick, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_REMOVE_BRICK] = { "REMOVE_BRICK", GLUSTER_CLI_REMOVE_BRICK, glusterd_handle_remove_brick, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_LOG_ROTATE] = { "LOG FILENAME", GLUSTER_CLI_LOG_ROTATE, glusterd_handle_log_rotate, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_SET_VOLUME] = { "SET_VOLUME", GLUSTER_CLI_SET_VOLUME, glusterd_handle_set_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_SYNC_VOLUME] = { "SYNC_VOLUME", GLUSTER_CLI_SYNC_VOLUME, glusterd_handle_sync_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_RESET_VOLUME] = { "RESET_VOLUME", GLUSTER_CLI_RESET_VOLUME, glusterd_handle_reset_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_FSM_LOG] = { "FSM_LOG", GLUSTER_CLI_FSM_LOG, glusterd_handle_fsm_log, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GSYNC_SET] = { "GSYNC_SET", GLUSTER_CLI_GSYNC_SET, glusterd_handle_gsync_set, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_PROFILE_VOLUME] = { "STATS_VOLUME", GLUSTER_CLI_PROFILE_VOLUME, glusterd_handle_cli_profile_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_QUOTA] = { "QUOTA", GLUSTER_CLI_QUOTA, glusterd_handle_quota, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1, DRC_NA},
+ [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_MOUNT] = { "MOUNT", GLUSTER_CLI_MOUNT, glusterd_handle_mount, NULL, 1, DRC_NA},
+ [GLUSTER_CLI_UMOUNT] = { "UMOUNT", GLUSTER_CLI_UMOUNT, glusterd_handle_umount, NULL, 1, DRC_NA},
+ [GLUSTER_CLI_HEAL_VOLUME] = { "HEAL_VOLUME", GLUSTER_CLI_HEAL_VOLUME, glusterd_handle_cli_heal_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", GLUSTER_CLI_STATEDUMP_VOLUME, glusterd_handle_cli_statedump_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_CLRLOCKS_VOLUME] = {"CLEARLOCKS_VOLUME", GLUSTER_CLI_CLRLOCKS_VOLUME, glusterd_handle_cli_clearlocks_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_COPY_FILE] = {"COPY_FILE", GLUSTER_CLI_COPY_FILE, glusterd_handle_copy_file, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_SYS_EXEC] = {"SYS_EXEC", GLUSTER_CLI_SYS_EXEC, glusterd_handle_sys_exec, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_SNAP] = {"SNAP", GLUSTER_CLI_SNAP, glusterd_handle_snapshot, NULL, 0, DRC_NA},
};
struct rpcsvc_program gd_svc_cli_prog = {
@@ -3579,3 +4378,24 @@ struct rpcsvc_program gd_svc_cli_prog = {
.actors = gd_svc_cli_actors,
.synctask = _gf_true,
};
+
+/* This is a minimal RPC prog, which contains only the readonly RPC procs from
+ * the cli rpcsvc
+ */
+rpcsvc_actor_t gd_svc_cli_actors_ro[] = {
+ [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_UUID_GET] = { "UUID_GET", GLUSTER_CLI_UUID_GET, glusterd_handle_cli_uuid_get, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1, DRC_NA},
+ [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0, DRC_NA},
+};
+
+struct rpcsvc_program gd_svc_cli_prog_ro = {
+ .progname = "GlusterD svc cli read-only",
+ .prognum = GLUSTER_CLI_PROGRAM,
+ .progver = GLUSTER_CLI_VERSION,
+ .numactors = GLUSTER_CLI_MAXVALUE,
+ .actors = gd_svc_cli_actors_ro,
+ .synctask = _gf_true,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c
index 9124c46ee..5078526e9 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handshake.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c
@@ -30,26 +30,145 @@
extern struct rpc_clnt_program gd_peer_prog;
extern struct rpc_clnt_program gd_mgmt_prog;
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
#define TRUSTED_PREFIX "trusted-"
typedef ssize_t (*gfs_serialize_t) (struct iovec outmsg, void *data);
+static int
+get_snap_volname_and_volinfo (const char *volpath, char **volname,
+ glusterd_volinfo_t **volinfo)
+{
+ int ret = -1;
+ char *save_ptr = NULL;
+ char *str_token = NULL;
+ char *snapname = NULL;
+ char *volname_token = NULL;
+ char *vol = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (volpath);
+ GF_ASSERT (volinfo);
+
+ str_token = gf_strdup (volpath);
+ if (NULL == str_token) {
+ goto out;
+ }
+
+ /* Input volname will have below formats:
+ * /snaps/<snapname>/<volname>.<hostname>
+ * or
+ * /snaps/<snapname>/<parent-volname>
+ * We need to extract snapname and parent_volname */
+
+ /*split string by "/" */
+ strtok_r (str_token, "/", &save_ptr);
+ snapname = strtok_r(NULL, "/", &save_ptr);
+ if (!snapname) {
+ gf_log(this->name, GF_LOG_ERROR, "Invalid path: %s", volpath);
+ goto out;
+ }
+
+ volname_token = strtok_r(NULL, "/", &save_ptr);
+ if (!volname_token) {
+ gf_log(this->name, GF_LOG_ERROR, "Invalid path: %s", volpath);
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to "
+ "fetch snap %s", snapname);
+ goto out;
+ }
+
+ /* Find if its a parent volume name or snap volume
+ * name. This function will succeed if volname_token
+ * is a parent volname
+ */
+ ret = glusterd_volinfo_find (volname_token, volinfo);
+ if (ret) {
+ *volname = gf_strdup (volname_token);
+ if (NULL == *volname) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_snap_volinfo_find (volname_token, snap,
+ volinfo);
+ if (ret) {
+ /* Split the volume name */
+ vol = strtok_r (volname_token, ".", &save_ptr);
+ if (!vol) {
+ gf_log(this->name, GF_LOG_ERROR, "Invalid "
+ "volname (%s)", volname_token);
+ goto out;
+ }
+
+ ret = glusterd_snap_volinfo_find (vol, snap, volinfo);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to "
+ "fetch snap volume from volname (%s)",
+ vol);
+ goto out;
+ }
+ }
+ } else {
+ /*volname_token is parent volname*/
+ ret = glusterd_snap_volinfo_find_from_parent_volname (
+ volname_token, snap, volinfo);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to "
+ "fetch snap volume from parent "
+ "volname (%s)", volname_token);
+ goto out;
+ }
+
+ /* Since volname_token is a parent volname we should
+ * get the snap volname here*/
+ *volname = gf_strdup ((*volinfo)->volname);
+ if (NULL == *volname) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+out:
+ if (ret && NULL != *volname) {
+ GF_FREE (*volname);
+ *volname = NULL;
+ }
+ return ret;
+}
+
static size_t
build_volfile_path (const char *volname, char *path,
size_t path_len, char *trusted_str)
{
- struct stat stbuf = {0,};
- int32_t ret = -1;
- glusterd_conf_t *priv = NULL;
- char *vol = NULL;
- char *dup_volname = NULL;
- char *free_ptr = NULL;
- char *tmp = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- char *server = NULL;
-
- priv = THIS->private;
+ struct stat stbuf = {0,};
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ char *vol = NULL;
+ char *dup_volname = NULL;
+ char *free_ptr = NULL;
+ char *save_ptr = NULL;
+ char *str_token = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *server = NULL;
+ const char *volname_ptr = NULL;
+ char path_prefix [PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volname);
+ GF_ASSERT (path);
if (strstr (volname, "gluster/")) {
server = strchr (volname, '/') + 1;
@@ -57,6 +176,22 @@ build_volfile_path (const char *volname, char *path,
path, path_len);
ret = 1;
goto out;
+ } else if ((str_token = strstr (volname, "/snaps/"))) {
+ ret = get_snap_volname_and_volinfo (str_token, &dup_volname,
+ &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get snap"
+ " volinfo from path (%s)", volname);
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (path_prefix, sizeof (path_prefix), "%s/snaps/%s",
+ priv->workdir, volinfo->snapshot->snapname);
+
+ free_ptr = dup_volname;
+ volname_ptr = dup_volname;
+ goto gotvolinfo;
} else if (volname[0] != '/') {
/* Normal behavior */
dup_volname = gf_strdup (volname);
@@ -67,39 +202,53 @@ build_volfile_path (const char *volname, char *path,
dup_volname = gf_strdup (&volname[1]);
}
+ if (!dup_volname) {
+ gf_log(THIS->name, GF_LOG_ERROR, "strdup failed");
+ ret = -1;
+ goto out;
+ }
free_ptr = dup_volname;
+ volname_ptr = volname;
+
+ snprintf (path_prefix, sizeof (path_prefix), "%s/vols",
+ priv->workdir);
ret = glusterd_volinfo_find (dup_volname, &volinfo);
+
if (ret) {
/* Split the volume name */
- vol = strtok_r (dup_volname, ".", &tmp);
+ vol = strtok_r (dup_volname, ".", &save_ptr);
if (!vol)
goto out;
+
ret = glusterd_volinfo_find (vol, &volinfo);
if (ret)
goto out;
}
+gotvolinfo:
if (!glusterd_auth_get_username (volinfo))
trusted_str = NULL;
- ret = snprintf (path, path_len, "%s/vols/%s/%s.vol",
- priv->workdir, volinfo->volname, volname);
+ ret = snprintf (path, path_len, "%s/%s/%s.vol", path_prefix,
+ volinfo->volname, volname_ptr);
if (ret == -1)
goto out;
ret = stat (path, &stbuf);
if ((ret == -1) && (errno == ENOENT)) {
- snprintf (path, path_len, "%s/vols/%s/%s%s-fuse.vol",
- priv->workdir, volinfo->volname,
- (trusted_str ? trusted_str : ""), dup_volname);
+ snprintf (path, path_len, "%s/%s/%s%s-fuse.vol",
+ path_prefix, volinfo->volname,
+ (trusted_str ? trusted_str : ""),
+ dup_volname);
+
ret = stat (path, &stbuf);
}
if ((ret == -1) && (errno == ENOENT)) {
- snprintf (path, path_len, "%s/vols/%s/%s-tcp.vol",
- priv->workdir, volinfo->volname, volname);
+ snprintf (path, path_len, "%s/%s/%s-tcp.vol",
+ path_prefix, volinfo->volname, volname_ptr);
}
ret = 1;
@@ -244,16 +393,22 @@ __server_getspec (rpcsvc_request_t *req)
}
trans = req->trans;
+ /* addrstr will be empty for cli socket connections */
ret = rpcsvc_transport_peername (trans, (char *)&addrstr,
sizeof (addrstr));
if (ret)
goto fail;
- tmp = strrchr (addrstr, ':');
- *tmp = '\0';
+ tmp = strrchr (addrstr, ':');
+ if (tmp)
+ *tmp = '\0';
- /* we trust the local admin */
- if (glusterd_is_local_addr (addrstr)) {
+ /* The trusted volfiles are given to the glusterd owned process like NFS
+ * server, self-heal daemon etc., so that they are not inadvertently
+ * blocked by a auth.{allow,reject} setting. The trusted volfile is not
+ * meant for external users.
+ */
+ if (strlen (addrstr) && gf_is_local_addr (addrstr)) {
ret = build_volfile_path (volume, filename,
sizeof (filename),
@@ -408,12 +563,16 @@ gd_validate_cluster_op_version (xlator_t *this, int cluster_op_version,
goto out;
}
- if (cluster_op_version < conf->op_version) {
+ /* The peer can only reduce its op-version when it doesn't have any
+ * volumes. Reducing op-version when it already contains volumes can
+ * lead to inconsistencies in the cluster
+ */
+ if ((cluster_op_version < conf->op_version) &&
+ !list_empty (&conf->volumes)) {
gf_log (this->name, GF_LOG_ERROR,
- "operating version %d is less than the currently "
- "running version (%d) on the machine (as per peer "
- "request from %s)", cluster_op_version,
- conf->op_version, peerid);
+ "cannot reduce operating version to %d from current "
+ "version %d as volumes exist (as per peer request from "
+ "%s)", cluster_op_version, conf->op_version, peerid);
goto out;
}
@@ -582,11 +741,9 @@ glusterd_mgmt_hndsk_versions_ack (rpcsvc_request_t *req)
}
rpcsvc_actor_t gluster_handshake_actors[] = {
- [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, NULL, NULL, 0},
- [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC,
- server_getspec, NULL, 0},
- [GF_HNDSK_EVENT_NOTIFY] = {"EVENTNOTIFY", GF_HNDSK_EVENT_NOTIFY,
- server_event_notify, NULL, 0},
+ [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, NULL, NULL, 0, DRC_NA},
+ [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA},
+ [GF_HNDSK_EVENT_NOTIFY] = {"EVENTNOTIFY", GF_HNDSK_EVENT_NOTIFY, server_event_notify, NULL, 0, DRC_NA},
};
@@ -598,6 +755,19 @@ struct rpcsvc_program gluster_handshake_prog = {
.numactors = GF_HNDSK_MAXVALUE,
};
+/* A minimal RPC program just for the cli getspec command */
+rpcsvc_actor_t gluster_cli_getspec_actors[] = {
+ [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA},
+};
+
+struct rpcsvc_program gluster_cli_getspec_prog = {
+ .progname = "Gluster Handshake (CLI Getspec)",
+ .prognum = GLUSTER_HNDSK_PROGRAM,
+ .progver = GLUSTER_HNDSK_VERSION,
+ .actors = gluster_cli_getspec_actors,
+ .numactors = GF_HNDSK_MAXVALUE,
+};
+
char *glusterd_dump_proc[GF_DUMP_MAXVALUE] = {
[GF_DUMP_NULL] = "NULL",
@@ -677,6 +847,7 @@ glusterd_event_connected_inject (glusterd_peerctx_t *peerctx)
ctx->hostname = gf_strdup (peerinfo->hostname);
ctx->port = peerinfo->port;
ctx->req = peerctx->args.req;
+ ctx->dict = peerctx->args.dict;
event->peerinfo = peerinfo;
event->ctx = ctx;
@@ -734,16 +905,6 @@ gd_validate_peer_op_version (xlator_t *this, glusterd_peerinfo_t *peerinfo,
goto out;
}
- /* If peer is already operating at a higher op_version reject it.
- * Cluster cannot be moved to higher op_version to accomodate a peer.
- */
- if (peer_op_version > conf->op_version) {
- ret = gf_asprintf (errstr, "Peer %s is already at a higher "
- "op-version", peerinfo->hostname);
- ret = -1;
- goto out;
- }
-
ret = 0;
out:
gf_log (this->name , GF_LOG_DEBUG, "Peer %s %s", peerinfo->hostname,
@@ -800,6 +961,7 @@ __glusterd_mgmt_hndsk_version_ack_cbk (struct rpc_req *req, struct iovec *iov,
*/
peerinfo->mgmt = &gd_mgmt_prog;
peerinfo->peer = &gd_peer_prog;
+ peerinfo->mgmt_v3 = &gd_mgmt_v3_prog;
ret = default_notify (this, GF_EVENT_CHILD_UP, NULL);
@@ -1026,6 +1188,15 @@ glusterd_set_clnt_mgmt_program (glusterd_peerinfo_t *peerinfo,
peerinfo->peer->progname, peerinfo->peer->prognum,
peerinfo->peer->progver);
}
+
+ if (peerinfo->mgmt_v3) {
+ gf_log ("", GF_LOG_INFO,
+ "Using Program %s, Num (%d), Version (%d)",
+ peerinfo->mgmt_v3->progname,
+ peerinfo->mgmt_v3->prognum,
+ peerinfo->mgmt_v3->progver);
+ }
+
ret = 0;
out:
return ret;
diff --git a/xlators/mgmt/glusterd/src/glusterd-hooks.c b/xlators/mgmt/glusterd/src/glusterd-hooks.c
index a61e1e85f..78730a564 100644
--- a/xlators/mgmt/glusterd/src/glusterd-hooks.c
+++ b/xlators/mgmt/glusterd/src/glusterd-hooks.c
@@ -49,6 +49,7 @@ char glusterd_hook_dirnames[GD_OP_MAX][256] =
[GD_OP_RESET_VOLUME] = EMPTY,
[GD_OP_SYNC_VOLUME] = EMPTY,
[GD_OP_LOG_ROTATE] = EMPTY,
+ [GD_OP_GSYNC_CREATE] = "gsync-create",
[GD_OP_GSYNC_SET] = EMPTY,
[GD_OP_PROFILE_VOLUME] = EMPTY,
[GD_OP_QUOTA] = EMPTY,
@@ -142,6 +143,24 @@ glusterd_hooks_get_hooks_cmd_subdir (glusterd_op_t op)
return glusterd_hook_dirnames[op];
}
+void
+glusterd_hooks_add_working_dir (runner_t *runner, glusterd_conf_t *priv)
+{
+ runner_argprintf (runner, "--gd-workdir=%s", priv->workdir);
+}
+
+void
+glusterd_hooks_add_op (runner_t *runner, char *op)
+{
+ runner_argprintf (runner, "--volume-op=%s", op);
+}
+
+void
+glusterd_hooks_add_hooks_version (runner_t* runner)
+{
+ runner_argprintf (runner, "--version=%d", GLUSTERD_HOOK_VER);
+}
+
int
glusterd_hooks_set_volume_args (dict_t *dict, runner_t *runner)
{
@@ -162,7 +181,7 @@ glusterd_hooks_set_volume_args (dict_t *dict, runner_t *runner)
goto out;
runner_add_arg (runner, "-o");
- for (i = 1; (ret == 0); i++) {
+ for (i = 1; ret == 0; i++) {
snprintf (query, sizeof (query), "key%d", i);
ret = dict_get_str (dict, query, &key);
if (ret)
@@ -185,6 +204,7 @@ static int
glusterd_hooks_add_op_args (runner_t *runner, glusterd_op_t op,
dict_t *op_ctx, glusterd_commit_hook_type_t type)
{
+ char *hooks_args = NULL;
int vol_count = 0;
gf_boolean_t truth = _gf_false;
glusterd_volinfo_t *voliter = NULL;
@@ -214,6 +234,11 @@ glusterd_hooks_add_op_args (runner_t *runner, glusterd_op_t op,
runner_argprintf (runner, "--first=%s",
truth? "yes":"no");
+
+ glusterd_hooks_add_hooks_version (runner);
+ glusterd_hooks_add_op (runner, "start");
+ glusterd_hooks_add_working_dir (runner, priv);
+
break;
case GD_OP_STOP_VOLUME:
@@ -236,6 +261,23 @@ glusterd_hooks_add_op_args (runner_t *runner, glusterd_op_t op,
ret = glusterd_hooks_set_volume_args (op_ctx, runner);
break;
+ case GD_OP_GSYNC_CREATE:
+ ret = dict_get_str (op_ctx, "hooks_args", &hooks_args);
+ if (ret)
+ gf_log ("", GF_LOG_DEBUG,
+ "No Hooks Arguments.");
+ else
+ gf_log ("", GF_LOG_DEBUG,
+ "Hooks Args = %s", hooks_args);
+ if (hooks_args)
+ runner_argprintf (runner, "%s", hooks_args);
+ break;
+
+ case GD_OP_ADD_BRICK:
+ glusterd_hooks_add_hooks_version (runner);
+ glusterd_hooks_add_op (runner, "add-brick");
+ glusterd_hooks_add_working_dir (runner, priv);
+
default:
break;
diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.c b/xlators/mgmt/glusterd/src/glusterd-locks.c
new file mode 100644
index 000000000..28358aa55
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-locks.c
@@ -0,0 +1,656 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "common-utils.h"
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-locks.h"
+#include "run.h"
+#include "syscall.h"
+
+#include <signal.h>
+
+#define GF_MAX_LOCKING_ENTITIES 2
+
+/* Valid entities that the mgmt_v3 lock can hold locks upon *
+ * To add newer entities to be locked, we can just add more *
+ * entries to this table along with the type and default value */
+glusterd_valid_entities valid_types[] = {
+ { "vol", _gf_true },
+ { "snap", _gf_false },
+ { NULL },
+};
+
+/* Checks if the lock request is for a valid entity */
+gf_boolean_t
+glusterd_mgmt_v3_is_type_valid (char *type)
+{
+ int32_t i = 0;
+ gf_boolean_t ret = _gf_false;
+
+ GF_ASSERT (type);
+
+ for (i = 0; valid_types[i].type; i++) {
+ if (!strcmp (type, valid_types[i].type)) {
+ ret = _gf_true;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/* Initialize the global mgmt_v3 lock list(dict) when
+ * glusterd is spawned */
+int32_t
+glusterd_mgmt_v3_lock_init ()
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ priv->mgmt_v3_lock = dict_new ();
+ if (!priv->mgmt_v3_lock)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/* Destroy the global mgmt_v3 lock list(dict) when
+ * glusterd cleanup is performed */
+void
+glusterd_mgmt_v3_lock_fini ()
+{
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (priv->mgmt_v3_lock)
+ dict_unref (priv->mgmt_v3_lock);
+}
+
+int32_t
+glusterd_get_mgmt_v3_lock_owner (char *key, uuid_t *uuid)
+{
+ int32_t ret = -1;
+ glusterd_mgmt_v3_lock_obj *lock_obj = NULL;
+ glusterd_conf_t *priv = NULL;
+ uuid_t no_owner = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!key || !uuid) {
+ gf_log (this->name, GF_LOG_ERROR, "key or uuid is null.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin (priv->mgmt_v3_lock, key, (void **) &lock_obj);
+ if (!ret)
+ uuid_copy (*uuid, lock_obj->lock_owner);
+ else
+ uuid_copy (*uuid, no_owner);
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* This function is called with the locked_count and type, to *
+ * release all the acquired locks. */
+static int32_t
+glusterd_release_multiple_locks_per_entity (dict_t *dict, uuid_t uuid,
+ int32_t locked_count,
+ char *type)
+{
+ char name_buf[PATH_MAX] = "";
+ char *name = NULL;
+ int32_t i = -1;
+ int32_t op_ret = 0;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT (dict);
+ GF_ASSERT (type);
+
+ if (locked_count == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No %s locked as part of this transaction",
+ type);
+ goto out;
+ }
+
+ /* Release all the locks held */
+ for (i = 0; i < locked_count; i++) {
+ snprintf (name_buf, sizeof(name_buf),
+ "%sname%d", type, i+1);
+
+ /* Looking for volname1, volname2 or snapname1, *
+ * as key in the dict snapname2 */
+ ret = dict_get_str (dict, name_buf, &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get %s locked_count = %d",
+ name_buf, locked_count);
+ op_ret = ret;
+ continue;
+ }
+
+ ret = glusterd_mgmt_v3_unlock (name, uuid, type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release lock for %s.",
+ name);
+ op_ret = ret;
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", op_ret);
+ return op_ret;
+}
+
+/* Given the count and type of the entity this function acquires *
+ * locks on multiple elements of the same entity. For example: *
+ * If type is "vol" this function tries to acquire locks on multiple *
+ * volumes */
+static int32_t
+glusterd_acquire_multiple_locks_per_entity (dict_t *dict, uuid_t uuid,
+ int32_t count, char *type)
+{
+ char name_buf[PATH_MAX] = "";
+ char *name = NULL;
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t locked_count = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT (dict);
+ GF_ASSERT (type);
+
+ /* Locking one element after other */
+ for (i = 0; i < count; i++) {
+ snprintf (name_buf, sizeof(name_buf),
+ "%sname%d", type, i+1);
+
+ /* Looking for volname1, volname2 or snapname1, *
+ * as key in the dict snapname2 */
+ ret = dict_get_str (dict, name_buf, &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get %s count = %d",
+ name_buf, count);
+ break;
+ }
+
+ ret = glusterd_mgmt_v3_lock (name, uuid, type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire lock for %s %s "
+ "on behalf of %s. Reversing "
+ "this transaction", type, name,
+ uuid_utoa(uuid));
+ break;
+ }
+ locked_count++;
+ }
+
+ if (count == locked_count) {
+ /* If all locking ops went successfuly, return as success */
+ ret = 0;
+ goto out;
+ }
+
+ /* If we failed to lock one element, unlock others and return failure */
+ ret = glusterd_release_multiple_locks_per_entity (dict, uuid,
+ locked_count,
+ type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release multiple %s locks",
+ type);
+ }
+ ret = -1;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Given the type of entity, this function figures out if it should unlock a *
+ * single element of multiple elements of the said entity. For example: *
+ * if the type is "vol", this function will accordingly unlock a single volume *
+ * or multiple volumes */
+static int32_t
+glusterd_mgmt_v3_unlock_entity (dict_t *dict, uuid_t uuid, char *type,
+ gf_boolean_t default_value)
+{
+ char name_buf[PATH_MAX] = "";
+ char *name = NULL;
+ int32_t count = -1;
+ int32_t ret = -1;
+ gf_boolean_t hold_locks = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT (dict);
+ GF_ASSERT (type);
+
+ snprintf (name_buf, sizeof(name_buf), "hold_%s_locks", type);
+ hold_locks = dict_get_str_boolean (dict, name_buf, default_value);
+
+ if (hold_locks == _gf_false) {
+ /* Locks were not held for this particular entity *
+ * Hence nothing to release */
+ ret = 0;
+ goto out;
+ }
+
+ /* Looking for volcount or snapcount in the dict */
+ snprintf (name_buf, sizeof(name_buf), "%scount", type);
+ ret = dict_get_int32 (dict, name_buf, &count);
+ if (ret) {
+ /* count is not present. Only one *
+ * element name needs to be unlocked */
+ snprintf (name_buf, sizeof(name_buf), "%sname",
+ type);
+ ret = dict_get_str (dict, name_buf, &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch %sname", type);
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_unlock (name, uuid, type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release lock for %s %s "
+ "on behalf of %s.", type, name,
+ uuid_utoa(uuid));
+ goto out;
+ }
+ } else {
+ /* Unlocking one element name after another */
+ ret = glusterd_release_multiple_locks_per_entity (dict,
+ uuid,
+ count,
+ type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release all %s locks", type);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Given the type of entity, this function figures out if it should lock a *
+ * single element or multiple elements of the said entity. For example: *
+ * if the type is "vol", this function will accordingly lock a single volume *
+ * or multiple volumes */
+static int32_t
+glusterd_mgmt_v3_lock_entity (dict_t *dict, uuid_t uuid, char *type,
+ gf_boolean_t default_value)
+{
+ char name_buf[PATH_MAX] = "";
+ char *name = NULL;
+ int32_t count = -1;
+ int32_t ret = -1;
+ gf_boolean_t hold_locks = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT (dict);
+ GF_ASSERT (type);
+
+ snprintf (name_buf, sizeof(name_buf), "hold_%s_locks", type);
+ hold_locks = dict_get_str_boolean (dict, name_buf, default_value);
+
+ if (hold_locks == _gf_false) {
+ /* Not holding locks for this particular entity */
+ ret = 0;
+ goto out;
+ }
+
+ /* Looking for volcount or snapcount in the dict */
+ snprintf (name_buf, sizeof(name_buf), "%scount", type);
+ ret = dict_get_int32 (dict, name_buf, &count);
+ if (ret) {
+ /* count is not present. Only one *
+ * element name needs to be locked */
+ snprintf (name_buf, sizeof(name_buf), "%sname",
+ type);
+ ret = dict_get_str (dict, name_buf, &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch %sname", type);
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_lock (name, uuid, type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire lock for %s %s "
+ "on behalf of %s.", type, name,
+ uuid_utoa(uuid));
+ goto out;
+ }
+ } else {
+ /* Locking one element name after another */
+ ret = glusterd_acquire_multiple_locks_per_entity (dict,
+ uuid,
+ count,
+ type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire all %s locks", type);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Try to release locks of multiple entities like *
+ * volume, snaps etc. */
+int32_t
+glusterd_multiple_mgmt_v3_unlock (dict_t *dict, uuid_t uuid)
+{
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t op_ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "dict is null.");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; valid_types[i].type; i++) {
+ ret = glusterd_mgmt_v3_unlock_entity
+ (dict, uuid,
+ valid_types[i].type,
+ valid_types[i].default_value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to unlock all %s",
+ valid_types[i].type);
+ op_ret = ret;
+ }
+ }
+
+ ret = op_ret;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+/* Try to acquire locks on multiple entities like *
+ * volume, snaps etc. */
+int32_t
+glusterd_multiple_mgmt_v3_lock (dict_t *dict, uuid_t uuid)
+{
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t locked_count = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "dict is null.");
+ ret = -1;
+ goto out;
+ }
+
+ /* Locking one entity after other */
+ for (i = 0; valid_types[i].type; i++) {
+ ret = glusterd_mgmt_v3_lock_entity
+ (dict, uuid,
+ valid_types[i].type,
+ valid_types[i].default_value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to lock all %s",
+ valid_types[i].type);
+ break;
+ }
+ locked_count++;
+ }
+
+ if (locked_count == GF_MAX_LOCKING_ENTITIES) {
+ /* If all locking ops went successfuly, return as success */
+ ret = 0;
+ goto out;
+ }
+
+ /* If we failed to lock one entity, unlock others and return failure */
+ for (i = 0; i < locked_count; i++) {
+ ret = glusterd_mgmt_v3_unlock_entity
+ (dict, uuid,
+ valid_types[i].type,
+ valid_types[i].default_value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to unlock all %s",
+ valid_types[i].type);
+ }
+ }
+ ret = -1;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_lock (const char *name, uuid_t uuid, char *type)
+{
+ char key[PATH_MAX] = "";
+ int32_t ret = -1;
+ glusterd_mgmt_v3_lock_obj *lock_obj = NULL;
+ glusterd_conf_t *priv = NULL;
+ gf_boolean_t is_valid = _gf_true;
+ uuid_t owner = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!name || !type) {
+ gf_log (this->name, GF_LOG_ERROR, "name or type is null.");
+ ret = -1;
+ goto out;
+ }
+
+ is_valid = glusterd_mgmt_v3_is_type_valid (type);
+ if (is_valid != _gf_true) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid entity. Cannot perform locking "
+ "operation on %s types", type);
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (key, sizeof(key), "%s_%s", name, type);
+ if (ret != strlen(name) + 1 + strlen(type)) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create key");
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Trying to acquire lock of %s %s for %s as %s",
+ type, name, uuid_utoa (uuid), key);
+
+ ret = glusterd_get_mgmt_v3_lock_owner (key, &owner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to get mgmt_v3 lock owner");
+ goto out;
+ }
+
+ /* If the lock has already been held for the given volume
+ * we fail */
+ if (!uuid_is_null (owner)) {
+ gf_log (this->name, GF_LOG_WARNING, "Lock for %s held by %s",
+ name, uuid_utoa (owner));
+ ret = -1;
+ goto out;
+ }
+
+ lock_obj = GF_CALLOC (1, sizeof(glusterd_mgmt_v3_lock_obj),
+ gf_common_mt_mgmt_v3_lock_obj_t);
+ if (!lock_obj) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (lock_obj->lock_owner, uuid);
+
+ ret = dict_set_bin (priv->mgmt_v3_lock, key, lock_obj,
+ sizeof(glusterd_mgmt_v3_lock_obj));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set lock owner in mgmt_v3 lock");
+ if (lock_obj)
+ GF_FREE (lock_obj);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lock for %s %s successfully held by %s",
+ type, name, uuid_utoa (uuid));
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock (const char *name, uuid_t uuid, char *type)
+{
+ char key[PATH_MAX] = "";
+ int32_t ret = -1;
+ gf_boolean_t is_valid = _gf_true;
+ glusterd_conf_t *priv = NULL;
+ uuid_t owner = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!name || !type) {
+ gf_log (this->name, GF_LOG_ERROR, "name is null.");
+ ret = -1;
+ goto out;
+ }
+
+ is_valid = glusterd_mgmt_v3_is_type_valid (type);
+ if (is_valid != _gf_true) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid entity. Cannot perform unlocking "
+ "operation on %s types", type);
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (key, sizeof(key), "%s_%s",
+ name, type);
+ if (ret != strlen(name) + 1 + strlen(type)) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create key");
+ ret = -1;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Trying to release lock of %s %s for %s as %s",
+ type, name, uuid_utoa (uuid), key);
+
+ ret = glusterd_get_mgmt_v3_lock_owner (key, &owner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to get mgmt_v3 lock owner");
+ goto out;
+ }
+
+ if (uuid_is_null (owner)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Lock for %s %s not held", type, name);
+ ret = -1;
+ goto out;
+ }
+
+ ret = uuid_compare (uuid, owner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Lock owner mismatch. "
+ "Lock for %s %s held by %s",
+ type, name, uuid_utoa (owner));
+ goto out;
+ }
+
+ /* Removing the mgmt_v3 lock from the global list */
+ dict_del (priv->mgmt_v3_lock, key);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lock for %s %s successfully released",
+ type, name);
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.h b/xlators/mgmt/glusterd/src/glusterd-locks.h
new file mode 100644
index 000000000..b9cc8c0d1
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-locks.h
@@ -0,0 +1,51 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_LOCKS_H_
+#define _GLUSTERD_LOCKS_H_
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+typedef struct glusterd_mgmt_v3_lock_object_ {
+ uuid_t lock_owner;
+} glusterd_mgmt_v3_lock_obj;
+
+typedef struct glusterd_mgmt_v3_lock_valid_entities {
+ char *type; /* Entity type like vol, snap */
+ gf_boolean_t default_value; /* The default value that *
+ * determines if the locks *
+ * should be held for that *
+ * entity */
+} glusterd_valid_entities;
+
+int32_t
+glusterd_mgmt_v3_lock_init ();
+
+void
+glusterd_mgmt_v3_lock_fini ();
+
+int32_t
+glusterd_get_mgmt_v3_lock_owner (char *volname, uuid_t *uuid);
+
+int32_t
+glusterd_mgmt_v3_lock (const char *key, uuid_t uuid, char *type);
+
+int32_t
+glusterd_mgmt_v3_unlock (const char *key, uuid_t uuid, char *type);
+
+int32_t
+glusterd_multiple_mgmt_v3_lock (dict_t *dict, uuid_t uuid);
+
+int32_t
+glusterd_multiple_mgmt_v3_unlock (dict_t *dict, uuid_t uuid);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-log-ops.c b/xlators/mgmt/glusterd/src/glusterd-log-ops.c
index 0136cddc9..33bd95c03 100644
--- a/xlators/mgmt/glusterd/src/glusterd-log-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-log-ops.c
@@ -24,7 +24,7 @@
#include <signal.h>
int
-glusterd_handle_log_rotate (rpcsvc_request_t *req)
+__glusterd_handle_log_rotate (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -90,6 +90,13 @@ out:
return ret;
}
+int
+glusterd_handle_log_rotate (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_log_rotate);
+}
+
/* op-sm */
int
glusterd_op_stage_log_rotate (dict_t *dict, char **op_errstr)
diff --git a/xlators/mgmt/glusterd/src/glusterd-mem-types.h b/xlators/mgmt/glusterd/src/glusterd-mem-types.h
index 98216e28a..e6f6a0333 100644
--- a/xlators/mgmt/glusterd/src/glusterd-mem-types.h
+++ b/xlators/mgmt/glusterd/src/glusterd-mem-types.h
@@ -66,7 +66,10 @@ typedef enum gf_gld_mem_types_ {
gf_gld_mt_hooks_stub_t = gf_common_mt_end + 50,
gf_gld_mt_hooks_priv_t = gf_common_mt_end + 51,
gf_gld_mt_mop_commit_req_t = gf_common_mt_end + 52,
- gf_gld_mt_end = gf_common_mt_end + 53,
+ gf_gld_mt_int = gf_common_mt_end + 53,
+ gf_gld_mt_snap_t = gf_common_mt_end + 54,
+ gf_gld_mt_missed_snapinfo_t = gf_common_mt_end + 55,
+ gf_gld_mt_end = gf_common_mt_end + 56,
} gf_gld_mem_types_t;
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
new file mode 100644
index 000000000..81c5aa579
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
@@ -0,0 +1,936 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+/* rpc related syncops */
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "xdr-generic.h"
+#include "glusterd1-xdr.h"
+#include "glusterd-syncop.h"
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-locks.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-op-sm.h"
+
+static int
+glusterd_mgmt_v3_null (rpcsvc_request_t *req)
+{
+ return 0;
+}
+
+static int
+glusterd_mgmt_v3_lock_send_resp (rpcsvc_request_t *req, int32_t status)
+{
+
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ if (rsp.op_ret)
+ rsp.op_errno = errno;
+
+ glusterd_get_uuid (&rsp.uuid);
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to mgmt_v3 lock, ret: %d", ret);
+
+ return ret;
+}
+
+static int
+glusterd_synctasked_mgmt_v3_lock (rpcsvc_request_t *req,
+ gd1_mgmt_v3_lock_req *lock_req,
+ glusterd_op_lock_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (ctx);
+ GF_ASSERT (ctx->dict);
+
+ /* Trying to acquire multiple mgmt_v3 locks */
+ ret = glusterd_multiple_mgmt_v3_lock (ctx->dict, ctx->uuid);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire mgmt_v3 locks for %s",
+ uuid_utoa (ctx->uuid));
+
+ ret = glusterd_mgmt_v3_lock_send_resp (req, ret);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_op_state_machine_mgmt_v3_lock (rpcsvc_request_t *req,
+ gd1_mgmt_v3_lock_req *lock_req,
+ glusterd_op_lock_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_op_info_t txn_op_info = {{0},};
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ glusterd_txn_opinfo_init (&txn_op_info, NULL, &lock_req->op,
+ ctx->dict, req);
+
+ ret = glusterd_set_txn_opinfo (&lock_req->txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+ goto out;
+ }
+
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK,
+ &lock_req->txn_id, ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to inject event GD_OP_EVENT_LOCK");
+
+out:
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_mgmt_v3_lock_fn (rpcsvc_request_t *req)
+{
+ gd1_mgmt_v3_lock_req lock_req = {{0},};
+ int32_t ret = -1;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_op_lock_ctx_t *ctx = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_synctasked = _gf_false;
+ gf_boolean_t free_ctx = _gf_false;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &lock_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode lock "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "Received mgmt_v3 lock req "
+ "from uuid: %s", uuid_utoa (lock_req.uuid));
+
+ if (glusterd_friend_find_by_uuid (lock_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (lock_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ ctx = GF_CALLOC (1, sizeof (*ctx), gf_gld_mt_op_lock_ctx_t);
+ if (!ctx) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (ctx->uuid, lock_req.uuid);
+ ctx->req = req;
+
+ ctx->dict = dict_new ();
+ if (!ctx->dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (lock_req.dict.dict_val,
+ lock_req.dict.dict_len, &ctx->dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ is_synctasked = dict_get_str_boolean (ctx->dict,
+ "is_synctasked", _gf_false);
+ if (is_synctasked) {
+ ret = glusterd_synctasked_mgmt_v3_lock (req, &lock_req, ctx);
+ /* The above function does not take ownership of ctx.
+ * Therefore we need to free the ctx explicitly. */
+ free_ctx = _gf_true;
+ }
+ else {
+ ret = glusterd_op_state_machine_mgmt_v3_lock (req, &lock_req,
+ ctx);
+ }
+
+out:
+
+ if (ret || free_ctx) {
+ if (ctx->dict)
+ dict_unref (ctx->dict);
+ if (ctx)
+ GF_FREE (ctx);
+ }
+
+ free (lock_req.dict.dict_val);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_pre_validate_send_resp (rpcsvc_request_t *req,
+ int32_t op, int32_t status,
+ char *op_errstr, dict_t *rsp_dict)
+{
+ gd1_mgmt_v3_pre_val_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op = op;
+ if (op_errstr)
+ rsp.op_errstr = op_errstr;
+ else
+ rsp.op_errstr = "";
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get serialized length of dict");
+ goto out;
+ }
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp);
+
+ GF_FREE (rsp.dict.dict_val);
+out:
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to pre validation, ret: %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_pre_validate_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_pre_val_req op_req = {{0},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ char *op_errstr = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to decode pre validation "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (op_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (op_req.dict.dict_val,
+ op_req.dict.dict_len, &dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
+ return -1;
+ }
+
+ ret = gd_mgmt_v3_pre_validate_fn (op_req.op, dict, &op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Pre Validation failed on operation %s",
+ gd_op_list[op_req.op]);
+ }
+
+ ret = glusterd_mgmt_v3_pre_validate_send_resp (req, op_req.op,
+ ret, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send Pre Validation "
+ "response for operation %s",
+ gd_op_list[op_req.op]);
+ goto out;
+ }
+
+out:
+ if (op_errstr && (strcmp (op_errstr, "")))
+ GF_FREE (op_errstr);
+
+ free (op_req.dict.dict_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_brick_op_send_resp (rpcsvc_request_t *req,
+ int32_t op, int32_t status,
+ char *op_errstr, dict_t *rsp_dict)
+{
+ gd1_mgmt_v3_brick_op_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op = op;
+ if (op_errstr)
+ rsp.op_errstr = op_errstr;
+ else
+ rsp.op_errstr = "";
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get serialized length of dict");
+ goto out;
+ }
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp);
+
+ GF_FREE (rsp.dict.dict_val);
+out:
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to brick op, ret: %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_brick_op_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_brick_op_req op_req = {{0},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ char *op_errstr = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode brick op "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (op_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (op_req.dict.dict_val,
+ op_req.dict.dict_len, &dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
+ return -1;
+ }
+
+ ret = gd_mgmt_v3_brick_op_fn (op_req.op, dict, &op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Brick Op failed on operation %s",
+ gd_op_list[op_req.op]);
+ }
+
+ ret = glusterd_mgmt_v3_brick_op_send_resp (req, op_req.op,
+ ret, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send brick op "
+ "response for operation %s",
+ gd_op_list[op_req.op]);
+ goto out;
+ }
+
+out:
+ if (op_errstr && (strcmp (op_errstr, "")))
+ GF_FREE (op_errstr);
+
+ free (op_req.dict.dict_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_commit_send_resp (rpcsvc_request_t *req,
+ int32_t op, int32_t status,
+ char *op_errstr, dict_t *rsp_dict)
+{
+ gd1_mgmt_v3_commit_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op = op;
+ if (op_errstr)
+ rsp.op_errstr = op_errstr;
+ else
+ rsp.op_errstr = "";
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get serialized length of dict");
+ goto out;
+ }
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp);
+
+ GF_FREE (rsp.dict.dict_val);
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Responded to commit, ret: %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_commit_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_commit_req op_req = {{0},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ char *op_errstr = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_commit_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode commit "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (op_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (op_req.dict.dict_val,
+ op_req.dict.dict_len, &dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
+ return -1;
+ }
+
+ ret = gd_mgmt_v3_commit_fn (op_req.op, dict, &op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "commit failed on operation %s",
+ gd_op_list[op_req.op]);
+ }
+
+ ret = glusterd_mgmt_v3_commit_send_resp (req, op_req.op,
+ ret, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send commit "
+ "response for operation %s",
+ gd_op_list[op_req.op]);
+ goto out;
+ }
+
+out:
+ if (op_errstr && (strcmp (op_errstr, "")))
+ GF_FREE (op_errstr);
+
+ free (op_req.dict.dict_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_post_validate_send_resp (rpcsvc_request_t *req,
+ int32_t op, int32_t status,
+ char *op_errstr, dict_t *rsp_dict)
+{
+ gd1_mgmt_v3_post_val_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op = op;
+ if (op_errstr)
+ rsp.op_errstr = op_errstr;
+ else
+ rsp.op_errstr = "";
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get serialized length of dict");
+ goto out;
+ }
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp);
+
+ GF_FREE (rsp.dict.dict_val);
+out:
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to post validation, ret: %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_post_validate_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_post_val_req op_req = {{0},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ char *op_errstr = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_post_val_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to decode post validation "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (op_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (op_req.dict.dict_val,
+ op_req.dict.dict_len, &dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
+ return -1;
+ }
+
+ ret = gd_mgmt_v3_post_validate_fn (op_req.op, op_req.op_ret, dict,
+ &op_errstr, rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Post Validation failed on operation %s",
+ gd_op_list[op_req.op]);
+ }
+
+ ret = glusterd_mgmt_v3_post_validate_send_resp (req, op_req.op,
+ ret, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send Post Validation "
+ "response for operation %s",
+ gd_op_list[op_req.op]);
+ goto out;
+ }
+
+out:
+ if (op_errstr && (strcmp (op_errstr, "")))
+ GF_FREE (op_errstr);
+
+ free (op_req.dict.dict_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req, int32_t status)
+{
+
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ if (rsp.op_ret)
+ rsp.op_errno = errno;
+
+ glusterd_get_uuid (&rsp.uuid);
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to mgmt_v3 unlock, ret: %d", ret);
+
+ return ret;
+}
+
+static int
+glusterd_syctasked_mgmt_v3_unlock (rpcsvc_request_t *req,
+ gd1_mgmt_v3_unlock_req *unlock_req,
+ glusterd_op_lock_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (ctx);
+
+ /* Trying to release multiple mgmt_v3 locks */
+ ret = glusterd_multiple_mgmt_v3_unlock (ctx->dict, ctx->uuid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release mgmt_v3 locks for %s",
+ uuid_utoa(ctx->uuid));
+ }
+
+ ret = glusterd_mgmt_v3_unlock_send_resp (req, ret);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+
+static int
+glusterd_op_state_machine_mgmt_v3_unlock (rpcsvc_request_t *req,
+ gd1_mgmt_v3_unlock_req *lock_req,
+ glusterd_op_lock_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK,
+ &lock_req->txn_id, ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to inject event GD_OP_EVENT_UNLOCK");
+
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_mgmt_v3_unlock_fn (rpcsvc_request_t *req)
+{
+ gd1_mgmt_v3_unlock_req lock_req = {{0},};
+ int32_t ret = -1;
+ glusterd_op_lock_ctx_t *ctx = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_synctasked = _gf_false;
+ gf_boolean_t free_ctx = _gf_false;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &lock_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode unlock "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "Received volume unlock req "
+ "from uuid: %s", uuid_utoa (lock_req.uuid));
+
+ if (glusterd_friend_find_by_uuid (lock_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (lock_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ ctx = GF_CALLOC (1, sizeof (*ctx), gf_gld_mt_op_lock_ctx_t);
+ if (!ctx) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (ctx->uuid, lock_req.uuid);
+ ctx->req = req;
+
+ ctx->dict = dict_new ();
+ if (!ctx->dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (lock_req.dict.dict_val,
+ lock_req.dict.dict_len, &ctx->dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ is_synctasked = dict_get_str_boolean (ctx->dict,
+ "is_synctasked", _gf_false);
+ if (is_synctasked) {
+ ret = glusterd_syctasked_mgmt_v3_unlock (req, &lock_req, ctx);
+ /* The above function does not take ownership of ctx.
+ * Therefore we need to free the ctx explicitly. */
+ free_ctx = _gf_true;
+ }
+ else {
+ ret = glusterd_op_state_machine_mgmt_v3_unlock (req, &lock_req,
+ ctx);
+ }
+
+out:
+
+ if (ret || free_ctx) {
+ if (ctx->dict)
+ dict_unref (ctx->dict);
+ if (ctx)
+ GF_FREE (ctx);
+ }
+
+ free (lock_req.dict.dict_val);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_handle_mgmt_v3_lock (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_mgmt_v3_lock_fn);
+}
+
+static int
+glusterd_handle_pre_validate (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_pre_validate_fn);
+}
+
+static int
+glusterd_handle_brick_op (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_brick_op_fn);
+}
+
+static int
+glusterd_handle_commit (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_commit_fn);
+}
+
+static int
+glusterd_handle_post_validate (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_post_validate_fn);
+}
+
+int
+glusterd_handle_mgmt_v3_unlock (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_mgmt_v3_unlock_fn);
+}
+
+rpcsvc_actor_t gd_svc_mgmt_v3_actors[] = {
+ [GLUSTERD_MGMT_V3_NULL] = { "NULL", GLUSTERD_MGMT_V3_NULL, glusterd_mgmt_v3_null, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_LOCK] = { "MGMT_V3_LOCK", GLUSTERD_MGMT_V3_LOCK, glusterd_handle_mgmt_v3_lock, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_PRE_VALIDATE] = { "PRE_VAL", GLUSTERD_MGMT_V3_PRE_VALIDATE, glusterd_handle_pre_validate, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_BRICK_OP] = { "BRCK_OP", GLUSTERD_MGMT_V3_BRICK_OP, glusterd_handle_brick_op, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_COMMIT] = { "COMMIT", GLUSTERD_MGMT_V3_COMMIT, glusterd_handle_commit, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_POST_VALIDATE] = { "POST_VAL", GLUSTERD_MGMT_V3_POST_VALIDATE, glusterd_handle_post_validate, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_UNLOCK] = { "MGMT_V3_UNLOCK", GLUSTERD_MGMT_V3_UNLOCK, glusterd_handle_mgmt_v3_unlock, NULL, 0, DRC_NA},
+};
+
+struct rpcsvc_program gd_svc_mgmt_v3_prog = {
+ .progname = "GlusterD svc mgmt v3",
+ .prognum = GD_MGMT_PROGRAM,
+ .progver = GD_MGMT_V3_VERSION,
+ .numactors = GLUSTERD_MGMT_V3_MAXVALUE,
+ .actors = gd_svc_mgmt_v3_actors,
+ .synctask = _gf_true,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-mgmt.c
new file mode 100644
index 000000000..5295f889e
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.c
@@ -0,0 +1,1899 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+/* rpc related syncops */
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "xdr-generic.h"
+#include "glusterd1-xdr.h"
+#include "glusterd-syncop.h"
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-locks.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-op-sm.h"
+
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
+
+
+static void
+gd_mgmt_v3_collate_errors (struct syncargs *args, int op_ret, int op_errno,
+ char *op_errstr, int op_code,
+ glusterd_peerinfo_t *peerinfo, u_char *uuid)
+{
+ char *peer_str = NULL;
+ char err_str[PATH_MAX] = "Please check log file for details.";
+ char op_err[PATH_MAX] = "";
+ int32_t len = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (args);
+ GF_ASSERT (uuid);
+
+ if (op_ret) {
+ args->op_ret = op_ret;
+ args->op_errno = op_errno;
+
+ if (peerinfo)
+ peer_str = peerinfo->hostname;
+ else
+ peer_str = uuid_utoa (uuid);
+
+ if (op_errstr && strcmp (op_errstr, "")) {
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "Error: %s", op_errstr);
+ err_str[len] = '\0';
+ }
+
+ switch (op_code) {
+ case GLUSTERD_MGMT_V3_LOCK:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Locking failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_PRE_VALIDATE:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Pre Validation failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_BRICK_OP:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Brick ops failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_COMMIT:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Commit failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_POST_VALIDATE:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Post Validation failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_UNLOCK:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Unlocking failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ }
+ op_err[len] = '\0';
+
+ if (args->errstr) {
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "%s\n%s", args->errstr,
+ op_err);
+ GF_FREE (args->errstr);
+ args->errstr = NULL;
+ } else
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "%s", op_err);
+ err_str[len] = '\0';
+
+ gf_log (this->name, GF_LOG_ERROR, "%s", op_err);
+ args->errstr = gf_strdup (err_str);
+ }
+
+ return;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ ret = glusterd_snapshot_prevalidate (dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Snapshot Prevalidate Failed");
+ goto out;
+ }
+
+ break;
+
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret);
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ {
+ ret = glusterd_snapshot_brickop (dict, op_errstr, rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "snapshot brickop "
+ "failed");
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "OP = %d. Returning %d", op, ret);
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_commit_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ {
+ ret = glusterd_snapshot (dict, op_errstr, rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Snapshot Commit Failed");
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret);
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_fn (glusterd_op_t op, int32_t op_ret, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ {
+ ret = glusterd_snapshot_postvalidate (dict, op_ret,
+ op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "postvalidate operation failed");
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ ret = 0;
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "OP = %d. Returning %d", op, ret);
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ /* Even though the lock command has failed, while collating the errors
+ (gd_mgmt_v3_collate_errors), args->op_ret and args->op_errno will be
+ used. @args is obtained from frame->local. So before checking the
+ status of the request and going out if its a failure, args should be
+ set to frame->local. Otherwise, while collating args will be NULL.
+ This applies to other phases such as prevalidate, brickop, commit and
+ postvalidate also.
+ */
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_LOCK,
+ peerinfo, rsp.uuid);
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_lock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_lock_cbk_fn);
+}
+
+int
+gd_mgmt_v3_lock (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ gd1_mgmt_v3_lock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_LOCK,
+ gd_mgmt_v3_lock_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_lock_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_initiate_lockdown (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *dict, char **op_errstr, int npeers,
+ gf_boolean_t *is_acquired)
+{
+ char *volname = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (is_acquired);
+
+ peers = &conf->xaction_peers;
+
+ /* Trying to acquire multiple mgmt_v3 locks on local node */
+ ret = glusterd_multiple_mgmt_v3_lock (dict, MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire mgmt_v3 locks on localhost");
+ goto out;
+ }
+
+ *is_acquired = _gf_true;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending mgmt_v3 lock req to other nodes in the cluster */
+ gd_syncargs_init (&args, NULL);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_lock (op, dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent lock op req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ if (ret) {
+ if (*op_errstr)
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ *op_errstr);
+
+ if (volname)
+ ret = gf_asprintf (op_errstr,
+ "Another transaction is in progress "
+ "for %s. Please try again after "
+ "sometime.", volname);
+ else
+ ret = gf_asprintf (op_errstr,
+ "Another transaction is in progress "
+ "Please try again after sometime.");
+
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+
+ return ret;
+}
+
+int
+glusterd_pre_validate_aggr_rsp_dict (glusterd_op_t op,
+ dict_t *aggr, dict_t *rsp)
+{
+ int32_t ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (aggr);
+ GF_ASSERT (rsp);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ ret = glusterd_snap_pre_validate_use_rsp_dict (aggr, rsp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to aggregate prevalidate "
+ "response dictionaries.");
+ goto out;
+ }
+ break;
+ default:
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Invalid op (%s)",
+ gd_op_list[op]);
+
+ break;
+ }
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_pre_val_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ dict_t *rsp_dict = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp);
+ if (ret < 0)
+ goto out;
+
+ if (rsp.dict.dict_len) {
+ /* Unserialize the dictionary */
+ rsp_dict = dict_new ();
+
+ ret = dict_unserialize (rsp.dict.dict_val,
+ rsp.dict.dict_len,
+ &rsp_dict);
+ if (ret < 0) {
+ free (rsp.dict.dict_val);
+ goto out;
+ } else {
+ rsp_dict->extra_stdfree = rsp.dict.dict_val;
+ }
+ }
+
+ uuid_copy (args->uuid, rsp.uuid);
+ pthread_mutex_lock (&args->lock_dict);
+ {
+ ret = glusterd_pre_validate_aggr_rsp_dict (rsp.op, args->dict,
+ rsp_dict);
+ }
+ pthread_mutex_unlock (&args->lock_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ if (!rsp.op_ret)
+ op_ret = ret;
+ else {
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+ }
+ } else {
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+ }
+
+out:
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_PRE_VALIDATE,
+ peerinfo, rsp.uuid);
+
+ if (rsp.op_errstr)
+ free (rsp.op_errstr);
+
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_pre_validate_cbk_fn);
+}
+
+int
+gd_mgmt_v3_pre_validate_req (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_pre_val_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_PRE_VALIDATE,
+ gd_mgmt_v3_pre_validate_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_pre_val_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_pre_validate (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *req_dict, char **op_errstr, int npeers)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (req_dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create response dictionary");
+ goto out;
+ }
+
+ /* Pre Validation on local node */
+ ret = gd_mgmt_v3_pre_validate_fn (op, req_dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Pre Validation failed for "
+ "operation %s on local node",
+ gd_op_list[op]);
+
+ if (*op_errstr == NULL) {
+ ret = gf_asprintf (op_errstr,
+ "Pre-validation failed "
+ "on localhost. Please "
+ "check log file for details");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+ goto out;
+ }
+
+ ret = glusterd_pre_validate_aggr_rsp_dict (op, req_dict,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ goto out;
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending Pre Validation req to other nodes in the cluster */
+ gd_syncargs_init (&args, req_dict);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_pre_validate_req (op, req_dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Pre Validation failed on peers");
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent pre valaidation req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_build_payload (dict_t **req, char **op_errstr, dict_t *dict,
+ glusterd_op_t op)
+{
+ int32_t ret = -1;
+ dict_t *req_dict = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (dict);
+
+ req_dict = dict_new ();
+ if (!req_dict)
+ goto out;
+
+ switch (op) {
+ case GD_OP_SNAP:
+ dict_copy (dict, req_dict);
+ break;
+ default:
+ break;
+ }
+
+ *req = req_dict;
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_brick_op_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ /* If the operation failed, then iov can be NULL. So better check the
+ status of the operation and then worry about iov (if the status of
+ the command is success)
+ */
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_BRICK_OP,
+ peerinfo, rsp.uuid);
+
+ if (rsp.op_errstr)
+ free (rsp.op_errstr);
+
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_brick_op_cbk_fn);
+}
+
+int
+gd_mgmt_v3_brick_op_req (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_brick_op_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_BRICK_OP,
+ gd_mgmt_v3_brick_op_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_brick_op_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_brick_op (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *req_dict, char **op_errstr, int npeers)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (req_dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create response dictionary");
+ goto out;
+ }
+
+ /* Perform brick op on local node */
+ ret = gd_mgmt_v3_brick_op_fn (op, req_dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Brick ops failed for "
+ "operation %s on local node",
+ gd_op_list[op]);
+
+ if (*op_errstr == NULL) {
+ ret = gf_asprintf (op_errstr,
+ "Brick ops failed "
+ "on localhost. Please "
+ "check log file for details");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+ goto out;
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending brick op req to other nodes in the cluster */
+ gd_syncargs_init (&args, NULL);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_brick_op_req (op, req_dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Brick ops failed on peers");
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent brick op req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_commit_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_commit_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ dict_t *rsp_dict = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp);
+ if (ret < 0)
+ goto out;
+
+ if (rsp.dict.dict_len) {
+ /* Unserialize the dictionary */
+ rsp_dict = dict_new ();
+
+ ret = dict_unserialize (rsp.dict.dict_val,
+ rsp.dict.dict_len,
+ &rsp_dict);
+ if (ret < 0) {
+ free (rsp.dict.dict_val);
+ goto out;
+ } else {
+ rsp_dict->extra_stdfree = rsp.dict.dict_val;
+ }
+ }
+
+ uuid_copy (args->uuid, rsp.uuid);
+ pthread_mutex_lock (&args->lock_dict);
+ {
+ ret = glusterd_syncop_aggr_rsp_dict (rsp.op, args->dict,
+ rsp_dict);
+ }
+ pthread_mutex_unlock (&args->lock_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ if (!rsp.op_ret)
+ op_ret = ret;
+ else {
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+ }
+ } else {
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+ }
+
+out:
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_COMMIT,
+ peerinfo, rsp.uuid);
+
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_commit_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_commit_cbk_fn);
+}
+
+int
+gd_mgmt_v3_commit_req (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_commit_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_COMMIT,
+ gd_mgmt_v3_commit_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_commit_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_commit (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr, int npeers)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (req_dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create response dictionary");
+ goto out;
+ }
+
+ /* Commit on local node */
+ ret = gd_mgmt_v3_commit_fn (op, req_dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Commit failed for "
+ "operation %s on local node",
+ gd_op_list[op]);
+
+ if (*op_errstr == NULL) {
+ ret = gf_asprintf (op_errstr,
+ "Commit failed "
+ "on localhost. Please "
+ "check log file for details.");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+ goto out;
+ }
+
+ ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ goto out;
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending commit req to other nodes in the cluster */
+ gd_syncargs_init (&args, op_ctx);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_commit_req (op, req_dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Commit failed on peers");
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent commit req for %s to %d "
+ "peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_post_val_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_POST_VALIDATE,
+ peerinfo, rsp.uuid);
+ if (rsp.op_errstr)
+ free (rsp.op_errstr);
+
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_post_validate_cbk_fn);
+}
+
+int
+gd_mgmt_v3_post_validate_req (glusterd_op_t op, int32_t op_ret, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_post_val_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ req.op_ret = op_ret;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_POST_VALIDATE,
+ gd_mgmt_v3_post_validate_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_post_val_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_post_validate (glusterd_conf_t *conf, glusterd_op_t op,
+ int32_t op_ret, dict_t *dict, dict_t *req_dict,
+ char **op_errstr, int npeers)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (dict);
+ GF_ASSERT (req_dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+ GF_ASSERT (peers);
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create response dictionary");
+ goto out;
+ }
+
+ /* Copy the contents of dict like missed snaps info to req_dict */
+ dict_copy (dict, req_dict);
+
+ /* Post Validation on local node */
+ ret = gd_mgmt_v3_post_validate_fn (op, op_ret, req_dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Post Validation failed for "
+ "operation %s on local node",
+ gd_op_list[op]);
+
+ if (*op_errstr == NULL) {
+ ret = gf_asprintf (op_errstr,
+ "Post-validation failed "
+ "on localhost. Please check "
+ "log file for details");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+ goto out;
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending Post Validation req to other nodes in the cluster */
+ gd_syncargs_init (&args, req_dict);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_post_validate_req (op, op_ret, req_dict, peerinfo,
+ &args, MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Post Validation failed on peers");
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent post valaidation req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_UNLOCK,
+ peerinfo, rsp.uuid);
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_unlock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_unlock_cbk_fn);
+}
+
+int
+gd_mgmt_v3_unlock (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_unlock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_UNLOCK,
+ gd_mgmt_v3_unlock_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_unlock_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_release_peer_locks (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *dict, int32_t op_ret,
+ char **op_errstr, int npeers,
+ gf_boolean_t is_acquired)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+
+ /* If the lock has not been held during this
+ * transaction, do not send unlock requests */
+ if (!is_acquired)
+ goto out;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending mgmt_v3 unlock req to other nodes in the cluster */
+ gd_syncargs_init (&args, NULL);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_unlock (op, dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unlock failed on peers");
+
+ if (!op_ret && args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent unlock op req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+
+out:
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_initiate_all_phases (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict)
+{
+ int32_t ret = -1;
+ int32_t op_ret = -1;
+ int32_t npeers = 0;
+ dict_t *req_dict = NULL;
+ dict_t *tmp_dict = NULL;
+ glusterd_conf_t *conf = NULL;
+ char *op_errstr = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_acquired = _gf_false;
+ uuid_t *originator_uuid = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ /* Save the MY_UUID as the originator_uuid. This originator_uuid
+ * will be used by is_origin_glusterd() to determine if a node
+ * is the originator node for a command. */
+ originator_uuid = GF_CALLOC (1, sizeof(uuid_t),
+ gf_common_mt_uuid_t);
+ if (!originator_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*originator_uuid, MY_UUID);
+ ret = dict_set_bin (dict, "originator_uuid",
+ originator_uuid, sizeof (uuid_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
+ goto out;
+ }
+
+ /* Marking the operation as complete synctasked */
+ ret = dict_set_int32 (dict, "is_synctasked", _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set synctasked flag.");
+ goto out;
+ }
+
+ /* Use a copy at local unlock as cli response will be sent before
+ * the unlock and the volname in the dict might be removed */
+ tmp_dict = dict_new();
+ if (!tmp_dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create dict");
+ goto out;
+ }
+ dict_copy (dict, tmp_dict);
+
+ /* BUILD PEERS LIST */
+ INIT_LIST_HEAD (&conf->xaction_peers);
+ npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op);
+
+ /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */
+ ret = glusterd_mgmt_v3_initiate_lockdown (conf, op, dict, &op_errstr,
+ npeers, &is_acquired);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "mgmt_v3 lockdown failed.");
+ goto out;
+ }
+
+ /* BUILD PAYLOAD */
+ ret = glusterd_mgmt_v3_build_payload (&req_dict, &op_errstr, dict, op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD,
+ gd_op_list[op]);
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+ goto out;
+ }
+
+ /* PRE-COMMIT VALIDATE PHASE */
+ ret = glusterd_mgmt_v3_pre_validate (conf, op, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Pre Validation Failed");
+ goto out;
+ }
+
+ /* COMMIT OP PHASE */
+ ret = glusterd_mgmt_v3_commit (conf, op, dict, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Commit Op Failed");
+ goto out;
+ }
+
+ /* POST-COMMIT VALIDATE PHASE */
+ /* As of now, post_validate is not handling any other
+ commands other than snapshot. So as of now, I am
+ sending 0 (op_ret as 0).
+ */
+ ret = glusterd_mgmt_v3_post_validate (conf, op, 0, dict, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Post Validation Failed");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ op_ret = ret;
+ /* UNLOCK PHASE FOR PEERS*/
+ (void) glusterd_mgmt_v3_release_peer_locks (conf, op, dict,
+ op_ret, &op_errstr,
+ npeers, is_acquired);
+
+ /* LOCAL VOLUME(S) UNLOCK */
+ if (is_acquired) {
+ /* Trying to release multiple mgmt_v3 locks */
+ ret = glusterd_multiple_mgmt_v3_unlock (tmp_dict, MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release mgmt_v3 locks on localhost");
+ op_ret = ret;
+ }
+ }
+
+ /* SEND CLI RESPONSE */
+ glusterd_op_send_cli_response (op, op_ret, 0, req, dict, op_errstr);
+
+ if (req_dict)
+ dict_unref (req_dict);
+
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+
+ if (op_errstr) {
+ GF_FREE (op_errstr);
+ op_errstr = NULL;
+ }
+
+ return 0;
+}
+
+int32_t
+glusterd_mgmt_v3_initiate_snap_phases (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict)
+{
+ int32_t ret = -1;
+ int32_t op_ret = -1;
+ int32_t npeers = 0;
+ dict_t *req_dict = NULL;
+ dict_t *tmp_dict = NULL;
+ glusterd_conf_t *conf = NULL;
+ char *op_errstr = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_acquired = _gf_false;
+ uuid_t *originator_uuid = NULL;
+ gf_boolean_t success = _gf_false;
+ char *tmp_errstr = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ /* Save the MY_UUID as the originator_uuid. This originator_uuid
+ * will be used by is_origin_glusterd() to determine if a node
+ * is the originator node for a command. */
+ originator_uuid = GF_CALLOC (1, sizeof(uuid_t),
+ gf_common_mt_uuid_t);
+ if (!originator_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*originator_uuid, MY_UUID);
+ ret = dict_set_bin (dict, "originator_uuid",
+ originator_uuid, sizeof (uuid_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
+ goto out;
+ }
+
+ /* Marking the operation as complete synctasked */
+ ret = dict_set_int32 (dict, "is_synctasked", _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set synctasked flag.");
+ goto out;
+ }
+
+ /* Use a copy at local unlock as cli response will be sent before
+ * the unlock and the volname in the dict might be removed */
+ tmp_dict = dict_new();
+ if (!tmp_dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create dict");
+ goto out;
+ }
+ dict_copy (dict, tmp_dict);
+
+ /* BUILD PEERS LIST */
+ INIT_LIST_HEAD (&conf->xaction_peers);
+ npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op);
+
+ /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */
+ ret = glusterd_mgmt_v3_initiate_lockdown (conf, op, dict, &op_errstr,
+ npeers, &is_acquired);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "mgmt_v3 lockdown failed.");
+ goto out;
+ }
+
+ /* BUILD PAYLOAD */
+ ret = glusterd_mgmt_v3_build_payload (&req_dict, &op_errstr, dict, op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD,
+ gd_op_list[op]);
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+ goto out;
+ }
+
+ /* PRE-COMMIT VALIDATE PHASE */
+ ret = glusterd_mgmt_v3_pre_validate (conf, op, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Pre Validation Failed");
+ goto out;
+ }
+
+ /* BRICK OP PHASE for initiating barrier*/
+ ret = dict_set_int32 (req_dict, "barrier", 1);
+ if (ret)
+ goto out;
+ ret = glusterd_mgmt_v3_brick_op (conf, op, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Brick Ops Failed");
+ goto unbarrier;
+ }
+
+ /* COMMIT OP PHASE */
+ /* TODO: As of now, the plan is to do quorum check before sending the
+ commit fop and if the quorum succeeds, then commit is sent to all
+ the other glusterds.
+ snap create functionality now creates the in memory and on disk
+ objects for the snapshot (marking them as incomplete), takes the lvm
+ snapshot and then updates the status of the in memory and on disk
+ snap objects as complete. Suppose one of the glusterds goes down
+ after taking the lvm snapshot, but before updating the snap object,
+ then treat it as a snapshot create failure and trigger cleanup.
+ i.e the number of commit responses received by the originator
+ glusterd shold be the same as the number of peers it has sent the
+ request to (i.e npeers variable). If not, then originator glusterd
+ will initiate cleanup in post-validate fop.
+ Question: What if one of the other glusterds goes down as explained
+ above and along with it the originator glusterd also goes down?
+ Who will initiate the cleanup?
+ */
+ ret = glusterd_mgmt_v3_commit (conf, op, dict, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Commit Op Failed");
+ /* If the main op fails, we should save the error string.
+ Because, op_errstr will be used for unbarrier and
+ unlock ops also. We might lose the actual error that
+ caused the failure.
+ */
+ tmp_errstr = op_errstr;
+ op_errstr = NULL;
+ goto unbarrier;
+ }
+
+ success = _gf_true;
+unbarrier:
+ /* BRICK OP PHASE for removing the barrier*/
+ ret = dict_set_int32 (req_dict, "barrier", 0);
+ if (ret)
+ goto out;
+ ret = glusterd_mgmt_v3_brick_op (conf, op, req_dict,
+ &op_errstr, npeers);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Brick Ops Failed");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ op_ret = ret;
+
+ if (success == _gf_false)
+ op_ret = -1;
+
+ /* POST-COMMIT VALIDATE PHASE */
+ ret = glusterd_mgmt_v3_post_validate (conf, op, op_ret, dict, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Post Validation Failed");
+ op_ret = -1;
+ }
+
+ /* UNLOCK PHASE FOR PEERS*/
+ (void) glusterd_mgmt_v3_release_peer_locks (conf, op, dict,
+ op_ret, &op_errstr,
+ npeers, is_acquired);
+
+ /* If the commit op (snapshot taking) failed, then the error is stored
+ in tmp_errstr and unbarrier is called. Suppose, if unbarrier also
+ fails, then the error happened in unbarrier is logged and freed.
+ The error happened in commit op, which is stored in tmp_errstr
+ is sent to cli.
+ */
+ if (tmp_errstr) {
+ if (op_errstr) {
+ gf_log (this->name, GF_LOG_ERROR, "unbarrier brick op"
+ "failed with the error %s", op_errstr);
+ GF_FREE (op_errstr);
+ op_errstr = NULL;
+ }
+ op_errstr = tmp_errstr;
+ }
+
+ /* LOCAL VOLUME(S) UNLOCK */
+ if (is_acquired) {
+ /* Trying to release multiple mgmt_v3 locks */
+ ret = glusterd_multiple_mgmt_v3_unlock (tmp_dict, MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release mgmt_v3 locks on localhost");
+ op_ret = ret;
+ }
+ }
+
+ /* SEND CLI RESPONSE */
+ glusterd_op_send_cli_response (op, op_ret, 0, req, dict, op_errstr);
+
+ if (req_dict)
+ dict_unref (req_dict);
+
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+
+ if (op_errstr) {
+ GF_FREE (op_errstr);
+ op_errstr = NULL;
+ }
+
+ return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-mgmt.h
new file mode 100644
index 000000000..b185a9bec
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.h
@@ -0,0 +1,45 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_MGMT_H_
+#define _GLUSTERD_MGMT_H_
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+int32_t
+gd_mgmt_v3_pre_validate_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_brick_op_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_commit_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_post_validate_fn (glusterd_op_t op, int32_t op_ret, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+glusterd_mgmt_v3_initiate_all_phases (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict);
+
+int32_t
+glusterd_mgmt_v3_initiate_snap_phases (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict);
+
+int
+glusterd_snap_pre_validate_use_rsp_dict (dict_t *dst, dict_t *src);
+
+#endif /* _GLUSTERD_MGMT_H_ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-mountbroker.c b/xlators/mgmt/glusterd/src/glusterd-mountbroker.c
index 0d67d1303..4ce441da8 100644
--- a/xlators/mgmt/glusterd/src/glusterd-mountbroker.c
+++ b/xlators/mgmt/glusterd/src/glusterd-mountbroker.c
@@ -231,7 +231,6 @@ parse_mount_pattern_desc (gf_mount_spec_t *mspec, char *pdesc)
const char *georep_mnt_desc_template =
"SUP("
- "xlator-option=\\*-dht.assert-no-child-down=true "
"volfile-server=localhost "
"client-pid=%d "
"user-map-root=%s "
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index dbc23525f..9b130b4c6 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -37,6 +37,7 @@
#include "glusterd-store.h"
#include "glusterd-hooks.h"
#include "glusterd-volgen.h"
+#include "glusterd-locks.h"
#include "syscall.h"
#include "cli1-xdr.h"
#include "common-utils.h"
@@ -67,6 +68,253 @@
static struct list_head gd_op_sm_queue;
pthread_mutex_t gd_op_sm_lock;
glusterd_op_info_t opinfo = {{0},};
+
+int32_t
+glusterd_txn_opinfo_dict_init ()
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ priv->glusterd_txn_opinfo = dict_new ();
+ if (!priv->glusterd_txn_opinfo) {
+ ret = -1;
+ goto out;
+ }
+
+ memset (priv->global_txn_id, '\0', sizeof(uuid_t));
+
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+glusterd_txn_opinfo_dict_fini ()
+{
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (priv->glusterd_txn_opinfo)
+ dict_unref (priv->glusterd_txn_opinfo);
+}
+
+void
+glusterd_txn_opinfo_init (glusterd_op_info_t *opinfo,
+ glusterd_op_sm_state_info_t *state,
+ glusterd_op_t *op,
+ dict_t *op_ctx,
+ rpcsvc_request_t *req)
+{
+ GF_ASSERT (opinfo);
+
+ if (state)
+ opinfo->state = *state;
+
+ if (op)
+ opinfo->op = *op;
+
+ if (op_ctx)
+ opinfo->op_ctx = dict_ref(op_ctx);
+ else
+ opinfo->op_ctx = NULL;
+
+ if (req)
+ opinfo->req = req;
+
+ return;
+}
+
+int32_t
+glusterd_generate_txn_id (dict_t *dict, uuid_t **txn_id)
+{
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (dict);
+
+ *txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+ if (!*txn_id)
+ goto out;
+
+ if (priv->op_version < GD_OP_VERSION_4)
+ uuid_copy (**txn_id, priv->global_txn_id);
+ else
+ uuid_generate (**txn_id);
+
+ ret = dict_set_bin (dict, "transaction_id",
+ *txn_id, sizeof (**txn_id));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to set transaction id.");
+ goto out;
+ }
+
+ gf_log ("", GF_LOG_DEBUG,
+ "Transaction_id = %s", uuid_utoa (**txn_id));
+out:
+ if (ret && *txn_id) {
+ GF_FREE (*txn_id);
+ *txn_id = NULL;
+ }
+
+ return ret;
+}
+
+int32_t
+glusterd_get_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo)
+{
+ int32_t ret = -1;
+ glusterd_txn_opinfo_obj *opinfo_obj = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!txn_id || !opinfo) {
+ gf_log ("", GF_LOG_ERROR,
+ "Empty transaction id or opinfo received.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin(priv->glusterd_txn_opinfo,
+ uuid_utoa (*txn_id),
+ (void **) &opinfo_obj);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get transaction opinfo "
+ "for transaction ID : %s",
+ uuid_utoa (*txn_id));
+ goto out;
+ }
+
+ (*opinfo) = opinfo_obj->opinfo;
+
+ gf_log ("", GF_LOG_DEBUG,
+ "Successfully got opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_set_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo)
+{
+ int32_t ret = -1;
+ glusterd_txn_opinfo_obj *opinfo_obj = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!txn_id) {
+ gf_log ("", GF_LOG_ERROR, "Empty transaction id received.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin(priv->glusterd_txn_opinfo,
+ uuid_utoa (*txn_id),
+ (void **) &opinfo_obj);
+ if (ret) {
+ opinfo_obj = GF_CALLOC (1, sizeof(glusterd_txn_opinfo_obj),
+ gf_common_mt_txn_opinfo_obj_t);
+ if (!opinfo_obj) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_bin(priv->glusterd_txn_opinfo,
+ uuid_utoa (*txn_id), opinfo_obj,
+ sizeof(glusterd_txn_opinfo_obj));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+ goto out;
+ }
+ }
+
+ opinfo_obj->opinfo = (*opinfo);
+
+ gf_log ("", GF_LOG_DEBUG,
+ "Successfully set opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+ ret = 0;
+out:
+ if (ret)
+ if (opinfo_obj)
+ GF_FREE (opinfo_obj);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_clear_txn_opinfo (uuid_t *txn_id)
+{
+ int32_t ret = -1;
+ glusterd_op_info_t txn_op_info = {{0},};
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!txn_id) {
+ gf_log ("", GF_LOG_ERROR, "Empty transaction id received.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_get_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Transaction opinfo not found");
+ goto out;
+ }
+
+ if (txn_op_info.op_ctx)
+ dict_unref (txn_op_info.op_ctx);
+
+ dict_del(priv->glusterd_txn_opinfo, uuid_utoa (*txn_id));
+
+ gf_log ("", GF_LOG_DEBUG,
+ "Successfully cleared opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
static int glusterfs_port = GLUSTERD_DEFAULT_PORT;
static char *glusterd_op_sm_state_names[] = {
"Default",
@@ -147,14 +395,48 @@ glusterd_is_volume_started (glusterd_volinfo_t *volinfo)
}
static int
-glusterd_op_sm_inject_all_acc ()
+glusterd_op_sm_inject_all_acc (uuid_t *txn_id)
{
int32_t ret = -1;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, txn_id, NULL);
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+static int
+glusterd_check_quota_cmd (char *key, char *value, char *errstr, size_t size)
+{
+ int ret = -1;
+ gf_boolean_t b = _gf_false;
+
+ if ((strcmp (key, "quota") == 0) ||
+ (strcmp (key, "features.quota") == 0)) {
+ ret = gf_string2boolean (value, &b);
+ if (ret)
+ goto out;
+ if (b) {
+ snprintf (errstr, size," 'gluster "
+ "volume set <VOLNAME> %s %s' is "
+ "deprecated. Use 'gluster volume "
+ "quota <VOLNAME> enable' instead.",
+ key, value);
+ ret = -1;
+ goto out;
+ } else {
+ snprintf (errstr, size, " 'gluster "
+ "volume set <VOLNAME> %s %s' is "
+ "deprecated. Use 'gluster volume "
+ "quota <VOLNAME> disable' instead.",
+ key, value);
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
int
glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickinfo,
gd1_mgmt_brick_op_req **req, dict_t *dict)
@@ -235,20 +517,20 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
brick_req->name = gf_strdup (name);
break;
-
-#ifdef HAVE_BD_XLATOR
- case GD_OP_BD_OP:
- {
+ case GD_OP_SNAP:
brick_req = GF_CALLOC (1, sizeof (*brick_req),
gf_gld_mt_mop_brick_req_t);
if (!brick_req)
goto out;
- brick_req->op = GLUSTERD_BRICK_BD_OP;
- brick_req->name = "";
- }
+ brick_req->op = GLUSTERD_VOLUME_BARRIER_OP;
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret)
+ goto out;
+ snprintf (name, 1024, "%s-server",volname);
+ brick_req->name = gf_strdup (name);
+
break;
-#endif
default:
goto out;
break;
@@ -333,6 +615,10 @@ glusterd_validate_quorum_options (xlator_t *this, char *fullkey, char *value,
if (!glusterd_is_quorum_option (fullkey))
goto out;
key = strchr (fullkey, '.');
+ if (key == NULL) {
+ ret = -1;
+ goto out;
+ }
key++;
opt = xlator_volume_option_get (this, key);
ret = xlator_option_validate (this, key, value, opt, op_errstr);
@@ -420,7 +706,7 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
* This check is not done on the originator glusterd. The originator
* glusterd sets this value.
*/
- origin_glusterd = is_origin_glusterd ();
+ origin_glusterd = is_origin_glusterd (dict);
if (!origin_glusterd) {
/* Check for v3.3.x origin glusterd */
@@ -557,6 +843,10 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
}
}
+ ret = glusterd_check_quota_cmd (key, value, errstr, sizeof (errstr));
+ if (ret)
+ goto out;
+
if (is_key_glusterd_hooks_friendly (key))
continue;
@@ -730,7 +1020,7 @@ glusterd_op_stage_reset_volume (dict_t *dict, char **op_errstr)
{
int ret = 0;
char *volname = NULL;
- gf_boolean_t exists = _gf_false;
+ int exists = 0;
char msg[2048] = {0};
char *key = NULL;
char *key_fixed = NULL;
@@ -778,6 +1068,7 @@ glusterd_op_stage_reset_volume (dict_t *dict, char **op_errstr)
ret = -1;
goto out;
}
+
if (!exists) {
ret = snprintf (msg, sizeof (msg),
"Option %s does not exist", key);
@@ -828,7 +1119,7 @@ glusterd_op_stage_sync_volume (dict_t *dict, char **op_errstr)
goto out;
}
- if (glusterd_is_local_addr (hostname)) {
+ if (gf_is_local_addr (hostname)) {
//volname is not present in case of sync all
ret = dict_get_str (dict, "volname", &volname);
if (!ret) {
@@ -901,18 +1192,24 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
if (cmd & GF_CLI_STATUS_ALL)
goto out;
+ if ((cmd & GF_CLI_STATUS_QUOTAD) &&
+ (priv->op_version == GD_OP_VERSION_MIN)) {
+ snprintf (msg, sizeof (msg), "The cluster is operating at "
+ "version 1. Getting the status of quotad is not "
+ "allowed in this state.");
+ ret = -1;
+ goto out;
+ }
+
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- snprintf (msg, sizeof(msg), "Volume %s does not exist",
- volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
+ snprintf (msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
ret = -1;
goto out;
}
@@ -925,7 +1222,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
if (!ret) {
snprintf (msg, sizeof (msg), "Volume %s is not started",
volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
ret = -1;
goto out;
}
@@ -940,7 +1236,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg),
"NFS server is disabled for volume %s",
volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
} else if ((cmd & GF_CLI_STATUS_SHD) != 0) {
@@ -949,7 +1244,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg),
"Volume %s is not of type replicate",
volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
@@ -961,10 +1255,15 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg),
"Self-heal Daemon is disabled for volume %s",
volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
-
+ } else if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+ if (!glusterd_is_volume_quota_enabled (volinfo)) {
+ ret = -1;
+ snprintf (msg, sizeof (msg), "Volume %s does not have "
+ "quota enabled", volname);
+ goto out;
+ }
} else if ((cmd & GF_CLI_STATUS_BRICK) != 0) {
ret = dict_get_str (dict, "brick", &brick);
if (ret)
@@ -975,8 +1274,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
if (ret) {
snprintf (msg, sizeof(msg), "No brick %s in"
" volume %s", brick, volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
-
ret = -1;
goto out;
}
@@ -992,7 +1289,7 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
*op_errstr = gf_strdup ("Validation Failed for Status");
}
- gf_log (THIS->name, GF_LOG_DEBUG, "Returning: %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning: %d", ret);
return ret;
}
@@ -1101,14 +1398,17 @@ _delete_reconfig_opt (dict_t *this, char *key, data_t *value, void *data)
GF_ASSERT (data);
is_force = (int32_t*)data;
- if (*is_force != 1 &&
- (_gf_true == glusterd_check_voloption_flags (key,
- OPT_FLAG_FORCE))) {
+ if (*is_force != 1) {
+ if (_gf_true == glusterd_check_voloption_flags (key,
+ OPT_FLAG_FORCE)) {
/* indicate to caller that we don't set the option
* due to being protected
*/
- *is_force = -1;
- goto out;
+ *is_force = *is_force | GD_OP_PROTECTED;
+ goto out;
+ } else {
+ *is_force = *is_force | GD_OP_UNPROTECTED;
+ }
}
gf_log ("", GF_LOG_DEBUG, "deleting dict with key=%s,value=%s",
@@ -1160,8 +1460,9 @@ glusterd_options_reset (glusterd_volinfo_t *volinfo, char *key,
_delete_reconfig_opt (volinfo->dict, key, value, is_force);
}
- ret = glusterd_create_volfiles_and_notify_services (volinfo);
+ gd_update_volume_op_versions (volinfo);
+ ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Unable to create volfile for"
" 'volume reset'");
@@ -1276,7 +1577,7 @@ out:
}
static int
-glusterd_op_reset_volume (dict_t *dict, char **op_errstr)
+glusterd_op_reset_volume (dict_t *dict, char **op_rspstr)
{
glusterd_volinfo_t *volinfo = NULL;
int ret = -1;
@@ -1331,14 +1632,20 @@ glusterd_op_reset_volume (dict_t *dict, char **op_errstr)
quorum_action = _gf_true;
ret = glusterd_options_reset (volinfo, key, &is_force);
- if (is_force == -1) {
- ret = -1;
- gf_asprintf(op_errstr, "'%s' is protected. To reset use 'force'.",
- key);
+ if (ret == -1) {
+ gf_asprintf(op_rspstr, "Volume reset : failed");
+ } else if (is_force & GD_OP_PROTECTED) {
+ if (is_force & GD_OP_UNPROTECTED) {
+ gf_asprintf (op_rspstr, "All unprotected fields were"
+ " reset. To reset the protected fields,"
+ " use 'force'.");
+ } else {
+ ret = -1;
+ gf_asprintf (op_rspstr, "'%s' is protected. To reset"
+ " use 'force'.", key);
+ }
}
- gd_update_volume_op_versions (volinfo);
-
out:
GF_FREE (key_fixed);
if (quorum_action)
@@ -1366,14 +1673,25 @@ glusterd_stop_bricks (glusterd_volinfo_t *volinfo)
int
glusterd_start_bricks (glusterd_volinfo_t *volinfo)
{
- glusterd_brickinfo_t *brickinfo = NULL;
+ int ret = -1;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ GF_ASSERT (volinfo);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- if (glusterd_brick_start (volinfo, brickinfo, _gf_false))
- return -1;
+ ret = glusterd_brick_start (volinfo, brickinfo, _gf_false);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Failed to start %s:%s for %s",
+ brickinfo->hostname, brickinfo->path,
+ volinfo->volname);
+ goto out;
+ }
}
- return 0;
+ ret = 0;
+out:
+ return ret;
}
static int
@@ -1428,10 +1746,6 @@ glusterd_op_set_all_volume_options (xlator_t *this, dict_t *dict)
if (ret)
goto out;
- dup_value = gf_strdup (value);
- if (!dup_value)
- goto out;
-
ret = glusterd_store_options (this, dup_opt);
if (ret)
goto out;
@@ -1446,10 +1760,18 @@ glusterd_op_set_all_volume_options (xlator_t *this, dict_t *dict)
else
next_version = NULL;
+ dup_value = gf_strdup (value);
+ if (!dup_value)
+ goto out;
+
ret = dict_set_dynstr (conf->opts, key, dup_value);
if (ret)
goto out;
+ else
+ dup_value = NULL; /* Protect the allocation from GF_FREE */
+
out:
+ GF_FREE (dup_value);
GF_FREE (key_fixed);
if (dup_opt)
dict_unref (dup_opt);
@@ -1476,6 +1798,7 @@ glusterd_op_set_volume (dict_t *dict)
char str[50] = {0, };
char *op_errstr = NULL;
gf_boolean_t global_opt = _gf_false;
+ gf_boolean_t global_opts_set = _gf_false;
glusterd_volinfo_t *voliter = NULL;
int32_t dict_count = 0;
gf_boolean_t check_op_version = _gf_false;
@@ -1497,10 +1820,12 @@ glusterd_op_set_volume (dict_t *dict)
if (dict_count == 0) {
ret = glusterd_volset_help (NULL, &op_errstr);
if (ret) {
- op_errstr = (op_errstr)? op_errstr:
- "Volume set help internal error";
- gf_log (this->name, GF_LOG_ERROR, "%s", op_errstr);
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ (op_errstr)? op_errstr:
+ "Volume set help internal error");
}
+
+ GF_FREE(op_errstr);
goto out;
}
@@ -1537,7 +1862,6 @@ glusterd_op_set_volume (dict_t *dict)
for (count = 1; ret != -1 ; count++) {
- global_opt = _gf_false;
sprintf (str, "key%d", count);
ret = dict_get_str (dict, str, &key);
if (ret)
@@ -1585,8 +1909,11 @@ glusterd_op_set_volume (dict_t *dict)
}
}
- if (glusterd_check_globaloption (key))
+ global_opt = _gf_false;
+ if (glusterd_check_globaloption (key)) {
global_opt = _gf_true;
+ global_opts_set = _gf_true;
+ }
if (!global_opt)
value = gf_strdup (value);
@@ -1629,7 +1956,21 @@ glusterd_op_set_volume (dict_t *dict)
goto out;
}
- if (!global_opt) {
+ /* Update the cluster op-version before regenerating volfiles so that
+ * correct volfiles are generated
+ */
+ if (new_op_version > priv->op_version) {
+ priv->op_version = new_op_version;
+ ret = glusterd_store_global_info (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store op-version");
+ goto out;
+ }
+ }
+
+ if (!global_opts_set) {
+ gd_update_volume_op_versions (volinfo);
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1651,11 +1992,11 @@ glusterd_op_set_volume (dict_t *dict)
goto out;
}
}
- gd_update_volume_op_versions (volinfo);
} else {
list_for_each_entry (voliter, &priv->volumes, vol_list) {
volinfo = voliter;
+ gd_update_volume_op_versions (volinfo);
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1678,17 +2019,6 @@ glusterd_op_set_volume (dict_t *dict)
goto out;
}
}
- gd_update_volume_op_versions (volinfo);
- }
- }
-
- if (new_op_version > priv->op_version) {
- priv->op_version = new_op_version;
- ret = glusterd_store_global_info (this);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to store op-version");
- goto out;
}
}
@@ -1728,7 +2058,7 @@ glusterd_op_sync_volume (dict_t *dict, char **op_errstr,
goto out;
}
- if (!glusterd_is_local_addr (hostname)) {
+ if (!gf_is_local_addr (hostname)) {
ret = 0;
goto out;
}
@@ -1752,12 +2082,12 @@ glusterd_op_sync_volume (dict_t *dict, char **op_errstr,
if (volname) {
ret = glusterd_add_volume_to_dict (volinfo, rsp_dict,
- 1);
+ 1, "volume");
vol_count = 1;
} else {
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
- ret = glusterd_add_volume_to_dict (volinfo,
- rsp_dict, count);
+ ret = glusterd_add_volume_to_dict (volinfo, rsp_dict,
+ count, "volume");
if (ret)
goto out;
@@ -1899,6 +2229,105 @@ out:
}
static int
+_add_brick_name_to_dict (dict_t *dict, char *key, glusterd_brickinfo_t *brick)
+{
+ int ret = -1;
+ char tmp[1024] = {0,};
+ char *brickname = NULL;
+ xlator_t *this = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (key);
+ GF_ASSERT (brick);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ snprintf (tmp, sizeof (tmp), "%s:%s", brick->hostname, brick->path);
+ brickname = gf_strdup (tmp);
+ if (!brickname) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to dup brick name");
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, key, brickname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add brick name to dict");
+ goto out;
+ }
+ brickname = NULL;
+out:
+ if (brickname)
+ GF_FREE (brickname);
+ return ret;
+}
+
+static int
+_add_remove_bricks_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo,
+ char *prefix)
+{
+ int ret = -1;
+ int count = 0;
+ int i = 0;
+ char brick_key[1024] = {0,};
+ char dict_key[1024] ={0,};
+ char *brick = NULL;
+ xlator_t *this = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (volinfo);
+ GF_ASSERT (prefix);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_int32 (volinfo->rebal.dict, "count", &count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get brick count");
+ goto out;
+ }
+
+ snprintf (dict_key, sizeof (dict_key), "%s.count", prefix);
+ ret = dict_set_int32 (dict, dict_key, count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set brick count in dict");
+ goto out;
+ }
+
+ for (i = 1; i <= count; i++) {
+ memset (brick_key, 0, sizeof (brick_key));
+ snprintf (brick_key, sizeof (brick_key), "brick%d", i);
+
+ ret = dict_get_str (volinfo->rebal.dict, brick_key, &brick);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get %s", brick_key);
+ goto out;
+ }
+
+ memset (dict_key, 0, sizeof (dict_key));
+ snprintf (dict_key, sizeof (dict_key), "%s.%s", prefix,
+ brick_key);
+ ret = dict_set_str (dict, dict_key, brick);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add brick to dict");
+ goto out;
+ }
+ brick = NULL;
+ }
+
+out:
+ return ret;
+}
+
+/* This adds the respective task-id and all available parameters of a task into
+ * a dictionary
+ */
+static int
_add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index)
{
@@ -1915,13 +2344,34 @@ _add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index)
GF_ASSERT (this);
switch (op) {
- case GD_OP_REBALANCE:
case GD_OP_REMOVE_BRICK:
+ snprintf (key, sizeof (key), "task%d", index);
+ ret = _add_remove_bricks_to_dict (dict, volinfo, key);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add remove bricks to dict");
+ goto out;
+ }
+ case GD_OP_REBALANCE:
uuid_str = gf_strdup (uuid_utoa (volinfo->rebal.rebalance_id));
status = volinfo->rebal.defrag_status;
break;
case GD_OP_REPLACE_BRICK:
+ snprintf (key, sizeof (key), "task%d.src-brick", index);
+ ret = _add_brick_name_to_dict (dict, key,
+ volinfo->rep_brick.src_brick);
+ if (ret)
+ goto out;
+ memset (key, 0, sizeof (key));
+
+ snprintf (key, sizeof (key), "task%d.dst-brick", index);
+ ret = _add_brick_name_to_dict (dict, key,
+ volinfo->rep_brick.dst_brick);
+ if (ret)
+ goto out;
+ memset (key, 0, sizeof (key));
+
uuid_str = gf_strdup (uuid_utoa (volinfo->rep_brick.rb_id));
status = volinfo->rep_brick.rb_status;
break;
@@ -1934,8 +2384,7 @@ _add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index)
}
snprintf (key, sizeof (key), "task%d.type", index);
- ret = dict_set_str (dict, key,
- (char *)gd_op_list[op]);
+ ret = dict_set_str (dict, key, (char *)gd_op_list[op]);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Error setting task type in dict");
@@ -1945,7 +2394,6 @@ _add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index)
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "task%d.id", index);
-
if (!uuid_str)
goto out;
ret = dict_set_dynstr (dict, key, uuid_str);
@@ -1972,6 +2420,50 @@ out:
}
static int
+glusterd_aggregate_task_status (dict_t *rsp_dict, glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int tasks = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (!uuid_is_null (volinfo->rebal.rebalance_id)) {
+ ret = _add_task_to_dict (rsp_dict, volinfo, volinfo->rebal.op,
+ tasks);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add task details to dict");
+ goto out;
+ }
+ tasks++;
+ }
+
+ if (!uuid_is_null (volinfo->rep_brick.rb_id)) {
+ ret = _add_task_to_dict (rsp_dict, volinfo, GD_OP_REPLACE_BRICK,
+ tasks);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add task details to dict");
+ goto out;
+ }
+ tasks++;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "tasks", tasks);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting tasks count in dict");
+ goto out;
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int
glusterd_op_status_volume (dict_t *dict, char **op_errstr,
dict_t *rsp_dict)
{
@@ -1991,7 +2483,6 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
gf_boolean_t nfs_disabled = _gf_false;
gf_boolean_t shd_enabled = _gf_true;
gf_boolean_t origin_glusterd = _gf_false;
- int tasks = 0;
this = THIS;
GF_ASSERT (this);
@@ -2001,13 +2492,13 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
GF_ASSERT (dict);
- origin_glusterd = is_origin_glusterd ();
+ origin_glusterd = is_origin_glusterd (dict);
ret = dict_get_uint32 (dict, "cmd", &cmd);
if (ret)
goto out;
- if (is_origin_glusterd ()) {
+ if (origin_glusterd) {
ret = 0;
if ((cmd & GF_CLI_STATUS_ALL)) {
ret = glusterd_get_all_volnames (rsp_dict);
@@ -2052,6 +2543,14 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
other_count++;
node_count++;
+ } else if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+ ret = glusterd_add_node_to_dict ("quotad", rsp_dict, 0,
+ vol_opts);
+ if (ret)
+ goto out;
+ other_count++;
+ node_count++;
+
} else if ((cmd & GF_CLI_STATUS_BRICK) != 0) {
ret = dict_get_str (dict, "brick", &brick);
if (ret)
@@ -2074,6 +2573,10 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
brick_index);
node_count++;
+ } else if ((cmd & GF_CLI_STATUS_TASKS) != 0) {
+ ret = glusterd_aggregate_task_status (rsp_dict, volinfo);
+ goto out;
+
} else {
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
brick_index++;
@@ -2123,6 +2626,17 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
goto out;
other_count++;
node_count++;
+ other_index++;
+ }
+ if (glusterd_is_volume_quota_enabled (volinfo)) {
+ ret = glusterd_add_node_to_dict ("quotad",
+ rsp_dict,
+ other_index,
+ vol_opts);
+ if (ret)
+ goto out;
+ other_count++;
+ node_count++;
}
}
}
@@ -2147,35 +2661,16 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
}
/* Active tasks */
- if (((cmd & GF_CLI_STATUS_MASK) != GF_CLI_STATUS_NONE) ||
- !origin_glusterd)
+ /* Tasks are added only for normal volume status request for either a
+ * single volume or all volumes
+ */
+ if (!glusterd_status_has_tasks (cmd))
goto out;
- if (!uuid_is_null (volinfo->rebal.rebalance_id)) {
- ret = _add_task_to_dict (rsp_dict, volinfo, volinfo->rebal.op,
- tasks);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to add task details to dict");
- goto out;
- }
- tasks++;
- }
- if (!uuid_is_null (volinfo->rep_brick.rb_id)) {
- ret = _add_task_to_dict (rsp_dict, volinfo, GD_OP_REPLACE_BRICK,
- tasks);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to add task details to dict");
- goto out;
- }
- tasks++;
- }
-
- ret = dict_set_int32 (rsp_dict, "tasks", tasks);
+ ret = glusterd_aggregate_task_status (rsp_dict, volinfo);
if (ret)
- gf_log (this->name, GF_LOG_ERROR,
- "Error setting tasks count in dict");
+ goto out;
+ ret = 0;
out:
gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -2202,6 +2697,7 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx)
xlator_t *this = NULL;
glusterd_peerinfo_t *peerinfo = NULL;
uint32_t pending_count = 0;
+ dict_t *dict = NULL;
this = THIS;
priv = this->private;
@@ -2216,27 +2712,61 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx)
(glusterd_op_get_op() != GD_OP_SYNC_VOLUME))
continue;
- proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_LOCK];
- if (proc->fn) {
- ret = proc->fn (NULL, this, peerinfo);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING, "Failed to "
- "send lock request for operation "
- "'Volume %s' to peer %s",
- gd_op_list[opinfo.op],
- peerinfo->hostname);
- continue;
+ /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4) {
+ proc = &peerinfo->mgmt->proctable
+ [GLUSTERD_MGMT_CLUSTER_LOCK];
+ if (proc->fn) {
+ ret = proc->fn (NULL, this, peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to send lock request "
+ "for operation 'Volume %s' to "
+ "peer %s",
+ gd_op_list[opinfo.op],
+ peerinfo->hostname);
+ continue;
+ }
+ pending_count++;
+ }
+ } else {
+ dict = glusterd_op_get_ctx ();
+ dict_ref (dict);
+
+ proc = &peerinfo->mgmt_v3->proctable
+ [GLUSTERD_MGMT_V3_LOCK];
+ if (proc->fn) {
+ ret = dict_set_static_ptr (dict, "peerinfo",
+ peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set peerinfo");
+ dict_unref (dict);
+ goto out;
+ }
+
+ ret = proc->fn (NULL, this, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to send mgmt_v3 lock "
+ "request for operation "
+ "'Volume %s' to peer %s",
+ gd_op_list[opinfo.op],
+ peerinfo->hostname);
+ dict_unref (dict);
+ continue;
+ }
+ pending_count++;
}
- pending_count++;
}
}
opinfo.pending_count = pending_count;
if (!opinfo.pending_count)
- ret = glusterd_op_sm_inject_all_acc ();
+ ret = glusterd_op_sm_inject_all_acc (&event->txn_id);
+out:
gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
-
return ret;
}
@@ -2249,17 +2779,12 @@ glusterd_op_ac_send_unlock (glusterd_op_sm_event_t *event, void *ctx)
xlator_t *this = NULL;
glusterd_peerinfo_t *peerinfo = NULL;
uint32_t pending_count = 0;
+ dict_t *dict = NULL;
this = THIS;
priv = this->private;
GF_ASSERT (priv);
- /*ret = glusterd_unlock (MY_UUID);
-
- if (ret)
- goto out;
- */
-
list_for_each_entry (peerinfo, &priv->peers, uuid_list) {
GF_ASSERT (peerinfo);
@@ -2269,29 +2794,63 @@ glusterd_op_ac_send_unlock (glusterd_op_sm_event_t *event, void *ctx)
(glusterd_op_get_op() != GD_OP_SYNC_VOLUME))
continue;
- proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_UNLOCK];
- if (proc->fn) {
- ret = proc->fn (NULL, this, peerinfo);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING, "Failed to "
- "send unlock request for operation "
- "'Volume %s' to peer %s",
- gd_op_list[opinfo.op],
- peerinfo->hostname);
- continue;
+ /* Based on the op_version,
+ * release the cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4) {
+ proc = &peerinfo->mgmt->proctable
+ [GLUSTERD_MGMT_CLUSTER_UNLOCK];
+ if (proc->fn) {
+ ret = proc->fn (NULL, this, peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to send unlock request "
+ "for operation 'Volume %s' to "
+ "peer %s",
+ gd_op_list[opinfo.op],
+ peerinfo->hostname);
+ continue;
+ }
+ pending_count++;
+ }
+ } else {
+ dict = glusterd_op_get_ctx ();
+ dict_ref (dict);
+
+ proc = &peerinfo->mgmt_v3->proctable
+ [GLUSTERD_MGMT_V3_UNLOCK];
+ if (proc->fn) {
+ ret = dict_set_static_ptr (dict, "peerinfo",
+ peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set peerinfo");
+ dict_unref (dict);
+ goto out;
+ }
+
+ ret = proc->fn (NULL, this, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to send volume unlock "
+ "request for operation "
+ "'Volume %s' to peer %s",
+ gd_op_list[opinfo.op],
+ peerinfo->hostname);
+ dict_unref (dict);
+ continue;
+ }
+ pending_count++;
}
- pending_count++;
}
}
opinfo.pending_count = pending_count;
if (!opinfo.pending_count)
- ret = glusterd_op_sm_inject_all_acc ();
+ ret = glusterd_op_sm_inject_all_acc (&event->txn_id);
+out:
gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
-
return ret;
-
}
static int
@@ -2303,7 +2862,8 @@ glusterd_op_ac_ack_drain (glusterd_op_sm_event_t *event, void *ctx)
opinfo.pending_count--;
if (!opinfo.pending_count)
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, NULL);
gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
@@ -2319,43 +2879,95 @@ glusterd_op_ac_send_unlock_drain (glusterd_op_sm_event_t *event, void *ctx)
static int
glusterd_op_ac_lock (glusterd_op_sm_event_t *event, void *ctx)
{
- glusterd_op_lock_ctx_t *lock_ctx = NULL;
- int32_t ret = 0;
+ int32_t ret = 0;
+ char *volname = NULL;
+ glusterd_op_lock_ctx_t *lock_ctx = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
GF_ASSERT (event);
GF_ASSERT (ctx);
+ this = THIS;
+ priv = this->private;
+
lock_ctx = (glusterd_op_lock_ctx_t *)ctx;
- ret = glusterd_lock (lock_ctx->uuid);
+ /* If the req came from a node running on older op_version
+ * the dict won't be present. Based on it acquiring a cluster
+ * or mgmt_v3 lock */
+ if (lock_ctx->dict == NULL) {
+ ret = glusterd_lock (lock_ctx->uuid);
+ glusterd_op_lock_send_resp (lock_ctx->req, ret);
+ } else {
+ ret = dict_get_str (lock_ctx->dict, "volname", &volname);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire volname");
+ else {
+ ret = glusterd_mgmt_v3_lock (volname, lock_ctx->uuid,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock for %s",
+ volname);
+ }
- gf_log (THIS->name, GF_LOG_DEBUG, "Lock Returned %d", ret);
+ glusterd_op_mgmt_v3_lock_send_resp (lock_ctx->req,
+ &event->txn_id, ret);
- glusterd_op_lock_send_resp (lock_ctx->req, ret);
+ dict_unref (lock_ctx->dict);
+ }
+ gf_log (THIS->name, GF_LOG_DEBUG, "Lock Returned %d", ret);
return ret;
}
static int
glusterd_op_ac_unlock (glusterd_op_sm_event_t *event, void *ctx)
{
- int ret = 0;
- glusterd_op_lock_ctx_t *lock_ctx = NULL;
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
+ int32_t ret = 0;
+ char *volname = NULL;
+ glusterd_op_lock_ctx_t *lock_ctx = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
GF_ASSERT (event);
GF_ASSERT (ctx);
this = THIS;
priv = this->private;
+
lock_ctx = (glusterd_op_lock_ctx_t *)ctx;
- ret = glusterd_unlock (lock_ctx->uuid);
+ /* If the req came from a node running on older op_version
+ * the dict won't be present. Based on it releasing the cluster
+ * or mgmt_v3 lock */
+ if (lock_ctx->dict == NULL) {
+ ret = glusterd_unlock (lock_ctx->uuid);
+ glusterd_op_unlock_send_resp (lock_ctx->req, ret);
+ } else {
+ ret = dict_get_str (lock_ctx->dict, "volname", &volname);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire volname");
+ else {
+ ret = glusterd_mgmt_v3_unlock (volname, lock_ctx->uuid,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to release lock for %s",
+ volname);
+ }
+
+ glusterd_op_mgmt_v3_unlock_send_resp (lock_ctx->req,
+ &event->txn_id, ret);
- gf_log (this->name, GF_LOG_DEBUG, "Unlock Returned %d", ret);
+ dict_unref (lock_ctx->dict);
+ }
- glusterd_op_unlock_send_resp (lock_ctx->req, ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Unlock Returned %d", ret);
if (priv->pending_quorum_action)
glusterd_do_quorum_action ();
@@ -2393,7 +3005,8 @@ glusterd_op_ac_rcvd_lock_acc (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC,
+ &event->txn_id, NULL);
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -2501,12 +3114,13 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
}
break;
+ case GD_OP_GSYNC_CREATE:
case GD_OP_GSYNC_SET:
{
ret = glusterd_op_gsync_args_get (dict,
&errstr,
&volname,
- NULL);
+ NULL, NULL);
if (ret == 0) {
ret = glusterd_dict_set_volid
(dict, volname, op_errstr);
@@ -2596,9 +3210,6 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
case GD_OP_STATEDUMP_VOLUME:
case GD_OP_CLEARLOCKS_VOLUME:
case GD_OP_DEFRAG_BRICK_VOLUME:
-#ifdef HAVE_BD_XLATOR
- case GD_OP_BD_OP:
-#endif
{
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
@@ -2619,6 +3230,18 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
}
break;
+ case GD_OP_COPY_FILE:
+ {
+ dict_copy (dict, req_dict);
+ break;
+ }
+
+ case GD_OP_SYS_EXEC:
+ {
+ dict_copy (dict, req_dict);
+ break;
+ }
+
default:
break;
}
@@ -2640,7 +3263,7 @@ glusterd_is_get_op (xlator_t *this, glusterd_op_t op, dict_t *dict)
if (op == GD_OP_STATUS_VOLUME)
return _gf_true;
- if ((op == GD_OP_SET_VOLUME)) {
+ if (op == GD_OP_SET_VOLUME) {
//check for set volume help
ret = dict_get_str (dict, "volname", &volname);
if (volname &&
@@ -2815,7 +3438,8 @@ out:
if (dict)
dict_unref (dict);
if (ret) {
- glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL);
+ glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ &event->txn_id, NULL);
opinfo.op_ret = ret;
}
@@ -2824,7 +3448,7 @@ out:
opinfo.pending_count);
if (!opinfo.pending_count)
- ret = glusterd_op_sm_inject_all_acc ();
+ ret = glusterd_op_sm_inject_all_acc (&event->txn_id);
gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
@@ -2833,10 +3457,10 @@ out:
}
static int32_t
-glusterd_op_start_rb_timer (dict_t *dict)
+glusterd_op_start_rb_timer (dict_t *dict, uuid_t *txn_id)
{
int32_t op = 0;
- struct timeval timeout = {0, };
+ struct timespec timeout = {0, };
glusterd_conf_t *priv = NULL;
int32_t ret = -1;
dict_t *rb_ctx = NULL;
@@ -2852,12 +3476,12 @@ glusterd_op_start_rb_timer (dict_t *dict)
}
if (op != GF_REPLACE_OP_START) {
- ret = glusterd_op_sm_inject_all_acc ();
+ ret = glusterd_op_sm_inject_all_acc (txn_id);
goto out;
}
timeout.tv_sec = 5;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
rb_ctx = dict_copy (dict, rb_ctx);
@@ -2867,6 +3491,17 @@ glusterd_op_start_rb_timer (dict_t *dict)
ret = -1;
goto out;
}
+
+ ret = dict_set_bin (rb_ctx, "transaction_id",
+ txn_id, sizeof (*txn_id));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to set transaction id.");
+ goto out;
+ } else
+ gf_log ("", GF_LOG_DEBUG,
+ "transaction_id = %s", uuid_utoa (*txn_id));
+
priv->timer = gf_timer_call_after (THIS->ctx, timeout,
glusterd_do_replace_brick,
(void *) rb_ctx);
@@ -2935,6 +3570,97 @@ out:
return ret;
}
+static int
+reassign_defrag_status (dict_t *dict, char *key, gf_defrag_status_t *status)
+{
+ int ret = 0;
+
+ if (!*status)
+ return ret;
+
+ switch (*status) {
+ case GF_DEFRAG_STATUS_STARTED:
+ *status = GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED;
+ break;
+
+ case GF_DEFRAG_STATUS_STOPPED:
+ *status = GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED;
+ break;
+
+ case GF_DEFRAG_STATUS_COMPLETE:
+ *status = GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE;
+ break;
+
+ case GF_DEFRAG_STATUS_FAILED:
+ *status = GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED;
+ break;
+ default:
+ break;
+ }
+
+ ret = dict_set_int32(dict, key, *status);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to reset defrag %s in dict", key);
+
+ return ret;
+}
+
+/* Check and reassign the defrag_status enum got from the rebalance process
+ * of all peers so that the rebalance-status CLI command can display if a
+ * full-rebalance or just a fix-layout was carried out.
+ */
+static int
+glusterd_op_check_peer_defrag_status (dict_t *dict, int count)
+{
+ glusterd_volinfo_t *volinfo = NULL;
+ gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED;
+ char key[256] = {0,};
+ char *volname = NULL;
+ int ret = -1;
+ int i = 1;
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING, "Unable to get volume name");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
+ goto out;
+ }
+
+ if (volinfo->rebal.defrag_cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+ /* Fix layout was not issued; we don't need to reassign
+ the status */
+ ret = 0;
+ goto out;
+ }
+
+ do {
+ memset (key, 0, 256);
+ snprintf (key, 256, "status-%d", i);
+ ret = dict_get_int32 (dict, key, (int32_t *)&status);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to get defrag %s", key);
+ goto out;
+ }
+ ret = reassign_defrag_status (dict, key, &status);
+ if (ret)
+ goto out;
+ i++;
+ } while (i <= count);
+
+ ret = 0;
+out:
+ return ret;
+
+}
+
/* This function is used to modify the op_ctx dict before sending it back
* to cli. This is useful in situations like changing the peer uuids to
* hostnames etc.
@@ -2997,6 +3723,38 @@ glusterd_op_modify_op_ctx (glusterd_op_t op, void *ctx)
count = brick_index_max + other_count + 1;
+ /* add 'brick%d.peerid' into op_ctx with value of 'brick%d.path'.
+ nfs/sshd like services have this additional uuid */
+ {
+ char key[1024];
+ char *uuid_str = NULL;
+ char *uuid = NULL;
+ int i;
+
+ for (i = brick_index_max + 1; i < count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "brick%d.path", i);
+ ret = dict_get_str (op_ctx, key, &uuid_str);
+ if (!ret) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),
+ "brick%d.peerid", i);
+ uuid = gf_strdup (uuid_str);
+ if (!uuid) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unable to create dup of"
+ " uuid_str");
+ continue;
+ }
+ ret = dict_set_dynstr (op_ctx, key,
+ uuid);
+ if (ret != 0) {
+ GF_FREE (uuid);
+ }
+ }
+ }
+ }
+
ret = glusterd_op_volume_dict_uuid_to_hostname (op_ctx,
"brick%d.path",
0, count);
@@ -3038,12 +3796,49 @@ glusterd_op_modify_op_ctx (glusterd_op_t op, void *ctx)
goto out;
}
+ /* add 'node-name-%d' into op_ctx with value uuid_str.
+ this will be used to convert to hostname later */
+ {
+ char key[1024];
+ char *uuid_str = NULL;
+ char *uuid = NULL;
+ int i;
+
+ for (i = 1; i <= count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "node-uuid-%d", i);
+ ret = dict_get_str (op_ctx, key, &uuid_str);
+ if (!ret) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),
+ "node-name-%d", i);
+ uuid = gf_strdup (uuid_str);
+ if (!uuid) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unable to create dup of"
+ " uuid_str");
+ continue;
+ }
+ ret = dict_set_dynstr (op_ctx, key,
+ uuid);
+ if (ret != 0) {
+ GF_FREE (uuid);
+ }
+ }
+ }
+ }
+
ret = glusterd_op_volume_dict_uuid_to_hostname (op_ctx,
- "node-uuid-%d",
+ "node-name-%d",
1, (count + 1));
if (ret)
gf_log (this->name, GF_LOG_WARNING,
"Failed uuid to hostname conversion");
+
+ ret = glusterd_op_check_peer_defrag_status (op_ctx, count);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to reset defrag status for fix-layout");
break;
default:
@@ -3196,17 +3991,19 @@ out:
if (dict)
dict_unref (dict);
if (ret) {
- glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL);
+ glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ &event->txn_id, NULL);
opinfo.op_ret = ret;
}
if (!opinfo.pending_count) {
if (op == GD_OP_REPLACE_BRICK) {
- ret = glusterd_op_start_rb_timer (op_dict);
+ ret = glusterd_op_start_rb_timer (op_dict,
+ &event->txn_id);
} else {
glusterd_op_modify_op_ctx (op, NULL);
- ret = glusterd_op_sm_inject_all_acc ();
+ ret = glusterd_op_sm_inject_all_acc (&event->txn_id);
}
goto err;
}
@@ -3231,7 +4028,8 @@ glusterd_op_ac_rcvd_stage_op_acc (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_ACC,
+ &event->txn_id, NULL);
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -3252,7 +4050,8 @@ glusterd_op_ac_stage_op_failed (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, NULL);
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -3273,7 +4072,8 @@ glusterd_op_ac_commit_op_failed (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, NULL);
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -3316,7 +4116,8 @@ glusterd_op_ac_brick_op_failed (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.brick_pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, ev_ctx->commit_ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, ev_ctx->commit_ctx);
out:
if (ev_ctx->rsp_dict)
@@ -3358,7 +4159,7 @@ glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx)
goto out;
}
- ret = glusterd_op_start_rb_timer (op_ctx);
+ ret = glusterd_op_start_rb_timer (op_ctx, &event->txn_id);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Couldn't start "
"replace-brick operation.");
@@ -3373,10 +4174,14 @@ glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx)
out:
if (commit_ack_inject) {
if (ret)
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ &event->txn_id,
+ NULL);
else if (!opinfo.pending_count) {
glusterd_op_modify_op_ctx (op, NULL);
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, NULL);
+ ret = glusterd_op_sm_inject_event
+ (GD_OP_EVENT_COMMIT_ACC,
+ &event->txn_id, NULL);
}
/*else do nothing*/
}
@@ -3397,7 +4202,8 @@ glusterd_op_ac_rcvd_unlock_acc (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC,
+ &event->txn_id, NULL);
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -3431,7 +4237,7 @@ glusterd_op_reset_ctx ()
}
int32_t
-glusterd_op_txn_complete ()
+glusterd_op_txn_complete (uuid_t *txn_id)
{
int32_t ret = -1;
glusterd_conf_t *priv = NULL;
@@ -3441,6 +4247,7 @@ glusterd_op_txn_complete ()
rpcsvc_request_t *req = NULL;
void *ctx = NULL;
char *op_errstr = NULL;
+ char *volname = NULL;
xlator_t *this = NULL;
this = THIS;
@@ -3463,14 +4270,30 @@ glusterd_op_txn_complete ()
glusterd_op_reset_ctx ();
glusterd_op_clear_errstr ();
- ret = glusterd_unlock (MY_UUID);
-
- /* unlock cant/shouldnt fail here!! */
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Unable to clear local lock, ret: %d", ret);
+ /* Based on the op-version, we release the cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4) {
+ ret = glusterd_unlock (MY_UUID);
+ /* unlock cant/shouldnt fail here!! */
+ if (ret)
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Unable to clear local lock, ret: %d", ret);
+ else
+ gf_log (this->name, GF_LOG_DEBUG, "Cleared local lock");
} else {
- gf_log (this->name, GF_LOG_DEBUG, "Cleared local lock");
+ ret = dict_get_str (ctx, "volname", &volname);
+ if (ret)
+ gf_log ("", GF_LOG_INFO,
+ "No Volume name present. "
+ "Locks have not been held.");
+
+ if (volname) {
+ ret = glusterd_mgmt_v3_unlock (volname, MY_UUID,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to release lock for %s",
+ volname);
+ }
}
ret = glusterd_op_send_cli_response (op, op_ret,
@@ -3489,6 +4312,13 @@ glusterd_op_txn_complete ()
if (priv->pending_quorum_action)
glusterd_do_quorum_action ();
+
+ /* Clearing the transaction opinfo */
+ ret = glusterd_clear_txn_opinfo (txn_id);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to clear transaction's opinfo");
+
gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -3500,7 +4330,7 @@ glusterd_op_ac_unlocked_all (glusterd_op_sm_event_t *event, void *ctx)
GF_ASSERT (event);
- ret = glusterd_op_txn_complete ();
+ ret = glusterd_op_txn_complete (&event->txn_id);
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -3517,6 +4347,7 @@ glusterd_op_ac_stage_op (glusterd_op_sm_event_t *event, void *ctx)
char *op_errstr = NULL;
dict_t *dict = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
this = THIS;
GF_ASSERT (this);
@@ -3542,9 +4373,27 @@ glusterd_op_ac_stage_op (glusterd_op_sm_event_t *event, void *ctx)
status);
}
+ txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+
+ if (txn_id)
+ uuid_copy (*txn_id, event->txn_id);
+ else {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_bin (rsp_dict, "transaction_id",
+ txn_id, sizeof(*txn_id));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set transaction id.");
+ goto out;
+ }
+
ret = glusterd_op_stage_send_resp (req_ctx->req, req_ctx->op,
status, op_errstr, rsp_dict);
+out:
if (op_errstr && (strcmp (op_errstr, "")))
GF_FREE (op_errstr);
@@ -3606,6 +4455,7 @@ glusterd_op_ac_commit_op (glusterd_op_sm_event_t *event, void *ctx)
dict_t *dict = NULL;
dict_t *rsp_dict = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
this = THIS;
GF_ASSERT (this);
@@ -3635,10 +4485,27 @@ glusterd_op_ac_commit_op (glusterd_op_sm_event_t *event, void *ctx)
"'Volume %s' failed: %d", gd_op_list[req_ctx->op],
status);
+ txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+
+ if (txn_id)
+ uuid_copy (*txn_id, event->txn_id);
+ else {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_bin (rsp_dict, "transaction_id",
+ txn_id, sizeof(*txn_id));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set transaction id.");
+ goto out;
+ }
+
ret = glusterd_op_commit_send_resp (req_ctx->req, req_ctx->op,
status, op_errstr, rsp_dict);
- glusterd_op_fini_ctx ();
+out:
if (op_errstr && (strcmp (op_errstr, "")))
GF_FREE (op_errstr);
@@ -3667,7 +4534,6 @@ glusterd_op_ac_send_commit_failed (glusterd_op_sm_event_t *event, void *ctx)
opinfo.op_ret, opinfo.op_errstr,
op_ctx);
- glusterd_op_fini_ctx ();
if (opinfo.op_errstr && (strcmp (opinfo.op_errstr, ""))) {
GF_FREE (opinfo.op_errstr);
opinfo.op_errstr = NULL;
@@ -3752,6 +4618,10 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr,
ret = glusterd_op_stage_sync_volume (dict, op_errstr);
break;
+ case GD_OP_GSYNC_CREATE:
+ ret = glusterd_op_stage_gsync_create (dict, op_errstr);
+ break;
+
case GD_OP_GSYNC_SET:
ret = glusterd_op_stage_gsync_set (dict, op_errstr);
break;
@@ -3761,7 +4631,8 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr,
break;
case GD_OP_QUOTA:
- ret = glusterd_op_stage_quota (dict, op_errstr);
+ ret = glusterd_op_stage_quota (dict, op_errstr,
+ rsp_dict);
break;
case GD_OP_STATUS_VOLUME:
@@ -3785,18 +4656,21 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr,
ret = glusterd_op_stage_clearlocks_volume (dict,
op_errstr);
break;
-#ifdef HAVE_BD_XLATOR
- case GD_OP_BD_OP:
- ret = glusterd_op_stage_bd (dict, op_errstr);
+
+ case GD_OP_COPY_FILE:
+ ret = glusterd_op_stage_copy_file (dict, op_errstr);
break;
-#endif
+
+ case GD_OP_SYS_EXEC:
+ ret = glusterd_op_stage_sys_exec (dict, op_errstr);
+ break;
+
default:
gf_log (this->name, GF_LOG_ERROR, "Unknown op %s",
gd_op_list[op]);
}
- gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
-
+ gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret);
return ret;
}
@@ -3854,6 +4728,11 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
ret = glusterd_op_sync_volume (dict, op_errstr, rsp_dict);
break;
+ case GD_OP_GSYNC_CREATE:
+ ret = glusterd_op_gsync_create (dict, op_errstr,
+ rsp_dict);
+ break;
+
case GD_OP_GSYNC_SET:
ret = glusterd_op_gsync_set (dict, op_errstr, rsp_dict);
break;
@@ -3888,11 +4767,15 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
ret = glusterd_op_clearlocks_volume (dict, op_errstr,
rsp_dict);
break;
-#ifdef HAVE_BD_XLATOR
- case GD_OP_BD_OP:
- ret = 0;
+
+ case GD_OP_COPY_FILE:
+ ret = glusterd_op_copy_file (dict, op_errstr);
break;
-#endif
+
+ case GD_OP_SYS_EXEC:
+ ret = glusterd_op_sys_exec (dict, op_errstr, rsp_dict);
+ break;
+
default:
gf_log (this->name, GF_LOG_ERROR, "Unknown op %s",
gd_op_list[op]);
@@ -3901,11 +4784,12 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
if (ret == 0)
glusterd_op_commit_hook (op, dict, GD_COMMIT_HOOK_POST);
- gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+
static int
glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr,
struct list_head *selected)
@@ -4222,24 +5106,95 @@ out:
}
int
+get_replica_index_for_per_replica_cmd (glusterd_volinfo_t *volinfo,
+ dict_t *dict) {
+ int ret = 0;
+ char *hostname = NULL;
+ char *path = NULL;
+ int index = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ int cmd_replica_index = -1;
+ int replica_count = -1;
+
+
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "per-replica-cmd-hostname", &hostname);
+ if (ret)
+ goto out;
+ ret = dict_get_str (dict, "per-replica-cmd-path", &path);
+ if (ret)
+ goto out;
+
+ replica_count = volinfo->replica_count;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_is_null (brickinfo->uuid))
+ (void)glusterd_resolve_brick (brickinfo);
+ if (!strcmp (brickinfo->path, path) &&
+ !strcmp (brickinfo->hostname, hostname)) {
+ cmd_replica_index = index/(replica_count);
+ goto out;
+ }
+ index++;
+ }
+
+
+out:
+ if (ret)
+ cmd_replica_index = -1;
+
+ return cmd_replica_index;
+}
+
+int
_select_rxlators_with_local_bricks (xlator_t *this, glusterd_volinfo_t *volinfo,
- dict_t *dict)
+ dict_t *dict, cli_cmd_type type)
{
glusterd_brickinfo_t *brickinfo = NULL;
glusterd_conf_t *priv = NULL;
- int index = 1;
+ int index = 0;
int rxlator_count = 0;
int replica_count = 0;
gf_boolean_t add = _gf_false;
+ int ret = 0;
+ int cmd_replica_index = -1;
priv = this->private;
replica_count = volinfo->replica_count;
+
+ if (type == PER_REPLICA) {
+
+ cmd_replica_index = get_replica_index_for_per_replica_cmd
+ (volinfo, dict);
+ if (cmd_replica_index == -1) {
+ ret = -1;
+ goto err;
+ }
+ }
+
+ index = 1;
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
if (uuid_is_null (brickinfo->uuid))
(void)glusterd_resolve_brick (brickinfo);
- if (!uuid_compare (MY_UUID, brickinfo->uuid))
- add = _gf_true;
+ switch (type) {
+ case ALL_REPLICA:
+ if (!uuid_compare (MY_UUID, brickinfo->uuid))
+ add = _gf_true;
+ break;
+ case PER_REPLICA:
+ if (!uuid_compare (MY_UUID, brickinfo->uuid) &&
+ ((index-1)/replica_count == cmd_replica_index))
+
+ add = _gf_true;
+ break;
+ }
+
if (index % replica_count == 0) {
if (add) {
_add_rxlator_to_dict (dict, volinfo->volname,
@@ -4252,6 +5207,10 @@ _select_rxlators_with_local_bricks (xlator_t *this, glusterd_volinfo_t *volinfo,
index++;
}
+err:
+ if (ret)
+ rxlator_count = -1;
+
return rxlator_count;
}
@@ -4292,9 +5251,10 @@ _select_rxlators_for_full_self_heal (xlator_t *this,
return rxlator_count;
}
-#ifdef HAVE_BD_XLATOR
+
static int
-glusterd_bricks_select_bd (dict_t *dict, char **op_errstr)
+glusterd_bricks_select_snap (dict_t *dict, char **op_errstr,
+ struct list_head *selected)
{
int ret = -1;
glusterd_conf_t *priv = NULL;
@@ -4312,31 +5272,31 @@ glusterd_bricks_select_bd (dict_t *dict, char **op_errstr)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to get volname");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get"
+ " volname");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret)
goto out;
- pending_node = GF_CALLOC (1, sizeof (*pending_node),
- gf_gld_mt_pending_node_t);
- if (!pending_node) {
- ret = -1;
- goto out;
- }
-
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
brick_index++;
if (uuid_compare (brickinfo->uuid, MY_UUID) ||
!glusterd_is_brick_started (brickinfo)) {
continue;
}
+ pending_node = GF_CALLOC (1, sizeof (*pending_node),
+ gf_gld_mt_pending_node_t);
+ if (!pending_node) {
+ ret = -1;
+ goto out;
+ }
pending_node->node = brickinfo;
pending_node->type = GD_NODE_BRICK;
pending_node->index = brick_index;
list_add_tail (&pending_node->list,
- &opinfo.pending_bricks);
+ selected);
pending_node = NULL;
}
@@ -4346,10 +5306,10 @@ out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning ret %d", ret);
return ret;
}
-#endif
static int
-fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo)
+fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo,
+ cli_cmd_type type, dict_t *req_dict)
{
glusterd_brickinfo_t *brickinfo = NULL;
char msg[1024] = {0,};
@@ -4358,10 +5318,22 @@ fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo)
int index = 0;
int ret = 0;
xlator_t *this = NULL;
+ int cmd_replica_index = -1;
this = THIS;
snprintf (msg, sizeof (msg), "self-heal-daemon is not running on");
+ if (type == PER_REPLICA) {
+ cmd_replica_index = get_replica_index_for_per_replica_cmd
+ (volinfo, req_dict);
+ if (cmd_replica_index == -1) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Could not find the "
+ "replica index for per replica type command");
+ ret = -1;
+ goto out;
+ }
+ }
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
if (uuid_is_null (brickinfo->uuid))
(void)glusterd_resolve_brick (brickinfo);
@@ -4370,6 +5342,14 @@ fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo)
index++;
continue;
}
+
+ if (type == PER_REPLICA) {
+ if (cmd_replica_index != (index/volinfo->replica_count)) {
+ index++;
+ continue;
+ }
+
+ }
snprintf (key, sizeof (key), "%d-status",index);
snprintf (value, sizeof (value), "%s %s",msg,
uuid_utoa(MY_UUID));
@@ -4438,21 +5418,49 @@ glusterd_bricks_select_heal_volume (dict_t *dict, char **op_errstr,
goto out;
}
+ switch (heal_op) {
+ case GF_AFR_OP_INDEX_SUMMARY:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ if (!glusterd_is_nodesvc_online ("glustershd")) {
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Received "
+ "empty ctx.");
+ goto out;
+ }
- if (!glusterd_is_nodesvc_online ("glustershd") &&
- (heal_op == GF_AFR_OP_INDEX_SUMMARY)) {
-
- if (!rsp_dict) {
- gf_log (this->name, GF_LOG_ERROR, "Received empty "
- "ctx.");
+ ret = fill_shd_status_for_local_bricks (rsp_dict,
+ volinfo,
+ ALL_REPLICA,
+ dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "fill the shd status for the local "
+ "bricks");
goto out;
+
}
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ if (!glusterd_is_nodesvc_online ("glustershd")) {
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Received "
+ "empty ctx.");
+ goto out;
+ }
+ ret = fill_shd_status_for_local_bricks (rsp_dict,
+ volinfo,
+ PER_REPLICA,
+ dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "fill the shd status for the local"
+ " bricks.");
+ goto out;
- ret = fill_shd_status_for_local_bricks (rsp_dict, volinfo);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Unable to fill the shd"
- " status for the local bricks");
- goto out;
+ }
+ break;
+ default:
+ break;
}
@@ -4462,14 +5470,28 @@ glusterd_bricks_select_heal_volume (dict_t *dict, char **op_errstr,
volinfo,
dict);
break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ rxlator_count = _select_rxlators_with_local_bricks (this,
+ volinfo,
+ dict,
+ PER_REPLICA);
+ break;
default:
rxlator_count = _select_rxlators_with_local_bricks (this,
volinfo,
- dict);
+ dict,
+ ALL_REPLICA);
break;
}
if (!rxlator_count)
goto out;
+ if (rxlator_count == -1){
+ gf_log (this->name, GF_LOG_ERROR, "Could not determine the"
+ "translator count");
+ ret = -1;
+ goto out;
+ }
+
ret = dict_set_int32 (dict, "count", rxlator_count);
if (ret)
goto out;
@@ -4538,9 +5560,6 @@ out:
return ret;
}
-
-
-
static int
glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr,
struct list_head *selected)
@@ -4580,6 +5599,7 @@ glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr,
case GF_CLI_STATUS_CALLPOOL:
case GF_CLI_STATUS_NFS:
case GF_CLI_STATUS_SHD:
+ case GF_CLI_STATUS_QUOTAD:
break;
default:
goto out;
@@ -4661,6 +5681,25 @@ glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr,
list_add_tail (&pending_node->list, selected);
ret = 0;
+ } else if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+ if (!glusterd_is_nodesvc_online ("quotad")) {
+ gf_log (this->name, GF_LOG_ERROR, "Quotad is not "
+ "running");
+ ret = -1;
+ goto out;
+ }
+ pending_node = GF_CALLOC (1, sizeof (*pending_node),
+ gf_gld_mt_pending_node_t);
+ if (!pending_node) {
+ ret = -1;
+ goto out;
+ }
+ pending_node->node = priv->quotad;
+ pending_node->type = GD_NODE_QUOTAD;
+ pending_node->index = 0;
+ list_add_tail (&pending_node->list, selected);
+
+ ret = 0;
} else {
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
brick_index++;
@@ -4731,7 +5770,8 @@ glusterd_op_ac_send_brick_op (glusterd_op_sm_event_t *event, void *ctx)
if (!opinfo.pending_count && !opinfo.brick_pending_count) {
glusterd_clear_pending_nodes (&opinfo.pending_bricks);
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, req_ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, req_ctx);
}
out:
@@ -4785,7 +5825,8 @@ glusterd_op_ac_rcvd_brick_op_acc (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.brick_pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, ev_ctx->commit_ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, &event->txn_id,
+ ev_ctx->commit_ctx);
out:
if (ev_ctx->rsp_dict)
@@ -4837,11 +5878,9 @@ glusterd_op_bricks_select (glusterd_op_t op, dict_t *dict, char **op_errstr,
ret = glusterd_bricks_select_rebalance_volume (dict, op_errstr,
selected);
break;
-#ifdef HAVE_BD_XLATOR
- case GD_OP_BD_OP:
- ret = glusterd_bricks_select_bd (dict, op_errstr);
+ case GD_OP_SNAP:
+ ret = glusterd_bricks_select_snap (dict, op_errstr, selected);
break;
-#endif
default:
break;
}
@@ -5163,7 +6202,7 @@ glusterd_op_sm_new_event (glusterd_op_sm_event_type_t event_type,
int
glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type,
- void *ctx)
+ uuid_t *txn_id, void *ctx)
{
int32_t ret = -1;
glusterd_op_sm_event_t *event = NULL;
@@ -5178,6 +6217,9 @@ glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type,
event->ctx = ctx;
+ if (txn_id)
+ uuid_copy (event->txn_id, *txn_id);
+
gf_log (THIS->name, GF_LOG_DEBUG, "Enqueue event: '%s'",
glusterd_op_sm_event_name_get (event->event));
list_add_tail (&event->list, &gd_op_sm_queue);
@@ -5238,6 +6280,7 @@ glusterd_op_sm ()
glusterd_op_sm_t *state = NULL;
glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
xlator_t *this = NULL;
+ glusterd_op_info_t txn_op_info;
this = THIS;
GF_ASSERT (this);
@@ -5258,6 +6301,20 @@ glusterd_op_sm ()
"type: '%s'",
glusterd_op_sm_event_name_get(event_type));
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s",
+ uuid_utoa (event->txn_id));
+
+ ret = glusterd_get_txn_opinfo (&event->txn_id,
+ &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get transaction's opinfo");
+ glusterd_destroy_op_event_ctx (event);
+ GF_FREE (event);
+ continue;
+ } else
+ opinfo = txn_op_info;
+
state = glusterd_op_state_table[opinfo.state.state];
GF_ASSERT (state);
@@ -5288,8 +6345,27 @@ glusterd_op_sm ()
return ret;
}
+ if ((state[event_type].next_state ==
+ GD_OP_STATE_DEFAULT) &&
+ (event_type == GD_OP_EVENT_UNLOCK)) {
+ /* Clearing the transaction opinfo */
+ ret = glusterd_clear_txn_opinfo(&event->txn_id);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to clear "
+ "transaction's opinfo");
+ } else {
+ ret = glusterd_set_txn_opinfo (&event->txn_id,
+ &opinfo);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set "
+ "transaction's opinfo");
+ }
+
glusterd_destroy_op_event_ctx (event);
GF_FREE (event);
+
}
}
@@ -5343,52 +6419,6 @@ glusterd_op_clear_op (glusterd_op_t op)
}
int32_t
-glusterd_op_init_ctx (glusterd_op_t op)
-{
- int ret = 0;
- dict_t *dict = NULL;
- xlator_t *this = NULL;
-
- this = THIS;
- GF_ASSERT (this);
- GF_ASSERT (GD_OP_NONE < op && op < GD_OP_MAX);
-
- if (_gf_false == glusterd_need_brick_op (op)) {
- gf_log (this->name, GF_LOG_DEBUG, "Received op: %s, returning",
- gd_op_list[op]);
- goto out;
- }
- dict = dict_new ();
- if (dict == NULL) {
- ret = -1;
- goto out;
- }
- ret = glusterd_op_set_ctx (dict);
- if (ret)
- goto out;
-out:
- gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
- return ret;
-}
-
-
-
-int32_t
-glusterd_op_fini_ctx ()
-{
- dict_t *dict = NULL;
-
- dict = glusterd_op_get_ctx ();
- if (dict)
- dict_unref (dict);
-
- glusterd_op_reset_ctx ();
- return 0;
-}
-
-
-
-int32_t
glusterd_op_free_ctx (glusterd_op_t op, void *ctx)
{
@@ -5414,9 +6444,6 @@ glusterd_op_free_ctx (glusterd_op_t op, void *ctx)
case GD_OP_STATEDUMP_VOLUME:
case GD_OP_CLEARLOCKS_VOLUME:
case GD_OP_DEFRAG_BRICK_VOLUME:
-#ifdef HAVE_BD_XLATOR
- case GD_OP_BD_OP:
-#endif
dict_unref (ctx);
break;
default:
@@ -5445,4 +6472,3 @@ glusterd_op_sm_init ()
pthread_mutex_init (&gd_op_sm_lock, NULL);
return 0;
}
-
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.h b/xlators/mgmt/glusterd/src/glusterd-op-sm.h
index df8b8c141..4a73b08f4 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.h
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.h
@@ -15,8 +15,8 @@
#include "config.h"
#endif
-#ifndef GSYNC_CONF
-#define GSYNC_CONF GEOREP"/gsyncd.conf"
+#ifndef GSYNC_CONF_TEMPLATE
+#define GSYNC_CONF_TEMPLATE GEOREP"/gsyncd_template.conf"
#endif
#include <pthread.h>
@@ -32,6 +32,8 @@
#include "protocol-common.h"
#define GD_VOLUME_NAME_MAX 256
+#define GD_OP_PROTECTED (0x02)
+#define GD_OP_UNPROTECTED (0x04)
typedef enum glusterd_op_sm_state_ {
GD_OP_STATE_DEFAULT = 0,
@@ -75,6 +77,7 @@ struct glusterd_op_sm_event_ {
struct list_head list;
void *ctx;
glusterd_op_sm_event_type_t event;
+ uuid_t txn_id;
};
typedef struct glusterd_op_sm_event_ glusterd_op_sm_event_t;
@@ -117,6 +120,7 @@ typedef struct glusterd_op_log_filename_ctx_ glusterd_op_log_filename_ctx_t;
struct glusterd_op_lock_ctx_ {
uuid_t uuid;
+ dict_t *dict;
rpcsvc_request_t *req;
};
@@ -162,12 +166,27 @@ typedef struct glusterd_gsync_status_temp {
glusterd_volinfo_t *volinfo;
char *node;
}glusterd_gsync_status_temp_t;
+
+typedef struct gsync_status_param {
+ int is_active;
+ glusterd_volinfo_t *volinfo;
+}gsync_status_param_t;
+
+typedef struct glusterd_txn_opinfo_object_ {
+ glusterd_op_info_t opinfo;
+} glusterd_txn_opinfo_obj;
+
+typedef enum cli_cmd_type_ {
+ PER_REPLICA,
+ ALL_REPLICA,
+ } cli_cmd_type;
+
int
glusterd_op_sm_new_event (glusterd_op_sm_event_type_t event_type,
glusterd_op_sm_event_t **new_event);
int
glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type,
- void *ctx);
+ uuid_t *txn_id, void *ctx);
int
glusterd_op_sm_init ();
@@ -251,10 +270,7 @@ glusterd_op_init_commit_rsp_dict (glusterd_op_t op);
void
glusterd_op_modify_op_ctx (glusterd_op_t op, void *op_ctx);
-int32_t
-glusterd_op_init_ctx (glusterd_op_t op);
-int32_t
-glusterd_op_fini_ctx ();
+
int32_t
glusterd_volume_stats_read_perf (char *brick_path, int32_t blk_size,
int32_t blk_count, double *throughput, double *time);
@@ -270,7 +286,7 @@ glusterd_are_all_volumes_stopped ();
int
glusterd_stop_bricks (glusterd_volinfo_t *volinfo);
int
-gsync_status (char *master, char *slave, int *status);
+gsync_status (char *master, char *slave, char *conf_path, int *status);
int
glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag);
@@ -278,4 +294,21 @@ glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag);
int
glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict,
dict_t *op_ctx);
+#ifdef HAVE_BD_XLATOR
+int
+glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg);
+#endif
+
+int32_t
+glusterd_get_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo);
+
+int32_t
+glusterd_set_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo);
+
+int32_t
+glusterd_clear_txn_opinfo (uuid_t *txn_id);
+
+int32_t
+glusterd_generate_txn_id (dict_t *dict, uuid_t **txn_id);
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
index aab6744a4..a153ca1a9 100644
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -52,8 +52,8 @@ pmap_port_isfree (int port)
}
-struct pmap_registry *
-pmap_registry_new (void)
+static struct pmap_registry *
+pmap_registry_new (xlator_t *this)
{
struct pmap_registry *pmap = NULL;
int i = 0;
@@ -69,8 +69,8 @@ pmap_registry_new (void)
pmap->ports[i].type = GF_PMAP_PORT_FOREIGN;
}
- pmap->base_port = GF_IANA_PRIV_PORTS_START;
- pmap->last_alloc = GF_IANA_PRIV_PORTS_START;
+ pmap->base_port = pmap->last_alloc =
+ ((glusterd_conf_t *)(this->private))->base_port;
return pmap;
}
@@ -86,7 +86,7 @@ pmap_registry_get (xlator_t *this)
pmap = priv->pmap;
if (!pmap) {
- pmap = pmap_registry_new ();
+ pmap = pmap_registry_new (this);
if (!pmap)
return NULL;
priv->pmap = pmap;
@@ -474,17 +474,12 @@ gluster_pmap_signout (rpcsvc_request_t *req)
}
rpcsvc_actor_t gluster_pmap_actors[] = {
- [GF_PMAP_NULL] = {"NULL", GF_PMAP_NULL, NULL, NULL, 0},
- [GF_PMAP_PORTBYBRICK] = {"PORTBYBRICK", GF_PMAP_PORTBYBRICK,
- gluster_pmap_portbybrick, NULL, 0},
- [GF_PMAP_BRICKBYPORT] = {"BRICKBYPORT", GF_PMAP_BRICKBYPORT,
- gluster_pmap_brickbyport, NULL, 0},
- [GF_PMAP_SIGNIN] = {"SIGNIN", GF_PMAP_SIGNIN,
- gluster_pmap_signin, NULL, 0},
- [GF_PMAP_SIGNOUT] = {"SIGNOUT", GF_PMAP_SIGNOUT,
- gluster_pmap_signout, NULL, 0},
- [GF_PMAP_SIGNUP] = {"SIGNUP", GF_PMAP_SIGNUP,
- gluster_pmap_signup, NULL, 0},
+ [GF_PMAP_NULL] = {"NULL", GF_PMAP_NULL, NULL, NULL, 0, DRC_NA},
+ [GF_PMAP_PORTBYBRICK] = {"PORTBYBRICK", GF_PMAP_PORTBYBRICK, gluster_pmap_portbybrick, NULL, 0, DRC_NA},
+ [GF_PMAP_BRICKBYPORT] = {"BRICKBYPORT", GF_PMAP_BRICKBYPORT, gluster_pmap_brickbyport, NULL, 0, DRC_NA},
+ [GF_PMAP_SIGNIN] = {"SIGNIN", GF_PMAP_SIGNIN, gluster_pmap_signin, NULL, 0, DRC_NA},
+ [GF_PMAP_SIGNOUT] = {"SIGNOUT", GF_PMAP_SIGNOUT, gluster_pmap_signout, NULL, 0, DRC_NA},
+ [GF_PMAP_SIGNUP] = {"SIGNUP", GF_PMAP_SIGNUP, gluster_pmap_signup, NULL, 0, DRC_NA},
};
diff --git a/xlators/mgmt/glusterd/src/glusterd-quota.c b/xlators/mgmt/glusterd/src/glusterd-quota.c
index 318267199..7f798ad26 100644
--- a/xlators/mgmt/glusterd/src/glusterd-quota.c
+++ b/xlators/mgmt/glusterd/src/glusterd-quota.c
@@ -21,25 +21,51 @@
#include "glusterd-utils.h"
#include "glusterd-volgen.h"
#include "run.h"
+#include "syscall.h"
+#include "byte-order.h"
+#include "compat-errno.h"
#include <sys/wait.h>
+#include <dlfcn.h>
+
+/* Any negative pid to make it special client */
+#define QUOTA_CRAWL_PID "-100"
+
+const char *gd_quota_op_list[GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT+1] = {
+ [GF_QUOTA_OPTION_TYPE_NONE] = "none",
+ [GF_QUOTA_OPTION_TYPE_ENABLE] = "enable",
+ [GF_QUOTA_OPTION_TYPE_DISABLE] = "disable",
+ [GF_QUOTA_OPTION_TYPE_LIMIT_USAGE] = "limit-usage",
+ [GF_QUOTA_OPTION_TYPE_REMOVE] = "remove",
+ [GF_QUOTA_OPTION_TYPE_LIST] = "list",
+ [GF_QUOTA_OPTION_TYPE_VERSION] = "version",
+ [GF_QUOTA_OPTION_TYPE_ALERT_TIME] = "alert-time",
+ [GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT] = "soft-timeout",
+ [GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT] = "hard-timeout",
+ [GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT] = "default-soft-limit",
+};
int
+glusterd_store_quota_config (glusterd_volinfo_t *volinfo, char *path,
+ char *gfid_str, int opcode, char **op_errstr);
+int
__glusterd_handle_quota (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
dict_t *dict = NULL;
glusterd_op_t cli_op = GD_OP_QUOTA;
- char operation[256] = {0, };
char *volname = NULL;
int32_t type = 0;
char msg[2048] = {0,};
xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
GF_ASSERT (req);
this = THIS;
GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
if (ret < 0) {
@@ -82,23 +108,16 @@ __glusterd_handle_quota (rpcsvc_request_t *req)
goto out;
}
- switch (type) {
- case GF_QUOTA_OPTION_TYPE_ENABLE:
- strncpy (operation, "enable", sizeof (operation));
- break;
-
- case GF_QUOTA_OPTION_TYPE_DISABLE:
- strncpy (operation, "disable", sizeof (operation));
- break;
-
- case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
- strncpy (operation, "limit-usage", sizeof (operation));
- break;
+ if ((conf->op_version == GD_OP_VERSION_MIN) &&
+ (type > GF_QUOTA_OPTION_TYPE_VERSION)) {
+ snprintf (msg, sizeof (msg), "Cannot execute command. The "
+ "cluster is operating at version %d. Quota command %s "
+ "is unavailable in this version", conf->op_version,
+ gd_quota_op_list[type]);
+ ret = -1;
+ goto out;
+ }
- case GF_QUOTA_OPTION_TYPE_REMOVE:
- strncpy (operation, "remove", sizeof (operation));
- break;
- }
ret = glusterd_op_begin_synctask (req, GD_OP_QUOTA, dict);
out:
@@ -132,7 +151,6 @@ glusterd_check_if_quota_trans_enabled (glusterd_volinfo_t *volinfo)
}
if (flag == _gf_false) {
- gf_log ("", GF_LOG_ERROR, "first enable the quota translator");
ret = -1;
goto out;
}
@@ -141,107 +159,9 @@ out:
return ret;
}
-/* At the end of the function, the variable found will be set
- * to true if the path to be removed was present in the limit-list,
- * else will be false.
- */
-int32_t
-_glusterd_quota_remove_limits (char **quota_limits, char *path,
- gf_boolean_t *found)
-{
- int ret = 0;
- int i = 0;
- int size = 0;
- int len = 0;
- int pathlen = 0;
- int skiplen = 0;
- int flag = 0;
- char *limits = NULL;
- char *qlimits = NULL;
-
- if (found != NULL)
- *found = _gf_false;
-
- if (*quota_limits == NULL)
- return -1;
-
- qlimits = *quota_limits;
-
- pathlen = strlen (path);
-
- len = strlen (qlimits);
-
- limits = GF_CALLOC (len + 1, sizeof (char), gf_gld_mt_char);
- if (!limits)
- return -1;
-
- while (i < len) {
- if (!memcmp ((void *) &qlimits [i], (void *)path, pathlen))
- if (qlimits [i + pathlen] == ':') {
- flag = 1;
- if (found != NULL)
- *found = _gf_true;
- }
-
- while (qlimits [i + size] != ',' &&
- qlimits [i + size] != '\0')
- size++;
-
- if (!flag) {
- memcpy ((void *) &limits [i], (void *) &qlimits [i], size + 1);
- } else {
- skiplen = size + 1;
- size = len - i - size;
- memcpy ((void *) &limits [i], (void *) &qlimits [i + skiplen], size);
- break;
- }
-
- i += size + 1;
- size = 0;
- }
-
- if (!flag) {
- ret = 1;
- } else {
- len = strlen (limits);
-
- if (len == 0) {
- GF_FREE (qlimits);
-
- *quota_limits = NULL;
-
- goto out;
- }
-
- if (limits[len - 1] == ',') {
- limits[len - 1] = '\0';
- len --;
- }
-
- GF_FREE (qlimits);
-
- qlimits = GF_CALLOC (len + 1, sizeof (char), gf_gld_mt_char);
-
- if (!qlimits) {
- ret = -1;
- goto out;
- }
-
- memcpy ((void *) qlimits, (void *) limits, len + 1);
-
- *quota_limits = qlimits;
-
- ret = 0;
- }
-
-out:
- GF_FREE (limits);
-
- return ret;
-}
-
int32_t
-glusterd_quota_initiate_fs_crawl (glusterd_conf_t *priv, char *volname)
+glusterd_quota_initiate_fs_crawl (glusterd_conf_t *priv, char *volname,
+ int type)
{
pid_t pid;
int32_t ret = 0;
@@ -260,6 +180,8 @@ glusterd_quota_initiate_fs_crawl (glusterd_conf_t *priv, char *volname)
runner_add_args (&runner, SBIN_DIR"/glusterfs",
"-s", "localhost",
"--volfile-id", volname,
+ "--use-readdirp=no",
+ "--client-pid", QUOTA_CRAWL_PID,
"-l", DEFAULT_LOG_FILE_DIRECTORY"/quota-crawl.log",
mountdir, NULL);
@@ -292,7 +214,19 @@ glusterd_quota_initiate_fs_crawl (glusterd_conf_t *priv, char *volname)
exit (EXIT_FAILURE);
}
runinit (&runner);
- runner_add_args (&runner, "/usr/bin/find", "find", ".", NULL);
+
+ if (type == GF_QUOTA_OPTION_TYPE_ENABLE)
+
+ runner_add_args (&runner, "/usr/bin/find", "find", ".",
+ NULL);
+
+ else if (type == GF_QUOTA_OPTION_TYPE_DISABLE)
+
+ runner_add_args (&runner, "/usr/bin/find", ".",
+ "-exec", "/usr/bin/setfattr", "-n",
+ VIRTUAL_QUOTA_XATTR_CLEANUP_KEY, "-v",
+ "1", "{}", "\\", ";", NULL);
+
if (runner_start (&runner) == -1)
_exit (EXIT_FAILURE);
@@ -312,114 +246,39 @@ out:
return ret;
}
-char *
-glusterd_quota_get_limit_value (char *quota_limits, char *path)
-{
- int32_t i, j, k, l, len;
- int32_t pat_len, diff;
- char *ret_str = NULL;
-
- len = strlen (quota_limits);
- pat_len = strlen (path);
- i = 0;
- j = 0;
-
- while (i < len) {
- j = i;
- k = 0;
- while (path [k] == quota_limits [j]) {
- j++;
- k++;
- }
-
- l = j;
-
- while (quota_limits [j] != ',' &&
- quota_limits [j] != '\0')
- j++;
-
- if (quota_limits [l] == ':' && pat_len == (l - i)) {
- diff = j - i;
- ret_str = GF_CALLOC (diff + 1, sizeof (char),
- gf_gld_mt_char);
-
- strncpy (ret_str, &quota_limits [i], diff);
-
- break;
- }
- i = ++j; //skip ','
- }
-
- return ret_str;
-}
-
-char*
-_glusterd_quota_get_limit_usages (glusterd_volinfo_t *volinfo,
- char *path, char **op_errstr)
-{
- int32_t ret = 0;
- char *quota_limits = NULL;
- char *ret_str = NULL;
-
- if (volinfo == NULL)
- return NULL;
-
- ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_LIMIT_USAGE,
- &quota_limits);
- if (ret)
- return NULL;
- if (quota_limits == NULL) {
- ret_str = NULL;
- *op_errstr = gf_strdup ("Limit not set on any directory");
- } else if (path == NULL)
- ret_str = gf_strdup (quota_limits);
- else
- ret_str = glusterd_quota_get_limit_value (quota_limits, path);
-
- return ret_str;
-}
-
int32_t
-glusterd_quota_get_limit_usages (glusterd_conf_t *priv,
- glusterd_volinfo_t *volinfo,
- char *volname,
- dict_t *dict,
- char **op_errstr,
- dict_t *rsp_dict)
+glusterd_quota_get_default_soft_limit (glusterd_volinfo_t *volinfo,
+ dict_t *rsp_dict)
{
- int32_t i = 0;
- int32_t ret = 0;
- int32_t count = 0;
- char *path = NULL;
- char cmd_str [1024] = {0, };
- char *ret_str = NULL;
+ int32_t ret = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char *default_limit = NULL;
+ char *val = NULL;
if (rsp_dict == NULL)
- return 0;
-
- ret = dict_get_int32 (dict, "count", &count);
- if (ret < 0)
- goto out;
+ return -1;
- if (count == 0) {
- ret_str = _glusterd_quota_get_limit_usages (volinfo, NULL,
- op_errstr);
- } else {
- i = 0;
- while (count--) {
- snprintf (cmd_str, 1024, "path%d", i++);
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
- ret = dict_get_str (dict, cmd_str, &path);
- if (ret < 0)
- goto out;
+ ret = glusterd_volinfo_get (volinfo, "features.default-soft-limit",
+ &default_limit);
+ if (default_limit)
+ val = gf_strdup (default_limit);
+ else
+ val = gf_strdup ("80%");
- ret_str = _glusterd_quota_get_limit_usages (volinfo, path, op_errstr);
- }
+ ret = dict_set_dynstr (rsp_dict, "default-soft-limit", val);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set default "
+ "soft-limit into dict");
+ goto out;
}
+ ret = 0;
- if (ret_str) {
- ret = dict_set_dynstr (rsp_dict, "limit_list", ret_str);
- }
out:
return ret;
}
@@ -430,54 +289,79 @@ glusterd_quota_enable (glusterd_volinfo_t *volinfo, char **op_errstr,
{
int32_t ret = -1;
char *quota_status = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
- GF_VALIDATE_OR_GOTO ("glusterd", volinfo, out);
- GF_VALIDATE_OR_GOTO ("glusterd", crawl, out);
- GF_VALIDATE_OR_GOTO ("glusterd", op_errstr, out);
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, crawl, out);
+ GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
if (glusterd_is_volume_started (volinfo) == 0) {
*op_errstr = gf_strdup ("Volume is stopped, start volume "
"to enable quota.");
+ ret = -1;
goto out;
}
ret = glusterd_check_if_quota_trans_enabled (volinfo);
if (ret == 0) {
*op_errstr = gf_strdup ("Quota is already enabled");
+ ret = -1;
goto out;
}
quota_status = gf_strdup ("on");
if (!quota_status) {
- gf_log ("", GF_LOG_ERROR, "memory allocation failed");
- *op_errstr = gf_strdup ("Enabling quota has been unsuccessful");
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation failed");
+ ret = -1;
goto out;
}
- ret = dict_set_dynstr (volinfo->dict, VKEY_FEATURES_QUOTA, quota_status);
+ ret = dict_set_dynstr (volinfo->dict, VKEY_FEATURES_QUOTA,
+ quota_status);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "dict set failed");
- *op_errstr = gf_strdup ("Enabling quota has been unsuccessful");
+ gf_log (this->name, GF_LOG_ERROR, "dict set failed");
goto out;
}
- *op_errstr = gf_strdup ("Enabling quota has been successful");
-
*crawl = _gf_true;
+ ret = glusterd_store_quota_config (volinfo, NULL, NULL,
+ GF_QUOTA_OPTION_TYPE_ENABLE,
+ op_errstr);
+
ret = 0;
out:
+ if (ret && op_errstr && !*op_errstr)
+ gf_asprintf (op_errstr, "Enabling quota on volume %s has been "
+ "unsuccessful", volinfo->volname);
return ret;
}
int32_t
-glusterd_quota_disable (glusterd_volinfo_t *volinfo, char **op_errstr)
+glusterd_quota_disable (glusterd_volinfo_t *volinfo, char **op_errstr,
+ gf_boolean_t *crawl)
{
- int32_t ret = -1;
- char *quota_status = NULL, *quota_limits = NULL;
+ int32_t ret = -1;
+ int i = 0;
+ char *quota_status = NULL;
+ char *value = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char *quota_options[] = {"features.soft-timeout",
+ "features.hard-timeout",
+ "features.alert-time",
+ "features.default-soft-limit", NULL};
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
- GF_VALIDATE_OR_GOTO ("glusterd", volinfo, out);
- GF_VALIDATE_OR_GOTO ("glusterd", op_errstr, out);
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
ret = glusterd_check_if_quota_trans_enabled (volinfo);
if (ret == -1) {
@@ -487,190 +371,637 @@ glusterd_quota_disable (glusterd_volinfo_t *volinfo, char **op_errstr)
quota_status = gf_strdup ("off");
if (!quota_status) {
- gf_log ("", GF_LOG_ERROR, "memory allocation failed");
- *op_errstr = gf_strdup ("Disabling quota has been unsuccessful");
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation failed");
+ ret = -1;
goto out;
}
ret = dict_set_dynstr (volinfo->dict, VKEY_FEATURES_QUOTA, quota_status);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "dict set failed");
- *op_errstr = gf_strdup ("Disabling quota has been unsuccessful");
+ gf_log (this->name, GF_LOG_ERROR, "dict set failed");
goto out;
}
- *op_errstr = gf_strdup ("Disabling quota has been successful");
+ for (i = 0; quota_options [i]; i++) {
+ ret = glusterd_volinfo_get (volinfo, quota_options[i], &value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO, "failed to get option"
+ " %s",
+ quota_options[i]);
+ } else {
+ dict_del (volinfo->dict, quota_options[i]);
+ }
+ }
+
+ //Remove aux mount of the volume on every node in the cluster
+ ret = glusterd_remove_auxiliary_mount (volinfo->volname);
+ if (ret)
+ goto out;
+
+ *crawl = _gf_true;
+
+ (void) glusterd_clean_up_quota_store (volinfo);
+
+ ret = 0;
+out:
+ if (ret && op_errstr && !*op_errstr)
+ gf_asprintf (op_errstr, "Disabling quota on volume %s has been "
+ "unsuccessful", volinfo->volname);
+ return ret;
+}
+
+
+static int
+glusterd_set_quota_limit (char *volname, char *path, char *hard_limit,
+ char *soft_limit, char **op_errstr)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ char abspath[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+ double soft_lim = 0;
+
+ typedef struct quota_limits {
+ int64_t hl;
+ int64_t sl;
+ } __attribute__ ((__packed__)) quota_limits_t;
+
+ quota_limits_t existing_limit = {0,};
+ quota_limits_t new_limit = {0,};
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
- ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_LIMIT_USAGE,
- &quota_limits);
+ GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH (abspath, volname, path);
+ ret = gf_lstat_dir (abspath, NULL);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "failed to get the quota limits");
+ gf_asprintf (op_errstr, "Failed to find the directory %s. "
+ "Reason : %s", abspath, strerror (errno));
+ goto out;
+ }
+
+ if (!soft_limit) {
+ ret = sys_lgetxattr (abspath,
+ "trusted.glusterfs.quota.limit-set",
+ (void *)&existing_limit,
+ sizeof (existing_limit));
+ if (ret < 0) {
+ switch (errno) {
+ case ENOATTR:
+ existing_limit.sl = -1;
+ break;
+ default:
+ gf_asprintf (op_errstr, "Failed to get the xattr "
+ "'trusted.glusterfs.quota.limit-set' from "
+ "%s. Reason : %s", abspath,
+ strerror (errno));
+ goto out;
+ }
+ } else {
+ existing_limit.hl = ntoh64 (existing_limit.hl);
+ existing_limit.sl = ntoh64 (existing_limit.sl);
+ }
+ new_limit.sl = existing_limit.sl;
+
} else {
- GF_FREE (quota_limits);
+ ret = gf_string2percent (soft_limit, &soft_lim);
+ if (ret)
+ goto out;
+ new_limit.sl = soft_lim;
}
- dict_del (volinfo->dict, VKEY_FEATURES_LIMIT_USAGE);
+ new_limit.sl = hton64 (new_limit.sl);
+
+ ret = gf_string2bytesize_uint64 (hard_limit, (uint64_t*)&new_limit.hl);
+ if (ret)
+ goto out;
+
+ new_limit.hl = hton64 (new_limit.hl);
+
+ ret = sys_lsetxattr (abspath, "trusted.glusterfs.quota.limit-set",
+ (char *)(void *)&new_limit, sizeof (new_limit), 0);
+ if (ret) {
+ gf_asprintf (op_errstr, "setxattr of "
+ "'trusted.glusterfs.quota.limit-set' failed on %s."
+ " Reason : %s", abspath, strerror (errno));
+ goto out;
+ }
+ ret = 0;
out:
return ret;
}
-int32_t
-glusterd_quota_limit_usage (glusterd_volinfo_t *volinfo, dict_t *dict, char **op_errstr)
+static int
+glusterd_update_quota_conf_version (glusterd_volinfo_t *volinfo)
+{
+ volinfo->quota_conf_version++;
+ return 0;
+}
+
+/*The function glusterd_find_gfid_match () does the following:
+ * Given a buffer of gfids, the number of bytes read and the key gfid that needs
+ * to be found, the function compares 16 bytes at a time from @buf against
+ * @gfid.
+ *
+ * What happens when the match is found:
+ * i. If the function was called as part of 'limit-usage' operation, the call
+ * returns with write_byte_count = bytes_read
+ *ii. If the function as called as part of 'quota remove' operation, @buf
+ * is modified in memory such that the match is deleted from the buffer, and
+ * also @write_byte_count is set to original buf size minus the sixteen bytes
+ * that was deleted as part of 'remove'.
+ *
+ * What happens when the match is not found in the current buffer:
+ * The function returns with write_byte_count = bytes_read, which means to say
+ * that the caller of this function must write the entire buffer to the tmp file
+ * and continue the search.
+ */
+static gf_boolean_t
+glusterd_find_gfid_match (uuid_t gfid, unsigned char *buf, size_t bytes_read,
+ int opcode, size_t *write_byte_count)
+{
+ int gfid_index = 0;
+ int shift_count = 0;
+ unsigned char tmp_buf[17] = {0,};
+
+ while (gfid_index != bytes_read) {
+ memcpy ((void *)tmp_buf, (void *)&buf[gfid_index], 16);
+ if (!uuid_compare (gfid, tmp_buf)) {
+ if (opcode == GF_QUOTA_OPTION_TYPE_REMOVE) {
+ shift_count = bytes_read - (gfid_index + 16);
+ memmove ((void *)&buf[gfid_index],
+ (void *)&buf[gfid_index+16],
+ shift_count);
+ *write_byte_count = bytes_read - 16;
+ } else {
+ *write_byte_count = bytes_read;
+ }
+ return _gf_true;
+ } else {
+ gfid_index+=16;
+ }
+ }
+ if (gfid_index == bytes_read)
+ *write_byte_count = bytes_read;
+
+ return _gf_false;
+}
+
+/* The function glusterd_copy_to_tmp_file() reads the "remaining" bytes from
+ * the source fd and writes them to destination fd, at the rate of 128K bytes
+ * of read+write at a time.
+ */
+
+static int
+glusterd_copy_to_tmp_file (int src_fd, int dst_fd)
{
- int32_t ret = -1;
- char *path = NULL;
- char *limit = NULL;
- char *value = NULL;
- char msg [1024] = {0,};
- char *quota_limits = NULL;
+ int ret = 0;
+ size_t entry_sz = 131072;
+ ssize_t bytes_read = 0;
+ unsigned char buf[131072] = {0,};
+ xlator_t *this = NULL;
- GF_VALIDATE_OR_GOTO ("glusterd", dict, out);
- GF_VALIDATE_OR_GOTO ("glusterd", volinfo, out);
- GF_VALIDATE_OR_GOTO ("glusterd", op_errstr, out);
+ this = THIS;
+ GF_ASSERT (this);
- ret = glusterd_check_if_quota_trans_enabled (volinfo);
- if (ret == -1) {
- *op_errstr = gf_strdup ("Quota is disabled, please enable "
- "quota");
+ while ((bytes_read = read (src_fd, (void *)&buf, entry_sz)) > 0) {
+ if (bytes_read % 16 != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "quota.conf "
+ "corrupted");
+ ret = -1;
+ goto out;
+ }
+ ret = write (dst_fd, (void *) buf, bytes_read);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "write into quota.conf failed. Reason : %s",
+ strerror (errno));
+ goto out;
+ }
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int
+glusterd_store_quota_config (glusterd_volinfo_t *volinfo, char *path,
+ char *gfid_str, int opcode, char **op_errstr)
+{
+ int ret = -1;
+ int fd = -1;
+ int conf_fd = -1;
+ size_t entry_sz = 131072;
+ ssize_t bytes_read = 0;
+ size_t bytes_to_write = 0;
+ unsigned char buf[131072] = {0,};
+ uuid_t gfid = {0,};
+ xlator_t *this = NULL;
+ gf_boolean_t found = _gf_false;
+ gf_boolean_t modified = _gf_false;
+ gf_boolean_t is_file_empty = _gf_false;
+ gf_boolean_t is_first_read = _gf_true;
+ glusterd_conf_t *conf = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ glusterd_store_create_quota_conf_sh_on_absence (volinfo);
+
+ fd = gf_store_mkstemp (volinfo->quota_conf_shandle);
+ if (fd < 0) {
+ ret = -1;
goto out;
}
- ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_LIMIT_USAGE,
- &quota_limits);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "failed to get the quota limits");
- *op_errstr = gf_strdup ("failed to set limit");
+ conf_fd = open (volinfo->quota_conf_shandle->path, O_RDONLY);
+ if (conf_fd == -1) {
+ ret = -1;
goto out;
}
- ret = dict_get_str (dict, "path", &path);
+ ret = glusterd_store_quota_conf_skip_header (this, conf_fd);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to fetch quota limits" );
- *op_errstr = gf_strdup ("failed to set limit");
goto out;
}
- ret = dict_get_str (dict, "limit", &limit);
+ ret = glusterd_store_quota_conf_stamp_header (this, fd);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to fetch quota limits" );
- *op_errstr = gf_strdup ("failed to set limit");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to add header to tmp "
+ "file.");
goto out;
}
- if (quota_limits) {
- ret = _glusterd_quota_remove_limits (&quota_limits, path, NULL);
- if (ret == -1) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
- *op_errstr = gf_strdup ("failed to set limit");
+ /* Just create empty quota.conf file if create */
+ if (GF_QUOTA_OPTION_TYPE_ENABLE == opcode) {
+ modified = _gf_true;
+ goto out;
+ }
+
+ /* Check if gfid_str is given for opts other than ENABLE */
+ if (!gfid_str) {
+ ret = -1;
+ goto out;
+ }
+ uuid_parse (gfid_str, gfid);
+
+ for (;;) {
+ bytes_read = read (conf_fd, (void*)&buf, entry_sz);
+ if (bytes_read <= 0) {
+ /*The flag @is_first_read is TRUE when the loop is
+ * entered, and is set to false if the first read
+ * reads non-zero bytes of data. The flag is used to
+ * detect if quota.conf is an empty file, but for the
+ * header. This is done to log appropriate error message
+ * when 'quota remove' is attempted when there are no
+ * limits set on the given volume.
+ */
+ if (is_first_read)
+ is_file_empty = _gf_true;
+ break;
+ }
+ if ((bytes_read % 16) != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "quota.conf "
+ "corrupted");
+ ret = -1;
goto out;
}
- }
+ found = glusterd_find_gfid_match (gfid, buf, bytes_read, opcode,
+ &bytes_to_write);
- if (quota_limits == NULL) {
- ret = gf_asprintf (&value, "%s:%s", path, limit);
+ ret = write (fd, (void *) buf, bytes_to_write);
if (ret == -1) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
- *op_errstr = gf_strdup ("failed to set limit");
+ gf_log (this->name, GF_LOG_ERROR,
+ "write into quota.conf failed. Reason : %s",
+ strerror (errno));
goto out;
}
- } else {
- ret = gf_asprintf (&value, "%s,%s:%s",
- quota_limits, path, limit);
- if (ret == -1) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
- *op_errstr = gf_strdup ("failed to set limit");
+
+ /*If the match is found in this iteration, copy the rest of
+ * quota.conf into quota.conf.tmp and break.
+ * Else continue with the search.
+ */
+ if (found) {
+ ret = glusterd_copy_to_tmp_file (conf_fd, fd);
+ if (ret)
+ goto out;
+ break;
+ }
+ is_first_read = _gf_false;
+ }
+
+ switch (opcode) {
+ case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+ if (!found) {
+ ret = write (fd, gfid, 16);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "write into quota.conf failed. "
+ "Reason : %s",
+ strerror (errno));
+ goto out;
+ }
+ modified = _gf_true;
+ }
+ break;
+
+ case GF_QUOTA_OPTION_TYPE_REMOVE:
+ if (is_file_empty) {
+ gf_asprintf (op_errstr, "Cannot remove limit on"
+ " %s. The quota configuration file"
+ " for volume %s is empty.", path,
+ volinfo->volname);
+ ret = -1;
+ goto out;
+ } else {
+ if (!found) {
+ gf_asprintf (op_errstr, "Error. gfid %s"
+ " for path %s not found in"
+ " store", gfid_str, path);
+ ret = -1;
+ goto out;
+ } else {
+ modified = _gf_true;
+ }
+ }
+ break;
+
+ default:
+ ret = 0;
+ break;
+ }
+
+ if (modified)
+ glusterd_update_quota_conf_version (volinfo);
+
+ ret = 0;
+out:
+ if (conf_fd != -1) {
+ close (conf_fd);
+ }
+
+ if (fd != -1) {
+ close (fd);
+ }
+
+ if (ret && (fd > 0)) {
+ gf_store_unlink_tmppath (volinfo->quota_conf_shandle);
+ } else if (!ret) {
+ ret = gf_store_rename_tmppath (volinfo->quota_conf_shandle);
+ if (modified) {
+ ret = glusterd_compute_cksum (volinfo, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "compute cksum for quota conf file");
+ goto out;
+ }
+
+ ret = glusterd_store_save_quota_version_and_cksum
+ (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "store quota version and cksum");
+ goto out;
+ }
+ }
+ }
+
+ return ret;
+}
+
+int32_t
+glusterd_quota_limit_usage (glusterd_volinfo_t *volinfo, dict_t *dict,
+ int opcode, char **op_errstr)
+{
+ int32_t ret = -1;
+ char *path = NULL;
+ char *hard_limit = NULL;
+ char *soft_limit = NULL;
+ char *gfid_str = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
+
+ ret = glusterd_check_if_quota_trans_enabled (volinfo);
+ if (ret == -1) {
+ *op_errstr = gf_strdup ("Quota is disabled, please enable "
+ "quota");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "path", &path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch path");
+ goto out;
+ }
+ ret = gf_canonicalize_path (path);
+ if (ret)
+ goto out;
+
+ ret = dict_get_str (dict, "hard-limit", &hard_limit);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch hard limit");
+ goto out;
+ }
+
+ if (dict_get (dict, "soft-limit")) {
+ ret = dict_get_str (dict, "soft-limit", &soft_limit);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch "
+ "soft limit");
goto out;
}
+ }
+
+ if (is_origin_glusterd (dict)) {
+ ret = glusterd_set_quota_limit (volinfo->volname, path,
+ hard_limit, soft_limit,
+ op_errstr);
+ if (ret)
+ goto out;
+ }
- GF_FREE (quota_limits);
+ ret = dict_get_str (dict, "gfid", &gfid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get gfid of path "
+ "%s", path);
+ goto out;
}
- quota_limits = value;
+ ret = glusterd_store_quota_config (volinfo, path, gfid_str, opcode,
+ op_errstr);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+
+ if (ret && op_errstr && !*op_errstr)
+ gf_asprintf (op_errstr, "Failed to set hard limit on path %s "
+ "for volume %s", path, volinfo->volname);
+ return ret;
+}
+
+static int
+glusterd_remove_quota_limit (char *volname, char *path, char **op_errstr)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ char abspath[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
- ret = dict_set_str (volinfo->dict, VKEY_FEATURES_LIMIT_USAGE,
- quota_limits);
+ GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH (abspath, volname, path);
+ ret = gf_lstat_dir (abspath, NULL);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to set quota limits" );
- *op_errstr = gf_strdup ("failed to set limit");
+ gf_asprintf (op_errstr, "Failed to find the directory %s. "
+ "Reason : %s", abspath, strerror (errno));
goto out;
}
- snprintf (msg, 1024, "limit set on %s", path);
- *op_errstr = gf_strdup (msg);
+ ret = sys_lremovexattr (abspath, "trusted.glusterfs.quota.limit-set");
+ if (ret) {
+ gf_asprintf (op_errstr, "removexattr failed on %s. Reason : %s",
+ abspath, strerror (errno));
+ goto out;
+ }
ret = 0;
+
out:
return ret;
}
int32_t
-glusterd_quota_remove_limits (glusterd_volinfo_t *volinfo, dict_t *dict, char **op_errstr)
+glusterd_quota_remove_limits (glusterd_volinfo_t *volinfo, dict_t *dict,
+ int opcode, char **op_errstr)
{
int32_t ret = -1;
- char str [PATH_MAX + 1024] = {0,};
- char *quota_limits = NULL;
char *path = NULL;
- gf_boolean_t flag = _gf_false;
+ char *gfid_str = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
- GF_VALIDATE_OR_GOTO ("glusterd", dict, out);
- GF_VALIDATE_OR_GOTO ("glusterd", volinfo, out);
- GF_VALIDATE_OR_GOTO ("glusterd", op_errstr, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
ret = glusterd_check_if_quota_trans_enabled (volinfo);
if (ret == -1) {
- *op_errstr = gf_strdup ("Quota is disabled, please enable quota");
+ *op_errstr = gf_strdup ("Quota is disabled, please enable "
+ "quota");
goto out;
}
- ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_LIMIT_USAGE,
- &quota_limits);
+ ret = dict_get_str (dict, "path", &path);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "failed to get the quota limits");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch path");
goto out;
}
- ret = dict_get_str (dict, "path", &path);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to fetch quota limits" );
+ ret = gf_canonicalize_path (path);
+ if (ret)
goto out;
+
+ if (is_origin_glusterd (dict)) {
+ ret = glusterd_remove_quota_limit (volinfo->volname, path,
+ op_errstr);
+ if (ret)
+ goto out;
}
- ret = _glusterd_quota_remove_limits (&quota_limits, path, &flag);
- if (ret == -1) {
- if (flag == _gf_true)
- snprintf (str, sizeof (str), "Removing limit on %s has "
- "been unsuccessful", path);
- else
- snprintf (str, sizeof (str), "%s has no limit set", path);
- *op_errstr = gf_strdup (str);
+ ret = dict_get_str (dict, "gfid", &gfid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get gfid of path "
+ "%s", path);
goto out;
- } else {
- if (flag == _gf_true)
- snprintf (str, sizeof (str), "Removed quota limit on "
- "%s", path);
- else
- snprintf (str, sizeof (str), "no limit set on %s",
- path);
- *op_errstr = gf_strdup (str);
- }
-
- if (quota_limits) {
- ret = dict_set_str (volinfo->dict, VKEY_FEATURES_LIMIT_USAGE,
- quota_limits);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to set quota limits" );
- goto out;
- }
- } else {
- dict_del (volinfo->dict, VKEY_FEATURES_LIMIT_USAGE);
}
+ ret = glusterd_store_quota_config (volinfo, path, gfid_str, opcode,
+ op_errstr);
+ if (ret)
+ goto out;
+
+
ret = 0;
out:
return ret;
}
+int
+glusterd_set_quota_option (glusterd_volinfo_t *volinfo, dict_t *dict,
+ char *key, char **op_errstr)
+{
+ int ret = 0;
+ char *value = NULL;
+ xlator_t *this = NULL;
+ char *option = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = glusterd_check_if_quota_trans_enabled (volinfo);
+ if (ret == -1) {
+ gf_asprintf (op_errstr, "Cannot set %s. Quota on volume %s is "
+ "disabled", key, volinfo->volname);
+ return -1;
+ }
+
+ ret = dict_get_str (dict, "value", &value);
+ if(ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Option value absent.");
+ return -1;
+ }
+
+ option = gf_strdup (value);
+ ret = dict_set_dynstr (volinfo->dict, key, option);
+ if(ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set option %s",
+ key);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+glusterd_quotad_op (int opcode)
+{
+ int ret = -1;
+
+ switch (opcode) {
+ case GF_QUOTA_OPTION_TYPE_ENABLE:
+ case GF_QUOTA_OPTION_TYPE_DISABLE:
+
+ if (glusterd_all_volumes_with_quota_stopped ())
+ ret = glusterd_quotad_stop ();
+ else
+ ret = glusterd_check_generate_start_quotad ();
+ break;
+
+ default:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
int
glusterd_op_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
@@ -681,76 +1012,127 @@ glusterd_op_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
int type = -1;
gf_boolean_t start_crawl = _gf_false;
glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
GF_ASSERT (dict);
GF_ASSERT (op_errstr);
- priv = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name " );
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
+ gf_asprintf (op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
goto out;
}
ret = dict_get_int32 (dict, "type", &type);
- if (type == GF_QUOTA_OPTION_TYPE_ENABLE) {
- ret = glusterd_quota_enable (volinfo, op_errstr, &start_crawl);
- if (ret < 0)
- goto out;
-
- goto create_vol;
+ if ((priv->op_version == GD_OP_VERSION_MIN) &&
+ (type > GF_QUOTA_OPTION_TYPE_VERSION)) {
+ gf_asprintf (op_errstr, "Volume quota failed. The cluster is "
+ "operating at version %d. Quota command"
+ " %s is unavailable in this version.",
+ priv->op_version,
+ gd_quota_op_list[type]);
+ ret = -1;
+ goto out;
}
- if (type == GF_QUOTA_OPTION_TYPE_DISABLE) {
- ret = glusterd_quota_disable (volinfo, op_errstr);
- if (ret < 0)
- goto out;
+ switch (type) {
+ case GF_QUOTA_OPTION_TYPE_ENABLE:
+ ret = glusterd_quota_enable (volinfo, op_errstr,
+ &start_crawl);
+ if (ret < 0)
+ goto out;
+ break;
- goto create_vol;
- }
+ case GF_QUOTA_OPTION_TYPE_DISABLE:
+ ret = glusterd_quota_disable (volinfo, op_errstr,
+ &start_crawl);
+ if (ret < 0)
+ goto out;
+
+ break;
- if (type == GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) {
- ret = glusterd_quota_limit_usage (volinfo, dict, op_errstr);
- if (ret < 0)
+ case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+ ret = glusterd_quota_limit_usage (volinfo, dict, type,
+ op_errstr);
goto out;
- goto create_vol;
- }
+ case GF_QUOTA_OPTION_TYPE_REMOVE:
+ ret = glusterd_quota_remove_limits (volinfo, dict, type,
+ op_errstr);
+ goto out;
- if (type == GF_QUOTA_OPTION_TYPE_REMOVE) {
- ret = glusterd_quota_remove_limits (volinfo, dict, op_errstr);
- if (ret < 0)
+ case GF_QUOTA_OPTION_TYPE_LIST:
+ ret = glusterd_check_if_quota_trans_enabled (volinfo);
+ if (ret == -1) {
+ *op_errstr = gf_strdup ("Cannot list limits, "
+ "quota is disabled");
+ goto out;
+ }
+ ret = glusterd_quota_get_default_soft_limit (volinfo,
+ rsp_dict);
goto out;
- goto create_vol;
- }
+ case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+ ret = glusterd_set_quota_option (volinfo, dict,
+ "features.soft-timeout",
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
- if (type == GF_QUOTA_OPTION_TYPE_LIST) {
- ret = glusterd_check_if_quota_trans_enabled (volinfo);
- if (ret == -1) {
- *op_errstr = gf_strdup ("cannot list the limits, "
- "quota is disabled");
- goto out;
- }
+ case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+ ret = glusterd_set_quota_option (volinfo, dict,
+ "features.hard-timeout",
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
- ret = glusterd_quota_get_limit_usages (priv, volinfo, volname,
- dict, op_errstr, rsp_dict);
+ case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+ ret = glusterd_set_quota_option (volinfo, dict,
+ "features.alert-time",
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
- goto out;
+ case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+ ret = glusterd_set_quota_option (volinfo, dict,
+ "features.default-soft-limit",
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
+
+ default:
+ gf_asprintf (op_errstr, "Quota command failed. Invalid "
+ "opcode");
+ ret = -1;
+ goto out;
}
-create_vol:
+
+ if (priv->op_version > GD_OP_VERSION_MIN) {
+ ret = glusterd_quotad_op (type);
+ if (ret)
+ goto out;
+ }
+
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to re-create volfile for"
- " 'quota'");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to re-create "
+ "volfiles");
ret = -1;
goto out;
}
@@ -759,81 +1141,311 @@ create_vol:
if (ret)
goto out;
- if (GLUSTERD_STATUS_STARTED == volinfo->status)
- ret = glusterd_check_generate_start_nfs ();
+ if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+ if (priv->op_version == GD_OP_VERSION_MIN)
+ ret = glusterd_check_generate_start_nfs ();
+ }
- ret = 0;
+ if (rsp_dict && start_crawl == _gf_true)
+ glusterd_quota_initiate_fs_crawl (priv, volname, type);
+ ret = 0;
out:
- if (rsp_dict && start_crawl == _gf_true)
- glusterd_quota_initiate_fs_crawl (priv, volname);
+ return ret;
+}
+
+/*
+ * glusterd_get_gfid_from_brick() fetches the 'trusted.gfid' attribute of @path
+ * from each brick in the backend and places the same in the rsp_dict with the
+ * keys being gfid0, gfid1, gfid2 and so on. The absence of @path in the backend
+ * is not treated as error.
+ */
+static int
+glusterd_get_gfid_from_brick (dict_t *dict, glusterd_volinfo_t *volinfo,
+ dict_t *rsp_dict, char **op_errstr)
+{
+ int ret = -1;
+ int count = 0;
+ char *path = NULL;
+ char backend_path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char key[256] = {0,};
+ char *gfid_str = NULL;
+ uuid_t gfid;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_str (dict, "path", &path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get path");
+ goto out;
+ }
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ ret = glusterd_resolve_brick (brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
+ brickinfo->hostname, brickinfo->path);
+ goto out;
+ }
+
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ if (brickinfo->vg[0])
+ continue;
+
+ snprintf (backend_path, sizeof (backend_path), "%s%s",
+ brickinfo->path, path);
+
+ ret = gf_lstat_dir (backend_path, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO, "Failed to find "
+ "directory %s. Reason : %s", backend_path,
+ strerror (errno));
+ ret = 0;
+ continue;
+ }
+ ret = sys_lgetxattr (backend_path, GFID_XATTR_KEY, gfid, 16);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_INFO, "Failed to get "
+ "extended attribute %s for directory %s. "
+ "Reason : %s", GFID_XATTR_KEY, backend_path,
+ strerror (errno));
+ ret = 0;
+ continue;
+ }
+ snprintf (key, sizeof (key), "gfid%d", count);
+
+ gfid_str = gf_strdup (uuid_utoa (gfid));
+ if (!gfid_str) {
+ ret = -1;
+ goto out;
+ }
- if (rsp_dict && *op_errstr) {
- ret = dict_set_dynstr (rsp_dict, "errstr", *op_errstr);
+ ret = dict_set_dynstr (rsp_dict, key, gfid_str);
if (ret) {
- GF_FREE (*op_errstr);
- gf_log ("", GF_LOG_DEBUG,
- "failed to set error message in ctx");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to place "
+ "gfid of %s in dict", backend_path);
+ GF_FREE (gfid_str);
+ goto out;
}
- *op_errstr = NULL;
+ count++;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "count", count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set count");
+ goto out;
}
+ ret = 0;
+out:
return ret;
}
+static int
+_glusterd_validate_quota_opts (dict_t *dict, int type, char **errstr)
+{
+ int ret = -1;
+ xlator_t *this = THIS;
+ void *quota_xl = NULL;
+ volume_opt_list_t opt_list = {{0},};
+ volume_option_t *opt = NULL;
+ char *key = NULL;
+ char *value = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (this);
+
+ ret = xlator_volopt_dynload ("features/quota", &quota_xl, &opt_list);
+ if (ret)
+ goto out;
+
+ switch (type) {
+ case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+ case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+ case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+ case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+ key = (char *)gd_quota_op_list[type];
+ break;
+ default:
+ ret = -1;
+ goto out;
+ }
+
+ opt = xlator_volume_option_get_list (&opt_list, key);
+ if (!opt) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Unknown option: %s", key);
+ goto out;
+ }
+ ret = dict_get_str (dict, "value", &value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Value not found for key %s",
+ key);
+ goto out;
+ }
+
+ ret = xlator_option_validate (this, key, value, opt, errstr);
+
+out:
+ if (quota_xl) {
+ dlclose (quota_xl);
+ quota_xl = NULL;
+ }
+ return ret;
+}
int
-glusterd_op_stage_quota (dict_t *dict, char **op_errstr)
+glusterd_op_stage_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
- int ret = 0;
- char *volname = NULL;
- gf_boolean_t exists = _gf_false;
- int type = 0;
- dict_t *ctx = NULL;
+ int ret = 0;
+ char *volname = NULL;
+ gf_boolean_t exists = _gf_false;
+ int type = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *hard_limit_str = NULL;
+ uint64_t hard_limit = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (dict);
GF_ASSERT (op_errstr);
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
exists = glusterd_check_volume_exists (volname);
if (!exists) {
- gf_log ("", GF_LOG_ERROR, "Volume with name: %s "
- "does not exist",
- volname);
- *op_errstr = gf_strdup ("Invalid volume name");
+ gf_asprintf (op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+ ret = -1;
+ goto out;
+ }
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_asprintf (op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+ goto out;
+ }
+
+ if (!glusterd_is_volume_started (volinfo)) {
+ *op_errstr = gf_strdup ("Volume is stopped, start volume "
+ "before executing quota command.");
ret = -1;
goto out;
}
ret = dict_get_int32 (dict, "type", &type);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get 'type' for quota op");
- *op_errstr = gf_strdup ("Volume quota failed, internal error "
- ", unable to get type of operation");
+ *op_errstr = gf_strdup ("Volume quota failed, internal error, "
+ "unable to get type of operation");
+ goto out;
+ }
+
+ if ((!glusterd_is_volume_quota_enabled (volinfo)) &&
+ (type != GF_QUOTA_OPTION_TYPE_ENABLE)) {
+ *op_errstr = gf_strdup ("Quota is disabled, please enable "
+ "quota");
+ ret = -1;
goto out;
}
+ if ((priv->op_version == GD_OP_VERSION_MIN) &&
+ (type > GF_QUOTA_OPTION_TYPE_VERSION)) {
+ gf_asprintf (op_errstr, "Volume quota failed. The cluster is "
+ "operating at version %d. Quota command"
+ " %s is unavailable in this version.",
+ priv->op_version,
+ gd_quota_op_list[type]);
+ ret = -1;
+ goto out;
+ }
- ctx = glusterd_op_get_ctx();
- if (ctx && (type == GF_QUOTA_OPTION_TYPE_ENABLE
- || type == GF_QUOTA_OPTION_TYPE_LIST)) {
+ if ((GF_QUOTA_OPTION_TYPE_ENABLE != type) &&
+ (glusterd_check_if_quota_trans_enabled (volinfo) != 0)) {
+ ret = -1;
+ gf_asprintf (op_errstr, "Quota is not enabled on volume %s",
+ volname);
+ goto out;
+ }
+
+ switch (type) {
+ case GF_QUOTA_OPTION_TYPE_ENABLE:
+ case GF_QUOTA_OPTION_TYPE_LIST:
/* Fuse mount req. only for enable & list-usage options*/
- if (!glusterd_is_fuse_available ()) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to open /dev/"
- "fuse (%s), quota command failed",
- strerror (errno));
+ if (is_origin_glusterd (dict) &&
+ !glusterd_is_fuse_available ()) {
*op_errstr = gf_strdup ("Fuse unavailable");
ret = -1;
goto out;
}
+ break;
+
+ case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+ ret = dict_get_str (dict, "hard-limit", &hard_limit_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Faild to get hard-limit from dict");
+ goto out;
+ }
+ ret = gf_string2bytesize_uint64 (hard_limit_str, &hard_limit);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to convert hard-limit string to value");
+ goto out;
+ }
+ if (hard_limit > UINT64_MAX) {
+ ret = -1;
+ ret = gf_asprintf (op_errstr, "Hard-limit %s is greater"
+ " than %"PRId64"bytes. Please set a "
+ "smaller limit.", hard_limit_str,
+ INT64_MAX);
+ gf_log (this->name, GF_LOG_ERROR, "hard-limit %s "
+ "greater than INT64_MAX", hard_limit_str);
+ goto out;
+ }
+ /*The break statement is missing here to allow intentional fall
+ * through of code execution to the next switch case
+ */
+
+ case GF_QUOTA_OPTION_TYPE_REMOVE:
+ ret = glusterd_get_gfid_from_brick (dict, volinfo, rsp_dict,
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
+
+ case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+ case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+ case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+ case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+ ret = _glusterd_validate_quota_opts (dict, type, op_errstr);
+ if (ret)
+ goto out;
+ break;
+
+ default:
+ break;
}
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ ret = 0;
- return ret;
+ out:
+ if (ret && op_errstr && *op_errstr)
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+
+ return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
index ebb9e7dd4..bdedf4c04 100644
--- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c
+++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
@@ -42,12 +42,27 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe);
int
glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr,
- size_t len)
+ size_t len, glusterd_op_t op)
{
- int ret = -1;
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ /* Check only if operation is not remove-brick */
+ if ((GD_OP_REMOVE_BRICK != op) &&
+ !gd_is_remove_brick_committed (volinfo)) {
+ gf_log (this->name, GF_LOG_DEBUG, "A remove-brick task on "
+ "volume %s is not yet committed", volinfo->volname);
+ snprintf (op_errstr, len, "A remove-brick task on volume %s is"
+ " not yet committed. Either commit or stop the "
+ "remove-brick task.", volinfo->volname);
+ goto out;
+ }
if (glusterd_is_defrag_on (volinfo)) {
- gf_log ("glusterd", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"rebalance on volume %s already started",
volinfo->volname);
snprintf (op_errstr, len, "Rebalance on %s is already started",
@@ -57,7 +72,7 @@ glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr,
if (glusterd_is_rb_started (volinfo) ||
glusterd_is_rb_paused (volinfo)) {
- gf_log ("glusterd", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Rebalance failed as replace brick is in progress on volume %s",
volinfo->volname);
snprintf (op_errstr, len, "Rebalance failed as replace brick is in progress on "
@@ -66,7 +81,7 @@ glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr,
}
ret = 0;
out:
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -111,7 +126,7 @@ __glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
UNLOCK (&defrag->lock);
gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_CONNECT",
- rpc->conn.trans->name);
+ rpc->conn.name);
break;
}
@@ -126,20 +141,18 @@ __glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
}
UNLOCK (&defrag->lock);
- if (!glusterd_is_service_running (pidfile, NULL)) {
+ if (!gf_is_service_running (pidfile, NULL)) {
if (volinfo->rebal.defrag_status ==
GF_DEFRAG_STATUS_STARTED) {
volinfo->rebal.defrag_status =
GF_DEFRAG_STATUS_FAILED;
- } else {
- volinfo->rebal.defrag_cmd = 0;
}
}
glusterd_store_perform_node_state_store (volinfo);
if (defrag->rpc) {
- rpc_clnt_unref (defrag->rpc);
+ glusterd_rpc_clnt_unref (priv, defrag->rpc);
defrag->rpc = NULL;
}
if (defrag->cbk_fn)
@@ -148,9 +161,12 @@ __glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
GF_FREE (defrag);
gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_DISCONNECT",
- rpc->conn.trans->name);
+ rpc->conn.name);
break;
}
+ case RPC_CLNT_DESTROY:
+ glusterd_volinfo_unref (volinfo);
+ break;
default:
gf_log ("", GF_LOG_TRACE,
"got some other RPC event %d", event);
@@ -182,7 +198,6 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
char sockfile[PATH_MAX] = {0,};
char pidfile[PATH_MAX] = {0,};
char logfile[PATH_MAX] = {0,};
- dict_t *options = NULL;
char valgrind_logfile[PATH_MAX] = {0,};
priv = THIS->private;
@@ -190,7 +205,7 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
GF_ASSERT (volinfo);
GF_ASSERT (op_errstr);
- ret = glusterd_defrag_start_validate (volinfo, op_errstr, len);
+ ret = glusterd_defrag_start_validate (volinfo, op_errstr, len, op);
if (ret)
goto out;
if (!volinfo->rebal.defrag)
@@ -204,6 +219,7 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
defrag->cmd = cmd;
+ volinfo->rebal.defrag_cmd = cmd;
volinfo->rebal.op = op;
LOCK_INIT (&defrag->lock);
@@ -221,7 +237,7 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
goto out;
}
- GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo, priv);
+ GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo);
GLUSTERD_GET_DEFRAG_PID_FILE (pidfile, volinfo, priv);
snprintf (logfile, PATH_MAX, "%s/%s-rebalance.log",
DEFAULT_LOG_FILE_DIRECTORY, volinfo->volname);
@@ -272,26 +288,10 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
sleep (5);
- /* Setting frame-timeout to 10mins (600seconds).
- * Unix domain sockets ensures that the connection is reliable. The
- * default timeout of 30mins used for unreliable network connections is
- * too long for unix domain socket connections.
- */
- ret = rpc_transport_unix_options_build (&options, sockfile, 600);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Unix options build failed");
- goto out;
- }
-
- synclock_unlock (&priv->big_lock);
- ret = glusterd_rpc_create (&defrag->rpc, options,
- glusterd_defrag_notify, volinfo);
- synclock_lock (&priv->big_lock);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "RPC create failed");
- goto out;
- }
+ ret = glusterd_rebalance_rpc_create (volinfo, _gf_false);
+ //FIXME: this cbk is passed as NULL in all occurrences. May be
+ //we never needed it.
if (cbk)
defrag->cbk_fn = cbk;
@@ -303,28 +303,54 @@ out:
int
glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
- glusterd_conf_t *priv, int cmd)
+ gf_boolean_t reconnect)
{
dict_t *options = NULL;
char sockfile[PATH_MAX] = {0,};
int ret = -1;
- glusterd_defrag_info_t *defrag = NULL;
+ glusterd_defrag_info_t *defrag = volinfo->rebal.defrag;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ struct stat buf = {0,};
- if (!volinfo->rebal.defrag)
- volinfo->rebal.defrag =
- GF_CALLOC (1, sizeof (*volinfo->rebal.defrag),
- gf_gld_mt_defrag_info);
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
- if (!volinfo->rebal.defrag)
+ //rebalance process is not started
+ if (!defrag)
goto out;
- defrag = volinfo->rebal.defrag;
-
- defrag->cmd = cmd;
-
- LOCK_INIT (&defrag->lock);
-
- GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo, priv);
+ //rpc obj for rebalance process already in place.
+ if (defrag->rpc) {
+ ret = 0;
+ goto out;
+ }
+ GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo);
+ /* If reconnecting check if defrag sockfile exists in the new location
+ * in /var/run/ , if it does not try the old location
+ */
+ if (reconnect) {
+ ret = sys_stat (sockfile, &buf);
+ /* TODO: Remove this once we don't need backward compatability
+ * with the older path
+ */
+ if (ret && (errno == ENOENT)) {
+ gf_log (this->name, GF_LOG_WARNING, "Rebalance sockfile "
+ "%s does not exist. Trying old path.",
+ sockfile);
+ GLUSTERD_GET_DEFRAG_SOCK_FILE_OLD (sockfile, volinfo,
+ priv);
+ ret =sys_stat (sockfile, &buf);
+ if (ret && (ENOENT == errno)) {
+ gf_log (this->name, GF_LOG_ERROR, "Rebalance "
+ "sockfile %s does not exist.",
+ sockfile);
+ goto out;
+ }
+ }
+ }
/* Setting frame-timeout to 10mins (600seconds).
* Unix domain sockets ensures that the connection is reliable. The
@@ -337,6 +363,7 @@ glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
goto out;
}
+ glusterd_volinfo_ref (volinfo);
synclock_unlock (&priv->big_lock);
ret = glusterd_rpc_create (&defrag->rpc, options,
glusterd_defrag_notify, volinfo);
@@ -518,7 +545,7 @@ glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr)
case GF_DEFRAG_CMD_START:
case GF_DEFRAG_CMD_START_LAYOUT_FIX:
case GF_DEFRAG_CMD_START_FORCE:
- if (is_origin_glusterd ()) {
+ if (is_origin_glusterd (dict)) {
op_ctx = glusterd_op_get_ctx ();
if (!op_ctx) {
ret = -1;
@@ -544,8 +571,9 @@ glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr)
ret = 0;
}
}
- ret = glusterd_defrag_start_validate (volinfo,
- msg, sizeof (msg));
+ ret = glusterd_defrag_start_validate (volinfo, msg,
+ sizeof (msg),
+ GD_OP_REBALANCE);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
"start validate failed");
@@ -641,6 +669,12 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
case GF_DEFRAG_CMD_START:
case GF_DEFRAG_CMD_START_LAYOUT_FIX:
case GF_DEFRAG_CMD_START_FORCE:
+ /* Reset defrag status to 'NOT STARTED' whenever a
+ * remove-brick/rebalance command is issued to remove
+ * stale information from previous run.
+ */
+ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+
ret = dict_get_str (dict, GF_REBALANCE_TID_KEY, &task_id_str);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG, "Missing rebalance "
@@ -648,15 +682,20 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
ret = 0;
} else {
uuid_parse (task_id_str, volinfo->rebal.rebalance_id) ;
+ volinfo->rebal.op = GD_OP_REBALANCE;
}
+ if (!gd_should_i_start_rebalance (volinfo))
+ break;
ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg),
cmd, NULL, GD_OP_REBALANCE);
- break;
+ break;
case GF_DEFRAG_CMD_STOP:
- /* Clear task-id only on explicitly stopping the
- * rebalance process.
+ /* Clear task-id only on explicitly stopping rebalance.
+ * Also clear the stored operation, so it doesn't cause trouble
+ * with future rebalance/remove-brick starts
*/
uuid_clear (volinfo->rebal.rebalance_id);
+ volinfo->rebal.op = GD_OP_NONE;
/* Fall back to the old volume file in case of decommission*/
list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks,
diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
index c1506033b..ed6d7fd57 100644
--- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
+++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
@@ -31,6 +31,7 @@
DEFAULT_VAR_RUN_DIRECTORY"/%s-"RB_CLIENT_MOUNTPOINT, \
volinfo->volname);
+extern uuid_t global_txn_id;
int
glusterd_get_replace_op_str (gf1_cli_replace_op op, char *op_str)
@@ -268,13 +269,6 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
goto out;
}
- if (volinfo->backend == GD_VOL_BK_BD) {
- snprintf (msg, sizeof (msg), "replace brick not supported "
- "for Block backend volume");
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-
if (GLUSTERD_STATUS_STARTED != volinfo->status) {
ret = -1;
snprintf (msg, sizeof (msg), "volume: %s is not started",
@@ -332,7 +326,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
ret = -1;
goto out;
}
- if (is_origin_glusterd ()) {
+ if (is_origin_glusterd (dict)) {
if (!ctx) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR,
@@ -441,7 +435,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
}
}
- if (glusterd_is_local_addr (src_brickinfo->hostname)) {
+ if (gf_is_local_addr (src_brickinfo->hostname)) {
gf_log (this->name, GF_LOG_DEBUG,
"I AM THE SOURCE HOST");
if (src_brickinfo->port && rsp_dict) {
@@ -457,7 +451,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, src_brickinfo,
priv);
if ((replace_op != GF_REPLACE_OP_COMMIT_FORCE) &&
- !glusterd_is_service_running (pidfile, NULL)) {
+ !gf_is_service_running (pidfile, NULL)) {
snprintf(msg, sizeof(msg), "Source brick %s:%s "
"is not online.", src_brickinfo->hostname,
src_brickinfo->path);
@@ -518,7 +512,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
}
if (!glusterd_is_rb_ongoing (volinfo) &&
- glusterd_is_local_addr (host)) {
+ gf_is_local_addr (host)) {
ret = glusterd_validate_and_create_brickpath (dst_brickinfo,
volinfo->volume_id,
op_errstr, is_force);
@@ -526,7 +520,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
goto out;
}
- if (!glusterd_is_local_addr (host)) {
+ if (!gf_is_local_addr (host)) {
ret = glusterd_friend_find (NULL, host, &peerinfo);
if (ret) {
snprintf (msg, sizeof (msg), "%s, is not a friend",
@@ -553,7 +547,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
}
if (replace_op == GF_REPLACE_OP_START &&
- glusterd_is_local_addr (volinfo->rep_brick.dst_brick->hostname)) {
+ gf_is_local_addr (volinfo->rep_brick.dst_brick->hostname)) {
port = pmap_registry_alloc (THIS);
if (!port) {
gf_log (THIS->name, GF_LOG_CRITICAL,
@@ -694,7 +688,7 @@ rb_src_brick_restart (glusterd_volinfo_t *volinfo,
sleep (2);
ret = glusterd_volume_start_glusterfs (volinfo, src_brickinfo,
- _gf_false);
+ _gf_false);
if (ret) {
gf_log ("", GF_LOG_ERROR, "Unable to start "
"glusterfs, ret: %d", ret);
@@ -1414,7 +1408,7 @@ rb_update_srcbrick_port (glusterd_brickinfo_t *src_brickinfo, dict_t *rsp_dict,
if (src_port)
src_brickinfo->port = src_port;
- if (glusterd_is_local_addr (src_brickinfo->hostname)) {
+ if (gf_is_local_addr (src_brickinfo->hostname)) {
gf_log ("", GF_LOG_INFO,
"adding src-brick port no");
@@ -1468,7 +1462,7 @@ rb_update_dstbrick_port (glusterd_brickinfo_t *dst_brickinfo, dict_t *rsp_dict,
dst_brickinfo->port = dst_port;
- if (glusterd_is_local_addr (dst_brickinfo->hostname)) {
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
gf_log ("", GF_LOG_INFO,
"adding dst-brick port no");
@@ -1522,6 +1516,9 @@ glusterd_op_perform_replace_brick (glusterd_volinfo_t *volinfo,
if (ret)
goto out;
+ strncpy (new_brickinfo->brick_id, old_brickinfo->brick_id,
+ sizeof (new_brickinfo->brick_id));
+
list_add_tail (&new_brickinfo->brick_list,
&old_brickinfo->brick_list);
@@ -1638,7 +1635,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
/* Set task-id, if available, in op_ctx dict for operations
* other than start
*/
- if (is_origin_glusterd ()) {
+ if (is_origin_glusterd (dict)) {
ctx = glusterd_op_get_ctx();
if (!ctx) {
gf_log (this->name, GF_LOG_ERROR, "Failed to "
@@ -1676,7 +1673,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
uuid_parse (task_id_str, volinfo->rep_brick.rb_id);
}
- if (glusterd_is_local_addr (dst_brickinfo->hostname)) {
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
gf_log (this->name, GF_LOG_INFO,
"I AM THE DESTINATION HOST");
if (!glusterd_is_rb_paused (volinfo)) {
@@ -1696,7 +1693,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
}
- if (glusterd_is_local_addr (src_brickinfo->hostname)) {
+ if (gf_is_local_addr (src_brickinfo->hostname)) {
ret = rb_src_brick_restart (volinfo, src_brickinfo,
1);
if (ret) {
@@ -1706,7 +1703,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
}
}
- if (glusterd_is_local_addr (dst_brickinfo->hostname)) {
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
gf_log (this->name, GF_LOG_INFO,
"adding dst-brick port no");
@@ -1737,7 +1734,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
/* fall through */
case GF_REPLACE_OP_COMMIT_FORCE:
{
- if (glusterd_is_local_addr (dst_brickinfo->hostname)) {
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
gf_log (this->name, GF_LOG_DEBUG,
"I AM THE DESTINATION HOST");
ret = rb_kill_destination_brick (volinfo,
@@ -1818,7 +1815,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
}
}
- if (glusterd_is_local_addr (src_brickinfo->hostname)) {
+ if (gf_is_local_addr (src_brickinfo->hostname)) {
ret = rb_src_brick_restart (volinfo, src_brickinfo,
0);
if (ret) {
@@ -1829,7 +1826,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
}
}
- if (glusterd_is_local_addr (dst_brickinfo->hostname)) {
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
gf_log (this->name, GF_LOG_INFO,
"I AM THE DESTINATION HOST");
ret = rb_kill_destination_brick (volinfo, dst_brickinfo);
@@ -1901,14 +1898,17 @@ glusterd_do_replace_brick (void *data)
glusterd_brickinfo_t *src_brickinfo = NULL;
glusterd_brickinfo_t *dst_brickinfo = NULL;
glusterd_conf_t *priv = NULL;
+ uuid_t *txn_id = NULL;
int ret = 0;
dict = data;
GF_ASSERT (THIS);
-
priv = THIS->private;
+ GF_ASSERT (priv);
+
+ txn_id = &priv->global_txn_id;
if (priv->timer) {
gf_timer_call_cancel (THIS->ctx, priv->timer);
@@ -1920,6 +1920,10 @@ glusterd_do_replace_brick (void *data)
gf_log ("", GF_LOG_DEBUG,
"Replace brick operation detected");
+ ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
ret = dict_get_int32 (dict, "operation", &op);
if (ret) {
gf_log ("", GF_LOG_DEBUG,
@@ -2015,9 +2019,15 @@ glusterd_do_replace_brick (void *data)
out:
if (ret)
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ txn_id, NULL);
else
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC,
+ txn_id, NULL);
- glusterd_op_sm ();
+ synclock_lock (&priv->big_lock);
+ {
+ glusterd_op_sm ();
+ }
+ synclock_unlock (&priv->big_lock);
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c
index 01fb77775..babd5a3be 100644
--- a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c
@@ -33,6 +33,7 @@
extern glusterd_op_info_t opinfo;
+extern uuid_t global_txn_id;
int32_t
glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
@@ -84,6 +85,7 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
}
break;
}
+ case GD_OP_GSYNC_CREATE:
case GD_OP_GSYNC_SET:
{
if (ctx) {
@@ -95,13 +97,6 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
break;
}
- case GD_OP_QUOTA:
- {
- if (ctx && !op_errstr) {
- ret = dict_get_str (ctx, "errstr", &errstr);
- }
- break;
- }
case GD_OP_PROFILE_VOLUME:
{
if (ctx && dict_get_int32 (ctx, "count", &count)) {
@@ -141,11 +136,27 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
case GD_OP_LIST_VOLUME:
case GD_OP_CLEARLOCKS_VOLUME:
case GD_OP_HEAL_VOLUME:
- case GD_OP_BD_OP:
+ case GD_OP_QUOTA:
+ case GD_OP_SNAP:
{
/*nothing specific to be done*/
break;
}
+ case GD_OP_COPY_FILE:
+ {
+ if (ctx)
+ ret = dict_get_str (ctx, "errstr", &errstr);
+ break;
+ }
+ case GD_OP_SYS_EXEC:
+ {
+ if (ctx) {
+ ret = dict_get_str (ctx, "errstr", &errstr);
+ ret = dict_set_str (ctx, "glusterd_workdir",
+ conf->workdir);
+ }
+ break;
+ }
}
rsp.op_ret = op_ret;
@@ -233,7 +244,8 @@ __glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov,
glusterd_xfer_cli_probe_resp (ctx->req, rsp.op_ret,
rsp.op_errno,
rsp.op_errstr,
- ctx->hostname, ctx->port);
+ ctx->hostname, ctx->port,
+ ctx->dict);
}
glusterd_destroy_probe_ctx (ctx);
@@ -261,7 +273,8 @@ __glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov,
glusterd_xfer_cli_probe_resp (ctx->req, rsp.op_ret,
rsp.op_errno,
rsp.op_errstr,
- ctx->hostname, ctx->port);
+ ctx->hostname, ctx->port,
+ ctx->dict);
}
glusterd_destroy_probe_ctx (ctx);
@@ -390,7 +403,7 @@ out:
if (ctx->req)//reverse probe doesn't have req
ret = glusterd_xfer_cli_probe_resp (ctx->req, op_ret, op_errno,
NULL, ctx->hostname,
- ctx->port);
+ ctx->port, ctx->dict);
if (!ret) {
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -488,7 +501,7 @@ inject:
respond:
ret = glusterd_xfer_cli_deprobe_resp (ctx->req, op_ret, op_errno, NULL,
- ctx->hostname);
+ ctx->hostname, ctx->dict);
if (!ret && move_sm_now) {
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -563,11 +576,17 @@ __glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov,
glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
glusterd_peerinfo_t *peerinfo = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
this = THIS;
GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+ txn_id = &priv->global_txn_id;
+
if (-1 == req->rpc_status) {
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
@@ -607,7 +626,7 @@ out:
event_type = GD_OP_EVENT_RCVD_ACC;
}
- ret = glusterd_op_sm_inject_event (event_type, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
if (!ret) {
glusterd_friend_sm ();
@@ -626,6 +645,168 @@ glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov,
__glusterd_cluster_lock_cbk);
}
+static int32_t
+glusterd_mgmt_v3_lock_peers_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ int ret = -1;
+ int32_t op_ret = -1;
+ glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to decode mgmt_v3 lock "
+ "response received from peer");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ op_ret = rsp.op_ret;
+
+ txn_id = &rsp.txn_id;
+
+ gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG,
+ "Received mgmt_v3 lock %s from uuid: %s",
+ (op_ret) ? "RJT" : "ACC", uuid_utoa (rsp.uuid));
+
+ ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "mgmt_v3 lock response received "
+ "from unknown peer: %s. Ignoring response",
+ uuid_utoa (rsp.uuid));
+ goto out;
+ }
+
+ if (op_ret) {
+ event_type = GD_OP_EVENT_RCVD_RJT;
+ opinfo.op_ret = op_ret;
+ opinfo.op_errstr = gf_strdup ("Another transaction could be in "
+ "progress. Please try again after"
+ " sometime.");
+ } else {
+ event_type = GD_OP_EVENT_RCVD_ACC;
+ }
+
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
+
+ if (!ret) {
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+ }
+
+out:
+ GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe));
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_lock_peers_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ glusterd_mgmt_v3_lock_peers_cbk_fn);
+}
+
+static int32_t
+glusterd_mgmt_v3_unlock_peers_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ int ret = -1;
+ int32_t op_ret = -1;
+ glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to decode mgmt_v3 unlock "
+ "response received from peer");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ op_ret = rsp.op_ret;
+
+ txn_id = &rsp.txn_id;
+
+ gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG,
+ "Received mgmt_v3 unlock %s from uuid: %s",
+ (op_ret) ? "RJT" : "ACC",
+ uuid_utoa (rsp.uuid));
+
+ ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "mgmt_v3 unlock response received "
+ "from unknown peer: %s. Ignoring response",
+ uuid_utoa (rsp.uuid));
+ goto out;
+ }
+
+ if (op_ret) {
+ event_type = GD_OP_EVENT_RCVD_RJT;
+ opinfo.op_ret = op_ret;
+ opinfo.op_errstr = gf_strdup ("Another transaction could be in "
+ "progress. Please try again after"
+ " sometime.");
+ } else {
+ event_type = GD_OP_EVENT_RCVD_ACC;
+ }
+
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
+
+ if (!ret) {
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+ }
+
+out:
+ GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe));
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock_peers_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ glusterd_mgmt_v3_unlock_peers_cbk_fn);
+}
+
int32_t
__glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
@@ -636,11 +817,17 @@ __glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov,
glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
glusterd_peerinfo_t *peerinfo = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
this = THIS;
GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+ txn_id = &priv->global_txn_id;
+
if (-1 == req->rpc_status) {
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
@@ -677,7 +864,7 @@ out:
event_type = GD_OP_EVENT_RCVD_ACC;
}
- ret = glusterd_op_sm_inject_event (event_type, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
if (!ret) {
glusterd_friend_sm ();
@@ -709,6 +896,7 @@ __glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
char err_str[2048] = {0};
char *peer_str = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
this = THIS;
GF_ASSERT (this);
@@ -761,11 +949,17 @@ out:
"Received stage %s from uuid: %s",
(op_ret) ? "RJT" : "ACC", uuid_utoa (rsp.uuid));
+ ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
if (ret) {
gf_log (this->name, GF_LOG_CRITICAL, "Stage response received "
- "from unknown peer: %s", uuid_utoa (rsp.uuid));
+ "from unknown peer: %s. Ignoring response.",
+ uuid_utoa (rsp.uuid));
+ goto out;
}
if (op_ret) {
@@ -796,7 +990,7 @@ out:
break;
}
- ret = glusterd_op_sm_inject_event (event_type, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
if (!ret) {
glusterd_friend_sm ();
@@ -836,6 +1030,8 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
char err_str[2048] = {0};
char *peer_str = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+
this = THIS;
GF_ASSERT (this);
@@ -889,6 +1085,10 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
"Received commit %s from uuid: %s",
(op_ret)?"RJT":"ACC", uuid_utoa (rsp.uuid));
+ ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
if (ret) {
@@ -968,7 +1168,7 @@ __glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
}
out:
- ret = glusterd_op_sm_inject_event (event_type, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
if (!ret) {
glusterd_friend_sm ();
@@ -1042,12 +1242,12 @@ int32_t
glusterd_rpc_friend_add (call_frame_t *frame, xlator_t *this,
void *data)
{
- gd1_mgmt_friend_req req = {{0},};
- int ret = 0;
- glusterd_peerinfo_t *peerinfo = NULL;
- glusterd_conf_t *priv = NULL;
- glusterd_friend_sm_event_t *event = NULL;
- dict_t *vols = NULL;
+ gd1_mgmt_friend_req req = {{0},};
+ int ret = 0;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_friend_sm_event_t *event = NULL;
+ dict_t *peer_data = NULL;
if (!frame || !this || !data) {
@@ -1062,15 +1262,37 @@ glusterd_rpc_friend_add (call_frame_t *frame, xlator_t *this,
peerinfo = event->peerinfo;
- ret = glusterd_build_volume_dict (&vols);
- if (ret)
+ ret = glusterd_add_volumes_to_export_dict (&peer_data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to add list of volumes "
+ "in the peer_data dict for handshake");
goto out;
+ }
+
+ if (priv->op_version >= GD_OP_VERSION_4) {
+ ret = glusterd_add_missed_snaps_to_export_dict (peer_data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to add list of missed snapshots "
+ "in the peer_data dict for handshake");
+ goto out;
+ }
+
+ ret = glusterd_add_snapshots_to_export_dict (peer_data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to add list of snapshots "
+ "in the peer_data dict for handshake");
+ goto out;
+ }
+ }
uuid_copy (req.uuid, MY_UUID);
req.hostname = peerinfo->hostname;
req.port = peerinfo->port;
- ret = dict_allocate_and_serialize (vols, &req.vols.vols_val,
+ ret = dict_allocate_and_serialize (peer_data, &req.vols.vols_val,
&req.vols.vols_len);
if (ret)
goto out;
@@ -1084,8 +1306,8 @@ glusterd_rpc_friend_add (call_frame_t *frame, xlator_t *this,
out:
GF_FREE (req.vols.vols_val);
- if (vols)
- dict_unref (vols);
+ if (peer_data)
+ dict_unref (peer_data);
gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -1205,6 +1427,139 @@ out:
}
int32_t
+glusterd_mgmt_v3_lock_peers (call_frame_t *frame, xlator_t *this,
+ void *data)
+{
+ gd1_mgmt_v3_lock_req req = {{0},};
+ int ret = -1;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ call_frame_t *dummy_frame = NULL;
+ dict_t *dict = NULL;
+ uuid_t *txn_id = NULL;
+
+ if (!this)
+ goto out;
+
+ dict = data;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_ptr (dict, "peerinfo", VOID (&peerinfo));
+ if (ret)
+ goto out;
+
+ //peerinfo should not be in payload
+ dict_del (dict, "peerinfo");
+
+ glusterd_get_uuid (&req.uuid);
+
+ ret = dict_allocate_and_serialize (dict, &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to serialize dict "
+ "to request buffer");
+ goto out;
+ }
+
+ /* Sending valid transaction ID to peers */
+ ret = dict_get_bin (dict, "transaction_id",
+ (void **)&txn_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get transaction id.");
+ goto out;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Transaction_id = %s", uuid_utoa (*txn_id));
+ uuid_copy (req.txn_id, *txn_id);
+ }
+
+ dummy_frame = create_frame (this, this->ctx->pool);
+ if (!dummy_frame) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame,
+ peerinfo->mgmt_v3,
+ GLUSTERD_MGMT_V3_LOCK, NULL,
+ this, glusterd_mgmt_v3_lock_peers_cbk,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_req);
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock_peers (call_frame_t *frame, xlator_t *this,
+ void *data)
+{
+ gd1_mgmt_v3_unlock_req req = {{0},};
+ int ret = -1;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ call_frame_t *dummy_frame = NULL;
+ dict_t *dict = NULL;
+ uuid_t *txn_id = NULL;
+
+ if (!this)
+ goto out;
+
+ dict = data;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_ptr (dict, "peerinfo", VOID (&peerinfo));
+ if (ret)
+ goto out;
+
+ //peerinfo should not be in payload
+ dict_del (dict, "peerinfo");
+
+ glusterd_get_uuid (&req.uuid);
+
+ ret = dict_allocate_and_serialize (dict, &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to serialize dict "
+ "to request buffer");
+ goto out;
+ }
+
+ /* Sending valid transaction ID to peers */
+ ret = dict_get_bin (dict, "transaction_id",
+ (void **)&txn_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get transaction id.");
+ goto out;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Transaction_id = %s", uuid_utoa (*txn_id));
+ uuid_copy (req.txn_id, *txn_id);
+ }
+
+ dummy_frame = create_frame (this, this->ctx->pool);
+ if (!dummy_frame) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame,
+ peerinfo->mgmt_v3,
+ GLUSTERD_MGMT_V3_UNLOCK, NULL,
+ this, glusterd_mgmt_v3_unlock_peers_cbk,
+ (xdrproc_t)
+ xdr_gd1_mgmt_v3_unlock_req);
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
glusterd_cluster_unlock (call_frame_t *frame, xlator_t *this,
void *data)
{
@@ -1367,11 +1722,16 @@ __glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
glusterd_req_ctx_t *req_ctx = NULL;
glusterd_pending_node_t *node = NULL;
xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
this = THIS;
GF_ASSERT (this);
-
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+
+ txn_id = &priv->global_txn_id;
frame = myframe;
req_ctx = frame->local;
@@ -1429,6 +1789,11 @@ __glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
}
}
out:
+
+ ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
ev_ctx = GF_CALLOC (1, sizeof (*ev_ctx), gf_gld_mt_brick_rsp_ctx_t);
GF_ASSERT (ev_ctx);
if (op_ret) {
@@ -1441,7 +1806,7 @@ out:
ev_ctx->pending_node = frame->cookie;
ev_ctx->rsp_dict = dict;
ev_ctx->commit_ctx = frame->local;
- ret = glusterd_op_sm_inject_event (event_type, ev_ctx);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, ev_ctx);
if (!ret) {
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -1464,8 +1829,9 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
int32_t
glusterd_brick_op (call_frame_t *frame, xlator_t *this,
- void *data)
+ void *data)
{
+
gd1_mgmt_brick_op_req *req = NULL;
int ret = 0;
glusterd_conf_t *priv = NULL;
@@ -1476,6 +1842,7 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this,
glusterd_req_ctx_t *req_ctx = NULL;
struct rpc_clnt *rpc = NULL;
dict_t *op_ctx = NULL;
+ uuid_t *txn_id = NULL;
if (!this) {
ret = -1;
@@ -1484,6 +1851,8 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this,
priv = this->private;
GF_ASSERT (priv);
+ txn_id = &priv->global_txn_id;
+
req_ctx = data;
GF_ASSERT (req_ctx);
INIT_LIST_HEAD (&opinfo.pending_bricks);
@@ -1498,14 +1867,20 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this,
goto out;
}
+
+ ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
list_for_each_entry (pending_node, &opinfo.pending_bricks, list) {
dummy_frame = create_frame (this, this->ctx->pool);
if (!dummy_frame)
continue;
if ((pending_node->type == GD_NODE_NFS) ||
+ (pending_node->type == GD_NODE_QUOTAD) ||
((pending_node->type == GD_NODE_SHD) &&
- (req_ctx->op == GD_OP_STATUS_VOLUME)))
+ (req_ctx->op == GD_OP_STATUS_VOLUME)))
ret = glusterd_node_op_build_payload
(req_ctx->op,
(gd1_mgmt_brick_op_req **)&req,
@@ -1575,7 +1950,8 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this,
out:
if (ret) {
- glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, data);
+ glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ txn_id, data);
opinfo.op_ret = ret;
}
gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -1603,6 +1979,12 @@ struct rpc_clnt_procedure gd_mgmt_actors[GLUSTERD_MGMT_MAXVALUE] = {
[GLUSTERD_MGMT_COMMIT_OP] = {"COMMIT_OP", glusterd_commit_op},
};
+struct rpc_clnt_procedure gd_mgmt_v3_actors[GLUSTERD_MGMT_V3_MAXVALUE] = {
+ [GLUSTERD_MGMT_V3_NULL] = {"NULL", NULL },
+ [GLUSTERD_MGMT_V3_LOCK] = {"MGMT_V3_LOCK", glusterd_mgmt_v3_lock_peers},
+ [GLUSTERD_MGMT_V3_UNLOCK] = {"MGMT_V3_UNLOCK", glusterd_mgmt_v3_unlock_peers},
+};
+
struct rpc_clnt_program gd_mgmt_prog = {
.progname = "glusterd mgmt",
.prognum = GD_MGMT_PROGRAM,
@@ -1627,4 +2009,10 @@ struct rpc_clnt_program gd_peer_prog = {
.numproc = GLUSTERD_FRIEND_MAXVALUE,
};
-
+struct rpc_clnt_program gd_mgmt_v3_prog = {
+ .progname = "glusterd mgmt v3",
+ .prognum = GD_MGMT_PROGRAM,
+ .progver = GD_MGMT_V3_VERSION,
+ .proctable = gd_mgmt_v3_actors,
+ .numproc = GLUSTERD_MGMT_V3_MAXVALUE,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.c b/xlators/mgmt/glusterd/src/glusterd-sm.c
index 86ce86a3e..7a8b2c94f 100644
--- a/xlators/mgmt/glusterd/src/glusterd-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-sm.c
@@ -34,6 +34,7 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "glusterd-store.h"
+#include "glusterd-etcd.h"
static struct list_head gd_friend_sm_queue;
@@ -400,7 +401,8 @@ glusterd_ac_send_friend_remove_req (glusterd_friend_sm_event_t *event,
if (ctx)
ret = glusterd_xfer_cli_deprobe_resp (ctx->req, ret, 0,
NULL,
- ctx->hostname);
+ ctx->hostname,
+ ctx->dict);
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -526,6 +528,9 @@ out:
return ret;
}
+/* Clean up stale volumes on the peer being detached. The volumes which have
+ * bricks on other peers are stale with respect to the detached peer.
+ */
static int
glusterd_peer_detach_cleanup (glusterd_conf_t *priv)
{
@@ -537,6 +542,12 @@ glusterd_peer_detach_cleanup (glusterd_conf_t *priv)
list_for_each_entry_safe (volinfo,tmp_volinfo,
&priv->volumes, vol_list) {
+ /* The peer detach checks make sure that, at this point in the
+ * detach process, there are only volumes contained completely
+ * within or completely outside the detached peer.
+ * The only stale volumes at this point are the ones
+ * completely outside the peer and can be safely deleted.
+ */
if (!glusterd_friend_contains_vol_bricks (volinfo,
MY_UUID)) {
gf_log (THIS->name, GF_LOG_INFO,
@@ -595,6 +606,9 @@ glusterd_ac_handle_friend_remove_req (glusterd_friend_sm_event_t *event,
"Peer detach cleanup was not successful");
ret = 0;
}
+ gf_log (THIS->name, GF_LOG_INFO, "detached, stopping etcd");
+ stop_etcd(priv->etcd_pid);
+ nuke_etcd_dir();
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
@@ -638,10 +652,14 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
glusterd_friend_update_ctx_t *new_ev_ctx = NULL;
glusterd_friend_sm_event_t *new_event = NULL;
glusterd_friend_sm_event_type_t event_type = GD_FRIEND_EVENT_NONE;
+ glusterd_conf_t *conf = NULL;
int status = 0;
int32_t op_ret = -1;
int32_t op_errno = 0;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (ctx);
ev_ctx = ctx;
uuid_copy (uuid, ev_ctx->uuid);
@@ -649,6 +667,9 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
GF_ASSERT (peerinfo);
uuid_copy (peerinfo->uuid, ev_ctx->uuid);
+ conf = this->private;
+ GF_ASSERT (conf);
+
//Build comparison logic here.
ret = glusterd_compare_friend_data (ev_ctx->vols, &status,
peerinfo->hostname);
@@ -664,6 +685,31 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
op_ret = -1;
}
+ /* Compare missed_snapshot list with the peer *
+ * if volume comparison is successful */
+ if ((op_ret == 0) &&
+ (conf->op_version >= GD_OP_VERSION_4)) {
+ ret = glusterd_import_friend_missed_snap_list (ev_ctx->vols);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to import peer's "
+ "missed_snaps_list.");
+ event_type = GD_FRIEND_EVENT_LOCAL_RJT;
+ op_errno = GF_PROBE_MISSED_SNAP_CONFLICT;
+ op_ret = -1;
+ }
+
+ ret = glusterd_compare_friend_snapshots (ev_ctx->vols,
+ peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Conflict in comparing peer's snapshots");
+ event_type = GD_FRIEND_EVENT_LOCAL_RJT;
+ op_errno = GF_PROBE_SNAP_CONFLICT;
+ op_ret = -1;
+ }
+ }
+
ret = glusterd_friend_sm_new_event (event_type, &new_event);
if (ret) {
@@ -691,6 +737,13 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
peerinfo->hostname, ev_ctx->port,
op_ret, op_errno);
+ // apply a deterministic function to decide via whom we should join the cluster
+ if (strcmp(peerinfo->hostname, ev_ctx->hostname) > 0) {
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
+ conf->etcd_pid = start_etcd (uuid_utoa(MY_UUID), peerinfo->hostname);
+ }
+
out:
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.h b/xlators/mgmt/glusterd/src/glusterd-sm.h
index 0af45deb6..b9bedbe69 100644
--- a/xlators/mgmt/glusterd/src/glusterd-sm.h
+++ b/xlators/mgmt/glusterd/src/glusterd-sm.h
@@ -27,14 +27,7 @@
#include "byte-order.h"
//#include "glusterd.h"
#include "rpcsvc.h"
-
-struct glusterd_store_handle_ {
- char *path;
- int fd;
- FILE *read;
-};
-
-typedef struct glusterd_store_handle_ glusterd_store_handle_t;
+#include "store.h"
typedef enum gd_quorum_contribution_ {
QUORUM_NONE,
@@ -93,7 +86,9 @@ typedef struct glusterd_sm_tr_log_ {
struct glusterd_peerinfo_ {
uuid_t uuid;
- char uuid_str[50];
+ char uuid_str[50]; /* Retrieve this using
+ * gd_peer_uuid_str ()
+ */
glusterd_peer_state_info_t state;
char *hostname;
int port;
@@ -102,11 +97,13 @@ struct glusterd_peerinfo_ {
struct rpc_clnt *rpc;
rpc_clnt_prog_t *mgmt;
rpc_clnt_prog_t *peer;
+ rpc_clnt_prog_t *mgmt_v3;
int connected;
- glusterd_store_handle_t *shandle;
+ gf_store_handle_t *shandle;
glusterd_sm_tr_log_t sm_log;
gf_boolean_t quorum_action;
gd_quorum_contrib_t quorum_contrib;
+ gf_boolean_t locked;
};
typedef struct glusterd_peerinfo_ glusterd_peerinfo_t;
@@ -120,6 +117,7 @@ typedef enum glusterd_ev_gen_mode_ {
typedef struct glusterd_peer_ctx_args_ {
rpcsvc_request_t *req;
glusterd_ev_gen_mode_t mode;
+ dict_t *dict;
} glusterd_peerctx_args_t;
typedef struct glusterd_peer_ctx_ {
@@ -186,6 +184,7 @@ typedef struct glusterd_probe_ctx_ {
char *hostname;
rpcsvc_request_t *req;
int port;
+ dict_t *dict;
} glusterd_probe_ctx_t;
int
glusterd_friend_sm_new_event (glusterd_friend_sm_event_type_t event_type,
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
new file mode 100644
index 000000000..0e824a022
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
@@ -0,0 +1,5787 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <inttypes.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <signal.h>
+
+
+#if !defined(__NetBSD__) && !defined(GF_DARWIN_HOST_OS)
+#include <mntent.h>
+#else
+#include "mntent_compat.h"
+#endif
+
+#include "globals.h"
+#include "compat.h"
+#include "protocol-common.h"
+#include "xlator.h"
+#include "logging.h"
+#include "timer.h"
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-store.h"
+#include "run.h"
+#include "glusterd-volgen.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-syncop.h"
+
+#include "syscall.h"
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+
+#include "lvm-defaults.h"
+
+char snap_mount_folder[PATH_MAX];
+
+/* Look for disconnected peers, for missed snap creates or deletes */
+static int32_t
+glusterd_find_missed_snap (dict_t *rsp_dict, glusterd_volinfo_t *vol,
+ struct list_head *peers, int32_t op)
+{
+ int32_t brick_count = -1;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (peers);
+ GF_ASSERT (vol);
+
+ brick_count = 0;
+ list_for_each_entry (brickinfo, &vol->bricks, brick_list) {
+ if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+ /* If the brick belongs to the same node */
+ brick_count++;
+ continue;
+ }
+
+ list_for_each_entry (peerinfo, peers, uuid_list) {
+ if (uuid_compare (peerinfo->uuid, brickinfo->uuid)) {
+ /* If the brick doesnt belong to this peer */
+ continue;
+ }
+
+ /* Found peer who owns the brick, *
+ * if peer is not connected or not *
+ * friend add it to missed snap list */
+ if (!(peerinfo->connected) ||
+ (peerinfo->state.state !=
+ GD_FRIEND_STATE_BEFRIENDED)) {
+ ret = glusterd_add_missed_snaps_to_dict
+ (rsp_dict,
+ vol, brickinfo,
+ brick_count + 1,
+ op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snapshot "
+ "info for %s:%s in the "
+ "rsp_dict", brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+ }
+ }
+ brick_count++;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* This function will restore a snapshot volumes
+ *
+ * @param dict dictionary containing snapshot restore request
+ * @param op_errstr In case of any failure error message will be returned
+ * in this variable
+ * @return Negative value on Failure and 0 in success
+ */
+int
+glusterd_snapshot_restore (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ int ret = -1;
+ char *snapname = NULL;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "snap name");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (NULL == snap) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) not found",
+ snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+
+
+ /* TODO : As of now there is only volume in snapshot.
+ * Change this when multiple volume snapshot is introduced
+ */
+ snap_volinfo = list_entry (snap->volumes.next, glusterd_volinfo_t,
+ vol_list);
+
+ ret = glusterd_volinfo_find (snap_volinfo->parent_volname, &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not get volinfo of "
+ "%s", snap_volinfo->parent_volname);
+ goto out;
+ }
+
+ if (is_origin_glusterd (dict) == _gf_true) {
+ /* From origin glusterd check if *
+ * any peers with snap bricks is down */
+ ret = glusterd_find_missed_snap (rsp_dict, snap_volinfo,
+ &priv->peers,
+ GF_SNAP_OPTION_TYPE_RESTORE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to find missed snap restores");
+ goto out;
+ }
+ }
+
+ ret = gd_restore_snap_volume (rsp_dict, volinfo, snap_volinfo);
+ if (ret) {
+ /* No need to update op_errstr because it is assumed
+ * that the called function will do that in case of
+ * failure.
+ */
+ gf_log (this->name, GF_LOG_ERROR, "Failed to restore "
+ "snap for %s", snapname);
+ goto out;
+ }
+
+ ret = 0;
+
+ /* TODO: Need to check if we need to delete the snap after the
+ * operation is successful or not. Also need to persist the state
+ * of restore operation in the store.
+ */
+out:
+ return ret;
+}
+
+/* This function is called before actual restore is taken place. This function
+ * will validate whether the snapshot volumes are ready to be restored or not.
+ *
+ * @param dict dictionary containing snapshot restore request
+ * @param op_errstr In case of any failure error message will be returned
+ * in this variable
+ * @param rsp_dict response dictionary
+ * @return Negative value on Failure and 0 in success
+ */
+int32_t
+glusterd_snapshot_restore_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int ret = -1;
+ int32_t i = 0;
+ int32_t volcount = 0;
+ gf_boolean_t snap_restored = _gf_false;
+ char key[PATH_MAX] = {0, };
+ char *volname = NULL;
+ char *snapname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "snap name");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (NULL == snap) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) not found",
+ snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+
+ snap_restored = snap->snap_restored;
+
+ if (snap_restored) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) is already "
+ "restored", snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (rsp_dict, "snapname", snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap name(%s)", snapname);
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "volcount", &volcount);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume count");
+ goto out;
+ }
+
+ /* Snapshot restore will only work if all the volumes,
+ that are part of the snapshot, are stopped. */
+ for (i = 1; i <= volcount; ++i) {
+ snprintf (key, sizeof (key), "volname%d", i);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get volume name");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ ret = gf_asprintf (op_errstr, "Volume (%s) not found",
+ volname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+
+ if (glusterd_is_volume_started (volinfo)) {
+ ret = gf_asprintf (op_errstr, "Volume (%s) has been "
+ "started. Volume needs to be stopped before restoring "
+ "a snapshot.", volname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+snap_max_hard_limits_validate (dict_t *dict, char *volname,
+ uint64_t value, char **op_errstr)
+{
+ char err_str[PATH_MAX] = "";
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = -1;
+ uint64_t max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ if (volname) {
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (!ret) {
+ if (volinfo->is_snap_volume) {
+ ret = -1;
+ snprintf (err_str, PATH_MAX,
+ "%s is a snap volume. Configuring "
+ "snap-max-hard-limit for a snap "
+ "volume is prohibited.", volname);
+ goto out;
+ }
+ }
+ }
+
+ if (value) {
+ /* Max limit for the system is GLUSTERD_SNAPS_MAX_HARD_LIMIT
+ * but max limit for a volume is conf->snap_max_hard_limit.
+ */
+ if (volname) {
+ max_limit = conf->snap_max_hard_limit;
+ } else {
+ max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+ }
+ }
+
+ if ((value < 0) || (value > max_limit)) {
+ ret = -1;
+ snprintf (err_str, PATH_MAX, "Invalid snap-max-hard-limit"
+ "%"PRIu64 ". Expected range 0 - %"PRIu64,
+ value, max_limit);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ *op_errstr = gf_strdup (err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ }
+ return ret;
+}
+
+int
+glusterd_snapshot_config_prevalidate (dict_t *dict, char **op_errstr)
+{
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ int config_command = 0;
+ char err_str[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = NULL;
+ uint64_t value = 0;
+ uint64_t hard_limit = 0;
+ uint64_t soft_limit = 0;
+ gf_loglevel_t loglevel = GF_LOG_ERROR;
+ uint64_t max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ ret = dict_get_int32 (dict, "config-command", &config_command);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "failed to get config-command type");
+ goto out;
+ }
+
+ ret = dict_get_uint64 (dict, "snap-max-hard-limit", &hard_limit);
+
+ ret = dict_get_uint64 (dict, "snap-max-soft-limit", &soft_limit);
+
+ ret = dict_get_str (dict, "volname", &volname);
+
+ if (volname) {
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "Volume %s does not exist.", volname);
+ goto out;
+ }
+ }
+
+ switch (config_command) {
+ case GF_SNAP_CONFIG_TYPE_SET:
+ if (hard_limit) {
+ /* Validations for snap-max-hard-limits */
+ ret = snap_max_hard_limits_validate (dict, volname,
+ hard_limit, op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "snap-max-hard-limit validation "
+ "failed.");
+ goto out;
+ }
+ }
+
+ if (soft_limit) {
+ max_limit = GLUSTERD_SNAPS_MAX_SOFT_LIMIT_PERCENT;
+ if ((soft_limit < 0) || (soft_limit > max_limit)) {
+ ret = -1;
+ snprintf (err_str, PATH_MAX, "Invalid "
+ "snap-max-soft-limit ""%"
+ PRIu64 ". Expected range 0 - %"PRIu64,
+ value, max_limit);
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+
+ if (ret && err_str[0] != '\0') {
+ gf_log (this->name, loglevel, "%s", err_str);
+ *op_errstr = gf_strdup (err_str);
+ }
+
+ return ret;
+}
+
+int
+glusterd_snap_create_pre_val_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ char *snap_brick_dir = NULL;
+ char *snap_device = NULL;
+ char *tmpstr = NULL;
+ char key[PATH_MAX] = "";
+ char snapbrckcnt[PATH_MAX] = "";
+ char snapbrckord[PATH_MAX] = "";
+ int ret = -1;
+ int64_t i = -1;
+ int64_t j = -1;
+ int64_t volume_count = 0;
+ int64_t brick_count = 0;
+ int64_t brick_order = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dst);
+ GF_ASSERT (src);
+
+ ret = dict_get_int64 (src, "volcount", &volume_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "get the volume count");
+ goto out;
+ }
+
+ for (i = 0; i < volume_count; i++) {
+ memset (snapbrckcnt, '\0', sizeof(snapbrckcnt));
+ ret = snprintf (snapbrckcnt, sizeof(snapbrckcnt) - 1,
+ "vol%"PRId64"_brickcount", i+1);
+ ret = dict_get_int64 (src, snapbrckcnt, &brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "No bricks for this volume in this dict");
+ continue;
+ }
+
+ for (j = 0; j < brick_count; j++) {
+ /* Fetching data from source dict */
+ snprintf (key, sizeof(key) - 1,
+ "vol%"PRId64".brickdir%"PRId64, i+1, j);
+
+ ret = dict_get_ptr (src, key,
+ (void **)&snap_brick_dir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to fetch %s", key);
+ continue;
+ }
+
+ snprintf (key, sizeof(key) - 1,
+ "vol%"PRId64".brick_snapdevice%"PRId64, i+1, j);
+
+ ret = dict_get_ptr (src, key,
+ (void **)&snap_device);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_device");
+ goto out;
+ }
+
+ snprintf (snapbrckord, sizeof(snapbrckord) - 1,
+ "vol%"PRId64".brick%"PRId64".order", i+1, j);
+
+ ret = dict_get_int64 (src, snapbrckord, &brick_order);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get brick order");
+ goto out;
+ }
+
+ /* Adding the data in the dst dict */
+ snprintf (key, sizeof(key) - 1,
+ "vol%"PRId64".brickdir%"PRId64, i+1, brick_order);
+
+ tmpstr = gf_strdup (snap_brick_dir);
+ if (!tmpstr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out Of Memory");
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dst, key, tmpstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", key);
+ GF_FREE (tmpstr);
+ goto out;
+ }
+
+ snprintf (key, sizeof(key) - 1,
+ "vol%"PRId64".brick_snapdevice%"PRId64,
+ i+1, brick_order);
+
+ tmpstr = gf_strdup (snap_device);
+ if (!tmpstr) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dst, key, tmpstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", key);
+ GF_FREE (tmpstr);
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+out:
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_snap_pre_validate_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ int ret = -1;
+ int32_t snap_command = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (!dst || !src) {
+ gf_log (this->name, GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ ret = glusterd_snap_create_pre_val_use_rsp_dict (dst, src);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to use "
+ "rsp dict");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_snapshot_create_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ char *volname = NULL;
+ char *snapname = NULL;
+ char *device = NULL;
+ char *tmpstr = NULL;
+ char *brick_dir = NULL;
+ char snap_brick_dir[PATH_MAX] = "";
+ char *mnt_pt = NULL;
+ char key[PATH_MAX] = "";
+ char snap_mount[PATH_MAX] = "";
+ char snap_volname[64] = "";
+ char err_str[PATH_MAX] = "";
+ int ret = -1;
+ int64_t i = 0;
+ int64_t volcount = 0;
+ int64_t brick_count = 0;
+ int64_t brick_order = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *snap_volid = NULL;
+ gf_loglevel_t loglevel = GF_LOG_ERROR;
+ glusterd_conf_t *conf = NULL;
+ int64_t effective_max_limit = 0;
+
+ this = THIS;
+ GF_ASSERT (op_errstr);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ ret = dict_get_int64 (dict, "volcount", &volcount);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str), "Failed to "
+ "get the volume count");
+ goto out;
+ }
+ if (volcount <= 0) {
+ snprintf (err_str, sizeof (err_str), "Invalid volume count %"PRId64
+ " supplied", volcount);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str), "Failed to get snapname");
+ goto out;
+ }
+
+ if (glusterd_find_snap_by_name (snapname)) {
+ ret = -1;
+ snprintf (err_str, sizeof (err_str), "Snap %s already exists",
+ snapname);
+ goto out;
+ }
+
+ for (i = 1; i <= volcount; i++) {
+ snprintf (key, sizeof (key), "volname%"PRId64, i);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "failed to get volume name");
+ goto out;
+ }
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "Volume (%s) does not exist ", volname);
+ goto out;
+ }
+
+ ret = -1;
+ if (!glusterd_is_volume_started (volinfo)) {
+ snprintf (err_str, sizeof (err_str), "volume %s is "
+ "not started", volinfo->volname);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+ if (glusterd_is_defrag_on (volinfo)) {
+ snprintf (err_str, sizeof (err_str),
+ "rebalance process is running for the "
+ "volume %s", volname);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+ /* TODO: Also check whether geo replication is running */
+
+ if (volinfo->is_snap_volume == _gf_true) {
+ snprintf (err_str, sizeof (err_str),
+ "Volume %s is a snap volume", volname);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+
+ if (volinfo->snap_max_hard_limit < conf->snap_max_hard_limit)
+ effective_max_limit = volinfo->snap_max_hard_limit;
+ else
+ effective_max_limit = conf->snap_max_hard_limit;
+
+ if (volinfo->snap_count >= effective_max_limit) {
+ snprintf (err_str, sizeof (err_str),
+ "The number of existing snaps has reached "
+ "the effective maximum limit of %"PRIu64" ,"
+ "for the volume %s", effective_max_limit,
+ volname);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64"_volid", i);
+ ret = dict_get_bin (dict, key, (void **)&snap_volid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_volid");
+ goto out;
+ }
+
+ /* snap volume uuid is used as lvm snapshot name.
+ This will avoid restrictions on snapshot names
+ provided by user */
+ GLUSTERD_GET_UUID_NOHYPHEN (snap_volname, *snap_volid);
+
+ brick_count = 0;
+ brick_order = 0;
+ /* Adding snap bricks mount paths to the dict */
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID)) {
+ brick_order++;
+ continue;
+ }
+
+ if (!glusterd_is_brick_started (brickinfo)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "brick %s:%s is not started",
+ brickinfo->hostname,
+ brickinfo->path);
+ brick_order++;
+ brick_count++;
+ continue;
+ }
+
+ device = glusterd_get_brick_mount_details (brickinfo);
+ if (!device) {
+ snprintf (err_str, sizeof (err_str),
+ "getting device name for the brick "
+ "%s:%s failed", brickinfo->hostname,
+ brickinfo->path);
+ ret = -1;
+ goto out;
+ }
+
+ device = glusterd_build_snap_device_path (device,
+ snap_volname);
+ if (!device) {
+ snprintf (err_str, sizeof (err_str),
+ "cannot copy the snapshot device "
+ "name (volname: %s, snapname: %s)",
+ volinfo->volname, snapname);
+ loglevel = GF_LOG_WARNING;
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof(key),
+ "vol%"PRId64".brick_snapdevice%"PRId64, i,
+ brick_count);
+ ret = dict_set_dynstr (rsp_dict, key, device);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", key);
+ GF_FREE (device);
+ goto out;
+ }
+
+ ret = glusterd_get_brick_root (brickinfo->path,
+ &mnt_pt);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "could not get the root of the brick path %s",
+ brickinfo->path);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+ if (strncmp (brickinfo->path, mnt_pt, strlen(mnt_pt))) {
+ snprintf (err_str, sizeof (err_str),
+ "brick: %s brick mount: %s",
+ brickinfo->path, mnt_pt);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+
+ brick_dir = &brickinfo->path[strlen (mnt_pt)];
+ brick_dir++;
+
+ snprintf (snap_brick_dir, sizeof (snap_brick_dir),
+ "/%s", brick_dir);
+
+ tmpstr = gf_strdup (snap_brick_dir);
+ if (!tmpstr) {
+ ret = -1;
+ goto out;
+ }
+ snprintf (key, sizeof(key), "vol%"PRId64".brickdir%"PRId64, i,
+ brick_count);
+ ret = dict_set_dynstr (rsp_dict, key, tmpstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", snap_mount);
+ goto out;
+ }
+ tmpstr = NULL;
+
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64".brick%"PRId64".order",
+ i, brick_count);
+ ret = dict_set_int64 (rsp_dict, key, brick_order);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", key);
+ goto out;
+ }
+
+ brick_count++;
+ brick_order++;
+ }
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64"_brickcount", i);
+ ret = dict_set_int64 (rsp_dict, key, brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set %s",
+ key);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int64 (rsp_dict, "volcount", volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set volcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret)
+ GF_FREE (tmpstr);
+
+ if (ret && err_str[0] != '\0') {
+ gf_log (this->name, loglevel, "%s", err_str);
+ *op_errstr = gf_strdup (err_str);
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+glusterd_snap_t*
+glusterd_new_snap_object()
+{
+ glusterd_snap_t *snap = NULL;
+
+ snap = GF_CALLOC (1, sizeof (*snap), gf_gld_mt_snap_t);
+
+ if (snap) {
+ if (LOCK_INIT (&snap->lock)) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed initiating"
+ " snap lock");
+ GF_FREE (snap);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD (&snap->snap_list);
+ INIT_LIST_HEAD (&snap->volumes);
+ snap->snapname[0] = 0;
+ snap->snap_status = GD_SNAP_STATUS_INIT;
+ }
+
+ return snap;
+
+};
+
+/* Function glusterd_list_add_snapvol adds the volinfo object (snapshot volume)
+ to the snapshot object list and to the parent volume list */
+int32_t
+glusterd_list_add_snapvol (glusterd_volinfo_t *origin_vol,
+ glusterd_volinfo_t *snap_vol)
+{
+ int ret = -1;
+ glusterd_snap_t *snap = NULL;
+
+ GF_VALIDATE_OR_GOTO ("glusterd", origin_vol, out);
+ GF_VALIDATE_OR_GOTO ("glusterd", snap_vol, out);
+
+ snap = snap_vol->snapshot;
+ GF_ASSERT (snap);
+
+ list_add_tail (&snap_vol->vol_list, &snap->volumes);
+ LOCK (&origin_vol->lock);
+ {
+ list_add_order (&snap_vol->snapvol_list,
+ &origin_vol->snap_volumes,
+ glusterd_compare_snap_vol_time);
+ origin_vol->snap_count++;
+ }
+ UNLOCK (&origin_vol->lock);
+
+ gf_log (THIS->name, GF_LOG_DEBUG, "Snap %s added to the list",
+ snap->snapname);
+ ret = 0;
+ out:
+ return ret;
+}
+
+glusterd_snap_t*
+glusterd_find_snap_by_name (char *snapname)
+{
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snapname);
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ if (!strcmp (snap->snapname, snapname)) {
+ gf_log (THIS->name, GF_LOG_DEBUG, "Found "
+ "snap %s (%s)", snap->snapname,
+ uuid_utoa (snap->snap_id));
+ goto out;
+ }
+ }
+ snap = NULL;
+out:
+ return snap;
+}
+
+glusterd_snap_t*
+glusterd_find_snap_by_id (uuid_t snap_id)
+{
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (priv);
+
+ if (uuid_is_null(snap_id))
+ goto out;
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ if (!uuid_compare (snap->snap_id, snap_id)) {
+ gf_log (THIS->name, GF_LOG_DEBUG, "Found "
+ "snap %s (%s)", snap->snapname,
+ uuid_utoa (snap->snap_id));
+ goto out;
+ }
+ }
+ snap = NULL;
+out:
+ return snap;
+}
+
+int
+glusterd_do_lvm_snapshot_remove (glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *brickinfo,
+ const char *mount_pt, const char *snap_device)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ runner_t runner = {0,};
+ char msg[1024] = {0, };
+ char pidfile[PATH_MAX] = {0, };
+ pid_t pid = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!brickinfo) {
+ gf_log (this->name, GF_LOG_ERROR, "brickinfo NULL");
+ goto out;
+ }
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (mount_pt);
+ GF_ASSERT (snap_device);
+
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_vol, brickinfo, priv);
+ if (gf_is_service_running (pidfile, &pid)) {
+ ret = kill (pid, SIGKILL);
+ if (ret && errno != ESRCH) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to kill pid "
+ "%d reason : %s", pid, strerror(errno));
+ goto out;
+ }
+ }
+
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "umount the snapshot mounted path %s",
+ mount_pt);
+ runner_add_args (&runner, "umount", mount_pt, NULL);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+
+ /* We need not do synclock_unlock => runner_run => synclock_lock here.
+ Because it is needed if we are running a glusterfs process in
+ runner_run, so that when the glusterfs process started wants to
+ communicate to glusterd, glusterd wont be able to respond if it
+ has held the big lock. So we do unlock, run glusterfs process
+ (thus communicate to glusterd), lock. But since this is not a
+ glusterfs command that is being run, unlocking and then relocking
+ is not needed.
+ */
+ ret = runner_run (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "unmounting the "
+ "path %s (brick: %s) failed (%s)", mount_pt,
+ brickinfo->path, strerror (errno));
+ goto out;
+ }
+
+ runinit (&runner);
+ snprintf (msg, sizeof(msg), "remove snapshot of the brick %s:%s, "
+ "device: %s", brickinfo->hostname, brickinfo->path,
+ snap_device);
+ runner_add_args (&runner, LVM_REMOVE, "-f", snap_device, NULL);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+
+ ret = runner_run (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "removing snapshot of the "
+ "brick (%s:%s) of device %s failed",
+ brickinfo->hostname, brickinfo->path, snap_device);
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+int32_t
+glusterd_lvm_snapshot_remove (dict_t *rsp_dict, glusterd_volinfo_t *snap_vol)
+{
+ char *mnt_pt = NULL;
+ struct mntent *entry = NULL;
+ int32_t brick_count = -1;
+ int32_t ret = -1;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ xlator_t *this = NULL;
+ FILE *mtab = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap_vol);
+
+ if ((snap_vol->is_snap_volume == _gf_false) &&
+ (uuid_is_null (snap_vol->restored_from_snap))) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a snap volume, or a restored snap volume.");
+ ret = 0;
+ goto out;
+ }
+
+ brick_count = -1;
+ list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
+ brick_count++;
+ if (uuid_compare (brickinfo->uuid, MY_UUID)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s:%s belongs to a different node",
+ brickinfo->hostname, brickinfo->path);
+ continue;
+ }
+
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "snapshot was pending. lvm not present "
+ "for brick %s:%s of the snap %s.",
+ brickinfo->hostname, brickinfo->path,
+ snap_vol->snapshot->snapname);
+
+ if (rsp_dict &&
+ (snap_vol->is_snap_volume == _gf_true)) {
+ /* Adding missed delete to the dict */
+ ret = glusterd_add_missed_snaps_to_dict
+ (rsp_dict,
+ snap_vol,
+ brickinfo,
+ brick_count + 1,
+ GF_SNAP_OPTION_TYPE_DELETE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snapshot "
+ "info for %s:%s in the "
+ "rsp_dict", brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+ }
+
+ continue;
+ }
+
+ ret = glusterd_get_brick_root (brickinfo->path, &mnt_pt);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "getting the root "
+ "of the brick for volume %s (snap %s) failed ",
+ snap_vol->volname, snap_vol->snapshot->snapname);
+ goto out;
+ }
+
+ entry = glusterd_get_mnt_entry_info (mnt_pt, mtab);
+ if (!entry) {
+ gf_log (this->name, GF_LOG_WARNING, "getting the mount"
+ " entry for the brick %s:%s of the snap %s "
+ "(volume: %s) failed", brickinfo->hostname,
+ brickinfo->path, snap_vol->snapshot->snapname,
+ snap_vol->volname);
+ ret = -1;
+ goto out;
+ }
+ ret = glusterd_do_lvm_snapshot_remove (snap_vol, brickinfo,
+ mnt_pt,
+ entry->mnt_fsname);
+ if (mtab)
+ endmntent (mtab);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "remove the snapshot %s (%s)",
+ brickinfo->path, entry->mnt_fsname);
+ goto out;
+ }
+
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+
+int32_t
+glusterd_snap_volume_remove (dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ gf_boolean_t remove_lvm,
+ gf_boolean_t force)
+{
+ int ret = -1;
+ int save_ret = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_volinfo_t *origin_vol = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap_vol);
+
+ if (!snap_vol) {
+ gf_log(this->name, GF_LOG_WARNING, "snap_vol in NULL");
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ ret = glusterd_brick_stop (snap_vol, brickinfo, _gf_false);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to stop "
+ "brick for volume %s", snap_vol->volname);
+ save_ret = ret;
+
+ /* Continue to cleaning up the snap in case of error
+ if force flag is enabled */
+ if (!force)
+ goto out;
+ }
+ }
+
+ /* Only remove the backend lvm when required */
+ if (remove_lvm) {
+ ret = glusterd_lvm_snapshot_remove (rsp_dict, snap_vol);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove "
+ "lvm snapshot volume %s", snap_vol->volname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+ }
+
+ ret = glusterd_store_delete_volume (snap_vol);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove volume %s "
+ "from store", snap_vol->volname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+
+ if (!list_empty(&snap_vol->snapvol_list)) {
+ ret = glusterd_volinfo_find (snap_vol->parent_volname,
+ &origin_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "parent volinfo %s for volume %s",
+ snap_vol->parent_volname, snap_vol->volname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+ origin_vol->snap_count--;
+ }
+
+ ret = glusterd_volinfo_delete (snap_vol);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove volinfo "
+ "%s ", snap_vol->volname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+
+ if (save_ret)
+ ret = save_ret;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_snapobject_delete (glusterd_snap_t *snap)
+{
+ if (snap == NULL) {
+ gf_log(THIS->name, GF_LOG_WARNING, "snap is NULL");
+ return -1;
+ }
+
+ list_del_init (&snap->snap_list);
+ list_del_init (&snap->volumes);
+ if (LOCK_DESTROY(&snap->lock))
+ gf_log (THIS->name, GF_LOG_WARNING, "Failed destroying lock"
+ "of snap %s", snap->snapname);
+
+ GF_FREE (snap->description);
+ GF_FREE (snap);
+
+ return 0;
+}
+
+int32_t
+glusterd_snap_remove (dict_t *rsp_dict,
+ glusterd_snap_t *snap,
+ gf_boolean_t remove_lvm,
+ gf_boolean_t force)
+{
+ int ret = -1;
+ int save_ret = 0;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap);
+
+ if (!snap) {
+ gf_log(this->name, GF_LOG_WARNING, "snap is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry_safe (snap_vol, tmp, &snap->volumes, vol_list) {
+ ret = glusterd_snap_volume_remove (rsp_dict, snap_vol,
+ remove_lvm, force);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove "
+ "volinfo %s for snap %s", snap_vol->volname,
+ snap->snapname);
+ save_ret = ret;
+
+ /* Continue to cleaning up the snap in case of error
+ if force flag is enabled */
+ if (!force)
+ goto out;
+ }
+ }
+
+ ret = glusterd_store_delete_snap (snap);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove snap %s "
+ "from store", snap->snapname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+
+ ret = glusterd_snapobject_delete (snap);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "Failed to delete "
+ "snap object %s", snap->snapname);
+
+ if (save_ret)
+ ret = save_ret;
+out:
+ gf_log (THIS->name, GF_LOG_TRACE, "returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_snapshot_get_snapvol_detail (dict_t *dict,
+ glusterd_volinfo_t *snap_vol,
+ char *keyprefix, int detail)
+{
+ int ret = -1;
+ int snap_limit = 0;
+ char key[PATH_MAX] = {0,};
+ char *value = NULL;
+ glusterd_volinfo_t *origin_vol = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GF_ASSERT (dict);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (keyprefix);
+
+ /* Volume Name */
+ value = gf_strdup (snap_vol->volname);
+ if (!value)
+ goto out;
+
+ snprintf (key, sizeof (key), "%s.volname", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "volume name in dictionary: %s", key);
+ goto out;
+ }
+
+ /* Volume ID */
+ value = gf_strdup (uuid_utoa (snap_vol->volume_id));
+ if (NULL == value) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.vol-id", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "volume id in dictionary: %s", key);
+ goto out;
+ }
+ value = NULL;
+
+ /* volume status */
+ snprintf (key, sizeof (key), "%s.vol-status", keyprefix);
+ switch (snap_vol->status) {
+ case GLUSTERD_STATUS_STARTED:
+ ret = dict_set_str (dict, key, "Started");
+ break;
+ case GLUSTERD_STATUS_STOPPED:
+ ret = dict_set_str (dict, key, "Stopped");
+ break;
+ case GD_SNAP_STATUS_NONE:
+ ret = dict_set_str (dict, key, "None");
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Invalid volume status");
+ ret = -1;
+ goto out;
+ }
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set volume status"
+ " in dictionary: %s", key);
+ goto out;
+ }
+
+
+ ret = glusterd_volinfo_find (snap_vol->parent_volname, &origin_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the parent "
+ "volinfo for the volume %s", snap_vol->volname);
+ goto out;
+ }
+
+ /* Snaps available */
+ if (conf->snap_max_hard_limit < origin_vol->snap_max_hard_limit) {
+ snap_limit = conf->snap_max_hard_limit;
+ gf_log(this->name, GF_LOG_DEBUG, "system snap-max-hard-limit is"
+ " lesser than volume snap-max-hard-limit, "
+ "snap-max-hard-limit value is set to %d", snap_limit);
+ } else {
+ snap_limit = origin_vol->snap_max_hard_limit;
+ gf_log(this->name, GF_LOG_DEBUG, "volume snap-max-hard-limit is"
+ " lesser than system snap-max-hard-limit, "
+ "snap-max-hard-limit value is set to %d", snap_limit);
+ }
+
+ snprintf (key, sizeof (key), "%s.snaps-available", keyprefix);
+ if (snap_limit > origin_vol->snap_count)
+ ret = dict_set_int32 (dict, key,
+ snap_limit - origin_vol->snap_count);
+ else
+ ret = dict_set_int32 (dict, key, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set available snaps");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.snapcount", keyprefix);
+ ret = dict_set_int32 (dict, key, origin_vol->snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save snapcount");
+ goto out;
+ }
+
+ if (!detail)
+ goto out;
+
+ /* Parent volume name */
+ value = gf_strdup (snap_vol->parent_volname);
+ if (!value)
+ goto out;
+
+ snprintf (key, sizeof (key), "%s.origin-volname", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set parent "
+ "volume name in dictionary: %s", key);
+ goto out;
+ }
+ value = NULL;
+
+ ret = 0;
+out:
+ if (value)
+ GF_FREE (value);
+
+ return ret;
+}
+
+static int
+glusterd_snapshot_get_snap_detail (dict_t *dict, glusterd_snap_t *snap,
+ char *keyprefix, glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int volcount = 0;
+ char key[PATH_MAX] = {0,};
+ char *value = NULL;
+ char *timestr = NULL;
+ struct tm *tmptr = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp_vol = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (snap);
+ GF_ASSERT (keyprefix);
+
+ /* Snap Name */
+ value = gf_strdup (snap->snapname);
+ if (!value)
+ goto out;
+
+ snprintf (key, sizeof (key), "%s.snapname", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap name in dictionary");
+ goto out;
+ }
+
+ /* Snap ID */
+ value = gf_strdup (uuid_utoa (snap->snap_id));
+ if (NULL == value) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.snap-id", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap id in dictionary");
+ goto out;
+ }
+ value = NULL;
+
+ tmptr = localtime (&(snap->time_stamp));
+ if (NULL == tmptr) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to convert "
+ "time_t to *tm");
+ ret = -1;
+ goto out;
+ }
+
+ timestr = GF_CALLOC (1, PATH_MAX, gf_gld_mt_char);
+ if (NULL == timestr) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = strftime (timestr, PATH_MAX, "%Y-%m-%d %H:%M:%S", tmptr);
+ if (0 == ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to convert time_t "
+ "to string");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.snap-time", keyprefix);
+ ret = dict_set_dynstr (dict, key, timestr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap time stamp in dictionary");
+ goto out;
+ }
+ timestr = NULL;
+
+ /* If snap description is provided then add that into dictionary */
+ if (NULL != snap->description) {
+ value = gf_strdup (snap->description);
+ if (NULL == value) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.snap-desc", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap description in dictionary");
+ goto out;
+ }
+ value = NULL;
+ }
+
+ snprintf (key, sizeof (key), "%s.snap-status", keyprefix);
+ switch (snap->snap_status) {
+ case GD_SNAP_STATUS_INIT:
+ ret = dict_set_str (dict, key, "Init");
+ break;
+ case GD_SNAP_STATUS_IN_USE:
+ ret = dict_set_str (dict, key, "In-use");
+ break;
+ case GD_SNAP_STATUS_DECOMMISSION:
+ ret = dict_set_str (dict, key, "Decommisioned");
+ break;
+ case GD_SNAP_STATUS_RESTORED:
+ ret = dict_set_str (dict, key, "Restored");
+ break;
+ case GD_SNAP_STATUS_NONE:
+ ret = dict_set_str (dict, key, "None");
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Invalid snap status");
+ ret = -1;
+ goto out;
+ }
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap status "
+ "in dictionary");
+ goto out;
+ }
+
+ if (volinfo) {
+ volcount = 1;
+ snprintf (key, sizeof (key), "%s.vol%d", keyprefix, volcount);
+ ret = glusterd_snapshot_get_snapvol_detail (dict,
+ volinfo, key, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get volume detail %s for snap %s",
+ snap_vol->volname, snap->snapname);
+ goto out;
+ }
+ goto done;
+ }
+
+ list_for_each_entry_safe (snap_vol, tmp_vol, &snap->volumes, vol_list) {
+ volcount++;
+ snprintf (key, sizeof (key), "%s.vol%d", keyprefix, volcount);
+ ret = glusterd_snapshot_get_snapvol_detail (dict,
+ snap_vol, key, 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get volume detail %s for snap %s",
+ snap_vol->volname, snap->snapname);
+ goto out;
+ }
+ }
+
+done:
+ snprintf (key, sizeof (key), "%s.vol-count", keyprefix);
+ ret = dict_set_int32 (dict, key, volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set %s",
+ key);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (value)
+ GF_FREE (value);
+
+ if (timestr)
+ GF_FREE(timestr);
+
+ return ret;
+}
+
+static int
+glusterd_snapshot_get_all_snap_info (dict_t *dict)
+{
+ int ret = -1;
+ int snapcount = 0;
+ char key[PATH_MAX] = {0,};
+ glusterd_snap_t *snap = NULL;
+ glusterd_snap_t *tmp_snap = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* General parameter validation */
+ GF_ASSERT (dict);
+
+ list_for_each_entry_safe (snap, tmp_snap, &priv->snapshots, snap_list) {
+ snapcount++;
+ snprintf (key, sizeof (key), "snap%d", snapcount);
+ ret = glusterd_snapshot_get_snap_detail (dict, snap, key, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "snapdetail for snap %s", snap->snapname);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "snap-count", snapcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_get_info_by_volume (dict_t *dict, char *volname,
+ char *err_str, size_t len)
+{
+ int ret = -1;
+ int snapcount = 0;
+ int snap_limit = 0;
+ char *value = NULL;
+ char key[PATH_MAX] = "";
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp_vol = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GF_ASSERT (dict);
+ GF_ASSERT (volname);
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, len, "Volume (%s) does not exist", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ goto out;
+ }
+
+ /* Snaps available */
+ if (conf->snap_max_hard_limit < volinfo->snap_max_hard_limit) {
+ snap_limit = conf->snap_max_hard_limit;
+ gf_log(this->name, GF_LOG_DEBUG, "system snap-max-hard-limit is"
+ " lesser than volume snap-max-hard-limit, "
+ "snap-max-hard-limit value is set to %d", snap_limit);
+ } else {
+ snap_limit = volinfo->snap_max_hard_limit;
+ gf_log(this->name, GF_LOG_DEBUG, "volume snap-max-hard-limit is"
+ " lesser than system snap-max-hard-limit, "
+ "snap-max-hard-limit value is set to %d", snap_limit);
+ }
+
+ if (snap_limit > volinfo->snap_count)
+ ret = dict_set_int32 (dict, "snaps-available",
+ snap_limit - volinfo->snap_count);
+ else
+ ret = dict_set_int32 (dict, "snaps-available", 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set available snaps");
+ goto out;
+ }
+
+ /* Origin volume name */
+ value = gf_strdup (volinfo->volname);
+ if (!value)
+ goto out;
+
+ ret = dict_set_dynstr (dict, "origin-volname", value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set parent "
+ "volume name in dictionary: %s", key);
+ goto out;
+ }
+ value = NULL;
+
+ list_for_each_entry_safe (snap_vol, tmp_vol, &volinfo->snap_volumes,
+ snapvol_list) {
+ snapcount++;
+ snprintf (key, sizeof (key), "snap%d", snapcount);
+ ret = glusterd_snapshot_get_snap_detail (dict,
+ snap_vol->snapshot,
+ key, snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "snapdetail for snap %s",
+ snap_vol->snapshot->snapname);
+ goto out;
+ }
+ }
+ ret = dict_set_int32 (dict, "snap-count", snapcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (value)
+ GF_FREE (value);
+
+ return ret;
+}
+
+/* This function will be called from RPC handler routine.
+ * This function is responsible for getting the requested
+ * snapshot info into the dictionary.
+ *
+ * @param req RPC request object. Required for sending a response back.
+ * @param op glusterd operation. Required for sending a response back.
+ * @param dict pointer to dictionary which will contain both
+ * request and response key-pair values.
+ * @return -1 on error and 0 on success
+ */
+int
+glusterd_handle_snapshot_info (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ int8_t snap_driven = 1;
+ char *volname = NULL;
+ char *snapname = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+ int32_t cmd = GF_SNAP_INFO_TYPE_ALL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, req, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+
+ ret = dict_get_int32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get type "
+ "of snapshot info");
+ goto out;
+ }
+
+ switch (cmd) {
+ case GF_SNAP_INFO_TYPE_ALL:
+ {
+ ret = glusterd_snapshot_get_all_snap_info (dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get info of all snaps");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_INFO_TYPE_SNAP:
+ {
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get snap name");
+ goto out;
+ }
+
+ ret = dict_set_int32 (dict, "snap-count", 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set snapcount");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ snprintf (err_str, len,
+ "Snap (%s) does not exist", snapname);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+ ret = glusterd_snapshot_get_snap_detail (dict, snap,
+ "snap1", NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get snap detail of snap "
+ "%s", snap->snapname);
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_INFO_TYPE_VOL:
+ {
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get volname");
+ goto out;
+ }
+ ret = glusterd_snapshot_get_info_by_volume (dict,
+ volname, err_str, len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get volume info of volume "
+ "%s", volname);
+ goto out;
+ }
+ snap_driven = 0;
+ break;
+ }
+ }
+
+ ret = dict_set_int8 (dict, "snap-driven", snap_driven);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap-driven");
+ goto out;
+ }
+
+ /* If everything is successful then send the response back to cli.
+ * In case of failure the caller of this function will take care
+ of the response */
+ ret = glusterd_op_send_cli_response (op, 0, 0, req, dict, err_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to send cli "
+ "response");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/* This function sets all the snapshot names in the dictionary */
+int
+glusterd_snapshot_get_all_snapnames (dict_t *dict)
+{
+ int ret = -1;
+ int snapcount = 0;
+ char *snapname = NULL;
+ char key[PATH_MAX] = {0,};
+ glusterd_snap_t *snap = NULL;
+ glusterd_snap_t *tmp_snap = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (dict);
+
+ list_for_each_entry_safe (snap, tmp_snap, &priv->snapshots, snap_list) {
+ snapcount++;
+ snapname = gf_strdup (snap->snapname);
+ if (!snapname) {
+ gf_log (this->name, GF_LOG_ERROR, "strdup failed");
+ ret = -1;
+ goto out;
+ }
+ snprintf (key, sizeof (key), "snapname%d", snapcount);
+ ret = dict_set_dynstr (dict, key, snapname);
+ if (ret) {
+ GF_FREE (snapname);
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set %s",
+ key);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "snap-count", snapcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
+
+/* This function sets all the snapshot names
+ under a given volume in the dictionary */
+int
+glusterd_snapshot_get_vol_snapnames (dict_t *dict, glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int snapcount = 0;
+ char *snapname = NULL;
+ char key[PATH_MAX] = {0,};
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp_vol = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (dict);
+ GF_ASSERT (volinfo);
+
+ list_for_each_entry_safe (snap_vol, tmp_vol,
+ &volinfo->snap_volumes, snapvol_list) {
+ snapcount++;
+ snapname = gf_strdup (snap_vol->snapshot->snapname);
+ if (!snapname) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "strdup failed");
+ ret = -1;
+ goto out;
+ }
+ snprintf (key, sizeof (key), "snapname%d", snapcount);
+ ret = dict_set_dynstr (dict, key, snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "set %s", key);
+ GF_FREE (snapname);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "snap-count", snapcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
+
+int
+glusterd_handle_snapshot_list (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_VALIDATE_OR_GOTO (this->name, req, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+ /* Ignore error for getting volname as it is optional */
+ ret = dict_get_str (dict, "volname", &volname);
+
+ if (NULL == volname) {
+ ret = glusterd_snapshot_get_all_snapnames (dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get snapshot list");
+ goto out;
+ }
+ } else {
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, len,
+ "Volume (%s) does not exist", volname);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ goto out;
+ }
+
+ ret = glusterd_snapshot_get_vol_snapnames (dict, volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get snapshot list for volume %s",
+ volname);
+ goto out;
+ }
+ }
+
+ /* If everything is successful then send the response back to cli.
+ In case of failure the caller of this function will take of response.*/
+ ret = glusterd_op_send_cli_response (op, 0, 0, req, dict, err_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to send cli "
+ "response");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/* This is a snapshot create handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt_v3 framework to do the actual snap creation on all the bricks
+ *
+ * @param req RPC request object
+ * @param op gluster operation
+ * @param dict dictionary containing snapshot restore request
+ * @param err_str In case of an err this string should be populated
+ * @param len length of err_str buffer
+ *
+ * @return Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_create (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ char *volname = NULL;
+ char *snapname = NULL;
+ int64_t volcount = 0;
+ xlator_t *this = NULL;
+ char key[PATH_MAX] = "";
+ char *username = NULL;
+ char *password = NULL;
+ uuid_t *uuid_ptr = NULL;
+ uuid_t tmp_uuid = {0};
+ int i = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ GF_ASSERT (err_str);
+
+ ret = dict_get_int64 (dict, "volcount", &volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "get the volume count");
+ goto out;
+ }
+ if (volcount <= 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid volume count %"PRId64
+ " supplied", volcount);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the snapname");
+ goto out;
+ }
+
+ if (strlen(snapname) >= GLUSTERD_MAX_SNAP_NAME) {
+ snprintf (err_str, len, "snapname cannot exceed 255 "
+ "characters");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+
+ uuid_ptr = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+ if (!uuid_ptr) {
+ gf_log (this->name, GF_LOG_ERROR, "Out Of Memory");
+ ret = -1;
+ goto out;
+ }
+
+ uuid_generate (*uuid_ptr);
+ ret = dict_set_bin (dict, "snap-id", uuid_ptr, sizeof(uuid_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to set snap-id");
+ GF_FREE (uuid_ptr);
+ goto out;
+ }
+ uuid_ptr = NULL;
+
+ ret = dict_set_int64 (dict, "snap-time", (int64_t)time(NULL));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to set snap-time");
+ goto out;
+ }
+
+ for (i = 1; i <= volcount; i++) {
+ snprintf (key, sizeof (key), "volname%d", i);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get volume name");
+ goto out;
+ }
+
+ /* generate internal username and password for the snap*/
+ uuid_generate (tmp_uuid);
+ username = gf_strdup (uuid_utoa (tmp_uuid));
+ snprintf (key, sizeof(key), "volume%d_username", i);
+ ret = dict_set_dynstr (dict, key, username);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap "
+ "username for volume %s", volname);
+ GF_FREE (username);
+ goto out;
+ }
+
+ uuid_generate (tmp_uuid);
+ password = gf_strdup (uuid_utoa (tmp_uuid));
+ snprintf (key, sizeof(key), "volume%d_password", i);
+ ret = dict_set_dynstr (dict, key, password);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap "
+ "password for volume %s", volname);
+ GF_FREE (password);
+ goto out;
+ }
+
+ uuid_ptr = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+ if (!uuid_ptr) {
+ gf_log (this->name, GF_LOG_ERROR, "Out Of Memory");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof(key) - 1, "vol%d_volid", i);
+ uuid_generate (*uuid_ptr);
+ ret = dict_set_bin (dict, key, uuid_ptr, sizeof(uuid_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snap_volid");
+ GF_FREE (uuid_ptr);
+ goto out;
+ }
+ }
+
+ ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap "
+ "phases");
+ }
+
+out:
+ return ret;
+}
+
+/* This is a snapshot status handler function. This function will be
+ * executed in a originator node. This function is responsible for
+ * calling mgmt v3 framework to get the actual snapshot status from
+ * all the bricks
+ *
+ * @param req RPC request object
+ * @param op gluster operation
+ * @param dict dictionary containing snapshot status request
+ * @param err_str In case of an err this string should be populated
+ * @param len length of err_str buffer
+ *
+ * return : 0 in case of success.
+ * -1 in case of failure.
+ *
+ */
+int
+glusterd_handle_snapshot_status (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ char *volname = NULL;
+ char *snapname = NULL;
+ char *buf = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ int32_t cmd = -1;
+ int i = 0;
+ dict_t *voldict = NULL;
+ char key[PATH_MAX] = "";
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+
+ GF_ASSERT (conf);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ GF_ASSERT (err_str);
+
+ ret = dict_get_int32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not get status type");
+ goto out;
+ }
+ switch (cmd) {
+ case GF_SNAP_STATUS_TYPE_ALL:
+ {
+ /* IF we give "gluster snapshot status"
+ * then lock is held on all snaps.
+ * This is the place where necessary information
+ * (snapname and snapcount)is populated in dictionary
+ * for locking.
+ */
+ ++i;
+ list_for_each_entry (snap, &conf->snapshots, snap_list)
+ {
+ snprintf (key, sizeof (key), "snapname%d", i);
+ buf = gf_strdup (snap->snapname);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dict, key, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save snapname (%s) "
+ "in the dictionary",
+ snap->snapname);
+ GF_FREE (buf);
+ goto out;
+ }
+
+ buf = NULL;
+ i++;
+ }
+
+ ret = dict_set_int32 (dict, "snapcount", i - 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not "
+ "save snapcount in the dictionary");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_STATUS_TYPE_SNAP:
+ {
+ /* IF we give "gluster snapshot status <snapname>"
+ * then lock is held on single snap.
+ * This is the place where necessary information
+ * (snapname)is populated in dictionary
+ * for locking.
+ */
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to fetch snap name");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ snprintf (err_str, len, "Snap (%s)"
+ "does not exist", snapname);
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_VOL:
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to fetch volname");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, len, "Volume (%s) "
+ "does not exist", volname);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ goto out;
+ }
+
+ i = 1;
+ list_for_each_entry (snap_volinfo,
+ &volinfo->snap_volumes, snapvol_list) {
+ snprintf (key, sizeof (key), "snapname%d", i);
+
+ buf = gf_strdup
+ (snap_volinfo->snapshot->snapname);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, key, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save snapname");
+ GF_FREE (buf);
+ goto out;
+ }
+
+ buf = NULL;
+ i++;
+ }
+
+ ret = dict_set_int32 (dict, "snapcount", i-1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save snapcount");
+ goto out;
+ }
+ break;
+ default:
+ {
+ gf_log (this->name, GF_LOG_ERROR, "Unknown type");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* Volume lock is not necessary for snapshot status, hence
+ * turning it off
+ */
+ ret = dict_set_int8 (dict, "hold_vol_locks", 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Setting volume lock "
+ "flag failed");
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initiate "
+ "snap phases");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (voldict) {
+ dict_unref (voldict);
+ }
+ return ret;
+}
+
+
+/* This is a snapshot restore handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt_v3 framework to do the actual restore on all the bricks
+ *
+ * @param req RPC request object
+ * @param op gluster operation
+ * @param dict dictionary containing snapshot restore request
+ * @param err_str In case of an err this string should be populated
+ * @param len length of err_str buffer
+ *
+ * @return Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_restore (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ char *snapname = NULL;
+ char *buf = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ int32_t i = 0;
+ char key[PATH_MAX] = "";
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+
+ GF_ASSERT (conf);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ GF_ASSERT (err_str);
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get snapname");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ snprintf (err_str, len, "Snap (%s) does not exist", snapname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (snap_volinfo, &snap->volumes, vol_list) {
+ i++;
+ snprintf (key, sizeof (key), "volname%d", i);
+ buf = gf_strdup (snap_volinfo->parent_volname);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dict, key, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not set "
+ "parent volume name %s in the dict",
+ snap_volinfo->parent_volname);
+ GF_FREE (buf);
+ goto out;
+ }
+ buf = NULL;
+ }
+
+ ret = dict_set_int32 (dict, "volcount", i);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save volume count");
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap "
+ "phases");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+glusterd_snap_t*
+glusterd_create_snap_object (dict_t *dict, dict_t *rsp_dict)
+{
+ char *snapname = NULL;
+ uuid_t *snap_id = NULL;
+ char *description = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+ int64_t time_stamp = 0;
+
+ this = THIS;
+ priv = this->private;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ /* Fetch snapname, description, id and time from dict */
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snapname");
+ goto out;
+ }
+
+ /* Ignore ret value for description*/
+ ret = dict_get_str (dict, "description", &description);
+
+ ret = dict_get_bin (dict, "snap-id", (void **)&snap_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snap_id");
+ goto out;
+ }
+
+ ret = dict_get_int64 (dict, "snap-time", &time_stamp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snap-time");
+ goto out;
+ }
+ if (time_stamp <= 0) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Invalid time-stamp: %"PRId64,
+ time_stamp);
+ goto out;
+ }
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ if (!strcmp (snap->snapname, snapname) ||
+ !uuid_compare (snap->snap_id, *snap_id)) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Found duplicate snap %s (%s)",
+ snap->snapname, uuid_utoa (snap->snap_id));
+ ret = -1;
+ break;
+ }
+ }
+ if (ret) {
+ snap = NULL;
+ goto out;
+ }
+
+ snap = glusterd_new_snap_object ();
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not create "
+ "the snap object for snap %s", snapname);
+ goto out;
+ }
+
+ strcpy (snap->snapname, snapname);
+ uuid_copy (snap->snap_id, *snap_id);
+ snap->time_stamp = (time_t)time_stamp;
+ /* Set the status as GD_SNAP_STATUS_INIT and once the backend snapshot
+ is taken and snap is really ready to use, set the status to
+ GD_SNAP_STATUS_IN_USE. This helps in identifying the incomplete
+ snapshots and cleaning them up.
+ */
+ snap->snap_status = GD_SNAP_STATUS_INIT;
+ if (description) {
+ snap->description = gf_strdup (description);
+ if (snap->description == NULL) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Saving the Snap Description Failed");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = glusterd_store_snap (snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Could not store snap"
+ "object %s", snap->snapname);
+ goto out;
+ }
+
+ list_add_order (&snap->snap_list, &priv->snapshots,
+ glusterd_compare_snap_time);
+
+ gf_log (this->name, GF_LOG_TRACE, "Snap %s added to the list",
+ snap->snapname);
+
+ ret = 0;
+
+out:
+ if (ret) {
+ if (snap)
+ glusterd_snap_remove (rsp_dict, snap,
+ _gf_true, _gf_true);
+ snap = NULL;
+ }
+
+ return snap;
+}
+
+/* Added missed_snap_entry to rsp_dict */
+int32_t
+glusterd_add_missed_snaps_to_dict (dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *brickinfo,
+ int32_t brick_number, int32_t op)
+{
+ char *snap_uuid = NULL;
+ char missed_snap_entry[PATH_MAX] = "";
+ char name_buf[PATH_MAX] = "";
+ int32_t missed_snap_count = -1;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (brickinfo);
+
+ snap_uuid = gf_strdup (uuid_utoa (snap_vol->snapshot->snap_id));
+ if (!snap_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (missed_snap_entry, sizeof(missed_snap_entry),
+ "%s:%s=%s:%d:%s:%d:%d", uuid_utoa(brickinfo->uuid),
+ snap_uuid, snap_vol->volname, brick_number, brickinfo->path,
+ op, GD_MISSED_SNAP_PENDING);
+
+ /* Fetch the missed_snap_count from the dict */
+ ret = dict_get_int32 (rsp_dict, "missed_snap_count",
+ &missed_snap_count);
+ if (ret) {
+ /* Initialize the missed_snap_count for the first time */
+ missed_snap_count = 0;
+ }
+
+ /* Setting the missed_snap_entry in the rsp_dict */
+ snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d",
+ missed_snap_count);
+ ret = dict_set_dynstr_with_alloc (rsp_dict, name_buf,
+ missed_snap_entry);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set missed_snap_entry (%s) "
+ "in the rsp_dict.", missed_snap_entry);
+ goto out;
+ }
+ missed_snap_count++;
+
+ /* Setting the new missed_snap_count in the dict */
+ ret = dict_set_int32 (rsp_dict, "missed_snap_count",
+ missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set missed_snap_count for %s "
+ "in the rsp_dict.", missed_snap_entry);
+ goto out;
+ }
+
+out:
+ if (snap_uuid)
+ GF_FREE (snap_uuid);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* This function is called to get the device path of the snap lvm. Usually
+ if /dev/mapper/<group-name>-<lvm-name> is the device for the lvm,
+ then the snap device will be /dev/<group-name>/<snapname>.
+ This function takes care of building the path for the snap device.
+*/
+char *
+glusterd_build_snap_device_path (char *device, char *snapname)
+{
+ char snap[PATH_MAX] = "";
+ char msg[1024] = "";
+ char volgroup[PATH_MAX] = "";
+ char *snap_device = NULL;
+ xlator_t *this = NULL;
+ runner_t runner = {0,};
+ char *ptr = NULL;
+ int ret = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ if (!device) {
+ gf_log (this->name, GF_LOG_ERROR, "device is NULL");
+ goto out;
+ }
+ if (!snapname) {
+ gf_log (this->name, GF_LOG_ERROR, "snapname is NULL");
+ goto out;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, "/sbin/lvs", "--noheadings", "-o", "vg_name",
+ device, NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ snprintf (msg, sizeof (msg), "Get volume group for device %s", device);
+ runner_log (&runner, this->name, GF_LOG_DEBUG, msg);
+ ret = runner_start (&runner);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume group "
+ "for device %s", device);
+ runner_end (&runner);
+ goto out;
+ }
+ ptr = fgets(volgroup, sizeof(volgroup),
+ runner_chio (&runner, STDOUT_FILENO));
+ if (!ptr || !strlen(volgroup)) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume group "
+ "for snap %s", snapname);
+ runner_end (&runner);
+ ret = -1;
+ goto out;
+ }
+ runner_end (&runner);
+
+ snprintf (snap, sizeof(snap), "/dev/%s/%s", gf_trim(volgroup),
+ snapname);
+ snap_device = gf_strdup (snap);
+ if (!snap_device) {
+ gf_log (this->name, GF_LOG_WARNING, "Cannot copy the "
+ "snapshot device name for snapname: %s)", snapname);
+ }
+
+out:
+ return snap_device;
+}
+
+/* This function actually calls the command (or the API) for taking the
+ snapshot of the backend brick filesystem. If this is successful,
+ then call the glusterd_snap_create function to create the snap object
+ for glusterd
+*/
+char *
+glusterd_take_lvm_snapshot (glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *brickinfo)
+{
+ char msg[NAME_MAX] = "";
+ char buf[PATH_MAX] = "";
+ char *snap_device = NULL;
+ char *ptr = NULL;
+ char *device = NULL;
+ int ret = -1;
+ gf_boolean_t match = _gf_false;
+ runner_t runner = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ if (!brickinfo) {
+ gf_log (this->name, GF_LOG_ERROR, "brickinfo NULL");
+ goto out;
+ }
+
+ device = glusterd_get_brick_mount_details (brickinfo);
+ if (!device) {
+ gf_log (this->name, GF_LOG_ERROR, "getting device name for "
+ "the brick %s:%s failed", brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+
+ /* Figuring out if setactivationskip flag is supported or not */
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "running lvcreate help");
+ runner_add_args (&runner, LVM_CREATE, "--help", NULL);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ ret = runner_start (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to run lvcreate help");
+ runner_end (&runner);
+ goto out;
+ }
+
+ /* Looking for setactivationskip in lvcreate --help */
+ do {
+ ptr = fgets(buf, sizeof(buf),
+ runner_chio (&runner, STDOUT_FILENO));
+ if (ptr) {
+ if (strstr(buf, "setactivationskip")) {
+ match = _gf_true;
+ break;
+ }
+ }
+ } while (ptr != NULL);
+ runner_end (&runner);
+
+ /* Takng the actual snapshot */
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "taking snapshot of the brick %s:%s",
+ brickinfo->hostname, brickinfo->path);
+ if (match == _gf_true)
+ runner_add_args (&runner, LVM_CREATE, "-s", device,
+ "--setactivationskip", "n", "--name",
+ snap_vol->volname, NULL);
+ else
+ runner_add_args (&runner, LVM_CREATE, "-s", device,
+ "--name", snap_vol->volname, NULL);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+ ret = runner_start (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "taking snapshot of the "
+ "brick (%s:%s) of device %s failed",
+ brickinfo->hostname, brickinfo->path, device);
+ runner_end (&runner);
+ goto out;
+ }
+ runner_end (&runner);
+
+ snap_device = glusterd_build_snap_device_path (device,
+ snap_vol->volname);
+ if (!snap_device) {
+ gf_log (this->name, GF_LOG_WARNING, "Cannot copy the snapshot "
+ "device name for snap %s (volume id: %s)",
+ snap_vol->snapshot->snapname, snap_vol->volname);
+ ret = -1;
+ goto out;
+ }
+
+out:
+ return snap_device;
+}
+
+int32_t
+glusterd_snap_brick_create (char *device, glusterd_volinfo_t *snap_volinfo,
+ glusterd_brickinfo_t *original_brickinfo,
+ int32_t brick_count, char *snap_brick_dir)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ char snap_brick_mount_path[PATH_MAX] = "";
+ char snap_brick_path[PATH_MAX] = "";
+ struct stat statbuf = {0, };
+
+ this = THIS;
+ priv = this->private;
+
+ GF_ASSERT (device);
+ GF_ASSERT (snap_volinfo);
+ GF_ASSERT (original_brickinfo);
+ GF_ASSERT (snap_brick_dir);
+
+ snprintf (snap_brick_mount_path, sizeof (snap_brick_mount_path),
+ "%s/%s/brick%d", snap_mount_folder, snap_volinfo->volname,
+ brick_count+1);
+
+ snprintf (snap_brick_path, sizeof (snap_brick_path), "%s%s",
+ snap_brick_mount_path, snap_brick_dir);
+
+ ret = mkdir_p (snap_brick_mount_path, 0777, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "creating the brick directory"
+ " %s for the snapshot %s(device: %s) failed",
+ snap_brick_mount_path, snap_volinfo->volname, device);
+ goto out;
+ }
+ /* mount the snap logical device on the directory inside
+ /run/gluster/snaps/<snapname>/@snap_brick_mount_path
+ Way to mount the snap brick via mount api is this.
+ ret = mount (device, snap_brick_mount_path, entry->mnt_type,
+ MS_MGC_VAL, "nouuid");
+ But for now, mounting using runner apis.
+ */
+ ret = glusterd_mount_lvm_snapshot (device, snap_brick_mount_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to mount lvm snapshot.");
+ goto out;
+ }
+
+ ret = stat (snap_brick_path, &statbuf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "stat of the brick %s"
+ "(brick mount: %s) failed (%s)", snap_brick_path,
+ snap_brick_mount_path, strerror (errno));
+ goto out;
+ }
+ ret = sys_lsetxattr (snap_brick_path,
+ GF_XATTR_VOL_ID_KEY,
+ snap_volinfo->volume_id, 16,
+ XATTR_REPLACE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "extended attribute %s on %s. Reason: "
+ "%s, snap: %s", GF_XATTR_VOL_ID_KEY,
+ snap_brick_path, strerror (errno),
+ snap_volinfo->volname);
+ goto out;
+ }
+
+out:
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "unmounting the snap brick"
+ " mount %s", snap_brick_mount_path);
+#if !defined(GF_DARWIN_HOST_OS)
+ umount (snap_brick_mount_path);
+#else
+ unmount (snap_brick_mount_path, 0);
+#endif
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int32_t
+glusterd_add_bricks_to_snap_volume (dict_t *dict, dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *original_brickinfo,
+ glusterd_brickinfo_t *snap_brickinfo,
+ char **snap_brick_dir, int64_t volcount,
+ int32_t brick_count)
+{
+ char key[PATH_MAX] = "";
+ char snap_brick_path[PATH_MAX] = "";
+ char *snap_device = NULL;
+ gf_boolean_t add_missed_snap = _gf_false;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (original_brickinfo);
+ GF_ASSERT (snap_brickinfo);
+ GF_ASSERT (snap_brick_dir);
+
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64".brickdir%d", volcount,
+ brick_count);
+ ret = dict_get_ptr (dict, key, (void **)snap_brick_dir);
+ if (ret) {
+ /* Using original brickinfo here because it will be a
+ * pending snapshot and storing the original brickinfo
+ * will help in mapping while recreating the missed snapshot
+ */
+ gf_log (this->name, GF_LOG_WARNING, "Unable to fetch "
+ "snap mount path (%s). Using original brickinfo", key);
+ snap_brickinfo->snap_status = -1;
+ strcpy (snap_brick_path, original_brickinfo->path);
+
+ /* In origiator node add snaps missed
+ * from different nodes to the dict
+ */
+ if (is_origin_glusterd (dict) == _gf_true)
+ add_missed_snap = _gf_true;
+ } else {
+ /* Create brick-path in the format /var/run/gluster/snaps/ *
+ * <snap-uuid>/<original-brick#>/snap-brick-dir *
+ */
+ snprintf (snap_brick_path, sizeof(snap_brick_path),
+ "%s/%s/brick%d%s", snap_mount_folder,
+ snap_vol->volname, brick_count+1,
+ *snap_brick_dir);
+ }
+
+ if ((snap_brickinfo->snap_status != -1) &&
+ (!uuid_compare (original_brickinfo->uuid, MY_UUID)) &&
+ (!glusterd_is_brick_started (original_brickinfo))) {
+ /* In case if the brick goes down after prevalidate. */
+ gf_log (this->name, GF_LOG_WARNING, "brick %s:%s is not"
+ " started (snap: %s)",
+ original_brickinfo->hostname,
+ original_brickinfo->path,
+ snap_vol->snapshot->snapname);
+
+ snap_brickinfo->snap_status = -1;
+ strcpy (snap_brick_path, original_brickinfo->path);
+ add_missed_snap = _gf_true;
+ }
+
+ if (add_missed_snap) {
+ ret = glusterd_add_missed_snaps_to_dict (rsp_dict,
+ snap_vol,
+ original_brickinfo,
+ brick_count + 1,
+ GF_SNAP_OPTION_TYPE_CREATE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to add missed"
+ " snapshot info for %s:%s in the rsp_dict",
+ original_brickinfo->hostname,
+ original_brickinfo->path);
+ goto out;
+ }
+ }
+
+ snprintf (key, sizeof(key), "vol%"PRId64".brick_snapdevice%d",
+ volcount, brick_count);
+ ret = dict_get_ptr (dict, key, (void **)&snap_device);
+ if (ret) {
+ /* If the device name is empty, so will be the brick path
+ * Hence the missed snap has already been added above
+ */
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch "
+ "snap device (%s). Leaving empty", key);
+ } else
+ strcpy (snap_brickinfo->device_path, snap_device);
+
+ ret = gf_canonicalize_path (snap_brick_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to canonicalize path");
+ goto out;
+ }
+
+ strcpy (snap_brickinfo->hostname, original_brickinfo->hostname);
+ strcpy (snap_brickinfo->path, snap_brick_path);
+ uuid_copy (snap_brickinfo->uuid, original_brickinfo->uuid);
+ /* AFR changelog names are based on brick_id and hence the snap
+ * volume's bricks must retain the same ID */
+ strcpy (snap_brickinfo->brick_id, original_brickinfo->brick_id);
+ list_add_tail (&snap_brickinfo->brick_list, &snap_vol->bricks);
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int32_t
+glusterd_take_brick_snapshot (glusterd_volinfo_t *origin_vol,
+ glusterd_volinfo_t *snap_vol, dict_t *rsp_dict,
+ glusterd_brickinfo_t *original_brickinfo,
+ glusterd_brickinfo_t *snap_brickinfo,
+ char *snap_brick_dir, int32_t brick_count)
+{
+ char *device = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (origin_vol);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (original_brickinfo);
+ GF_ASSERT (snap_brickinfo);
+ GF_ASSERT (snap_brick_dir);
+
+ device = glusterd_take_lvm_snapshot (snap_vol, original_brickinfo);
+ /* Fail the snapshot even though snapshot on one of
+ the bricks fails. At the end when we check whether
+ the snapshot volume meets quorum or not, then the
+ the snapshot can either be treated as success, or
+ in case of failure we can undo the changes and return
+ failure to cli. */
+ if (!device) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to take snapshot of %s:%s",
+ original_brickinfo->hostname,
+ original_brickinfo->path);
+ goto out;
+ }
+
+ /* create the complete brick here */
+ ret = glusterd_snap_brick_create (device, snap_vol,
+ original_brickinfo,
+ brick_count, snap_brick_dir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "not able to"
+ " create the brickinfo for the snap %s"
+ ", volume %s", snap_vol->snapshot->snapname,
+ origin_vol->volname);
+ goto out;
+ }
+
+out:
+ if (device)
+ GF_FREE (device);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+glusterd_volinfo_t *
+glusterd_do_snap_vol (glusterd_volinfo_t *origin_vol, glusterd_snap_t *snap,
+ dict_t *dict, dict_t *rsp_dict, int64_t volcount)
+{
+ char key[PATH_MAX] = "";
+ char *snap_brick_dir = NULL;
+ char *username = NULL;
+ char *password = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ uuid_t *snap_volid = NULL;
+ int32_t ret = -1;
+ int32_t brick_count = 0;
+ glusterd_brickinfo_t *snap_brickinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (dict);
+ GF_ASSERT (origin_vol);
+ GF_ASSERT (rsp_dict);
+
+ /* fetch username, password and vol_id from dict*/
+ snprintf (key, sizeof(key), "volume%"PRId64"_username", volcount);
+ ret = dict_get_str (dict, key, &username);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get %s for "
+ "snap %s", key, snap->snapname);
+ goto out;
+ }
+
+ snprintf (key, sizeof(key), "volume%"PRId64"_password", volcount);
+ ret = dict_get_str (dict, key, &password);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get %s for "
+ "snap %s", key, snap->snapname);
+ goto out;
+ }
+
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64"_volid", volcount);
+ ret = dict_get_bin (dict, key, (void **)&snap_volid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_volid");
+ goto out;
+ }
+
+ /* We are not setting the username and password here as
+ * we need to set the user name and password passed in
+ * the dictionary
+ */
+ ret = glusterd_volinfo_dup (origin_vol, &snap_vol, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to duplicate volinfo "
+ "for the snapshot %s", snap->snapname);
+ goto out;
+ }
+
+ /* uuid is used as lvm snapshot name.
+ This will avoid restrictions on snapshot names provided by user */
+ GLUSTERD_GET_UUID_NOHYPHEN (snap_vol->volname, *snap_volid);
+ uuid_copy (snap_vol->volume_id, *snap_volid);
+ snap_vol->is_snap_volume = _gf_true;
+ strcpy (snap_vol->parent_volname, origin_vol->volname);
+ snap_vol->snapshot = snap;
+
+ glusterd_auth_set_username (snap_vol, username);
+ glusterd_auth_set_password (snap_vol, password);
+
+ /* Adding snap brickinfos to the snap volinfo */
+ brick_count = 0;
+ list_for_each_entry (brickinfo, &origin_vol->bricks, brick_list) {
+ snap_brickinfo = NULL;
+
+ ret = glusterd_brickinfo_new (&snap_brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "initializing the brick for the snap "
+ "volume failed (snapname: %s)", snap->snapname);
+ goto out;
+ }
+
+ ret = glusterd_add_bricks_to_snap_volume (dict, rsp_dict,
+ snap_vol,
+ brickinfo,
+ snap_brickinfo,
+ &snap_brick_dir,
+ volcount,
+ brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add the snap brick for "
+ "%s:%s to the snap volume",
+ brickinfo->hostname, brickinfo->path);
+ GF_FREE (snap_brickinfo);
+ goto out;
+ }
+
+ /* Take snapshot of the brick */
+ if ((uuid_compare (brickinfo->uuid, MY_UUID)) ||
+ (snap_brickinfo->snap_status == -1)) {
+ brick_count++;
+ continue;
+ }
+
+ ret = glusterd_take_brick_snapshot (origin_vol, snap_vol,
+ rsp_dict, brickinfo,
+ snap_brickinfo,
+ snap_brick_dir,
+ brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to take snapshot for %s:%s",
+ brickinfo->hostname, brickinfo->path);
+ goto out;
+ }
+ brick_count++;
+ }
+
+ /*TODO: the quorum check of the snap volume here */
+
+ ret = glusterd_store_volinfo (snap_vol,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store snapshot "
+ "volinfo (%s) for snap %s", snap_vol->volname,
+ snap->snapname);
+ goto out;
+ }
+
+ ret = generate_brick_volfiles (snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "generating the brick "
+ "volfiles for the snap %s (volume: %s) failed",
+ snap->snapname, origin_vol->volname);
+ goto out;
+ }
+
+ ret = generate_client_volfiles (snap_vol, GF_CLIENT_TRUSTED);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "generating the trusted "
+ "client volfiles for the snap %s (volume: %s) failed",
+ snap->snapname, origin_vol->volname);
+ goto out;
+ }
+
+ ret = generate_client_volfiles (snap_vol, GF_CLIENT_OTHER);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "generating the client "
+ "volfiles for the snap %s (volume: %s) failed",
+ snap->snapname, origin_vol->volname);
+ goto out;
+ }
+
+ ret = glusterd_list_add_snapvol (origin_vol, snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "could not add the snap "
+ "volume %s to the list", snap_vol->volname);
+ goto out;
+ }
+
+ list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "not starting snap brick %s:%s for "
+ "for the snap %s (volume: %s)",
+ brickinfo->hostname, brickinfo->path,
+ snap->snapname, origin_vol->volname);
+ continue;
+ }
+
+ ret = glusterd_brick_start (snap_vol, brickinfo, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "starting the "
+ "brick %s:%s for the snap %s (volume: %s) "
+ "failed", brickinfo->hostname, brickinfo->path,
+ snap->snapname, origin_vol->volname);
+ goto out;
+ }
+ }
+
+ snap_vol->status = GLUSTERD_STATUS_STARTED;
+ ret = glusterd_store_volinfo (snap_vol,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store snap volinfo");
+ goto out;
+ }
+
+out:
+ if (ret) {
+ if (snap_vol)
+ glusterd_snap_volume_remove (rsp_dict, snap_vol,
+ _gf_true, _gf_true);
+ snap_vol = NULL;
+ }
+
+ return snap_vol;
+}
+
+/* This is a snapshot remove handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt v3 framework to do the actual remove on all the bricks
+ *
+ * @param req RPC request object
+ * @param op gluster operation
+ * @param dict dictionary containing snapshot remove request
+ * @param err_str In case of an err this string should be populated
+ * @param len length of err_str buffer
+ *
+ * @return Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_remove (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ int64_t volcount = 0;
+ char *snapname = NULL;
+ char *volname = NULL;
+ char key[PATH_MAX] = "";
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ GF_ASSERT (err_str);
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get snapname");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ snprintf (err_str, len, "Snap (%s) does not exist", snapname);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+
+ /* Set volnames in the dict to get mgmt_v3 lock */
+ list_for_each_entry_safe (snap_vol, tmp, &snap->volumes, vol_list) {
+ volcount++;
+ volname = gf_strdup (snap_vol->parent_volname);
+ if (!volname) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "strdup failed");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "volname%"PRId64, volcount);
+ ret = dict_set_dynstr (dict, key, volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "volume name in dictionary");
+ GF_FREE (volname);
+ goto out;
+ }
+ volname = NULL;
+ }
+ ret = dict_set_int64 (dict, "volcount", volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set volcount");
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap "
+ "phases");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_remove_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ char *snapname = NULL;
+ xlator_t *this = NULL;
+ glusterd_snap_t *snap = NULL;
+
+ this = THIS;
+
+ if (!dict || !op_errstr) {
+ gf_log (this->name, GF_LOG_ERROR, "input parameters NULL");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Getting the snap name "
+ "failed");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Snap %s does not exist",
+ snapname);
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_status_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int ret = -1;
+ char *snapname = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ int32_t cmd = -1;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *volname = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+
+ GF_ASSERT (conf);
+ GF_ASSERT (op_errstr);
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Input dict is NULL");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not fetch status cmd");
+ goto out;
+ }
+
+ switch (cmd) {
+ case GF_SNAP_STATUS_TYPE_ALL:
+ {
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_SNAP:
+ {
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not fetch snapname");
+ goto out;
+ }
+
+ if (!glusterd_find_snap_by_name (snapname)) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) "
+ "not found", snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Snap (%s) "
+ "not found", snapname);
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_VOL:
+ {
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not fetch volname");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ ret = gf_asprintf (op_errstr, "Volume (%s)"
+ "not found", volname);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Volume "
+ "%s not present", volname);
+ goto out;
+ }
+ break;
+
+ }
+ default:
+ {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid command");
+ break;
+ }
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+glusterd_snapshot_remove_commit (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ char *snapname = NULL;
+ char *dup_snapname = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (op_errstr);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!dict || !op_errstr) {
+ gf_log (this->name, GF_LOG_ERROR, "input parameters NULL");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Getting the snap name "
+ "failed");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Snap %s does not exist",
+ snapname);
+ ret = -1;
+ goto out;
+ }
+
+ if (is_origin_glusterd (dict) == _gf_true) {
+ /* TODO : As of now there is only volume in snapshot.
+ * Change this when multiple volume snapshot is introduced
+ */
+ snap_volinfo = list_entry (snap->volumes.next,
+ glusterd_volinfo_t,
+ vol_list);
+ if (!snap_volinfo) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_volinfo");
+ ret = -1;
+ goto out;
+ }
+
+ /* From origin glusterd check if *
+ * any peers with snap bricks is down */
+ ret = glusterd_find_missed_snap (rsp_dict, snap_volinfo,
+ &priv->peers,
+ GF_SNAP_OPTION_TYPE_DELETE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to find missed snap deletes");
+ goto out;
+ }
+ }
+
+ ret = glusterd_snap_remove (rsp_dict, snap, _gf_true, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to remove snap %s",
+ snapname);
+ goto out;
+ }
+
+ dup_snapname = gf_strdup (snapname);
+ if (!dup_snapname) {
+ gf_log (this->name, GF_LOG_ERROR, "Strdup failed");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, "snapname", dup_snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set the snapname");
+ GF_FREE (dup_snapname);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+glusterd_do_snap_cleanup (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ char *name = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ if (!dict || !op_errstr) {
+ gf_log (this->name, GF_LOG_ERROR, "input parameters NULL");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "getting the snap "
+ "name failed (volume: %s)", volinfo->volname);
+ goto out;
+ }
+
+ /*
+ If the snapname is not found that means the failure happened at
+ staging, or in commit, before the snap object is created, in which
+ case there is nothing to cleanup. So set ret to 0.
+ */
+ snap = glusterd_find_snap_by_name (name);
+ if (!snap) {
+ gf_log (this->name, GF_LOG_INFO, "snap %s is not found", name);
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_snap_remove (rsp_dict, snap, _gf_true, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "removing the snap %s failed",
+ name);
+ goto out;
+ }
+
+ name = NULL;
+
+ ret = 0;
+
+out:
+
+ return ret;
+}
+
+/* In case of a successful, delete or create operation, during post_validate *
+ * look for missed snap operations and update the missed snap lists */
+int32_t
+glusterd_snapshot_update_snaps_post_validate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ int32_t missed_snap_count = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (op_errstr);
+
+ ret = dict_get_int32 (dict, "missed_snap_count",
+ &missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "No missed snaps");
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_add_missed_snaps_to_list (dict, missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snaps to list");
+ goto out;
+ }
+
+ ret = glusterd_store_update_missed_snaps ();
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to update missed_snaps_list");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_snapshot_create_commit (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int ret = -1;
+ int64_t i = 0;
+ int64_t volcount = 0;
+ char *snapname = NULL;
+ char *volname = NULL;
+ char *tmp_name = NULL;
+ char key[PATH_MAX] = "";
+ xlator_t *this = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *origin_vol = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT(dict);
+ GF_ASSERT(op_errstr);
+ GF_ASSERT(rsp_dict);
+ priv = this->private;
+ GF_ASSERT(priv);
+
+ ret = dict_get_int64 (dict, "volcount", &volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "get the volume count");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snapname");
+ goto out;
+ }
+ tmp_name = gf_strdup (snapname);
+ if (!tmp_name) {
+ gf_log (this->name, GF_LOG_ERROR, "Out of memory");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, "snapname", tmp_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snapname in rsp_dict");
+ GF_FREE (tmp_name);
+ goto out;
+ }
+ tmp_name = NULL;
+
+ snap = glusterd_create_snap_object (dict, rsp_dict);
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "creating the"
+ "snap object %s failed", snapname);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 1; i <= volcount; i++) {
+ snprintf (key, sizeof (key), "volname%"PRId64, i);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get volume name");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &origin_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get the volinfo for "
+ "the volume %s", volname);
+ goto out;
+ }
+
+ /* TODO: Create a stub where the bricks are
+ added parallely by worker threads so that
+ the snap creating happens parallely. */
+ snap_vol = glusterd_do_snap_vol (origin_vol, snap, dict,
+ rsp_dict, i);
+ if (!snap_vol) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_WARNING, "taking the "
+ "snapshot of the volume %s failed", volname);
+ goto out;
+ }
+ }
+
+ snap->snap_status = GD_SNAP_STATUS_IN_USE;
+ ret = glusterd_store_snap (snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Could not store snap"
+ "object %s", snap->snapname);
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (ret) {
+ if (snap)
+ glusterd_snap_remove (rsp_dict, snap,
+ _gf_true, _gf_true);
+ snap = NULL;
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+snap_max_hard_limit_set_commit (dict_t *dict, uint64_t value,
+ char *volname, char **op_errstr)
+{
+ char err_str[PATH_MAX] = "";
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (volname);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ /* TODO: Initiate auto deletion when there is a limit change */
+ if (!volname) {
+ /* For system limit */
+ conf->snap_max_hard_limit = value;
+
+ ret = glusterd_store_global_info (this);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to store "
+ "snap-max-hard-limit for system");
+ goto out;
+ }
+ } else {
+ /* For one volume */
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to get the"
+ " volinfo for volume %s", volname);
+ goto out;
+ }
+
+ volinfo->snap_max_hard_limit = value;
+
+ ret = glusterd_store_volinfo (volinfo,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to store "
+ "snap-max-hard-limit for volume %s", volname);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ *op_errstr = gf_strdup (err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ }
+ return ret;
+}
+
+int
+snap_max_limits_display_commit (dict_t *rsp_dict, char *volname,
+ char **op_errstr)
+{
+ char err_str[PATH_MAX] = "";
+ char buf[PATH_MAX] = "";
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = -1;
+ uint64_t active_hard_limit = 0;
+ uint64_t snap_max_limit = 0;
+ uint64_t soft_limit_value = -1;
+ uint64_t count = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (volname);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ if (!volname) {
+ /* For system limit */
+ list_for_each_entry (volinfo, &conf->volumes, vol_list) {
+ if (volinfo->is_snap_volume == _gf_true)
+ continue;
+ snap_max_limit = volinfo->snap_max_hard_limit;
+ if (snap_max_limit > conf->snap_max_hard_limit)
+ active_hard_limit = conf->snap_max_hard_limit;
+ else
+ active_hard_limit = snap_max_limit;
+ soft_limit_value = (active_hard_limit *
+ conf->snap_max_soft_limit) / 100;
+
+ snprintf (buf, sizeof(buf), "volume%"PRIu64"-volname", count);
+ ret = dict_set_str (rsp_dict, buf, volinfo->volname);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-hard-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, snap_max_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-active-hard-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf,
+ active_hard_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-soft-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, soft_limit_value);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+ count++;
+ }
+
+ ret = dict_set_uint64 (rsp_dict, "voldisplaycount", count);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set voldisplaycount");
+ goto out;
+ }
+ } else {
+ /* For one volume */
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to get the"
+ " volinfo for volume %s", volname);
+ goto out;
+ }
+
+ snap_max_limit = volinfo->snap_max_hard_limit;
+ if (snap_max_limit > conf->snap_max_hard_limit)
+ active_hard_limit = conf->snap_max_hard_limit;
+ else
+ active_hard_limit = snap_max_limit;
+
+ soft_limit_value = (active_hard_limit *
+ conf->snap_max_soft_limit) / 100;
+
+ snprintf (buf, sizeof(buf), "volume%"PRIu64"-volname", count);
+ ret = dict_set_str (rsp_dict, buf, volinfo->volname);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-hard-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, snap_max_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-active-hard-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, active_hard_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-soft-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, soft_limit_value);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ count++;
+
+ ret = dict_set_uint64 (rsp_dict, "voldisplaycount", count);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set voldisplaycount");
+ goto out;
+ }
+
+ }
+
+ ret = dict_set_uint64 (rsp_dict, "snap-max-hard-limit",
+ conf->snap_max_hard_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set sys-snap-max-hard-limit ");
+ goto out;
+ }
+
+ ret = dict_set_uint64 (rsp_dict, "snap-max-soft-limit",
+ conf->snap_max_soft_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set sys-snap-max-hard-limit ");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ *op_errstr = gf_strdup (err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ }
+ return ret;
+}
+
+int
+glusterd_snapshot_config_commit (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ char *volname = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ char err_str[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = NULL;
+ int config_command = 0;
+ uint64_t hard_limit = 0;
+ uint64_t soft_limit = 0;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ ret = dict_get_int32 (dict, "config-command", &config_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get config-command type");
+ goto out;
+ }
+
+ /* Ignore the return value of the following dict_get,
+ * as they are optional
+ */
+ ret = dict_get_str (dict, "volname", &volname);
+
+ ret = dict_get_uint64 (dict, "snap-max-hard-limit", &hard_limit);
+
+ ret = dict_get_uint64 (dict, "snap-max-soft-limit", &soft_limit);
+
+ switch (config_command) {
+ case GF_SNAP_CONFIG_TYPE_SET:
+ if (hard_limit) {
+ /* Commit ops for snap-max-hard-limit */
+ ret = snap_max_hard_limit_set_commit (dict, hard_limit,
+ volname,
+ op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "snap-max-hard-limit set "
+ "commit failed.");
+ goto out;
+ }
+ }
+
+ if (soft_limit) {
+ /* For system limit */
+ conf->snap_max_soft_limit = soft_limit;
+
+ ret = glusterd_store_global_info (this);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to store "
+ "snap-max-soft-limit for system");
+ *op_errstr = gf_strdup (err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ err_str);
+ goto out;
+ }
+ }
+ break;
+
+ case GF_SNAP_CONFIG_DISPLAY:
+ /* Reading data from local node only */
+ if (!is_origin_glusterd (dict)) {
+ ret = 0;
+ break;
+ }
+
+ ret = snap_max_limits_display_commit (rsp_dict, volname,
+ op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "snap-max-limit "
+ "display commit failed.");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_get_brick_lvm_details (dict_t *rsp_dict,
+ glusterd_brickinfo_t *brickinfo, char *volname,
+ char *device, char *key_prefix)
+{
+
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ runner_t runner = {0,};
+ xlator_t *this = NULL;
+ char msg[PATH_MAX] = "";
+ char buf[PATH_MAX] = "";
+ char *ptr = NULL;
+ char *token = NULL;
+ char key[PATH_MAX] = "";
+ char *value = NULL;
+
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (brickinfo);
+ GF_ASSERT (volname);
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ device = glusterd_get_brick_mount_details (brickinfo);
+ if (!device) {
+ gf_log (this->name, GF_LOG_ERROR, "Getting device name for "
+ "the brick %s:%s failed", brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "running lvs command, "
+ "for getting snap status");
+ /* Using lvs command fetch the Volume Group name,
+ * Percentage of data filled and Logical Volume size
+ *
+ * "-o" argument is used to get the desired information,
+ * example : "lvs /dev/VolGroup/thin_vol -o vgname,lv_size",
+ * will get us Volume Group name and Logical Volume size.
+ *
+ * Here separator used is ":",
+ * for the above given command with separator ":",
+ * The output will be "vgname:lvsize"
+ */
+ runner_add_args (&runner, LVS, device, "--noheading", "-o",
+ "vg_name,data_percent,lv_size",
+ "--separator", ":", NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+ ret = runner_start (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not perform lvs action");
+ goto end;
+ }
+ do {
+ ptr = fgets (buf, sizeof (buf),
+ runner_chio (&runner, STDOUT_FILENO));
+
+ if (ptr == NULL)
+ break;
+ token = strtok (buf, ":");
+ if (token != NULL) {
+ while (token && token[0] == ' ')
+ token++;
+ if (!token) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid vg entry");
+ goto end;
+ }
+ value = gf_strdup (token);
+ if (!value) {
+ ret = -1;
+ goto end;
+ }
+ ret = snprintf (key, sizeof (key), "%s.vgname",
+ key_prefix);
+ if (ret < 0) {
+ goto end;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save vgname ");
+ goto end;
+ }
+ }
+
+ token = strtok (NULL, ":");
+ if (token != NULL) {
+ value = gf_strdup (token);
+ if (!value) {
+ ret = -1;
+ goto end;
+ }
+ ret = snprintf (key, sizeof (key), "%s.data",
+ key_prefix);
+ if (ret < 0) {
+ goto end;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save data percent ");
+ goto end;
+ }
+ }
+ token = strtok (NULL, ":");
+ if (token != NULL) {
+ value = gf_strdup (token);
+ if (!value) {
+ ret = -1;
+ goto end;
+ }
+ ret = snprintf (key, sizeof (key), "%s.lvsize",
+ key_prefix);
+ if (ret < 0) {
+ goto end;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save meta data percent ");
+ goto end;
+ }
+ }
+
+ } while (ptr != NULL);
+
+ ret = 0;
+
+end:
+ runner_end (&runner);
+
+out:
+ if (ret && value) {
+ GF_FREE (value);
+ }
+
+ return ret;
+}
+
+int
+glusterd_get_single_brick_status (char **op_errstr, dict_t *rsp_dict,
+ char *keyprefix, int index,
+ glusterd_volinfo_t *snap_volinfo,
+ glusterd_brickinfo_t *brickinfo)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ char key[PATH_MAX] = "";
+ char *device = NULL;
+ char *value = NULL;
+ char brick_path[PATH_MAX] = "";
+ char pidfile[PATH_MAX] = "";
+ pid_t pid = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (keyprefix);
+ GF_ASSERT (snap_volinfo);
+ GF_ASSERT (brickinfo);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.path", keyprefix,
+ index);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = snprintf (brick_path, sizeof (brick_path),
+ "%s:%s", brickinfo->hostname, brickinfo->path);
+ if (ret < 0) {
+ goto out;
+ }
+
+ value = gf_strdup (brick_path);
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to store "
+ "brick_path %s", brickinfo->path);
+ goto out;
+ }
+
+ if (brickinfo->snap_status == -1) {
+ /* Setting vgname as "Pending Snapshot" */
+ value = gf_strdup ("Pending Snapshot");
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.brick%d.vgname",
+ keyprefix, index);
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save vgname ");
+ goto out;
+ }
+
+ ret = 0;
+ goto out;
+ }
+ value = NULL;
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.status",
+ keyprefix, index);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (brickinfo->status == GF_BRICK_STOPPED) {
+ value = gf_strdup ("No");
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save brick status");
+ goto out;
+ }
+ value = NULL;
+ } else {
+ value = gf_strdup ("Yes");
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save brick status");
+ goto out;
+ }
+ value = NULL;
+
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_volinfo,
+ brickinfo, priv);
+ ret = gf_is_service_running (pidfile, &pid);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.pid",
+ keyprefix, index);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, key, pid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save pid %d", pid);
+ goto out;
+ }
+ }
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d",
+ keyprefix, index);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = glusterd_get_brick_lvm_details (rsp_dict, brickinfo,
+ snap_volinfo->volname,
+ device, key);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "brick LVM details");
+ goto out;
+ }
+out:
+ if (ret && value) {
+ GF_FREE (value);
+ }
+
+ return ret;
+}
+
+int
+glusterd_get_single_snap_status (char **op_errstr, dict_t *rsp_dict,
+ char *keyprefix, glusterd_snap_t *snap)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ char key[PATH_MAX] = "";
+ char brickkey[PATH_MAX] = "";
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ glusterd_volinfo_t *tmp_volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ int volcount = 0;
+ int brickcount = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (keyprefix);
+ GF_ASSERT (snap);
+
+ list_for_each_entry_safe (snap_volinfo, tmp_volinfo, &snap->volumes,
+ vol_list) {
+ ret = snprintf (key, sizeof (key), "%s.vol%d", keyprefix,
+ volcount);
+ if (ret < 0) {
+ goto out;
+ }
+ list_for_each_entry (brickinfo, &snap_volinfo->bricks,
+ brick_list) {
+ if (!glusterd_is_local_brick (this, snap_volinfo,
+ brickinfo)) {
+ brickcount++;
+ continue;
+ }
+
+ ret = glusterd_get_single_brick_status (op_errstr,
+ rsp_dict, key, brickcount,
+ snap_volinfo, brickinfo);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Getting "
+ "single snap status failed");
+ goto out;
+ }
+ brickcount++;
+ }
+ ret = snprintf (brickkey, sizeof (brickkey), "%s.brickcount",
+ key);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, brickkey, brickcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save brick count");
+ goto out;
+ }
+ volcount++;
+ }
+
+ ret = snprintf (key, sizeof (key), "%s.volcount", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, key, volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save volcount");
+ goto out;
+ }
+
+out:
+
+ return ret;
+}
+
+int
+glusterd_get_each_snap_object_status (char **op_errstr, dict_t *rsp_dict,
+ glusterd_snap_t *snap, char *keyprefix)
+{
+ int ret = -1;
+ char key[PATH_MAX] = "";
+ char *temp = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap);
+ GF_ASSERT (keyprefix);
+
+ /* TODO : Get all the snap volume info present in snap object,
+ * as of now, There will be only one snapvolinfo per snap object
+ */
+ ret = snprintf (key, sizeof (key), "%s.snapname", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ temp = gf_strdup (snap->snapname);
+ if (temp == NULL) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (rsp_dict, key, temp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save "
+ "snap name");
+ goto out;
+ }
+
+ temp = NULL;
+
+ ret = snprintf (key, sizeof (key), "%s.uuid", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ temp = gf_strdup (uuid_utoa (snap->snap_id));
+ if (temp == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, temp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save "
+ "snap UUID");
+ goto out;
+ }
+
+ temp = NULL;
+
+ ret = glusterd_get_single_snap_status (op_errstr, rsp_dict, keyprefix,
+ snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not get single snap status");
+ goto out;
+ }
+
+ ret = snprintf (key, sizeof (key), "%s.volcount", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, key, 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save volcount");
+ goto out;
+ }
+out:
+ if (ret && temp)
+ GF_FREE (temp);
+
+ return ret;
+}
+
+int
+glusterd_get_snap_status_of_volume (char **op_errstr, dict_t *rsp_dict,
+ char *volname, char *keyprefix) {
+ int ret = -1;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ glusterd_volinfo_t *temp_volinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char key[PATH_MAX] = "";
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int i = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (volname);
+ GF_ASSERT (keyprefix);
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volinfo of "
+ "volume %s", volname);
+ goto out;
+ }
+
+ list_for_each_entry_safe (snap_volinfo, temp_volinfo,
+ &volinfo->snap_volumes, snapvol_list) {
+ ret = snprintf (key, sizeof (key), "status.snap%d", i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = glusterd_get_each_snap_object_status (op_errstr,
+ rsp_dict, snap_volinfo->snapshot, key);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Function : "
+ "glusterd_get_single_snap_status failed");
+ goto out;
+ }
+ i++;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "status.snapcount", i);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to save snapcount");
+ ret = -1;
+ goto out;
+ }
+out:
+ return ret;
+}
+
+int
+glusterd_get_all_snapshot_status (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int32_t i = 0;
+ int ret = -1;
+ char key[PATH_MAX] = "";
+ glusterd_conf_t *priv = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_snap_t *tmp_snap = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ list_for_each_entry_safe (snap, tmp_snap,
+ &priv->snapshots, snap_list) {
+ ret = snprintf (key, sizeof (key), "status.snap%d", i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = glusterd_get_each_snap_object_status (op_errstr,
+ rsp_dict, snap, key);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not get "
+ "the details of a snap object: %s",
+ snap->snapname);
+ goto out;
+ }
+ i++;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "status.snapcount", i);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+glusterd_snapshot_status_commit (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ xlator_t *this = NULL;
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+ char *get_buffer = NULL;
+ int32_t cmd = -1;
+ char *snapname = NULL;
+ glusterd_snap_t *snap = NULL;
+ char *volname = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+ ret = dict_get_int32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get status cmd type");
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "cmd", cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save status cmd in rsp dictionary");
+ goto out;
+ }
+ switch (cmd) {
+ case GF_SNAP_STATUS_TYPE_ALL:
+ {
+ ret = glusterd_get_all_snapshot_status (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "get snapshot status");
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_SNAP:
+ {
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "get snap name");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) "
+ "not found", snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "get snap volinfo");
+ goto out;
+ }
+ ret = glusterd_get_each_snap_object_status (op_errstr,
+ rsp_dict, snap, "status.snap0");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "get status of snap %s", get_buffer);
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_VOL:
+ {
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to"
+ " get volume name");
+ goto out;
+ }
+
+ ret = glusterd_get_snap_status_of_volume (op_errstr,
+ rsp_dict, volname, "status.vol0");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Function :"
+ " glusterd_get_snap_status_of_volume "
+ "failed");
+ goto out;
+ }
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+glusterd_snapshot_create_postvalidate (dict_t *dict, int32_t op_ret,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (op_ret) {
+ ret = glusterd_do_snap_cleanup (dict, op_errstr, rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "cleanup operation "
+ "failed");
+ goto out;
+ }
+ } else {
+ ret = glusterd_snapshot_update_snaps_post_validate (dict,
+ op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "create snapshot");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+glusterd_snapshot (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int32_t snap_command = 0;
+ int ret = -1;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_int32 (dict, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case (GF_SNAP_OPTION_TYPE_CREATE):
+ ret = glusterd_snapshot_create_commit (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "create snapshot");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_CONFIG:
+ ret = glusterd_snapshot_config_commit (dict, op_errstr,
+ rsp_dict);
+ break;
+
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_snapshot_remove_commit (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "delete snapshot");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ ret = glusterd_snapshot_restore (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to "
+ "restore snapshot");
+ goto out;
+ }
+
+ break;
+
+ case GF_SNAP_OPTION_TYPE_STATUS:
+ ret = glusterd_snapshot_status_commit (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "show snapshot status");
+ goto out;
+ }
+ break;
+
+
+ default:
+ gf_log (this->name, GF_LOG_WARNING, "invalid snap command");
+ goto out;
+ break;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_brickop (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ int ret = -1;
+ int64_t vol_count = 0;
+ int64_t count = 1;
+ char key[1024] = {0,};
+ char *volname = NULL;
+ int32_t snap_command = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ ret = dict_get_int32 (dict, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ ret = dict_get_int64 (dict, "volcount", &vol_count);
+ if (ret)
+ goto out;
+ while (count <= vol_count) {
+ snprintf (key, 1024, "volname%"PRId64, count);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get volname");
+ goto out;
+ }
+ ret = dict_set_str (dict, "volname", volname);
+ if (ret)
+ goto out;
+
+ ret = gd_brick_op_phase (GD_OP_SNAP, NULL, dict,
+ op_errstr);
+ if (ret)
+ goto out;
+ volname = NULL;
+ count++;
+ }
+
+ dict_del (dict, "volname");
+ ret = 0;
+ break;
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ break;
+ default:
+ break;
+ }
+
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int snap_command = 0;
+ xlator_t *this = NULL;
+ int ret = -1;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ ret = dict_get_int32 (dict, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case (GF_SNAP_OPTION_TYPE_CREATE):
+ ret = glusterd_snapshot_create_prevalidate (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot create "
+ "pre-validation failed");
+ goto out;
+ }
+ break;
+
+ case (GF_SNAP_OPTION_TYPE_CONFIG):
+ ret = glusterd_snapshot_config_prevalidate (dict, op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot config "
+ "pre-validation failed");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ ret = glusterd_snapshot_restore_prevalidate (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot restore "
+ "validation failed");
+ goto out;
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_snapshot_remove_prevalidate (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot remove "
+ "validation failed");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_STATUS:
+ ret = glusterd_snapshot_status_prevalidate (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot status "
+ "validation failed");
+ goto out;
+ }
+ break;
+
+ default:
+ gf_log (this->name, GF_LOG_WARNING, "invalid snap command");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int snap_command = 0;
+ xlator_t *this = NULL;
+ int ret = -1;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ ret = dict_get_int32 (dict, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ ret = glusterd_snapshot_create_postvalidate (dict, op_ret,
+ op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot create "
+ "post-validation failed");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ ret = glusterd_snapshot_update_snaps_post_validate (dict,
+ op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "update missed snaps list");
+ goto out;
+ }
+ break;
+
+ default:
+ gf_log (this->name, GF_LOG_WARNING, "invalid snap command");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_handle_snapshot_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = 0;
+ dict_t *dict = NULL;
+ gf_cli_req cli_req = {{0},};
+ glusterd_op_t cli_op = GD_OP_SNAP;
+ int type = 0;
+ glusterd_conf_t *conf = NULL;
+ char *host_uuid = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+ char *volname = NULL;
+
+ GF_ASSERT (req);
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (cli_req.dict.dict_len > 0) {
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
+ goto out;
+ }
+
+ dict->extra_stdfree = cli_req.dict.dict_val;
+
+ host_uuid = gf_strdup (uuid_utoa(MY_UUID));
+ if (host_uuid == NULL) {
+ snprintf (err_str, sizeof (err_str), "Failed to get "
+ "the uuid of local glusterd");
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dict, "host-uuid", host_uuid);
+ if (ret) {
+ GF_FREE (host_uuid);
+ goto out;
+ }
+
+
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "request dict length is %d",
+ cli_req.dict.dict_len);
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "type", &type);
+ if (ret < 0) {
+ snprintf (err_str, sizeof (err_str), "Command type not found");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ goto out;
+ }
+
+ switch (type) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ ret = glusterd_handle_snapshot_create (req, cli_op, dict,
+ err_str, sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot create "
+ "failed: %s", err_str);
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ ret = glusterd_handle_snapshot_restore (req, cli_op, dict,
+ err_str, sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot restore "
+ "failed: %s", err_str);
+ }
+
+ break;
+ case GF_SNAP_OPTION_TYPE_INFO:
+ ret = glusterd_handle_snapshot_info (req, cli_op, dict,
+ err_str, sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot info "
+ "failed");
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_LIST:
+ ret = glusterd_handle_snapshot_list (req, cli_op, dict,
+ err_str, sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot list "
+ "failed");
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_CONFIG:
+ /* TODO : Type of lock to be taken when we are setting
+ * limits system wide
+ */
+ ret = dict_get_str (dict, "volname", &volname);
+ if (!volname) {
+ ret = dict_set_int32 (dict, "hold_vol_locks",
+ _gf_false);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Unable to set hold_vol_locks value "
+ "as _gf_false");
+ goto out;
+ }
+
+ }
+ ret = glusterd_mgmt_v3_initiate_all_phases (req, cli_op, dict);
+ break;
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_handle_snapshot_remove (req, cli_op, dict,
+ err_str,
+ sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot delete "
+ "failed: %s", err_str);
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_START:
+ case GF_SNAP_OPTION_TYPE_STOP:
+ case GF_SNAP_OPTION_TYPE_STATUS:
+ ret = glusterd_handle_snapshot_status (req, cli_op, dict,
+ err_str,
+ sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot status "
+ "failed: %s", err_str);
+ }
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Unkown snapshot request "
+ "type (%d)", type);
+ ret = -1; /* Failure */
+ }
+
+out:
+ if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
+ ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
+ dict, err_str);
+ }
+
+ return ret;
+}
+
+int
+glusterd_handle_snapshot (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, glusterd_handle_snapshot_fn);
+}
+
+static inline void
+glusterd_free_snap_op (glusterd_snap_op_t *snap_op)
+{
+ if (snap_op) {
+ if (snap_op->brick_path)
+ GF_FREE (snap_op->brick_path);
+
+ GF_FREE (snap_op);
+ }
+}
+
+static inline void
+glusterd_free_missed_snapinfo (glusterd_missed_snap_info *missed_snapinfo)
+{
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ glusterd_snap_op_t *tmp = NULL;
+
+ if (missed_snapinfo) {
+ list_for_each_entry_safe (snap_opinfo, tmp,
+ &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ glusterd_free_snap_op (snap_opinfo);
+ snap_opinfo = NULL;
+ }
+
+ if (missed_snapinfo->node_uuid)
+ GF_FREE (missed_snapinfo->node_uuid);
+
+ if (missed_snapinfo->snap_uuid)
+ GF_FREE (missed_snapinfo->snap_uuid);
+
+ GF_FREE (missed_snapinfo);
+ }
+}
+
+/* Look for duplicates and accordingly update the list */
+int32_t
+glusterd_update_missed_snap_entry (glusterd_missed_snap_info *missed_snapinfo,
+ glusterd_snap_op_t *missed_snap_op)
+{
+ int32_t ret = -1;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ gf_boolean_t match = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT(missed_snapinfo);
+ GF_ASSERT(missed_snap_op);
+
+ list_for_each_entry (snap_opinfo, &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ /* If the entry is not for the same snap_vol_id
+ * then continue
+ */
+ if (strcmp (snap_opinfo->snap_vol_id,
+ missed_snap_op->snap_vol_id))
+ continue;
+
+ if ((!strcmp (snap_opinfo->brick_path,
+ missed_snap_op->brick_path)) &&
+ (snap_opinfo->op == missed_snap_op->op)) {
+ /* If two entries have conflicting status
+ * GD_MISSED_SNAP_DONE takes precedence
+ */
+ if ((snap_opinfo->status == GD_MISSED_SNAP_PENDING) &&
+ (missed_snap_op->status == GD_MISSED_SNAP_DONE)) {
+ snap_opinfo->status = GD_MISSED_SNAP_DONE;
+ gf_log (this->name, GF_LOG_INFO,
+ "Updating missed snap status "
+ "for %s:%s=%s:%d:%s:%d as DONE",
+ missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid,
+ snap_opinfo->snap_vol_id,
+ snap_opinfo->brick_num,
+ snap_opinfo->brick_path,
+ snap_opinfo->op);
+ ret = 0;
+ glusterd_free_snap_op (missed_snap_op);
+ goto out;
+ }
+ match = _gf_true;
+ break;
+ } else if ((snap_opinfo->brick_num ==
+ missed_snap_op->brick_num) &&
+ (snap_opinfo->op == GF_SNAP_OPTION_TYPE_CREATE) &&
+ ((missed_snap_op->op ==
+ GF_SNAP_OPTION_TYPE_DELETE) ||
+ (missed_snap_op->op ==
+ GF_SNAP_OPTION_TYPE_RESTORE))) {
+ /* Optimizing create and delete entries for the same
+ * brick and same node
+ */
+ gf_log (this->name, GF_LOG_INFO,
+ "Updating missed snap status "
+ "for %s:%s=%s:%d:%s:%d as DONE",
+ missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid,
+ snap_opinfo->snap_vol_id,
+ snap_opinfo->brick_num,
+ snap_opinfo->brick_path,
+ snap_opinfo->op);
+ snap_opinfo->status = GD_MISSED_SNAP_DONE;
+ ret = 0;
+ glusterd_free_snap_op (missed_snap_op);
+ goto out;
+ }
+ }
+
+ if (match == _gf_true) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Duplicate entry. Not updating");
+ glusterd_free_snap_op (missed_snap_op);
+ } else {
+ list_add_tail (&missed_snap_op->snap_ops_list,
+ &missed_snapinfo->snap_ops);
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Add new missed snap entry to the missed_snaps list. */
+int32_t
+glusterd_add_new_entry_to_list (char *missed_info, char *snap_vol_id,
+ int32_t brick_num, char *brick_path,
+ int32_t snap_op, int32_t snap_status)
+{
+ char *buf = NULL;
+ char *save_ptr = NULL;
+ char node_snap_info[PATH_MAX] = "";
+ int32_t ret = -1;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *missed_snap_op = NULL;
+ glusterd_conf_t *priv = NULL;
+ gf_boolean_t match = _gf_false;
+ gf_boolean_t free_missed_snap_info = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT(missed_info);
+ GF_ASSERT(snap_vol_id);
+ GF_ASSERT(brick_path);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Create the snap_op object consisting of the *
+ * snap id and the op */
+ ret = glusterd_missed_snap_op_new (&missed_snap_op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create new missed snap object.");
+ ret = -1;
+ goto out;
+ }
+
+ missed_snap_op->snap_vol_id = gf_strdup(snap_vol_id);
+ if (!missed_snap_op->snap_vol_id) {
+ ret = -1;
+ goto out;
+ }
+ missed_snap_op->brick_path = gf_strdup(brick_path);
+ if (!missed_snap_op->brick_path) {
+ ret = -1;
+ goto out;
+ }
+ missed_snap_op->brick_num = brick_num;
+ missed_snap_op->op = snap_op;
+ missed_snap_op->status = snap_status;
+
+ /* Look for other entries for the same node and same snap */
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ snprintf (node_snap_info, sizeof(node_snap_info),
+ "%s:%s", missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid);
+ if (!strcmp (node_snap_info, missed_info)) {
+ /* Found missed snapshot info for *
+ * the same node and same snap */
+ match = _gf_true;
+ break;
+ }
+ }
+
+ if (match == _gf_false) {
+ /* First snap op missed for the brick */
+ ret = glusterd_missed_snapinfo_new (&missed_snapinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create missed snapinfo");
+ goto out;
+ }
+ free_missed_snap_info = _gf_true;
+ buf = strtok_r (missed_info, ":", &save_ptr);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+ missed_snapinfo->node_uuid = gf_strdup(buf);
+ if (!missed_snapinfo->node_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ buf = strtok_r (NULL, ":", &save_ptr);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+ missed_snapinfo->snap_uuid = gf_strdup(buf);
+ if (!missed_snapinfo->snap_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ list_add_tail (&missed_snap_op->snap_ops_list,
+ &missed_snapinfo->snap_ops);
+ list_add_tail (&missed_snapinfo->missed_snaps,
+ &priv->missed_snaps_list);
+
+ ret = 0;
+ goto out;
+ } else {
+ ret = glusterd_update_missed_snap_entry (missed_snapinfo,
+ missed_snap_op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to update existing missed snap entry.");
+ goto out;
+ }
+ }
+
+out:
+ if (ret) {
+ glusterd_free_snap_op (missed_snap_op);
+
+ if (missed_snapinfo &&
+ (free_missed_snap_info == _gf_true))
+ glusterd_free_missed_snapinfo (missed_snapinfo);
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Add missing snap entries to the in-memory conf->missed_snap_list */
+int32_t
+glusterd_add_missed_snaps_to_list (dict_t *dict, int32_t missed_snap_count)
+{
+ char *buf = NULL;
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ char *nodeid = NULL;
+ char *snap_uuid = NULL;
+ char *snap_vol_id = NULL;
+ char *brick_path = NULL;
+ char missed_info[PATH_MAX] = "";
+ char name_buf[PATH_MAX] = "";
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t brick_num = -1;
+ int32_t snap_op = -1;
+ int32_t snap_status = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT(dict);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* We can update the missed_snaps_list without acquiring *
+ * any additional locks as big lock will be held. */
+ for (i = 0; i < missed_snap_count; i++) {
+ snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d",
+ i);
+ ret = dict_get_str (dict, name_buf, &buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch %s", name_buf);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "missed_snap_entry = %s",
+ buf);
+
+ /* Need to make a duplicate string coz the same dictionary *
+ * is resent to the non-originator nodes */
+ tmp = gf_strdup (buf);
+ if (!tmp) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Fetch the node-id, snap-id, brick_num,
+ * brick_path, snap_op and snap status
+ */
+ nodeid = strtok_r (tmp, ":", &save_ptr);
+ snap_uuid = strtok_r (NULL, "=", &save_ptr);
+ snap_vol_id = strtok_r (NULL, ":", &save_ptr);
+ brick_num = atoi(strtok_r (NULL, ":", &save_ptr));
+ brick_path = strtok_r (NULL, ":", &save_ptr);
+ snap_op = atoi(strtok_r (NULL, ":", &save_ptr));
+ snap_status = atoi(strtok_r (NULL, ":", &save_ptr));
+
+ if (!nodeid || !snap_uuid || !brick_path ||
+ !snap_vol_id || brick_num < 1 || snap_op < 1 ||
+ snap_status < 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid missed_snap_entry");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (missed_info, sizeof(missed_info), "%s:%s",
+ nodeid, snap_uuid);
+
+ ret = glusterd_add_new_entry_to_list (missed_info,
+ snap_vol_id,
+ brick_num,
+ brick_path,
+ snap_op,
+ snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store missed snaps_list");
+ goto out;
+ }
+
+ GF_FREE (tmp);
+ tmp = NULL;
+ }
+
+ ret = 0;
+out:
+ if (tmp)
+ GF_FREE (tmp);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* This function will restore origin volume to it's snap.
+ * The restore operation will simply replace the Gluster origin
+ * volume with the snap volume.
+ * TODO: Multi-volume delete to be done.
+ * Cleanup in case of restore failure is pending.
+ *
+ * @param orig_vol volinfo of origin volume
+ * @param snap_vol volinfo of snapshot volume
+ *
+ * @return 0 on success and negative value on error
+ */
+int
+gd_restore_snap_volume (dict_t *rsp_dict,
+ glusterd_volinfo_t *orig_vol,
+ glusterd_volinfo_t *snap_vol)
+{
+ int ret = -1;
+ glusterd_volinfo_t *new_volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *temp_volinfo = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GF_VALIDATE_OR_GOTO (this->name, orig_vol, out);
+ GF_VALIDATE_OR_GOTO (this->name, snap_vol, out);
+ snap = snap_vol->snapshot;
+ GF_VALIDATE_OR_GOTO (this->name, snap, out);
+
+ /* Snap volume must be stoped before performing the
+ * restore operation.
+ */
+ ret = glusterd_stop_volume (snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to stop "
+ "snap volume");
+ goto out;
+ }
+
+ /* Create a new volinfo for the restored volume */
+ ret = glusterd_volinfo_dup (snap_vol, &new_volinfo, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create volinfo");
+ goto out;
+ }
+
+ /* Following entries need to be derived from origin volume. */
+ strcpy (new_volinfo->volname, orig_vol->volname);
+ uuid_copy (new_volinfo->volume_id, orig_vol->volume_id);
+ new_volinfo->snap_count = orig_vol->snap_count;
+ new_volinfo->snap_max_hard_limit = orig_vol->snap_max_hard_limit;
+ uuid_copy (new_volinfo->restored_from_snap,
+ snap_vol->snapshot->snap_id);
+
+ /* Bump the version of the restored volume, so that nodes *
+ * which are done can sync during handshake */
+ new_volinfo->version = orig_vol->version;
+
+ list_for_each_entry_safe (voliter, temp_volinfo,
+ &orig_vol->snap_volumes, snapvol_list) {
+ list_add_tail (&voliter->snapvol_list,
+ &new_volinfo->snap_volumes);
+ }
+ /* Copy the snap vol info to the new_volinfo.*/
+ ret = glusterd_snap_volinfo_restore (rsp_dict, new_volinfo, snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to restore snap");
+ (void)glusterd_volinfo_delete (new_volinfo);
+ goto out;
+ }
+
+ ret = glusterd_lvm_snapshot_remove (rsp_dict, orig_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to remove "
+ "LVM backend");
+ (void)glusterd_volinfo_delete (new_volinfo);
+ goto out;
+ }
+
+ /* New volinfo always shows the status as created. Therefore
+ * set the status to the original volume's status. */
+ glusterd_set_volume_status (new_volinfo, orig_vol->status);
+
+ /* Once the new_volinfo is completely constructed then delete
+ * the orinal volinfo
+ */
+ ret = glusterd_volinfo_delete (orig_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to delete volinfo");
+ (void)glusterd_volinfo_delete (new_volinfo);
+ goto out;
+ }
+
+ list_add_tail (&new_volinfo->vol_list, &conf->volumes);
+
+ /* Now delete the snap entry. As a first step delete the snap
+ * volume information stored in store. */
+ ret = glusterd_snap_remove (rsp_dict, snap, _gf_false, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to delete "
+ "snap %s", snap->snapname);
+ goto out;
+ }
+
+ ret = glusterd_store_volinfo (new_volinfo,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store volinfo");
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c
index 9cfed8db9..afbc8ff35 100644
--- a/xlators/mgmt/glusterd/src/glusterd-store.c
+++ b/xlators/mgmt/glusterd/src/glusterd-store.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2007-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -35,6 +35,7 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "glusterd-hooks.h"
+#include "store.h"
#include "glusterd-store.h"
#include "rpc-clnt.h"
@@ -43,152 +44,10 @@
#include <sys/resource.h>
#include <inttypes.h>
#include <dirent.h>
+#include <mntent.h>
-static int32_t
-glusterd_store_mkdir (char *path)
-{
- int32_t ret = -1;
-
- ret = mkdir (path, 0777);
-
- if ((-1 == ret) && (EEXIST != errno)) {
- gf_log (THIS->name, GF_LOG_ERROR, "mkdir() failed on path %s,"
- "errno: %s", path, strerror (errno));
- } else {
- ret = 0;
- }
-
- return ret;
-}
-
-int32_t
-glusterd_store_handle_create_on_absence (glusterd_store_handle_t **shandle,
- char *path)
-{
- GF_ASSERT (shandle);
- int32_t ret = 0;
-
- if (*shandle == NULL) {
- ret = glusterd_store_handle_new (path, shandle);
-
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Unable to create "
- "store handle for path: %s", path);
- }
- }
- return ret;
-}
-
-int32_t
-glusterd_store_mkstemp (glusterd_store_handle_t *shandle)
-{
- int fd = -1;
- char tmppath[PATH_MAX] = {0,};
-
- GF_ASSERT (shandle);
- GF_ASSERT (shandle->path);
-
- snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path);
- fd = open (tmppath, O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0600);
- if (fd <= 0) {
- gf_log (THIS->name, GF_LOG_ERROR, "Failed to open %s, "
- "error: %s", tmppath, strerror (errno));
- }
-
- return fd;
-}
-
-int
-glusterd_store_sync_direntry (char *path)
-{
- int ret = -1;
- int dirfd = -1;
- char *dir = NULL;
- char *pdir = NULL;
- xlator_t *this = NULL;
-
- this = THIS;
-
- dir = gf_strdup (path);
- if (!dir)
- goto out;
-
- pdir = dirname (dir);
- dirfd = open (pdir, O_RDONLY);
- if (dirfd == -1) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to open directory "
- "%s, due to %s", pdir, strerror (errno));
- goto out;
- }
-
- ret = fsync (dirfd);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to fsync %s, due to "
- "%s", pdir, strerror (errno));
- goto out;
- }
-
- ret = 0;
-out:
- if (dirfd >= 0) {
- ret = close (dirfd);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to close "
- "%s, due to %s", pdir, strerror (errno));
- }
- }
-
- if (dir)
- GF_FREE (dir);
-
- return ret;
-}
-
-int32_t
-glusterd_store_rename_tmppath (glusterd_store_handle_t *shandle)
-{
- int32_t ret = -1;
- char tmppath[PATH_MAX] = {0,};
-
- GF_ASSERT (shandle);
- GF_ASSERT (shandle->path);
-
- snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path);
- ret = rename (tmppath, shandle->path);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Failed to rename %s to %s, "
- "error: %s", tmppath, shandle->path, strerror (errno));
- goto out;
- }
-
- ret = glusterd_store_sync_direntry (tmppath);
-out:
- return ret;
-}
-
-int32_t
-glusterd_store_unlink_tmppath (glusterd_store_handle_t *shandle)
-{
- int32_t ret = -1;
- char tmppath[PATH_MAX] = {0,};
-
- GF_ASSERT (shandle);
- GF_ASSERT (shandle->path);
-
- snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path);
- ret = unlink (tmppath);
- if (ret && (errno != ENOENT)) {
- gf_log (THIS->name, GF_LOG_ERROR, "Failed to mv %s to %s, "
- "error: %s", tmppath, shandle->path, strerror (errno));
- } else {
- ret = 0;
- }
-
- return ret;
-}
-
-static void
-glusterd_replace_slash_with_hipen (char *str)
+void
+glusterd_replace_slash_with_hyphen (char *str)
{
char *ptr = NULL;
@@ -213,7 +72,7 @@ glusterd_store_create_brick_dir (glusterd_volinfo_t *volinfo)
GF_ASSERT (priv);
GLUSTERD_GET_BRICK_DIR (brickdirpath, volinfo, priv);
- ret = glusterd_store_mkdir (brickdirpath);
+ ret = gf_store_mkdir (brickdirpath);
return ret;
}
@@ -227,7 +86,7 @@ glusterd_store_key_vol_brick_set (glusterd_brickinfo_t *brickinfo,
GF_ASSERT (len >= PATH_MAX);
snprintf (key_vol_brick, len, "%s", brickinfo->path);
- glusterd_replace_slash_with_hipen (key_vol_brick);
+ glusterd_replace_slash_with_hyphen (key_vol_brick);
}
static void
@@ -308,7 +167,7 @@ out:
if (brickinfo)
glusterd_brickinfo_delete (brickinfo);
if (volinfo)
- glusterd_volinfo_delete (volinfo);
+ glusterd_volinfo_unref (volinfo);
return ret;
}
@@ -326,7 +185,7 @@ glusterd_store_volinfo_brick_fname_write (int vol_fd,
brick_count);
glusterd_store_brickinfofname_set (brickinfo, brickfname,
sizeof (brickfname));
- ret = glusterd_store_save_value (vol_fd, key, brickfname);
+ ret = gf_store_save_value (vol_fd, key, brickfname);
if (ret)
goto out;
@@ -345,9 +204,9 @@ glusterd_store_create_brick_shandle_on_absence (glusterd_volinfo_t *volinfo,
GF_ASSERT (brickinfo);
glusterd_store_brickinfopath_set (volinfo, brickinfo, brickpath,
- sizeof (brickpath));
- ret = glusterd_store_handle_create_on_absence (&brickinfo->shandle,
- brickpath);
+ sizeof (brickpath));
+ ret = gf_store_handle_create_on_absence (&brickinfo->shandle,
+ brickpath);
return ret;
}
@@ -360,30 +219,53 @@ glusterd_store_brickinfo_write (int fd, glusterd_brickinfo_t *brickinfo)
GF_ASSERT (brickinfo);
GF_ASSERT (fd > 0);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_HOSTNAME,
- brickinfo->hostname);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_HOSTNAME,
+ brickinfo->hostname);
if (ret)
goto out;
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PATH,
- brickinfo->path);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PATH,
+ brickinfo->path);
if (ret)
goto out;
snprintf (value, sizeof(value), "%d", brickinfo->port);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PORT,
- value);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PORT, value);
snprintf (value, sizeof(value), "%d", brickinfo->rdma_port);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT,
- value);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT,
+ value);
snprintf (value, sizeof(value), "%d", brickinfo->decommissioned);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
- value);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
+ value);
+ if (ret)
+ goto out;
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_ID,
+ brickinfo->brick_id);
if (ret)
goto out;
+ if (strlen(brickinfo->device_path) > 0) {
+ snprintf (value, sizeof(value), "%s", brickinfo->device_path);
+ ret = gf_store_save_value (fd,
+ GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH, value);
+ if (ret)
+ goto out;
+ }
+
+ snprintf (value, sizeof(value), "%d", brickinfo->snap_status);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS,
+ value);
+ if (ret)
+ goto out;
+
+ if (!brickinfo->vg[0])
+ goto out;
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_VGNAME,
+ brickinfo->vg);
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -396,7 +278,7 @@ glusterd_store_perform_brick_store (glusterd_brickinfo_t *brickinfo)
int32_t ret = -1;
GF_ASSERT (brickinfo);
- fd = glusterd_store_mkstemp (brickinfo->shandle);
+ fd = gf_store_mkstemp (brickinfo->shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -408,7 +290,7 @@ glusterd_store_perform_brick_store (glusterd_brickinfo_t *brickinfo)
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (brickinfo->shandle);
+ gf_store_unlink_tmppath (brickinfo->shandle);
if (fd > 0)
close (fd);
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -490,7 +372,7 @@ glusterd_store_delete_brick (glusterd_brickinfo_t *brickinfo, char *delete_path)
out:
if (brickinfo->shandle) {
- glusterd_store_handle_destroy (brickinfo->shandle);
+ gf_store_handle_destroy (brickinfo->shandle);
brickinfo->shandle = NULL;
}
gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
@@ -554,13 +436,13 @@ static int
_storeslaves (dict_t *this, char *key, data_t *value, void *data)
{
int32_t ret = 0;
- glusterd_store_handle_t *shandle = NULL;
- xlator_t *xl = NULL;
+ gf_store_handle_t *shandle = NULL;
+ xlator_t *xl = NULL;
xl = THIS;
GF_ASSERT (xl);
- shandle = (glusterd_store_handle_t*)data;
+ shandle = (gf_store_handle_t*)data;
GF_ASSERT (shandle);
GF_ASSERT (shandle->fd > 0);
@@ -579,7 +461,7 @@ _storeslaves (dict_t *this, char *key, data_t *value, void *data)
gf_log (xl->name, GF_LOG_DEBUG, "Storing in volinfo:key= %s, val=%s",
key, value->data);
- ret = glusterd_store_save_value (shandle->fd, key, (char*)value->data);
+ ret = gf_store_save_value (shandle->fd, key, (char*)value->data);
if (ret) {
gf_log (xl->name, GF_LOG_ERROR, "Unable to write into store"
" handle for path: %s", shandle->path);
@@ -593,13 +475,13 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data)
{
int32_t ret = 0;
int32_t exists = 0;
- glusterd_store_handle_t *shandle = NULL;
- xlator_t *xl = NULL;
+ gf_store_handle_t *shandle = NULL;
+ xlator_t *xl = NULL;
xl = THIS;
GF_ASSERT (xl);
- shandle = (glusterd_store_handle_t*)data;
+ shandle = (gf_store_handle_t*)data;
GF_ASSERT (shandle);
GF_ASSERT (shandle->fd > 0);
@@ -632,7 +514,7 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data)
return 0;
}
- ret = glusterd_store_save_value (shandle->fd, key, (char*)value->data);
+ ret = gf_store_save_value (shandle->fd, key, (char*)value->data);
if (ret) {
gf_log (xl->name, GF_LOG_ERROR, "Unable to write into store"
" handle for path: %s", shandle->path);
@@ -644,93 +526,124 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data)
int32_t
glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo)
{
- char *str = NULL;
+ char *str = NULL;
+ char buf[PATH_MAX] = "";
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (fd > 0);
GF_ASSERT (volinfo);
- char buf[PATH_MAX] = {0,};
- int32_t ret = -1;
-
snprintf (buf, sizeof (buf), "%d", volinfo->type);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TYPE, buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TYPE, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->brick_count);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_COUNT, buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_COUNT, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->status);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STATUS, buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STATUS, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->sub_count);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_SUB_COUNT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_SUB_COUNT, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->stripe_count);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STRIPE_CNT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STRIPE_CNT, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->replica_count);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT,
+ buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->version);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->transport_type);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TRANSPORT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TRANSPORT, buf);
if (ret)
goto out;
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_ID,
- uuid_utoa (volinfo->volume_id));
+ snprintf (buf, sizeof (buf), "%s", volinfo->parent_volname);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PARENT_VOLNAME, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store "
+ GLUSTERD_STORE_KEY_PARENT_VOLNAME);
+ goto out;
+ }
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_ID,
+ uuid_utoa (volinfo->volume_id));
if (ret)
goto out;
str = glusterd_auth_get_username (volinfo);
if (str) {
- ret = glusterd_store_save_value (fd,
- GLUSTERD_STORE_KEY_USERNAME,
- str);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_USERNAME,
+ str);
if (ret)
goto out;
}
str = glusterd_auth_get_password (volinfo);
if (str) {
- ret = glusterd_store_save_value (fd,
- GLUSTERD_STORE_KEY_PASSWORD,
- str);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PASSWORD,
+ str);
if (ret)
goto out;
}
- if (volinfo->backend == GD_VOL_BK_BD) {
- snprintf (buf, sizeof (buf), "%d", volinfo->backend);
- ret = glusterd_store_save_value (fd,
- GLUSTERD_STORE_KEY_VOL_BACKEND, buf);
+ snprintf (buf, sizeof (buf), "%d", volinfo->op_version);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_OP_VERSION, buf);
+ if (ret)
+ goto out;
+
+ snprintf (buf, sizeof (buf), "%d", volinfo->client_op_version);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION,
+ buf);
+ if (ret)
+ goto out;
+ if (volinfo->caps) {
+ snprintf (buf, sizeof (buf), "%d", volinfo->caps);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_CAPS,
+ buf);
if (ret)
goto out;
}
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP,
+ uuid_utoa (volinfo->restored_from_snap));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to write restored_from_snap");
+ goto out;
+ }
+
+ snprintf (buf, sizeof (buf), "%"PRIu64, volinfo->snap_max_hard_limit);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+ buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to write snap-max-hard-limit");
+ goto out;
+ }
+
out:
if (ret)
- gf_log (THIS->name, GF_LOG_ERROR, "Unable to write volume "
+ gf_log (this->name, GF_LOG_ERROR, "Unable to write volume "
"values for %s", volinfo->volname);
return ret;
}
@@ -745,8 +658,7 @@ glusterd_store_voldirpath_set (glusterd_volinfo_t *volinfo, char *voldirpath,
priv = THIS->private;
GF_ASSERT (priv);
- snprintf (voldirpath, len, "%s/%s/%s", priv->workdir,
- GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname);
+ GLUSTERD_GET_VOLUME_DIR (voldirpath, volinfo, priv);
}
static int32_t
@@ -759,16 +671,38 @@ glusterd_store_create_volume_dir (glusterd_volinfo_t *volinfo)
glusterd_store_voldirpath_set (volinfo, voldirpath,
sizeof (voldirpath));
- ret = glusterd_store_mkdir (voldirpath);
+ ret = gf_store_mkdir (voldirpath);
+
gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
int32_t
+glusterd_store_create_snap_dir (glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+ char snapdirpath[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+
+ GLUSTERD_GET_SNAP_DIR (snapdirpath, snap, priv);
+
+ ret = mkdir_p (snapdirpath, 0755, _gf_true);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snaps dir "
+ "%s", snapdirpath);
+ }
+ return ret;
+}
+
+int32_t
glusterd_store_volinfo_write (int fd, glusterd_volinfo_t *volinfo)
{
int32_t ret = -1;
- glusterd_store_handle_t *shandle = NULL;
+ gf_store_handle_t *shandle = NULL;
GF_ASSERT (fd > 0);
GF_ASSERT (volinfo);
GF_ASSERT (volinfo->shandle);
@@ -788,6 +722,49 @@ out:
return ret;
}
+int32_t
+glusterd_store_snapinfo_write (glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+ int fd = 0;
+ char buf[PATH_MAX] = "";
+
+ GF_ASSERT (snap);
+
+ fd = gf_store_mkstemp (snap->shandle);
+ if (fd <= 0)
+ goto out;
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_ID,
+ uuid_utoa (snap->snap_id));
+ if (ret)
+ goto out;
+
+ snprintf (buf, sizeof (buf), "%d", snap->snap_status);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_STATUS, buf);
+ if (ret)
+ goto out;
+
+ snprintf (buf, sizeof (buf), "%d", snap->snap_restored);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_RESTORED, buf);
+ if (ret)
+ goto out;
+
+ if (snap->description) {
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_DESC,
+ snap->description);
+ if (ret)
+ goto out;
+ }
+
+ snprintf (buf, sizeof (buf), "%ld", snap->time_stamp);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_TIMESTAMP, buf);
+
+out:
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
static void
glusterd_store_rbstatepath_set (glusterd_volinfo_t *volinfo, char *rbstatepath,
size_t len)
@@ -832,6 +809,51 @@ glusterd_store_node_state_path_set (glusterd_volinfo_t *volinfo,
GLUSTERD_NODE_STATE_FILE);
}
+static void
+glusterd_store_quota_conf_path_set (glusterd_volinfo_t *volinfo,
+ char *quota_conf_path, size_t len)
+{
+ char voldirpath[PATH_MAX] = {0,};
+ GF_ASSERT (volinfo);
+ GF_ASSERT (quota_conf_path);
+ GF_ASSERT (len <= PATH_MAX);
+
+ glusterd_store_voldirpath_set (volinfo, voldirpath,
+ sizeof (voldirpath));
+ snprintf (quota_conf_path, len, "%s/%s", voldirpath,
+ GLUSTERD_VOLUME_QUOTA_CONFIG);
+}
+
+static void
+glusterd_store_missed_snaps_list_path_set (char *missed_snaps_list,
+ size_t len)
+{
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (missed_snaps_list);
+ GF_ASSERT (len <= PATH_MAX);
+
+ snprintf (missed_snaps_list, len, "%s/snaps/"
+ GLUSTERD_MISSED_SNAPS_LIST_FILE, priv->workdir);
+}
+
+static void
+glusterd_store_snapfpath_set (glusterd_snap_t *snap, char *snap_fpath,
+ size_t len)
+{
+ glusterd_conf_t *priv = NULL;
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+ GF_ASSERT (snap_fpath);
+ GF_ASSERT (len <= PATH_MAX);
+
+ snprintf (snap_fpath, len, "%s/snaps/%s/%s", priv->workdir,
+ snap->snapname, GLUSTERD_SNAP_INFO_FILE);
+}
+
int32_t
glusterd_store_create_rbstate_shandle_on_absence (glusterd_volinfo_t *volinfo)
{
@@ -841,8 +863,8 @@ glusterd_store_create_rbstate_shandle_on_absence (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
glusterd_store_rbstatepath_set (volinfo, rbstatepath, sizeof (rbstatepath));
- ret = glusterd_store_handle_create_on_absence (&volinfo->rb_shandle,
- rbstatepath);
+ ret = gf_store_handle_create_on_absence (&volinfo->rb_shandle,
+ rbstatepath);
return ret;
}
@@ -855,8 +877,7 @@ glusterd_store_create_vol_shandle_on_absence (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
glusterd_store_volfpath_set (volinfo, volfpath, sizeof (volfpath));
- ret = glusterd_store_handle_create_on_absence (&volinfo->shandle,
- volfpath);
+ ret = gf_store_handle_create_on_absence (&volinfo->shandle, volfpath);
return ret;
}
@@ -871,13 +892,66 @@ glusterd_store_create_nodestate_sh_on_absence (glusterd_volinfo_t *volinfo)
glusterd_store_node_state_path_set (volinfo, node_state_path,
sizeof (node_state_path));
ret =
- glusterd_store_handle_create_on_absence (&volinfo->node_state_shandle,
- node_state_path);
+ gf_store_handle_create_on_absence (&volinfo->node_state_shandle,
+ node_state_path);
return ret;
}
int32_t
+glusterd_store_create_quota_conf_sh_on_absence (glusterd_volinfo_t *volinfo)
+{
+ char quota_conf_path[PATH_MAX] = {0};
+ int32_t ret = 0;
+
+ GF_ASSERT (volinfo);
+
+ glusterd_store_quota_conf_path_set (volinfo, quota_conf_path,
+ sizeof (quota_conf_path));
+ ret =
+ gf_store_handle_create_on_absence (&volinfo->quota_conf_shandle,
+ quota_conf_path);
+
+ return ret;
+}
+
+static int32_t
+glusterd_store_create_missed_snaps_list_shandle_on_absence ()
+{
+ char missed_snaps_list[PATH_MAX] = "";
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ glusterd_store_missed_snaps_list_path_set (missed_snaps_list,
+ sizeof(missed_snaps_list));
+
+ ret = gf_store_handle_create_on_absence
+ (&priv->missed_snaps_list_shandle,
+ missed_snaps_list);
+ return ret;
+}
+
+int32_t
+glusterd_store_create_snap_shandle_on_absence (glusterd_snap_t *snap)
+{
+ char snapfpath[PATH_MAX] = {0};
+ int32_t ret = 0;
+
+ GF_ASSERT (snap);
+
+ glusterd_store_snapfpath_set (snap, snapfpath, sizeof (snapfpath));
+ ret = gf_store_handle_create_on_absence (&snap->shandle, snapfpath);
+ return ret;
+}
+
+int32_t
glusterd_store_brickinfos (glusterd_volinfo_t *volinfo, int vol_fd)
{
int32_t ret = 0;
@@ -888,7 +962,7 @@ glusterd_store_brickinfos (glusterd_volinfo_t *volinfo, int vol_fd)
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
ret = glusterd_store_brickinfo (volinfo, brickinfo,
- brick_count, vol_fd);
+ brick_count, vol_fd);
if (ret)
goto out;
brick_count++;
@@ -909,8 +983,7 @@ glusterd_store_rbstate_write (int fd, glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
snprintf (buf, sizeof (buf), "%d", volinfo->rep_brick.rb_status);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_STATUS,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_STATUS, buf);
if (ret)
goto out;
@@ -919,16 +992,16 @@ glusterd_store_rbstate_write (int fd, glusterd_volinfo_t *volinfo)
snprintf (buf, sizeof (buf), "%s:%s",
volinfo->rep_brick.src_brick->hostname,
volinfo->rep_brick.src_brick->path);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_SRC_BRICK,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_SRC_BRICK,
+ buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%s:%s",
volinfo->rep_brick.dst_brick->hostname,
volinfo->rep_brick.dst_brick->path);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_BRICK,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_BRICK,
+ buf);
if (ret)
goto out;
@@ -944,13 +1017,12 @@ glusterd_store_rbstate_write (int fd, glusterd_volinfo_t *volinfo)
}
snprintf (buf, sizeof (buf), "%d", port);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_PORT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_PORT,
+ buf);
if (ret)
goto out;
uuid_unparse (volinfo->rep_brick.rb_id, buf);
- ret = glusterd_store_save_value (fd, GF_REPLACE_BRICK_TID_KEY,
- buf);
+ ret = gf_store_save_value (fd, GF_REPLACE_BRICK_TID_KEY, buf);
}
ret = 0;
@@ -966,7 +1038,7 @@ glusterd_store_perform_rbstate_store (glusterd_volinfo_t *volinfo)
int32_t ret = -1;
GF_ASSERT (volinfo);
- fd = glusterd_store_mkstemp (volinfo->rb_shandle);
+ fd = gf_store_mkstemp (volinfo->rb_shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -976,19 +1048,32 @@ glusterd_store_perform_rbstate_store (glusterd_volinfo_t *volinfo)
if (ret)
goto out;
- ret = glusterd_store_rename_tmppath (volinfo->rb_shandle);
+ ret = gf_store_rename_tmppath (volinfo->rb_shandle);
if (ret)
goto out;
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (volinfo->rb_shandle);
+ gf_store_unlink_tmppath (volinfo->rb_shandle);
if (fd > 0)
close (fd);
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+int
+_gd_store_rebalance_dict (dict_t *dict, char *key, data_t *value, void *data)
+{
+ int ret = -1;
+ int fd = 0;
+
+ fd = *(int *)data;
+
+ ret = gf_store_save_value (fd, key, value->data);
+
+ return ret;
+}
+
int32_t
glusterd_store_node_state_write (int fd, glusterd_volinfo_t *volinfo)
{
@@ -1004,22 +1089,23 @@ glusterd_store_node_state_write (int fd, glusterd_volinfo_t *volinfo)
}
snprintf (buf, sizeof (buf), "%d", volinfo->rebal.defrag_cmd);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DEFRAG,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DEFRAG, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->rebal.op);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_DEFRAG_OP,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_DEFRAG_OP, buf);
+ if (ret)
+ goto out;
+
+ uuid_unparse (volinfo->rebal.rebalance_id, buf);
+ ret = gf_store_save_value (fd, GF_REBALANCE_TID_KEY, buf);
if (ret)
goto out;
- if (volinfo->rebal.defrag_cmd) {
- uuid_unparse (volinfo->rebal.rebalance_id, buf);
- ret = glusterd_store_save_value (fd,
- GF_REBALANCE_TID_KEY,
- buf);
+ if (volinfo->rebal.dict) {
+ dict_foreach (volinfo->rebal.dict, _gd_store_rebalance_dict,
+ &fd);
}
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -1033,7 +1119,7 @@ glusterd_store_perform_node_state_store (glusterd_volinfo_t *volinfo)
int32_t ret = -1;
GF_ASSERT (volinfo);
- fd = glusterd_store_mkstemp (volinfo->node_state_shandle);
+ fd = gf_store_mkstemp (volinfo->node_state_shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -1043,13 +1129,13 @@ glusterd_store_perform_node_state_store (glusterd_volinfo_t *volinfo)
if (ret)
goto out;
- ret = glusterd_store_rename_tmppath (volinfo->node_state_shandle);
+ ret = gf_store_rename_tmppath (volinfo->node_state_shandle);
if (ret)
goto out;
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (volinfo->node_state_shandle);
+ gf_store_unlink_tmppath (volinfo->node_state_shandle);
if (fd > 0)
close (fd);
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -1063,7 +1149,7 @@ glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo)
int32_t ret = -1;
GF_ASSERT (volinfo);
- fd = glusterd_store_mkstemp (volinfo->shandle);
+ fd = gf_store_mkstemp (volinfo->shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -1079,7 +1165,7 @@ glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo)
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (volinfo->shandle);
+ gf_store_unlink_tmppath (volinfo->shandle);
if (fd > 0)
close (fd);
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
@@ -1112,7 +1198,7 @@ glusterd_store_bricks_cleanup_tmp (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- glusterd_store_unlink_tmppath (brickinfo->shandle);
+ gf_store_unlink_tmppath (brickinfo->shandle);
}
}
@@ -1123,11 +1209,11 @@ glusterd_store_volume_cleanup_tmp (glusterd_volinfo_t *volinfo)
glusterd_store_bricks_cleanup_tmp (volinfo);
- glusterd_store_unlink_tmppath (volinfo->shandle);
+ gf_store_unlink_tmppath (volinfo->shandle);
- glusterd_store_unlink_tmppath (volinfo->rb_shandle);
+ gf_store_unlink_tmppath (volinfo->rb_shandle);
- glusterd_store_unlink_tmppath (volinfo->node_state_shandle);
+ gf_store_unlink_tmppath (volinfo->node_state_shandle);
}
int32_t
@@ -1139,7 +1225,7 @@ glusterd_store_brickinfos_atomic_update (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- ret = glusterd_store_rename_tmppath (brickinfo->shandle);
+ ret = gf_store_rename_tmppath (brickinfo->shandle);
if (ret)
goto out;
}
@@ -1153,7 +1239,7 @@ glusterd_store_volinfo_atomic_update (glusterd_volinfo_t *volinfo)
int ret = -1;
GF_ASSERT (volinfo);
- ret = glusterd_store_rename_tmppath (volinfo->shandle);
+ ret = gf_store_rename_tmppath (volinfo->shandle);
if (ret)
goto out;
@@ -1181,6 +1267,60 @@ out:
}
int32_t
+glusterd_store_snap_atomic_update (glusterd_snap_t *snap)
+{
+ int ret = -1;
+ GF_ASSERT (snap);
+
+ ret = gf_store_rename_tmppath (snap->shandle);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR, "Couldn't rename "
+ "temporary file(s): Reason %s", strerror (errno));
+
+ return ret;
+}
+
+int32_t
+glusterd_store_snap (glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+
+ GF_ASSERT (snap);
+
+ ret = glusterd_store_create_snap_dir (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snap dir");
+ goto out;
+ }
+
+ ret = glusterd_store_create_snap_shandle_on_absence (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snap info "
+ "file");
+ goto out;
+ }
+
+ ret = glusterd_store_snapinfo_write (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to write snap info");
+ goto out;
+ }
+
+ ret = glusterd_store_snap_atomic_update (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR,"Failed to do automic update");
+ goto out;
+ }
+
+out:
+ if (ret && snap->shandle)
+ gf_store_unlink_tmppath (snap->shandle);
+
+ gf_log (THIS->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t ac)
{
int32_t ret = -1;
@@ -1224,7 +1364,7 @@ glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t a
goto out;
//checksum should be computed at the end
- ret = glusterd_volume_compute_cksum (volinfo);
+ ret = glusterd_compute_cksum (volinfo, _gf_false);
if (ret)
goto out;
@@ -1237,7 +1377,6 @@ out:
return ret;
}
-
int32_t
glusterd_store_delete_volume (glusterd_volinfo_t *volinfo)
{
@@ -1264,8 +1403,8 @@ glusterd_store_delete_volume (glusterd_volinfo_t *volinfo)
GLUSTERD_GET_VOLUME_DIR (pathname, volinfo, priv);
snprintf (delete_path, sizeof (delete_path),
- "%s/"GLUSTERD_TRASH"/%s.deleted", priv->workdir,
- uuid_utoa (volinfo->volume_id));
+ "%s/"GLUSTERD_TRASH"/%s.deleted", priv->workdir,
+ uuid_utoa (volinfo->volume_id));
snprintf (trashdir, sizeof (trashdir), "%s/"GLUSTERD_TRASH,
priv->workdir);
@@ -1348,7 +1487,7 @@ stat_failed:
out:
if (volinfo->shandle) {
- glusterd_store_handle_destroy (volinfo->shandle);
+ gf_store_handle_destroy (volinfo->shandle);
volinfo->shandle = NULL;
}
ret = (rename_fail == _gf_true) ? -1: 0;
@@ -1357,269 +1496,127 @@ out:
return ret;
}
-
-int
-glusterd_store_read_and_tokenize (FILE *file, char *str,
- char **iter_key, char **iter_val,
- glusterd_store_op_errno_t *store_errno)
-{
- int32_t ret = -1;
- char *savetok = NULL;
-
- GF_ASSERT (file);
- GF_ASSERT (str);
- GF_ASSERT (iter_key);
- GF_ASSERT (iter_val);
- GF_ASSERT (store_errno);
-
- ret = fscanf (file, "%s", str);
- if (ret <= 0 || feof (file)) {
- ret = -1;
- *store_errno = GD_STORE_EOF;
- goto out;
- }
-
- *iter_key = strtok_r (str, "=", &savetok);
- if (*iter_key == NULL) {
- ret = -1;
- *store_errno = GD_STORE_KEY_NULL;
- goto out;
- }
-
- *iter_val = strtok_r (NULL, "=", &savetok);
- if (*iter_key == NULL) {
- ret = -1;
- *store_errno = GD_STORE_VALUE_NULL;
- goto out;
- }
-
- *store_errno = GD_STORE_SUCCESS;
- ret = 0;
-out:
- return ret;
-}
-
+/*TODO: cleanup the duplicate code and implement a generic function for
+ * deleting snap/volume depending on the parameter flag */
int32_t
-glusterd_store_retrieve_value (glusterd_store_handle_t *handle,
- char *key, char **value)
+glusterd_store_delete_snap (glusterd_snap_t *snap)
{
- int32_t ret = -1;
- char *scan_str = NULL;
- char *iter_key = NULL;
- char *iter_val = NULL;
- char *free_str = NULL;
- struct stat st = {0,};
- glusterd_store_op_errno_t store_errno = GD_STORE_SUCCESS;
+ char pathname[PATH_MAX] = {0,};
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ DIR *dir = NULL;
+ struct dirent *entry = NULL;
+ char path[PATH_MAX] = {0,};
+ char delete_path[PATH_MAX] = {0,};
+ char trashdir[PATH_MAX] = {0,};
+ struct stat st = {0, };
+ xlator_t *this = NULL;
+ gf_boolean_t rename_fail = _gf_false;
- GF_ASSERT (handle);
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
- handle->fd = open (handle->path, O_RDWR);
+ GF_ASSERT (snap);
+ GLUSTERD_GET_SNAP_DIR (pathname, snap, priv);
- if (handle->fd == -1) {
- gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %s",
- handle->path, strerror (errno));
- goto out;
- }
- if (!handle->read)
- handle->read = fdopen (handle->fd, "r");
+ snprintf (delete_path, sizeof (delete_path),
+ "%s/"GLUSTERD_TRASH"/snap-%s.deleted", priv->workdir,
+ uuid_utoa (snap->snap_id));
- if (!handle->read) {
- gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %s",
- handle->path, strerror (errno));
- goto out;
- }
+ snprintf (trashdir, sizeof (trashdir), "%s/"GLUSTERD_TRASH,
+ priv->workdir);
- ret = fstat (handle->fd, &st);
- if (ret < 0) {
- gf_log ("glusterd", GF_LOG_WARNING,
- "stat on file failed");
+ ret = mkdir (trashdir, 0777);
+ if (ret && errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create trash "
+ "directory, reason : %s", strerror (errno));
ret = -1;
- store_errno = GD_STORE_STAT_FAILED;
goto out;
}
- scan_str = GF_CALLOC (1, st.st_size,
- gf_gld_mt_char);
- if (scan_str == NULL) {
- ret = -1;
- store_errno = GD_STORE_ENOMEM;
+ ret = rename (pathname, delete_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to rename snap "
+ "directory %s to %s", pathname, delete_path);
+ rename_fail = _gf_true;
goto out;
}
- free_str = scan_str;
+ dir = opendir (delete_path);
+ if (!dir) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to open directory %s."
+ " Reason : %s", delete_path, strerror (errno));
+ ret = 0;
+ goto out;
+ }
- do {
- ret = glusterd_store_read_and_tokenize (handle->read, scan_str,
- &iter_key, &iter_val,
- &store_errno);
- if (ret < 0) {
- goto out;
+ glusterd_for_each_entry (entry, dir);
+ while (entry) {
+ snprintf (path, PATH_MAX, "%s/%s", delete_path, entry->d_name);
+ ret = stat (path, &st);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to stat "
+ "entry %s : %s", path, strerror (errno));
+ goto stat_failed;
}
- gf_log ("", GF_LOG_DEBUG, "key %s read", iter_key);
+ if (S_ISDIR (st.st_mode))
+ ret = rmdir (path);
+ else
+ ret = unlink (path);
- if (!strcmp (key, iter_key)) {
- gf_log ("", GF_LOG_DEBUG, "key %s found", key);
- ret = 0;
- if (iter_val)
- *value = gf_strdup (iter_val);
- goto out;
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, " Failed to remove "
+ "%s. Reason : %s", path, strerror (errno));
}
- } while (1);
-out:
- if (handle->fd > 0) {
- close (handle->fd);
- handle->read = NULL;
- }
-
- GF_FREE (free_str);
-
- return ret;
-}
-
-int32_t
-glusterd_store_save_value (int fd, char *key, char *value)
-{
- int32_t ret = -1;
- FILE *fp = NULL;
- xlator_t *this = NULL;
- this = THIS;
- GF_ASSERT (this);
- GF_ASSERT (fd > 0);
- GF_ASSERT (key);
- GF_ASSERT (value);
-
- fp = fdopen (fd, "a+");
- if (fp == NULL) {
- gf_log (this->name, GF_LOG_WARNING, "fdopen failed.");
- ret = -1;
- goto out;
+ gf_log (this->name, GF_LOG_DEBUG, "%s %s",
+ ret ? "Failed to remove":"Removed",
+ entry->d_name);
+stat_failed:
+ memset (path, 0, sizeof(path));
+ glusterd_for_each_entry (entry, dir);
}
- ret = fprintf (fp, "%s=%s\n", key, value);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING, "Unable to store key: %s,"
- "value: %s, error: %s", key, value,
- strerror (errno));
- ret = -1;
- goto out;
+ ret = closedir (dir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to close dir %s. "
+ "Reason : %s",delete_path, strerror (errno));
}
- ret = fflush (fp);
- if (feof (fp)) {
- gf_log (this->name, GF_LOG_WARNING,
- "fflush failed, error: %s",
- strerror (errno));
- ret = -1;
- goto out;
+ ret = rmdir (delete_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s,err: %s",
+ delete_path, strerror (errno));
}
-
- ret = 0;
-out:
-
- gf_log (this->name, GF_LOG_DEBUG, "returning: %d", ret);
- return ret;
-}
-
-int32_t
-glusterd_store_handle_new (char *path, glusterd_store_handle_t **handle)
-{
- int32_t ret = -1;
- glusterd_store_handle_t *shandle = NULL;
- int fd = -1;
- char *spath = NULL;
- xlator_t *this = NULL;
-
- this = THIS;
- GF_ASSERT (this);
-
- shandle = GF_CALLOC (1, sizeof (*shandle), gf_gld_mt_store_handle_t);
- if (!shandle)
- goto out;
-
- spath = gf_strdup (path);
-
- if (!spath)
- goto out;
-
- fd = open (path, O_RDWR | O_CREAT | O_APPEND, 0600);
- if (fd <= 0) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to open file: %s, "
- "error: %s", path, strerror (errno));
- goto out;
+ ret = rmdir (trashdir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s, Reason:"
+ " %s", trashdir, strerror (errno));
}
- ret = glusterd_store_sync_direntry (spath);
- if (ret)
- goto out;
-
- shandle->path = spath;
- *handle = shandle;
-
- ret = 0;
out:
- if (fd > 0)
- close (fd);
-
- if (ret == -1) {
- GF_FREE (spath);
- GF_FREE (shandle);
+ if (snap->shandle) {
+ gf_store_handle_destroy (snap->shandle);
+ snap->shandle = NULL;
}
+ ret = (rename_fail == _gf_true) ? -1: 0;
gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
int
-glusterd_store_handle_retrieve (char *path, glusterd_store_handle_t **handle)
-{
- int32_t ret = -1;
- struct stat statbuf = {0};
-
- ret = stat (path, &statbuf);
- if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to retrieve store "
- "handle for %s, error: %s", path, strerror (errno));
- goto out;
- }
- ret = glusterd_store_handle_new (path, handle);
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
- return ret;
-}
-
-int32_t
-glusterd_store_handle_destroy (glusterd_store_handle_t *handle)
-{
- int32_t ret = -1;
-
- if (!handle) {
- ret = 0;
- goto out;
- }
-
- GF_FREE (handle->path);
-
- GF_FREE (handle);
-
- ret = 0;
-
-out:
- gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
-
- return ret;
-}
-
-int
glusterd_store_global_info (xlator_t *this)
{
int ret = -1;
glusterd_conf_t *conf = NULL;
char op_version_str[15] = {0,};
char path[PATH_MAX] = {0,};
- glusterd_store_handle_t *handle = NULL;
+ gf_store_handle_t *handle = NULL;
char *uuid_str = NULL;
+ char buf[256] = {0, };
conf = this->private;
@@ -1630,7 +1627,7 @@ glusterd_store_global_info (xlator_t *this)
if (!conf->handle) {
snprintf (path, PATH_MAX, "%s/%s", conf->workdir,
GLUSTERD_INFO_FILE);
- ret = glusterd_store_handle_new (path, &handle);
+ ret = gf_store_handle_new (path, &handle);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Unable to get store handle");
@@ -1649,14 +1646,14 @@ glusterd_store_global_info (xlator_t *this)
goto out;
}
- handle->fd = glusterd_store_mkstemp (handle);
+ handle->fd = gf_store_mkstemp (handle);
if (handle->fd <= 0) {
ret = -1;
goto out;
}
- ret = glusterd_store_save_value (handle->fd, GLUSTERD_STORE_UUID_KEY,
- uuid_str);
+ ret = gf_store_save_value (handle->fd, GLUSTERD_STORE_UUID_KEY,
+ uuid_str);
if (ret) {
gf_log (this->name, GF_LOG_CRITICAL,
"Storing uuid failed ret = %d", ret);
@@ -1664,22 +1661,42 @@ glusterd_store_global_info (xlator_t *this)
}
snprintf (op_version_str, 15, "%d", conf->op_version);
- ret = glusterd_store_save_value (handle->fd, GD_OP_VERSION_KEY,
- op_version_str);
+ ret = gf_store_save_value (handle->fd, GD_OP_VERSION_KEY,
+ op_version_str);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Storing op-version failed ret = %d", ret);
goto out;
}
- ret = glusterd_store_rename_tmppath (handle);
+ snprintf (buf, sizeof (buf), "%"PRIu64, conf->snap_max_hard_limit);
+ ret = gf_store_save_value (handle->fd,
+ GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Storing snap-max-hard-limit failed ret = %d", ret);
+ goto out;
+ }
+
+ snprintf (buf, sizeof (buf), "%"PRIu64, conf->snap_max_soft_limit);
+ ret = gf_store_save_value (handle->fd,
+ GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Storing snap-max-soft-limit failed ret = %d", ret);
+ goto out;
+ }
+
+ ret = gf_store_rename_tmppath (handle);
out:
- if (ret && (handle->fd > 0))
- glusterd_store_unlink_tmppath (handle);
+ if (handle) {
+ if (ret && (handle->fd > 0))
+ gf_store_unlink_tmppath (handle);
- if (handle->fd > 0) {
- close (handle->fd);
- handle->fd = 0;
+ if (handle->fd > 0) {
+ close (handle->fd);
+ handle->fd = 0;
+ }
}
if (uuid_str)
@@ -1701,17 +1718,17 @@ glusterd_retrieve_op_version (xlator_t *this, int *op_version)
int tmp_version = 0;
char *tmp = NULL;
char path[PATH_MAX] = {0,};
- glusterd_store_handle_t *handle = NULL;
+ gf_store_handle_t *handle = NULL;
priv = this->private;
if (!priv->handle) {
snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
GLUSTERD_INFO_FILE);
- ret = glusterd_store_handle_retrieve (path, &handle);
+ ret = gf_store_handle_retrieve (path, &handle);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get store "
+ gf_log ("", GF_LOG_DEBUG, "Unable to get store "
"handle!");
goto out;
}
@@ -1719,9 +1736,8 @@ glusterd_retrieve_op_version (xlator_t *this, int *op_version)
priv->handle = handle;
}
- ret = glusterd_store_retrieve_value (priv->handle,
- GD_OP_VERSION_KEY,
- &op_version_str);
+ ret = gf_store_retrieve_value (priv->handle, GD_OP_VERSION_KEY,
+ &op_version_str);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
"No previous op_version present");
@@ -1744,21 +1760,100 @@ out:
return ret;
}
+int
+glusterd_retrieve_sys_snap_max_limit (xlator_t *this, uint64_t *limit,
+ char *key)
+{
+ char *limit_str = NULL;
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+ uint64_t tmp_limit = 0;
+ char *tmp = NULL;
+ char path[PATH_MAX] = {0,};
+ gf_store_handle_t *handle = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+
+ GF_ASSERT (priv);
+ GF_ASSERT (limit);
+ GF_ASSERT (key);
+
+ if (!priv->handle) {
+ snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
+ GLUSTERD_INFO_FILE);
+ ret = gf_store_handle_retrieve (path, &handle);
+
+ if (ret) {
+ gf_log ("", GF_LOG_DEBUG, "Unable to get store "
+ "handle!");
+ goto out;
+ }
+
+ priv->handle = handle;
+ }
+
+ ret = gf_store_retrieve_value (priv->handle,
+ key,
+ &limit_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No previous %s present", key);
+ goto out;
+ }
+
+ tmp_limit = strtoul (limit_str, &tmp, 10);
+ if ((tmp_limit <= 0) || (tmp && strlen (tmp) > 1)) {
+ gf_log (this->name, GF_LOG_WARNING, "invalid version number");
+ goto out;
+ }
+
+ *limit = tmp_limit;
+
+ ret = 0;
+out:
+ if (limit_str)
+ GF_FREE (limit_str);
+
+ return ret;
+}
static int
glusterd_restore_op_version (xlator_t *this)
{
- glusterd_conf_t *conf = NULL;
- int ret = 0;
- int op_version = 0;
+ glusterd_conf_t *conf = NULL;
+ int ret = 0;
+ int op_version = 0;
conf = this->private;
+ ret = glusterd_retrieve_sys_snap_max_limit (this,
+ &conf->snap_max_hard_limit,
+ GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to retrieve system snap-max-hard-limit, "
+ "setting it to default value(%d)",
+ GLUSTERD_SNAPS_MAX_HARD_LIMIT);
+ conf->snap_max_hard_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+ }
+
+ ret = glusterd_retrieve_sys_snap_max_limit (this,
+ &conf->snap_max_soft_limit,
+ GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to retrieve system snap-max-soft-limit, "
+ "setting it to default value(%d)",
+ GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT);
+ conf->snap_max_soft_limit = GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT;
+ }
+
ret = glusterd_retrieve_op_version (this, &op_version);
if (!ret) {
if ((op_version < GD_OP_VERSION_MIN) ||
(op_version > GD_OP_VERSION_MAX)) {
gf_log (this->name, GF_LOG_ERROR,
- "wrong op-version (%d) retreived", op_version);
+ "wrong op-version (%d) retrieved", op_version);
ret = -1;
goto out;
}
@@ -1801,7 +1896,7 @@ glusterd_retrieve_uuid ()
{
char *uuid_str = NULL;
int32_t ret = -1;
- glusterd_store_handle_t *handle = NULL;
+ gf_store_handle_t *handle = NULL;
glusterd_conf_t *priv = NULL;
char path[PATH_MAX] = {0,};
@@ -1810,10 +1905,10 @@ glusterd_retrieve_uuid ()
if (!priv->handle) {
snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
GLUSTERD_INFO_FILE);
- ret = glusterd_store_handle_retrieve (path, &handle);
+ ret = gf_store_handle_retrieve (path, &handle);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get store "
+ gf_log ("", GF_LOG_DEBUG, "Unable to get store"
"handle!");
goto out;
}
@@ -1821,12 +1916,11 @@ glusterd_retrieve_uuid ()
priv->handle = handle;
}
- ret = glusterd_store_retrieve_value (priv->handle,
- GLUSTERD_STORE_UUID_KEY,
- &uuid_str);
+ ret = gf_store_retrieve_value (priv->handle, GLUSTERD_STORE_UUID_KEY,
+ &uuid_str);
if (ret) {
- gf_log ("", GF_LOG_INFO, "No previous uuid is present");
+ gf_log ("", GF_LOG_DEBUG, "No previous uuid is present");
goto out;
}
@@ -1839,246 +1933,11 @@ out:
}
int32_t
-glusterd_store_iter_new (glusterd_store_handle_t *shandle,
- glusterd_store_iter_t **iter)
-{
- int32_t ret = -1;
- glusterd_store_iter_t *tmp_iter = NULL;
- int fd = -1;
-
- GF_ASSERT (shandle);
- GF_ASSERT (iter);
-
- tmp_iter = GF_CALLOC (1, sizeof (*tmp_iter),
- gf_gld_mt_store_iter_t);
-
- if (!tmp_iter) {
- gf_log ("", GF_LOG_ERROR, "Out of Memory");
- goto out;
- }
-
- fd = open (shandle->path, O_RDWR);
-
- if (fd < 0) {
- gf_log ("", GF_LOG_ERROR, "Unable to open %s, errno: %d",
- shandle->path, errno);
- goto out;
- }
-
- tmp_iter->fd = fd;
-
- tmp_iter->file = fdopen (tmp_iter->fd, "r");
-
- if (!tmp_iter->file) {
- gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %d",
- shandle->path, errno);
- goto out;
- }
-
- strncpy (tmp_iter->filepath, shandle->path, sizeof (tmp_iter->filepath));
- tmp_iter->filepath[sizeof (tmp_iter->filepath) - 1] = 0;
- *iter = tmp_iter;
- ret = 0;
-
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
- return ret;
-}
-
-int32_t
-glusterd_store_validate_key_value (char *storepath, char *key, char*val,
- glusterd_store_op_errno_t *op_errno)
-{
- int ret = 0;
-
- GF_ASSERT (op_errno);
- GF_ASSERT (storepath);
-
- if ((key == NULL) && (val == NULL)) {
- ret = -1;
- gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be "
- "corrupted, Invalid key and value (null) in %s",
- storepath);
- *op_errno = GD_STORE_KEY_VALUE_NULL;
- } else if (key == NULL) {
- ret = -1;
- gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be "
- "corrupted, Invalid key (null) in %s", storepath);
- *op_errno = GD_STORE_KEY_NULL;
- } else if (val == NULL) {
- ret = -1;
- gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be "
- "corrupted, Invalid value (null) for key %s in %s",
- key, storepath);
- *op_errno = GD_STORE_VALUE_NULL;
- } else {
- ret = 0;
- *op_errno = GD_STORE_SUCCESS;
- }
-
- return ret;
-}
-
-int32_t
-glusterd_store_iter_get_next (glusterd_store_iter_t *iter,
- char **key, char **value,
- glusterd_store_op_errno_t *op_errno)
-{
- int32_t ret = -1;
- char *scan_str = NULL;
- char *free_str = NULL;
- char *iter_key = NULL;
- char *iter_val = NULL;
- struct stat st = {0,};
- glusterd_store_op_errno_t store_errno = GD_STORE_SUCCESS;
-
- GF_ASSERT (iter);
- GF_ASSERT (iter->file);
- GF_ASSERT (key);
- GF_ASSERT (value);
-
- ret = fstat (iter->fd, &st);
- if (ret < 0) {
- gf_log ("glusterd", GF_LOG_WARNING,
- "stat on file failed");
- ret = -1;
- store_errno = GD_STORE_STAT_FAILED;
- goto out;
- }
-
- scan_str = GF_CALLOC (1, st.st_size,
- gf_gld_mt_char);
- if (scan_str == NULL) {
- ret = -1;
- store_errno = GD_STORE_ENOMEM;
- goto out;
- }
-
- *key = NULL;
- *value = NULL;
-
- free_str = scan_str;
-
- ret = glusterd_store_read_and_tokenize (iter->file, scan_str,
- &iter_key, &iter_val,
- &store_errno);
- if (ret < 0) {
- goto out;
- }
-
-
- ret = glusterd_store_validate_key_value (iter->filepath, iter_key,
- iter_val, &store_errno);
- if (ret)
- goto out;
-
- *value = gf_strdup (iter_val);
-
- *key = gf_strdup (iter_key);
- if (!iter_key || !iter_val) {
- ret = -1;
- store_errno = GD_STORE_ENOMEM;
- goto out;
- }
-
- ret = 0;
-
-out:
- if (ret) {
- if (*key) {
- GF_FREE (*key);
- *key = NULL;
- }
- if (*value) {
- GF_FREE (*value);
- *value = NULL;
- }
- }
- GF_FREE (free_str);
- if (op_errno)
- *op_errno = store_errno;
-
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
- return ret;
-}
-
-int32_t
-glusterd_store_iter_get_matching (glusterd_store_iter_t *iter,
- char *key, char **value)
-{
- int32_t ret = -1;
- char *tmp_key = NULL;
- char *tmp_value = NULL;
-
- ret = glusterd_store_iter_get_next (iter, &tmp_key, &tmp_value,
- NULL);
- while (!ret) {
- if (!strncmp (key, tmp_key, strlen (key))){
- *value = tmp_value;
- GF_FREE (tmp_key);
- goto out;
- }
- GF_FREE (tmp_key);
- GF_FREE (tmp_value);
- ret = glusterd_store_iter_get_next (iter, &tmp_key,
- &tmp_value, NULL);
- }
-out:
- return ret;
-}
-
-int32_t
-glusterd_store_iter_destroy (glusterd_store_iter_t *iter)
-{
- int32_t ret = -1;
-
- if (!iter)
- return 0;
-
- if (iter->file)
- ret = fclose (iter->file);
- else
- ret = 0;
-
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to close fd: %d, ret: %d, "
- "errno: %d" ,iter->fd, ret, errno);
- }
-
- GF_FREE (iter);
-
- return ret;
-}
-
-char*
-glusterd_store_strerror (glusterd_store_op_errno_t op_errno)
-{
- switch (op_errno) {
- case GD_STORE_SUCCESS:
- return "Success";
- case GD_STORE_KEY_NULL:
- return "Invalid Key";
- case GD_STORE_VALUE_NULL:
- return "Invalid Value";
- case GD_STORE_KEY_VALUE_NULL:
- return "Invalid Key and Value";
- case GD_STORE_EOF:
- return "No data";
- case GD_STORE_ENOMEM:
- return "No memory";
- default:
- return "Invalid errno";
- }
- return "Invalid errno";
-}
-
-int32_t
glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
{
-
int32_t ret = 0;
glusterd_brickinfo_t *brickinfo = NULL;
- glusterd_store_iter_t *iter = NULL;
+ gf_store_iter_t *iter = NULL;
char *key = NULL;
char *value = NULL;
char brickdir[PATH_MAX] = {0,};
@@ -2086,19 +1945,20 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
glusterd_conf_t *priv = NULL;
int32_t brick_count = 0;
char tmpkey[4096] = {0,};
- glusterd_store_iter_t *tmpiter = NULL;
+ gf_store_iter_t *tmpiter = NULL;
char *tmpvalue = NULL;
- struct pmap_registry *pmap = NULL;
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ struct pmap_registry *pmap = NULL;
+ int brickid = 0;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
GF_ASSERT (volinfo);
GF_ASSERT (volinfo->volname);
priv = THIS->private;
- GLUSTERD_GET_BRICK_DIR (brickdir, volinfo, priv)
+ GLUSTERD_GET_BRICK_DIR (brickdir, volinfo, priv);
- ret = glusterd_store_iter_new (volinfo->shandle, &tmpiter);
+ ret = gf_store_iter_new (volinfo->shandle, &tmpiter);
if (ret)
goto out;
@@ -2110,30 +1970,28 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
goto out;
snprintf (tmpkey, sizeof (tmpkey), "%s-%d",
GLUSTERD_STORE_KEY_VOL_BRICK,brick_count);
- ret = glusterd_store_iter_get_matching (tmpiter, tmpkey,
- &tmpvalue);
+ ret = gf_store_iter_get_matching (tmpiter, tmpkey, &tmpvalue);
snprintf (path, sizeof (path), "%s/%s", brickdir, tmpvalue);
GF_FREE (tmpvalue);
tmpvalue = NULL;
- ret = glusterd_store_handle_retrieve (path, &brickinfo->shandle);
+ ret = gf_store_handle_retrieve (path, &brickinfo->shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (brickinfo->shandle, &iter);
+ ret = gf_store_iter_new (brickinfo->shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
if (ret) {
gf_log ("glusterd", GF_LOG_ERROR, "Unable to iterate "
"the store for brick: %s, reason: %s", path,
- glusterd_store_strerror (op_errno));
+ gf_store_strerror (op_errno));
goto out;
}
while (!ret) {
@@ -2148,7 +2006,7 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
strlen (GLUSTERD_STORE_KEY_BRICK_PORT))) {
gf_string2int (value, &brickinfo->port);
- if (brickinfo->port < GF_IANA_PRIV_PORTS_START){
+ if (brickinfo->port < priv->base_port) {
/* This is required to adhere to the
IANA standards */
brickinfo->port = 0;
@@ -2164,8 +2022,7 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
strlen (GLUSTERD_STORE_KEY_BRICK_RDMA_PORT))) {
gf_string2int (value, &brickinfo->rdma_port);
- if (brickinfo->rdma_port <
- GF_IANA_PRIV_PORTS_START){
+ if (brickinfo->rdma_port < priv->base_port) {
/* This is required to adhere to the
IANA standards */
brickinfo->rdma_port = 0;
@@ -2182,6 +2039,21 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
} else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
strlen (GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) {
gf_string2int (value, &brickinfo->decommissioned);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH,
+ strlen (GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH))) {
+ strncpy (brickinfo->device_path, value,
+ sizeof (brickinfo->device_path));
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS,
+ strlen (GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS))) {
+ gf_string2int (value, &brickinfo->snap_status);
+ } else if (!strncmp (key,
+ GLUSTERD_STORE_KEY_BRICK_VGNAME,
+ strlen (GLUSTERD_STORE_KEY_BRICK_VGNAME))) {
+ strncpy (brickinfo->vg, value,
+ sizeof (brickinfo->vg));
+ } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_ID)) {
+ strncpy (brickinfo->brick_id, value,
+ sizeof (brickinfo->brick_id));
} else {
gf_log ("", GF_LOG_ERROR, "Unknown key: %s",
key);
@@ -2192,22 +2064,31 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value,
+ &op_errno);
}
- if (op_errno != GD_STORE_EOF)
+ if (op_errno != GD_STORE_EOF) {
+ gf_log ("", GF_LOG_ERROR, "Error parsing brickinfo: "
+ "op_errno=%d", op_errno);
goto out;
- ret = glusterd_store_iter_destroy (iter);
+ }
+ ret = gf_store_iter_destroy (iter);
if (ret)
goto out;
+ if (brickinfo->brick_id[0] == '\0') {
+ /* This is an old volume upgraded to op_version 4 */
+ GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo,
+ brickid++);
+ }
+
list_add_tail (&brickinfo->brick_list, &volinfo->bricks);
brick_count++;
}
- ret = glusterd_store_iter_destroy (tmpiter);
+ ret = gf_store_iter_destroy (tmpiter);
if (ret)
goto out;
out:
@@ -2218,42 +2099,39 @@ out:
int32_t
-glusterd_store_retrieve_rbstate (char *volname)
+glusterd_store_retrieve_rbstate (glusterd_volinfo_t *volinfo)
{
int32_t ret = -1;
- glusterd_volinfo_t *volinfo = NULL;
- glusterd_store_iter_t *iter = NULL;
+ gf_store_iter_t *iter = NULL;
char *key = NULL;
char *value = NULL;
char volpath[PATH_MAX] = {0,};
glusterd_conf_t *priv = NULL;
char path[PATH_MAX] = {0,};
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ xlator_t *this = NULL;
- priv = THIS->private;
-
- ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Couldn't get"
- "volinfo for %s.", volname);
- goto out;
- }
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volinfo);
GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv);
snprintf (path, sizeof (path), "%s/%s", volpath,
GLUSTERD_VOLUME_RBSTATE_FILE);
- ret = glusterd_store_handle_retrieve (path, &volinfo->rb_shandle);
+ ret = gf_store_handle_retrieve (path, &volinfo->rb_shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (volinfo->rb_shandle, &iter);
+ ret = gf_store_iter_new (volinfo->rb_shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
if (ret)
goto out;
@@ -2302,61 +2180,58 @@ glusterd_store_retrieve_rbstate (char *volname)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
}
if (op_errno != GD_STORE_EOF)
goto out;
- ret = glusterd_store_iter_destroy (iter);
+ ret = gf_store_iter_destroy (iter);
if (ret)
goto out;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
return ret;
}
int32_t
-glusterd_store_retrieve_node_state (char *volname)
+glusterd_store_retrieve_node_state (glusterd_volinfo_t *volinfo)
{
- int32_t ret = -1;
- glusterd_volinfo_t *volinfo = NULL;
- glusterd_store_iter_t *iter = NULL;
- char *key = NULL;
- char *value = NULL;
- char volpath[PATH_MAX] = {0,};
- glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
-
- priv = THIS->private;
+ int32_t ret = -1;
+ gf_store_iter_t *iter = NULL;
+ char *key = NULL;
+ char *value = NULL;
+ char *dup_value = NULL;
+ char volpath[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+ char path[PATH_MAX] = {0,};
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ dict_t *tmp_dict = NULL;
+ xlator_t *this = NULL;
- ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Couldn't get"
- "volinfo for %s.", volname);
- goto out;
- }
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volinfo);
GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv);
snprintf (path, sizeof (path), "%s/%s", volpath,
GLUSTERD_NODE_STATE_FILE);
- ret = glusterd_store_handle_retrieve (path,
- &volinfo->node_state_shandle);
+ ret = gf_store_handle_retrieve (path, &volinfo->node_state_shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (volinfo->node_state_shandle, &iter);
+ ret = gf_store_iter_new (volinfo->node_state_shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
if (ret)
goto out;
@@ -2364,16 +2239,35 @@ glusterd_store_retrieve_node_state (char *volname)
if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_DEFRAG,
strlen (GLUSTERD_STORE_KEY_VOL_DEFRAG))) {
volinfo->rebal.defrag_cmd = atoi (value);
- }
-
- if (volinfo->rebal.defrag_cmd) {
- if (!strncmp (key, GF_REBALANCE_TID_KEY,
- strlen (GF_REBALANCE_TID_KEY)))
- uuid_parse (value, volinfo->rebal.rebalance_id);
-
- if (!strncmp (key, GLUSTERD_STORE_KEY_DEFRAG_OP,
- strlen (GLUSTERD_STORE_KEY_DEFRAG_OP)))
- volinfo->rebal.op = atoi (value);
+ } else if (!strncmp (key, GF_REBALANCE_TID_KEY,
+ strlen (GF_REBALANCE_TID_KEY))) {
+ uuid_parse (value, volinfo->rebal.rebalance_id);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_DEFRAG_OP,
+ strlen (GLUSTERD_STORE_KEY_DEFRAG_OP))) {
+ volinfo->rebal.op = atoi (value);
+ } else {
+ if (!tmp_dict) {
+ tmp_dict = dict_new ();
+ if (!tmp_dict) {
+ ret = -1;
+ goto out;
+ }
+ }
+ dup_value = gf_strdup (value);
+ if (!dup_value) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to strdup value string");
+ goto out;
+ }
+ ret = dict_set_str (tmp_dict, key, dup_value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting data in rebal "
+ "dict.");
+ goto out;
+ }
+ dup_value = NULL;
}
GF_FREE (key);
@@ -2381,66 +2275,81 @@ glusterd_store_retrieve_node_state (char *volname)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
}
+ if (tmp_dict)
+ volinfo->rebal.dict = dict_ref (tmp_dict);
- if (op_errno != GD_STORE_EOF)
+ if (op_errno != GD_STORE_EOF) {
+ ret = -1;
goto out;
+ }
- ret = glusterd_store_iter_destroy (iter);
+ ret = gf_store_iter_destroy (iter);
if (ret)
goto out;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ if (dup_value)
+ GF_FREE (dup_value);
+ if (ret && volinfo->rebal.dict)
+ dict_unref (volinfo->rebal.dict);
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
return ret;
}
-int32_t
-glusterd_store_retrieve_volume (char *volname)
-{
- int32_t ret = -1;
- glusterd_volinfo_t *volinfo = NULL;
- glusterd_store_iter_t *iter = NULL;
- char *key = NULL;
- char *value = NULL;
- char volpath[PATH_MAX] = {0,};
- glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
- int exists = 0;
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
-
- ret = glusterd_volinfo_new (&volinfo);
- if (ret)
- goto out;
+int
+glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int exists = 0;
+ char *key = NULL;
+ char *value = NULL;
+ char volpath[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ gf_store_iter_t *iter = NULL;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
- strncpy (volinfo->volname, volname, GLUSTERD_MAX_VOLUME_NAME);
+ this = THIS;
+ GF_ASSERT (this);
+ conf = THIS->private;
+ GF_ASSERT (volinfo);
- priv = THIS->private;
+ GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, conf);
- GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv);
snprintf (path, sizeof (path), "%s/%s", volpath,
GLUSTERD_VOLUME_INFO_FILE);
- ret = glusterd_store_handle_retrieve (path, &volinfo->shandle);
-
- if (ret)
+ ret = gf_store_handle_retrieve (path, &volinfo->shandle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "volinfo handle is NULL");
goto out;
+ }
- ret = glusterd_store_iter_new (volinfo->shandle, &iter);
-
- if (ret)
+ ret = gf_store_iter_new (volinfo->shandle, &iter);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get new store "
+ "iter");
goto out;
+ }
- ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno);
- if (ret)
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get next store "
+ "iter");
goto out;
+ }
while (!ret) {
+ gf_log ("", GF_LOG_DEBUG, "key = %s value = %s", key, value);
if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_TYPE,
strlen (GLUSTERD_STORE_KEY_VOL_TYPE))) {
volinfo->type = atoi (value);
@@ -2500,9 +2409,27 @@ glusterd_store_retrieve_volume (char *volname)
gf_log ("", GF_LOG_DEBUG, "Parsed as "GEOREP" "
" slave:key=%s,value:%s", key, value);
- } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_BACKEND,
- strlen (GLUSTERD_STORE_KEY_VOL_BACKEND))) {
- volinfo->backend = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_OP_VERSION,
+ strlen (GLUSTERD_STORE_KEY_VOL_OP_VERSION))) {
+ volinfo->op_version = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION,
+ strlen (GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION))) {
+ volinfo->client_op_version = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CAPS,
+ strlen (GLUSTERD_STORE_KEY_VOL_CAPS))) {
+ volinfo->caps = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+ strlen (GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT))) {
+ volinfo->snap_max_hard_limit = (uint64_t) atoll (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP,
+ strlen (GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP))) {
+ ret = uuid_parse (value, volinfo->restored_from_snap);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to parse restored snap's uuid");
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_PARENT_VOLNAME,
+ strlen (GLUSTERD_STORE_KEY_PARENT_VOLNAME))) {
+ strncpy (volinfo->parent_volname, value, sizeof(volinfo->parent_volname) - 1);
} else {
if (is_key_glusterd_hooks_friendly (key)) {
@@ -2519,11 +2446,22 @@ glusterd_store_retrieve_volume (char *volname)
goto out;
case 0:
- gf_log ("", GF_LOG_ERROR, "Unknown key: %s",
- key);
+ /*Ignore GLUSTERD_STORE_KEY_VOL_BRICK since
+ glusterd_store_retrieve_bricks gets it later*/
+ if (!strstr (key, GLUSTERD_STORE_KEY_VOL_BRICK))
+ gf_log ("", GF_LOG_WARNING,
+ "Unknown key: %s", key);
break;
case 1:
+ /*The following strcmp check is to ensure that
+ * glusterd does not restore the quota limits
+ * into volinfo->dict post upgradation from 3.3
+ * to 3.4 as the same limits will now be stored
+ * in xattrs on the respective directories.
+ */
+ if (!strcmp (key, "features.limit-usage"))
+ break;
ret = dict_set_str(volinfo->dict, key,
gf_strdup (value));
if (ret) {
@@ -2542,8 +2480,7 @@ glusterd_store_retrieve_volume (char *volname)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
}
/* backward compatibility */
@@ -2582,35 +2519,110 @@ glusterd_store_retrieve_volume (char *volname)
volinfo->subvol_count = (volinfo->brick_count /
volinfo->dist_leaf_count);
+ /* Only calculate volume op-versions if they are not found */
+ if (!volinfo->op_version && !volinfo->client_op_version)
+ gd_update_volume_op_versions (volinfo);
}
if (op_errno != GD_STORE_EOF)
goto out;
- ret = glusterd_store_iter_destroy (iter);
+ ret = gf_store_iter_destroy (iter);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to destroy store "
+ "iter");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+glusterd_volinfo_t*
+glusterd_store_retrieve_volume (char *volname, glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_volinfo_t *origin_volinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volname);
+
+ ret = glusterd_volinfo_new (&volinfo);
if (ret)
goto out;
+ strncpy (volinfo->volname, volname, GLUSTERD_MAX_VOLUME_NAME);
+ volinfo->snapshot = snap;
+ if (snap)
+ volinfo->is_snap_volume = _gf_true;
+
+ ret = glusterd_store_update_volinfo (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to update volinfo "
+ "for %s volume", volname);
+ goto out;
+ }
+
ret = glusterd_store_retrieve_bricks (volinfo);
if (ret)
goto out;
- ret = glusterd_volume_compute_cksum (volinfo);
+ ret = glusterd_compute_cksum (volinfo, _gf_false);
if (ret)
goto out;
- gd_update_volume_op_versions (volinfo);
+ ret = glusterd_store_retrieve_quota_version (volinfo);
+ if (ret)
+ goto out;
- list_add_tail (&volinfo->vol_list, &priv->volumes);
+ ret = glusterd_store_create_quota_conf_sh_on_absence (volinfo);
+ if (ret)
+ goto out;
+
+ ret = glusterd_compute_cksum (volinfo, _gf_true);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_save_quota_version_and_cksum (volinfo);
+ if (ret)
+ goto out;
+
+
+ if (!snap) {
+ list_add_order (&volinfo->vol_list, &priv->volumes,
+ glusterd_compare_volume_name);
+ } else {
+ ret = glusterd_volinfo_find (volinfo->parent_volname,
+ &origin_volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Parent volinfo "
+ "not found for %s volume", volname);
+ goto out;
+ }
+ glusterd_list_add_snapvol (origin_volinfo, volinfo);
+ }
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ if (ret) {
+ if (volinfo)
+ glusterd_volinfo_delete (volinfo);
+ volinfo = NULL;
+ }
- return ret;
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+
+ return volinfo;
}
-inline void
+static inline void
glusterd_store_set_options_path (glusterd_conf_t *conf, char *path, size_t len)
{
snprintf (path, len, "%s/options", conf->workdir);
@@ -2619,16 +2631,16 @@ glusterd_store_set_options_path (glusterd_conf_t *conf, char *path, size_t len)
int
_store_global_opts (dict_t *this, char *key, data_t *value, void *data)
{
- glusterd_store_handle_t *shandle = data;
+ gf_store_handle_t *shandle = data;
- glusterd_store_save_value (shandle->fd, key, (char*)value->data);
+ gf_store_save_value (shandle->fd, key, (char*)value->data);
return 0;
}
int32_t
glusterd_store_options (xlator_t *this, dict_t *opts)
{
- glusterd_store_handle_t *shandle = NULL;
+ gf_store_handle_t *shandle = NULL;
glusterd_conf_t *conf = NULL;
char path[PATH_MAX] = {0};
int fd = -1;
@@ -2637,11 +2649,11 @@ glusterd_store_options (xlator_t *this, dict_t *opts)
conf = this->private;
glusterd_store_set_options_path (conf, path, sizeof (path));
- ret = glusterd_store_handle_new (path, &shandle);
+ ret = gf_store_handle_new (path, &shandle);
if (ret)
goto out;
- fd = glusterd_store_mkstemp (shandle);
+ fd = gf_store_mkstemp (shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -2650,11 +2662,11 @@ glusterd_store_options (xlator_t *this, dict_t *opts)
shandle->fd = fd;
dict_foreach (opts, _store_global_opts, shandle);
shandle->fd = 0;
- ret = glusterd_store_rename_tmppath (shandle);
+ ret = gf_store_rename_tmppath (shandle);
if (ret)
goto out;
out:
- glusterd_store_handle_destroy (shandle);
+ gf_store_handle_destroy (shandle);
if (fd >=0 )
close (fd);
return ret;
@@ -2665,25 +2677,25 @@ glusterd_store_retrieve_options (xlator_t *this)
{
char path[PATH_MAX] = {0};
glusterd_conf_t *conf = NULL;
- glusterd_store_handle_t *shandle = NULL;
- glusterd_store_iter_t *iter = NULL;
+ gf_store_handle_t *shandle = NULL;
+ gf_store_iter_t *iter = NULL;
char *key = NULL;
char *value = NULL;
- glusterd_store_op_errno_t op_errno = 0;
+ gf_store_op_errno_t op_errno = 0;
int ret = -1;
conf = this->private;
glusterd_store_set_options_path (conf, path, sizeof (path));
- ret = glusterd_store_handle_retrieve (path, &shandle);
+ ret = gf_store_handle_retrieve (path, &shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (shandle, &iter);
+ ret = gf_store_iter_new (shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
while (!ret) {
ret = dict_set_dynstr (conf->opts, key, value);
if (ret) {
@@ -2695,22 +2707,21 @@ glusterd_store_retrieve_options (xlator_t *this)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
}
if (op_errno != GD_STORE_EOF)
goto out;
ret = 0;
out:
- glusterd_store_iter_destroy (iter);
- glusterd_store_handle_destroy (shandle);
+ gf_store_iter_destroy (iter);
+ gf_store_handle_destroy (shandle);
return ret;
}
int32_t
-glusterd_store_retrieve_volumes (xlator_t *this)
+glusterd_store_retrieve_volumes (xlator_t *this, glusterd_snap_t *snap)
{
- int32_t ret = 0;
+ int32_t ret = -1;
char path[PATH_MAX] = {0,};
glusterd_conf_t *priv = NULL;
DIR *dir = NULL;
@@ -2722,51 +2733,58 @@ glusterd_store_retrieve_volumes (xlator_t *this)
GF_ASSERT (priv);
- snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
- GLUSTERD_VOLUME_DIR_PREFIX);
+ if (snap)
+ snprintf (path, PATH_MAX, "%s/snaps/%s", priv->workdir,
+ snap->snapname);
+ else
+ snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
+ GLUSTERD_VOLUME_DIR_PREFIX);
dir = opendir (path);
if (!dir) {
gf_log ("", GF_LOG_ERROR, "Unable to open dir %s", path);
- ret = -1;
goto out;
}
glusterd_for_each_entry (entry, dir);
while (entry) {
- ret = glusterd_store_retrieve_volume (entry->d_name);
- if (ret) {
+ if ( entry->d_type != DT_DIR )
+ goto next;
+
+ volinfo = glusterd_store_retrieve_volume (entry->d_name, snap);
+ if (!volinfo) {
gf_log ("", GF_LOG_ERROR, "Unable to restore "
"volume: %s", entry->d_name);
+ ret = -1;
goto out;
}
- ret = glusterd_store_retrieve_rbstate (entry->d_name);
+ ret = glusterd_store_retrieve_rbstate (volinfo);
if (ret) {
/* Backward compatibility */
gf_log ("", GF_LOG_INFO, "Creating a new rbstate "
"for volume: %s.", entry->d_name);
- ret = glusterd_volinfo_find (entry->d_name, &volinfo);
ret = glusterd_store_create_rbstate_shandle_on_absence (volinfo);
ret = glusterd_store_perform_rbstate_store (volinfo);
}
- ret = glusterd_store_retrieve_node_state (entry->d_name);
+ ret = glusterd_store_retrieve_node_state (volinfo);
if (ret) {
/* Backward compatibility */
gf_log ("", GF_LOG_INFO, "Creating a new node_state "
"for volume: %s.", entry->d_name);
- ret = glusterd_volinfo_find (entry->d_name, &volinfo);
- ret =
glusterd_store_create_nodestate_sh_on_absence (volinfo);
ret = glusterd_store_perform_node_state_store (volinfo);
}
+next:
glusterd_for_each_entry (entry, dir);
}
+ ret = 0;
+
out:
if (dir)
closedir (dir);
@@ -2775,6 +2793,635 @@ out:
return ret;
}
+/* Figure out the brick mount path, from the brick path */
+int32_t
+glusterd_find_brick_mount_path (char *brick_path, int32_t brick_count,
+ char **brick_mount_path)
+{
+ char brick_num[PATH_MAX] = "";
+ char *ptr = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brick_path);
+ GF_ASSERT (brick_mount_path);
+
+ *brick_mount_path = gf_strdup (brick_path);
+ if (!*brick_mount_path) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (brick_num, sizeof(brick_num), "brick%d", brick_count);
+
+ /* Finding the pointer to the end of
+ * /var/run/gluster/snaps/<snap-uuid>
+ */
+ ptr = strstr (*brick_mount_path, brick_num);
+ if (!ptr) {
+ /* Snapshot bricks must have brick num as part
+ * of the brickpath
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid brick path(%s)", brick_path);
+ ret = -1;
+ goto out;
+ }
+
+ /* Moving the pointer to the end of
+ * /var/run/gluster/snaps/<snap-uuid>/<brick_num>
+ * and assigning '\0' to it.
+ */
+ ptr += strlen(brick_num);
+ *ptr = '\0';
+
+ ret = 0;
+out:
+ if (ret && *brick_mount_path) {
+ GF_FREE (*brick_mount_path);
+ *brick_mount_path = NULL;
+ }
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+/* Check if brick_mount_path is already mounted. If not, mount the device_path
+ * at the brick_mount_path
+ */
+int32_t
+glusterd_mount_brick_paths (char *brick_mount_path, char *device_path)
+{
+ FILE *mtab = NULL;
+ int32_t ret = -1;
+ runner_t runner = {0, };
+ struct mntent *entry = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brick_mount_path);
+ GF_ASSERT (device_path);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Check if the brick_mount_path is already mounted */
+ entry = glusterd_get_mnt_entry_info (brick_mount_path, mtab);
+ if (entry) {
+ gf_log (this->name, GF_LOG_INFO,
+ "brick_mount_path (%s) already mounted.",
+ brick_mount_path);
+ ret = 0;
+ goto out;
+ }
+
+ /* TODO RHEL 6.5 has the logical volumes inactive by default
+ * on reboot. Hence activating the logical vol. Check behaviour
+ * on other systems
+ */
+ /* Activate the snapshot */
+ runinit (&runner);
+ runner_add_args (&runner, "lvchange", "-ay", device_path,
+ NULL);
+ ret = runner_run (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to activate %s. Error: %s",
+ device_path, strerror(errno));
+ goto out;
+ } else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Activating %s successful", device_path);
+
+ /* Mount the snapshot */
+ ret = glusterd_mount_lvm_snapshot (device_path, brick_mount_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to mount lvm snapshot.");
+ goto out;
+ }
+
+out:
+ if (mtab)
+ endmntent (mtab);
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+static int32_t
+glusterd_store_recreate_brick_mounts (glusterd_volinfo_t *volinfo)
+{
+ char *brick_mount_path = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ int32_t ret = -1;
+ int32_t brick_count = -1;
+ struct stat st_buf = {0, };
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (volinfo);
+
+ brick_count = 0;
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ brick_count++;
+ /* If the brick is not of this node, or its
+ * snapshot is pending, or the brick is not
+ * a snapshotted brick, we continue
+ */
+ if ((uuid_compare (brickinfo->uuid, MY_UUID)) ||
+ (brickinfo->snap_status == -1) ||
+ (strlen(brickinfo->device_path) == 0))
+ continue;
+
+ /* Fetch the brick mount path from the brickinfo->path */
+ ret = glusterd_find_brick_mount_path (brickinfo->path,
+ brick_count,
+ &brick_mount_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to find brick_mount_path for %s",
+ brickinfo->path);
+ goto out;
+ }
+
+ /* Check if the brickinfo path is present.
+ * If not create the brick_mount_path */
+ ret = lstat (brickinfo->path, &st_buf);
+ if (ret) {
+ if (errno == ENOENT) {
+ ret = mkdir_p (brick_mount_path, 0777,
+ _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create %s. "
+ "Error: %s", brick_mount_path,
+ strerror (errno));
+ goto out;
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Brick Path(%s) not valid. "
+ "Error: %s", brickinfo->path,
+ strerror(errno));
+ goto out;
+ }
+ }
+
+ /* Check if brick_mount_path is already mounted.
+ * If not, mount the device_path at the brick_mount_path */
+ ret = glusterd_mount_brick_paths (brick_mount_path,
+ brickinfo->device_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to mount brick_mount_path");
+ goto out;
+ }
+
+ if (brick_mount_path) {
+ GF_FREE (brick_mount_path);
+ brick_mount_path = NULL;
+ }
+ }
+
+ ret = 0;
+out:
+ if (ret && brick_mount_path)
+ GF_FREE (brick_mount_path);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_resolve_snap_bricks (xlator_t *this, glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ GF_ASSERT (this);
+ GF_VALIDATE_OR_GOTO (this->name, snap, out);
+
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ ret = glusterd_resolve_brick (brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "resolve brick failed in restore");
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+
+ return ret;
+}
+
+int
+glusterd_store_update_snap (glusterd_snap_t *snap)
+{
+ int ret = -1;
+ char *key = NULL;
+ char *value = NULL;
+ char snappath[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ gf_store_iter_t *iter = NULL;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+
+ this = THIS;
+ conf = this->private;
+ GF_ASSERT (snap);
+
+ GLUSTERD_GET_SNAP_DIR (snappath, snap, conf);
+
+ snprintf (path, sizeof (path), "%s/%s", snappath,
+ GLUSTERD_SNAP_INFO_FILE);
+
+ ret = gf_store_handle_retrieve (path, &snap->shandle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "snap handle is NULL");
+ goto out;
+ }
+
+ ret = gf_store_iter_new (snap->shandle, &iter);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get new store "
+ "iter");
+ goto out;
+ }
+
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get next store "
+ "iter");
+ goto out;
+ }
+
+ while (!ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "key = %s value = %s",
+ key, value);
+
+ if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_ID,
+ strlen (GLUSTERD_STORE_KEY_SNAP_ID))) {
+ ret = uuid_parse (value, snap->snap_id);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to parse uuid");
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_RESTORED,
+ strlen (GLUSTERD_STORE_KEY_SNAP_RESTORED))) {
+ snap->snap_restored = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_STATUS,
+ strlen (GLUSTERD_STORE_KEY_SNAP_STATUS))) {
+ snap->snap_status = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_DESC,
+ strlen (GLUSTERD_STORE_KEY_SNAP_DESC))) {
+ snap->description = gf_strdup (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_TIMESTAMP,
+ strlen (GLUSTERD_STORE_KEY_SNAP_TIMESTAMP))) {
+ snap->time_stamp = atoi (value);
+ }
+
+ GF_FREE (key);
+ GF_FREE (value);
+ key = NULL;
+ value = NULL;
+
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ }
+
+ if (op_errno != GD_STORE_EOF)
+ goto out;
+
+ ret = gf_store_iter_destroy (iter);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to destroy store "
+ "iter");
+ }
+
+out:
+ return ret;
+}
+
+int32_t
+glusterd_store_retrieve_snap (char *snapname)
+{
+ int32_t ret = -1;
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snapname);
+
+ snap = glusterd_new_snap_object ();
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create "
+ " snap object");
+ goto out;
+ }
+
+ strncpy (snap->snapname, snapname, strlen(snapname));
+ ret = glusterd_store_update_snap (snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to update snapshot "
+ "for %s snap", snapname);
+ goto out;
+ }
+
+ ret = glusterd_store_retrieve_volumes (this, snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to retrieve "
+ "snap volumes for snap %s", snapname);
+ goto out;
+ }
+
+ /* TODO: list_add_order can do 'N-square' comparisions and
+ is not efficient. Find a better solution to store the snap
+ in order */
+ list_add_order (&snap->snap_list, &priv->snapshots,
+ glusterd_compare_snap_time);
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+/* Read the missed_snap_list and update the in-memory structs */
+int32_t
+glusterd_store_retrieve_missed_snaps_list (xlator_t *this)
+{
+ char buf[PATH_MAX] = "";
+ char path[PATH_MAX] = "";
+ char *snap_vol_id = NULL;
+ char *missed_node_info = NULL;
+ char *brick_path = NULL;
+ char *value = NULL;
+ char *save_ptr = NULL;
+ FILE *fp = NULL;
+ int32_t brick_num = -1;
+ int32_t snap_op = -1;
+ int32_t snap_status = -1;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ gf_store_op_errno_t store_errno = GD_STORE_SUCCESS;
+
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Get the path of the missed_snap_list */
+ glusterd_store_missed_snaps_list_path_set (path, sizeof(path));
+
+ fp = fopen (path, "r");
+ if (!fp) {
+ /* If errno is ENOENT then there are no missed snaps yet */
+ if (errno != ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to open %s. "
+ "Error: %s", path, strerror(errno));
+ } else {
+ gf_log (this->name, GF_LOG_INFO,
+ "No missed snaps list.");
+ ret = 0;
+ }
+ goto out;
+ }
+
+ do {
+ ret = gf_store_read_and_tokenize (fp, buf,
+ &missed_node_info, &value,
+ &store_errno);
+ if (ret) {
+ if (store_errno == GD_STORE_EOF) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "EOF for missed_snap_list");
+ ret = 0;
+ break;
+ }
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to fetch data from "
+ "missed_snaps_list. Error: %s",
+ gf_store_strerror (store_errno));
+ goto out;
+ }
+
+ /* Fetch the brick_num, brick_path, snap_op and snap status */
+ snap_vol_id = strtok_r (value, ":", &save_ptr);
+ brick_num = atoi(strtok_r (NULL, ":", &save_ptr));
+ brick_path = strtok_r (NULL, ":", &save_ptr);
+ snap_op = atoi(strtok_r (NULL, ":", &save_ptr));
+ snap_status = atoi(strtok_r (NULL, ":", &save_ptr));
+
+ if (!missed_node_info || !brick_path || !snap_vol_id ||
+ brick_num < 1 || snap_op < 1 ||
+ snap_status < 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid missed_snap_entry");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_add_new_entry_to_list (missed_node_info,
+ snap_vol_id,
+ brick_num,
+ brick_path,
+ snap_op,
+ snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store missed snaps_list");
+ goto out;
+ }
+
+ } while (store_errno == GD_STORE_SUCCESS);
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_store_retrieve_snaps (xlator_t *this)
+{
+ int32_t ret = 0;
+ char path[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+ DIR *dir = NULL;
+ struct dirent *entry = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+
+ GF_ASSERT (priv);
+
+ snprintf (path, PATH_MAX, "%s/snaps", priv->workdir);
+
+ dir = opendir (path);
+
+ if (!dir) {
+ /* If snaps dir doesn't exists ignore the error for
+ backward compatibility */
+ if (errno != ENOENT) {
+ ret = -1;
+ gf_log ("", GF_LOG_ERROR, "Unable to open dir %s", path);
+ }
+ goto out;
+ }
+
+ glusterd_for_each_entry (entry, dir);
+
+ while (entry) {
+ if (entry->d_type == DT_DIR) {
+ ret = glusterd_store_retrieve_snap (entry->d_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to restore snapshot: %s",
+ entry->d_name);
+ goto out;
+ }
+ }
+
+ glusterd_for_each_entry (entry, dir);
+ }
+
+ /* Retrieve missed_snaps_list */
+ ret = glusterd_store_retrieve_missed_snaps_list (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Failed to retrieve missed_snaps_list");
+ goto out;
+ }
+
+out:
+ if (dir)
+ closedir (dir);
+ gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+
+ return ret;
+}
+
+/* Writes all the contents of conf->missed_snap_list */
+int32_t
+glusterd_store_write_missed_snapinfo (int32_t fd)
+{
+ char key[PATH_MAX] = "";
+ char value[PATH_MAX] = "";
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Write the missed_snap_entry */
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ list_for_each_entry (snap_opinfo,
+ &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ snprintf (key, sizeof(key), "%s:%s",
+ missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid);
+ snprintf (value, sizeof(value), "%s:%d:%s:%d:%d",
+ snap_opinfo->snap_vol_id,
+ snap_opinfo->brick_num,
+ snap_opinfo->brick_path,
+ snap_opinfo->op, snap_opinfo->status);
+ ret = gf_store_save_value (fd, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to write missed snapinfo");
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Adds the missed snap entries to the in-memory conf->missed_snap_list *
+ * and writes them to disk */
+int32_t
+glusterd_store_update_missed_snaps ()
+{
+ int32_t fd = -1;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = glusterd_store_create_missed_snaps_list_shandle_on_absence ();
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to obtain "
+ "missed_snaps_list store handle.");
+ goto out;
+ }
+
+ fd = gf_store_mkstemp (priv->missed_snaps_list_shandle);
+ if (fd <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create tmp file");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_store_write_missed_snapinfo (fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to write missed snaps to disk");
+ goto out;
+ }
+
+ ret = gf_store_rename_tmppath (priv->missed_snaps_list_shandle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to rename the tmp file");
+ goto out;
+ }
+out:
+ if (ret && (fd > 0)) {
+ ret = gf_store_unlink_tmppath (priv->missed_snaps_list_shandle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to unlink the tmp file");
+ }
+ ret = -1;
+ }
+
+ if (fd > 0)
+ close (fd);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
int32_t
glusterd_store_delete_peerinfo (glusterd_peerinfo_t *peerinfo)
{
@@ -2823,7 +3470,7 @@ glusterd_store_delete_peerinfo (glusterd_peerinfo_t *peerinfo)
out:
if (peerinfo->shandle) {
- glusterd_store_handle_destroy (peerinfo->shandle);
+ gf_store_handle_destroy (peerinfo->shandle);
peerinfo->shandle = NULL;
}
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
@@ -2849,7 +3496,7 @@ glusterd_store_create_peer_dir ()
char path[PATH_MAX];
glusterd_store_peerinfo_dirpath_set (path, sizeof (path));
- ret = glusterd_store_mkdir (path);
+ ret = gf_store_mkdir (path);
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
@@ -2893,8 +3540,8 @@ glusterd_store_peerinfo_hostname_shandle_create (glusterd_peerinfo_t *peerinfo)
glusterd_store_hostname_peerpath_set (peerinfo, peerfpath,
sizeof (peerfpath));
- ret = glusterd_store_handle_create_on_absence (&peerinfo->shandle,
- peerfpath);
+ ret = gf_store_handle_create_on_absence (&peerinfo->shandle,
+ peerfpath);
return ret;
}
@@ -2906,8 +3553,8 @@ glusterd_store_peerinfo_uuid_shandle_create (glusterd_peerinfo_t *peerinfo)
glusterd_store_uuid_peerpath_set (peerinfo, peerfpath,
sizeof (peerfpath));
- ret = glusterd_store_handle_create_on_absence (&peerinfo->shandle,
- peerfpath);
+ ret = gf_store_handle_create_on_absence (&peerinfo->shandle,
+ peerfpath);
return ret;
}
@@ -2923,7 +3570,7 @@ glusterd_peerinfo_hostname_shandle_check_destroy (glusterd_peerinfo_t *peerinfo)
ret = stat (peerfpath, &stbuf);
if (!ret) {
if (peerinfo->shandle)
- glusterd_store_handle_destroy (peerinfo->shandle);
+ gf_store_handle_destroy (peerinfo->shandle);
peerinfo->shandle = NULL;
ret = unlink (peerfpath);
}
@@ -2952,18 +3599,18 @@ glusterd_store_peer_write (int fd, glusterd_peerinfo_t *peerinfo)
char buf[50] = {0};
int32_t ret = 0;
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_UUID,
- uuid_utoa (peerinfo->uuid));
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_UUID,
+ uuid_utoa (peerinfo->uuid));
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", peerinfo->state.state);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_STATE, buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_STATE, buf);
if (ret)
goto out;
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_HOSTNAME "1",
- peerinfo->hostname);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_HOSTNAME "1",
+ peerinfo->hostname);
if (ret)
goto out;
@@ -2980,7 +3627,7 @@ glusterd_store_perform_peer_store (glusterd_peerinfo_t *peerinfo)
GF_ASSERT (peerinfo);
- fd = glusterd_store_mkstemp (peerinfo->shandle);
+ fd = gf_store_mkstemp (peerinfo->shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -2990,10 +3637,10 @@ glusterd_store_perform_peer_store (glusterd_peerinfo_t *peerinfo)
if (ret)
goto out;
- ret = glusterd_store_rename_tmppath (peerinfo->shandle);
+ ret = gf_store_rename_tmppath (peerinfo->shandle);
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (peerinfo->shandle);
+ gf_store_unlink_tmppath (peerinfo->shandle);
if (fd > 0)
close (fd);
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
@@ -3033,13 +3680,13 @@ glusterd_store_retrieve_peers (xlator_t *this)
uuid_t uuid = {0,};
char *hostname = NULL;
int32_t state = 0;
- glusterd_store_handle_t *shandle = NULL;
+ gf_store_handle_t *shandle = NULL;
char filepath[PATH_MAX] = {0,};
- glusterd_store_iter_t *iter = NULL;
+ gf_store_iter_t *iter = NULL;
char *key = NULL;
char *value = NULL;
glusterd_peerctx_args_t args = {0};
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
GF_ASSERT (this);
priv = this->private;
@@ -3061,16 +3708,15 @@ glusterd_store_retrieve_peers (xlator_t *this)
while (entry) {
snprintf (filepath, PATH_MAX, "%s/%s", path, entry->d_name);
- ret = glusterd_store_handle_retrieve (filepath, &shandle);
+ ret = gf_store_handle_retrieve (filepath, &shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (shandle, &iter);
+ ret = gf_store_iter_new (shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
if (ret)
goto out;
@@ -3098,13 +3744,15 @@ glusterd_store_retrieve_peers (xlator_t *this)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value,
+ &op_errno);
}
- if (op_errno != GD_STORE_EOF)
+ if (op_errno != GD_STORE_EOF) {
+ GF_FREE(hostname);
goto out;
+ }
- (void) glusterd_store_iter_destroy (iter);
+ (void) gf_store_iter_destroy (iter);
ret = glusterd_friend_add (hostname, 0, state, &uuid,
&peerinfo, 1, NULL);
@@ -3132,19 +3780,147 @@ out:
return ret;
}
+static int32_t
+glusterd_recreate_vol_brick_mounts (xlator_t *this,
+ glusterd_volinfo_t *volinfo)
+{
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ ret = glusterd_store_recreate_brick_mounts (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to recreate brick mounts "
+ "for %s", volinfo->volname);
+ goto out;
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+/* Bricks for snap volumes are hosted at /var/run/gluster/snaps
+ * When a volume is restored, it points to the bricks of the snap
+ * volume it was restored from. Hence on a node restart these
+ * paths need to be recreated and re-mounted
+ */
+int32_t
+glusterd_recreate_all_snap_brick_mounts (xlator_t *this)
+{
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Recreate bricks of volumes restored from snaps */
+ list_for_each_entry (volinfo, &priv->volumes, vol_list) {
+ /* If the volume is not a restored volume then continue */
+ if (uuid_is_null (volinfo->restored_from_snap))
+ continue;
+
+ ret = glusterd_recreate_vol_brick_mounts (this, volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to recreate brick mounts "
+ "for %s", volinfo->volname);
+ goto out;
+ }
+ }
+
+ /* Recreate bricks of snapshot volumes */
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ ret = glusterd_recreate_vol_brick_mounts (this,
+ volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to recreate brick mounts "
+ "for %s", snap->snapname);
+ goto out;
+ }
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+/* When the snapshot command from cli is received, the on disk and
+ * in memory structures for the snapshot are created (with the status)
+ * being marked as GD_SNAP_STATUS_INIT. Once the backend snapshot is
+ * taken, the status is changed to GD_SNAP_STATUS_IN_USE. If glusterd
+ * dies after taking the backend snapshot, but before updating the
+ * status, then when glusterd comes up, it should treat that snapshot
+ * as a failed snapshot and clean it up.
+ */
+int32_t
+glusterd_snap_cleanup (xlator_t *this)
+{
+ dict_t *dict = NULL;
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_snap_t *snap = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ dict = dict_new();
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create dict");
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ if (snap->snap_status != GD_SNAP_STATUS_IN_USE) {
+ ret = glusterd_snap_remove (dict, snap,
+ _gf_true, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to remove the snapshot %s",
+ snap->snapname);
+ goto out;
+ }
+ }
+ }
+out:
+ if (dict)
+ dict_unref (dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
int32_t
glusterd_resolve_all_bricks (xlator_t *this)
{
- int32_t ret = 0;
- glusterd_conf_t *priv = NULL;
- glusterd_volinfo_t *volinfo = NULL;
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_snap_t *snap = NULL;
GF_ASSERT (this);
priv = this->private;
GF_ASSERT (priv);
+ /* Resolve bricks of volumes */
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
ret = glusterd_resolve_brick (brickinfo);
@@ -3156,9 +3932,20 @@ glusterd_resolve_all_bricks (xlator_t *this)
}
}
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ /* Resolve bricks of snapshot volumes */
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ ret = glusterd_resolve_snap_bricks (this, snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "resolving the snap bricks"
+ " failed for snap: %s",
+ snap->snapname);
+ goto out;
+ }
+ }
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
return ret;
}
@@ -3177,7 +3964,11 @@ glusterd_restore ()
goto out;
}
- ret = glusterd_store_retrieve_volumes (this);
+ ret = glusterd_store_retrieve_volumes (this, NULL);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_retrieve_snaps (this);
if (ret)
goto out;
@@ -3189,7 +3980,124 @@ glusterd_restore ()
if (ret)
goto out;
+ ret = glusterd_snap_cleanup (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to perform "
+ "a cleanup of the snapshots");
+ goto out;
+ }
+
+ ret = glusterd_recreate_all_snap_brick_mounts (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to recreate "
+ "all snap brick mounts");
+ goto out;
+ }
+
out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+
+int
+glusterd_store_retrieve_quota_version (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ uint32_t version = 0;
+ char cksum_path[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ char *version_str = NULL;
+ char *tmp = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ gf_store_handle_t *handle = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, conf);
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", path,
+ GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+
+ ret = gf_store_handle_new (cksum_path, &handle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get store handle "
+ "for %s", cksum_path);
+ goto out;
+ }
+
+ ret = gf_store_retrieve_value (handle, "version", &version_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Version absent");
+ ret = 0;
+ goto out;
+ }
+
+ version = strtoul (version_str, &tmp, 10);
+ if ((errno == ERANGE) || (errno == EINVAL)) {
+ gf_log (this->name, GF_LOG_DEBUG, "Invalid version number");
+ goto out;
+ }
+ volinfo->quota_conf_version = version;
+ ret = 0;
+
+out:
+ if (version_str)
+ GF_FREE (version_str);
+ gf_store_handle_destroy (handle);
+ return ret;
+}
+
+int
+glusterd_store_save_quota_version_and_cksum (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ char cksum_path[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char buf[256] = {0,};
+ int fd = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, conf);
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", path,
+ GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+
+ fd = open (cksum_path, O_RDWR | O_APPEND | O_CREAT| O_TRUNC, 0600);
+
+ if (-1 == fd) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to open %s,"
+ "Reason: %s", cksum_path, strerror (errno));
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (buf, sizeof (buf)-1, "%u", volinfo->quota_conf_cksum);
+ ret = gf_store_save_value (fd, "cksum", buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store cksum");
+ goto out;
+ }
+
+ memset (buf, 0, sizeof (buf));
+ snprintf (buf, sizeof (buf)-1, "%u", volinfo->quota_conf_version);
+ ret = gf_store_save_value (fd, "version", buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store version");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (fd != -1)
+ close (fd);
+ return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h
index 762604e23..63d510cbf 100644
--- a/xlators/mgmt/glusterd/src/glusterd-store.h
+++ b/xlators/mgmt/glusterd/src/glusterd-store.h
@@ -35,38 +35,56 @@ typedef enum glusterd_store_ver_ac_{
} glusterd_volinfo_ver_ac_t;
-#define GLUSTERD_STORE_UUID_KEY "UUID"
-
-#define GLUSTERD_STORE_KEY_VOL_TYPE "type"
-#define GLUSTERD_STORE_KEY_VOL_COUNT "count"
-#define GLUSTERD_STORE_KEY_VOL_STATUS "status"
-#define GLUSTERD_STORE_KEY_VOL_PORT "port"
-#define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count"
-#define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count"
-#define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count"
-#define GLUSTERD_STORE_KEY_VOL_BRICK "brick"
-#define GLUSTERD_STORE_KEY_VOL_VERSION "version"
-#define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type"
-#define GLUSTERD_STORE_KEY_VOL_ID "volume-id"
-#define GLUSTERD_STORE_KEY_RB_STATUS "rb_status"
-#define GLUSTERD_STORE_KEY_RB_SRC_BRICK "rb_src"
-#define GLUSTERD_STORE_KEY_RB_DST_BRICK "rb_dst"
-#define GLUSTERD_STORE_KEY_RB_DST_PORT "rb_port"
-#define GLUSTERD_STORE_KEY_VOL_DEFRAG "rebalance_status"
-#define GLUSTERD_STORE_KEY_DEFRAG_OP "rebalance_op"
-#define GLUSTERD_STORE_KEY_USERNAME "username"
-#define GLUSTERD_STORE_KEY_PASSWORD "password"
-
-#define GLUSTERD_STORE_KEY_BRICK_HOSTNAME "hostname"
-#define GLUSTERD_STORE_KEY_BRICK_PATH "path"
-#define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port"
-#define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port"
+#define GLUSTERD_STORE_UUID_KEY "UUID"
+
+#define GLUSTERD_STORE_KEY_VOL_TYPE "type"
+#define GLUSTERD_STORE_KEY_VOL_COUNT "count"
+#define GLUSTERD_STORE_KEY_VOL_STATUS "status"
+#define GLUSTERD_STORE_KEY_VOL_PORT "port"
+#define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count"
+#define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count"
+#define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count"
+#define GLUSTERD_STORE_KEY_VOL_BRICK "brick"
+#define GLUSTERD_STORE_KEY_VOL_VERSION "version"
+#define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type"
+#define GLUSTERD_STORE_KEY_VOL_ID "volume-id"
+#define GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP "restored_from_snap"
+#define GLUSTERD_STORE_KEY_RB_STATUS "rb_status"
+#define GLUSTERD_STORE_KEY_RB_SRC_BRICK "rb_src"
+#define GLUSTERD_STORE_KEY_RB_DST_BRICK "rb_dst"
+#define GLUSTERD_STORE_KEY_RB_DST_PORT "rb_port"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG "rebalance_status"
+#define GLUSTERD_STORE_KEY_DEFRAG_OP "rebalance_op"
+#define GLUSTERD_STORE_KEY_USERNAME "username"
+#define GLUSTERD_STORE_KEY_PASSWORD "password"
+#define GLUSTERD_STORE_KEY_PARENT_VOLNAME "parent_volname"
+#define GLUSTERD_STORE_KEY_VOL_OP_VERSION "op-version"
+#define GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION "client-op-version"
+
+#define GLUSTERD_STORE_KEY_SNAP_NAME "name"
+#define GLUSTERD_STORE_KEY_SNAP_ID "snap-id"
+#define GLUSTERD_STORE_KEY_SNAP_DESC "desc"
+#define GLUSTERD_STORE_KEY_SNAP_TIMESTAMP "time-stamp"
+#define GLUSTERD_STORE_KEY_SNAP_STATUS "status"
+#define GLUSTERD_STORE_KEY_SNAP_RESTORED "snap-restored"
+#define GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT "snap-max-hard-limit"
+#define GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT "snap-max-soft-limit"
+
+#define GLUSTERD_STORE_KEY_BRICK_HOSTNAME "hostname"
+#define GLUSTERD_STORE_KEY_BRICK_PATH "path"
+#define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port"
+#define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port"
#define GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED "decommissioned"
+#define GLUSTERD_STORE_KEY_BRICK_VGNAME "vg"
+#define GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH "device_path"
+#define GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS "snap-status"
+#define GLUSTERD_STORE_KEY_BRICK_ID "brick-id"
-#define GLUSTERD_STORE_KEY_PEER_UUID "uuid"
-#define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname"
-#define GLUSTERD_STORE_KEY_PEER_STATE "state"
-#define GLUSTERD_STORE_KEY_VOL_BACKEND "backend"
+#define GLUSTERD_STORE_KEY_PEER_UUID "uuid"
+#define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname"
+#define GLUSTERD_STORE_KEY_PEER_STATE "state"
+
+#define GLUSTERD_STORE_KEY_VOL_CAPS "caps"
#define glusterd_for_each_entry(entry, dir) \
do {\
@@ -74,6 +92,7 @@ typedef enum glusterd_store_ver_ac_{
if (dir) {\
entry = readdir (dir);\
while (entry && (!strcmp (entry->d_name, ".") ||\
+ !fnmatch ("*.tmp", entry->d_name, 0) ||\
!strcmp (entry->d_name, ".."))) {\
entry = readdir (dir);\
}\
@@ -81,16 +100,6 @@ typedef enum glusterd_store_ver_ac_{
} while (0); \
-typedef enum {
- GD_STORE_SUCCESS,
- GD_STORE_KEY_NULL,
- GD_STORE_VALUE_NULL,
- GD_STORE_KEY_VALUE_NULL,
- GD_STORE_EOF,
- GD_STORE_ENOMEM,
- GD_STORE_STAT_FAILED
-} glusterd_store_op_errno_t;
-
int32_t
glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t ac);
@@ -98,14 +107,7 @@ int32_t
glusterd_store_delete_volume (glusterd_volinfo_t *volinfo);
int32_t
-glusterd_store_handle_new (char *path, glusterd_store_handle_t **handle);
-
-int32_t
-glusterd_store_save_value (int fd, char *key, char *value);
-
-int32_t
-glusterd_store_retrieve_value (glusterd_store_handle_t *handle,
- char *key, char **value);
+glusterd_store_delete_snap (glusterd_snap_t *snap);
int32_t
glusterd_retrieve_uuid ();
@@ -121,9 +123,6 @@ glusterd_store_delete_brick (glusterd_brickinfo_t *brickinfo,
char *delete_path);
int32_t
-glusterd_store_handle_destroy (glusterd_store_handle_t *handle);
-
-int32_t
glusterd_restore ();
void
@@ -145,5 +144,30 @@ int32_t
glusterd_store_retrieve_options (xlator_t *this);
int32_t
+glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo);
+
+int32_t
glusterd_store_options (xlator_t *this, dict_t *opts);
+
+void
+glusterd_replace_slash_with_hyphen (char *str);
+
+int32_t
+glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_store_create_quota_conf_sh_on_absence (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_store_retrieve_quota_version (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_store_save_quota_version_and_cksum (glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_store_snap (glusterd_snap_t *snap);
+
+int32_t
+glusterd_store_update_missed_snaps ();
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c
index aa1391df8..b36d6f616 100644
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c
@@ -17,8 +17,11 @@
#include "glusterd.h"
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
+#include "glusterd-locks.h"
-static inline void
+extern glusterd_op_info_t opinfo;
+
+void
gd_synctask_barrier_wait (struct syncargs *args, int count)
{
glusterd_conf_t *conf = THIS->private;
@@ -31,18 +34,137 @@ gd_synctask_barrier_wait (struct syncargs *args, int count)
}
static void
-gd_collate_errors (struct syncargs *args, int op_ret, int op_errno,
- char *op_errstr)
+gd_mgmt_v3_collate_errors (struct syncargs *args, int op_ret, int op_errno,
+ char *op_errstr, int op_code,
+ glusterd_peerinfo_t *peerinfo, u_char *uuid)
{
- if (args->op_ret)
- return;
- args->op_ret = op_ret;
- args->op_errno = op_errno;
- if (op_ret && op_errstr && strcmp (op_errstr, ""))
- args->errstr = gf_strdup (op_errstr);
+ char err_str[PATH_MAX] = "Please check log file for details.";
+ char op_err[PATH_MAX] = "";
+ char *peer_str = NULL;
+
+ if (op_ret) {
+ args->op_ret = op_ret;
+ args->op_errno = op_errno;
+
+ if (peerinfo)
+ peer_str = peerinfo->hostname;
+ else
+ peer_str = uuid_utoa (uuid);
+
+ if (op_errstr && strcmp (op_errstr, ""))
+ snprintf (err_str, sizeof(err_str) - 1,
+ "Error: %s", op_errstr);
+
+ switch (op_code) {
+ case GLUSTERD_MGMT_V3_LOCK:
+ {
+ snprintf (op_err, sizeof(op_err) - 1,
+ "Locking failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_UNLOCK:
+ {
+ snprintf (op_err, sizeof(op_err) - 1,
+ "Unlocking failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ }
+
+ if (args->errstr) {
+ snprintf (err_str, sizeof(err_str) - 1,
+ "%s\n%s", args->errstr,
+ op_err);
+ GF_FREE (args->errstr);
+ args->errstr = NULL;
+ } else
+ snprintf (err_str, sizeof(err_str) - 1,
+ "%s", op_err);
+
+ gf_log ("", GF_LOG_ERROR, "%s", op_err);
+ args->errstr = gf_strdup (err_str);
+ }
+
+ return;
}
static void
+gd_collate_errors (struct syncargs *args, int op_ret, int op_errno,
+ char *op_errstr, int op_code,
+ glusterd_peerinfo_t *peerinfo, u_char *uuid)
+{
+ char err_str[PATH_MAX] = "Please check log file for details.";
+ char op_err[PATH_MAX] = "";
+ int len = -1;
+ char *peer_str = NULL;
+
+ if (op_ret) {
+ args->op_ret = op_ret;
+ args->op_errno = op_errno;
+
+ if (peerinfo)
+ peer_str = peerinfo->hostname;
+ else
+ peer_str = uuid_utoa (uuid);
+
+ if (op_errstr && strcmp (op_errstr, "")) {
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "Error: %s", op_errstr);
+ err_str[len] = '\0';
+ }
+
+ switch (op_code){
+ case GLUSTERD_MGMT_CLUSTER_LOCK :
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Locking failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_CLUSTER_UNLOCK :
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Unlocking failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_STAGE_OP :
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Staging failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_COMMIT_OP :
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Commit failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ }
+ op_err[len] = '\0';
+
+ if (args->errstr) {
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "%s\n%s", args->errstr,
+ op_err);
+ GF_FREE (args->errstr);
+ args->errstr = NULL;
+ } else
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "%s", op_err);
+ err_str[len] = '\0';
+
+ gf_log ("", GF_LOG_ERROR, "%s", op_err);
+ args->errstr = gf_strdup (err_str);
+ }
+
+ return;
+}
+
+void
gd_syncargs_init (struct syncargs *args, dict_t *op_ctx)
{
args->dict = op_ctx;
@@ -82,9 +204,9 @@ gd_brick_op_req_free (gd1_mgmt_brick_op_req *req)
}
int
-gd_syncop_submit_request (struct rpc_clnt *rpc, void *req,
- void *cookie, rpc_clnt_prog_t *prog,
- int procnum, fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+gd_syncop_submit_request (struct rpc_clnt *rpc, void *req, void *local,
+ void *cookie, rpc_clnt_prog_t *prog, int procnum,
+ fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
{
int ret = -1;
struct iobuf *iobuf = NULL;
@@ -124,7 +246,8 @@ gd_syncop_submit_request (struct rpc_clnt *rpc, void *req,
iov.iov_len = ret;
count = 1;
- frame->local = cookie;
+ frame->local = local;
+ frame->cookie = cookie;
/* Send the msg */
ret = rpc_clnt_submit (rpc, prog, procnum, cbkfn,
@@ -143,8 +266,9 @@ out:
/* Defined in glusterd-rpc-ops.c */
extern struct rpc_clnt_program gd_mgmt_prog;
extern struct rpc_clnt_program gd_brick_prog;
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
-static int
+int
glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp)
{
int ret = 0;
@@ -168,6 +292,9 @@ glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp)
goto out;
break;
+ case GD_OP_GSYNC_CREATE:
+ break;
+
case GD_OP_GSYNC_SET:
ret = glusterd_gsync_use_rsp_dict (aggr, rsp, NULL);
if (ret)
@@ -194,12 +321,28 @@ glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp)
break;
- case GD_OP_QUOTA:
case GD_OP_CLEARLOCKS_VOLUME:
ret = glusterd_use_rsp_dict (aggr, rsp);
if (ret)
goto out;
+ break;
+
+ case GD_OP_QUOTA:
+ ret = glusterd_volume_quota_copy_to_op_ctx_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+ case GD_OP_SYS_EXEC:
+ ret = glusterd_sys_exec_output_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_SNAP:
+ ret = glusterd_snap_use_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
break;
default:
@@ -210,20 +353,204 @@ out:
}
int32_t
+gd_syncop_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
+
+ GF_ASSERT(req);
+ GF_ASSERT(iov);
+ GF_ASSERT(myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_LOCK,
+ peerinfo, rsp.uuid);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_syncop_mgmt_v3_lock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_syncop_mgmt_v3_lock_cbk_fn);
+}
+
+int
+gd_syncop_mgmt_v3_lock (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid, uuid_t txn_id)
+{
+ int ret = -1;
+ gd1_mgmt_v3_lock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+
+ GF_ASSERT(op_ctx);
+ GF_ASSERT(peerinfo);
+ GF_ASSERT(args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ uuid_copy (req.txn_id, txn_id);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_LOCK,
+ gd_syncop_mgmt_v3_lock_cbk,
+ (xdrproc_t)
+ xdr_gd1_mgmt_v3_lock_req);
+ synclock_lock (&conf->big_lock);
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+gd_syncop_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
+
+ GF_ASSERT(req);
+ GF_ASSERT(iov);
+ GF_ASSERT(myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ /* Set peer as locked, so we unlock only the locked peers */
+ if (rsp.op_ret == 0)
+ peerinfo->locked = _gf_true;
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_UNLOCK,
+ peerinfo, rsp.uuid);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_syncop_mgmt_v3_unlock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_syncop_mgmt_v3_unlock_cbk_fn);
+}
+
+int
+gd_syncop_mgmt_v3_unlock (dict_t *op_ctx, glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid, uuid_t txn_id)
+{
+ int ret = -1;
+ gd1_mgmt_v3_unlock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+
+ GF_ASSERT(op_ctx);
+ GF_ASSERT(peerinfo);
+ GF_ASSERT(args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ uuid_copy (req.txn_id, txn_id);
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_UNLOCK,
+ gd_syncop_mgmt_v3_unlock_cbk,
+ (xdrproc_t)
+ xdr_gd1_mgmt_v3_unlock_req);
+ synclock_lock (&conf->big_lock);
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
_gd_syncop_mgmt_lock_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
int ret = -1;
struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
gd1_mgmt_cluster_lock_rsp rsp = {{0},};
call_frame_t *frame = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
frame = myframe;
args = frame->local;
+ peerinfo = frame->cookie;
frame->local = NULL;
+ frame->cookie = NULL;
if (-1 == req->rpc_status) {
- args->op_errno = ENOTCONN;
+ op_errno = ENOTCONN;
goto out;
}
@@ -232,25 +559,31 @@ _gd_syncop_mgmt_lock_cbk (struct rpc_req *req, struct iovec *iov,
if (ret < 0)
goto out;
- gd_collate_errors (args, rsp.op_ret, rsp.op_errno, NULL);
uuid_copy (args->uuid, rsp.uuid);
+ /* Set peer as locked, so we unlock only the locked peers */
+ if (rsp.op_ret == 0)
+ peerinfo->locked = _gf_true;
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
out:
+ gd_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_CLUSTER_LOCK, peerinfo, rsp.uuid);
STACK_DESTROY (frame->root);
synctask_barrier_wake(args);
return 0;
}
int32_t
-gd_syncop_mgmt_lock_cbk (struct rpc_req *req, struct iovec *iov,
- int count, void *myframe)
+gd_syncop_mgmt_lock_cbk (struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
{
return glusterd_big_locked_cbk (req, iov, count, myframe,
_gd_syncop_mgmt_lock_cbk);
}
int
-gd_syncop_mgmt_lock (struct rpc_clnt *rpc, struct syncargs *args,
+gd_syncop_mgmt_lock (glusterd_peerinfo_t *peerinfo, struct syncargs *args,
uuid_t my_uuid, uuid_t recv_uuid)
{
int ret = -1;
@@ -259,7 +592,8 @@ gd_syncop_mgmt_lock (struct rpc_clnt *rpc, struct syncargs *args,
uuid_copy (req.uuid, my_uuid);
synclock_unlock (&conf->big_lock);
- ret = gd_syncop_submit_request (rpc, &req, args, &gd_mgmt_prog,
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_prog,
GLUSTERD_MGMT_CLUSTER_LOCK,
gd_syncop_mgmt_lock_cbk,
(xdrproc_t) xdr_gd1_mgmt_cluster_lock_req);
@@ -273,15 +607,19 @@ _gd_syncop_mgmt_unlock_cbk (struct rpc_req *req, struct iovec *iov,
{
int ret = -1;
struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
gd1_mgmt_cluster_unlock_rsp rsp = {{0},};
call_frame_t *frame = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
frame = myframe;
args = frame->local;
+ peerinfo = frame->cookie;
frame->local = NULL;
if (-1 == req->rpc_status) {
- args->op_errno = ENOTCONN;
+ op_errno = ENOTCONN;
goto out;
}
@@ -290,10 +628,14 @@ _gd_syncop_mgmt_unlock_cbk (struct rpc_req *req, struct iovec *iov,
if (ret < 0)
goto out;
- gd_collate_errors (args, rsp.op_ret, rsp.op_errno, NULL);
uuid_copy (args->uuid, rsp.uuid);
+ peerinfo->locked = _gf_false;
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
out:
+ gd_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_CLUSTER_UNLOCK, peerinfo, rsp.uuid);
STACK_DESTROY (frame->root);
synctask_barrier_wake(args);
return 0;
@@ -309,7 +651,7 @@ gd_syncop_mgmt_unlock_cbk (struct rpc_req *req, struct iovec *iov,
int
-gd_syncop_mgmt_unlock (struct rpc_clnt *rpc, struct syncargs *args,
+gd_syncop_mgmt_unlock (glusterd_peerinfo_t *peerinfo, struct syncargs *args,
uuid_t my_uuid, uuid_t recv_uuid)
{
int ret = -1;
@@ -318,7 +660,8 @@ gd_syncop_mgmt_unlock (struct rpc_clnt *rpc, struct syncargs *args,
uuid_copy (req.uuid, my_uuid);
synclock_unlock (&conf->big_lock);
- ret = gd_syncop_submit_request (rpc, &req, args, &gd_mgmt_prog,
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_prog,
GLUSTERD_MGMT_CLUSTER_UNLOCK,
gd_syncop_mgmt_unlock_cbk,
(xdrproc_t) xdr_gd1_mgmt_cluster_lock_req);
@@ -336,6 +679,9 @@ _gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
xlator_t *this = NULL;
dict_t *rsp_dict = NULL;
call_frame_t *frame = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
this = THIS;
frame = myframe;
@@ -343,8 +689,7 @@ _gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
frame->local = NULL;
if (-1 == req->rpc_status) {
- args->op_ret = -1;
- args->op_errno = ENOTCONN;
+ op_errno = ENOTCONN;
goto out;
}
@@ -368,9 +713,17 @@ _gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
}
}
- gd_collate_errors (args, rsp.op_ret, rsp.op_errno, rsp.op_errstr);
+ ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Staging response "
+ "for 'Volume %s' received from unknown "
+ "peer: %s", gd_op_list[rsp.op],
+ uuid_utoa (rsp.uuid));
+ goto out;
+ }
+
uuid_copy (args->uuid, rsp.uuid);
- if (rsp.op == GD_OP_REPLACE_BRICK) {
+ if (rsp.op == GD_OP_REPLACE_BRICK || rsp.op == GD_OP_QUOTA) {
pthread_mutex_lock (&args->lock_dict);
{
ret = glusterd_syncop_aggr_rsp_dict (rsp.op, args->dict,
@@ -383,7 +736,13 @@ _gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
pthread_mutex_unlock (&args->lock_dict);
}
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
out:
+ gd_collate_errors (args, op_ret, op_errno, rsp.op_errstr,
+ GLUSTERD_MGMT_STAGE_OP, peerinfo, rsp.uuid);
+
if (rsp_dict)
dict_unref (rsp_dict);
@@ -423,7 +782,7 @@ gd_syncop_mgmt_stage_op (struct rpc_clnt *rpc, struct syncargs *args,
goto out;
synclock_unlock (&conf->big_lock);
- ret = gd_syncop_submit_request (rpc, req, args, &gd_mgmt_prog,
+ ret = gd_syncop_submit_request (rpc, req, args, NULL, &gd_mgmt_prog,
GLUSTERD_MGMT_STAGE_OP,
gd_syncop_stage_op_cbk,
(xdrproc_t) xdr_gd1_mgmt_stage_op_req);
@@ -514,8 +873,8 @@ gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode,
args.op_errno = ENOTCONN;
if ((pnode->type == GD_NODE_NFS) ||
- ((pnode->type == GD_NODE_SHD) &&
- (op == GD_OP_STATUS_VOLUME))) {
+ (pnode->type == GD_NODE_QUOTAD) ||
+ ((pnode->type == GD_NODE_SHD) && (op == GD_OP_STATUS_VOLUME))) {
ret = glusterd_node_op_build_payload
(op, &req, dict_out);
@@ -528,14 +887,15 @@ gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode,
if (ret)
goto out;
- GD_SYNCOP (rpc, (&args), gd_syncop_brick_op_cbk,
- req, &gd_brick_prog, req->op,
- xdr_gd1_mgmt_brick_op_req);
+ GD_SYNCOP (rpc, (&args), NULL, gd_syncop_brick_op_cbk, req,
+ &gd_brick_prog, req->op, xdr_gd1_mgmt_brick_op_req);
- if (args.errstr && errstr)
- *errstr = args.errstr;
- else
- GF_FREE (args.errstr);
+ if (args.errstr) {
+ if ((strlen(args.errstr) > 0) && errstr)
+ *errstr = args.errstr;
+ else
+ GF_FREE (args.errstr);
+ }
if (GD_OP_STATUS_VOLUME == op) {
ret = dict_set_int32 (args.dict, "index", pnode->index);
@@ -565,12 +925,16 @@ int32_t
_gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
- int ret = -1;
- gd1_mgmt_commit_op_rsp rsp = {{0},};
- struct syncargs *args = NULL;
- xlator_t *this = NULL;
- dict_t *rsp_dict = NULL;
- call_frame_t *frame = NULL;
+ int ret = -1;
+ gd1_mgmt_commit_op_rsp rsp = {{0},};
+ struct syncargs *args = NULL;
+ xlator_t *this = NULL;
+ dict_t *rsp_dict = NULL;
+ call_frame_t *frame = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
+ int type = GF_QUOTA_OPTION_TYPE_NONE;
this = THIS;
frame = myframe;
@@ -578,7 +942,7 @@ _gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
frame->local = NULL;
if (-1 == req->rpc_status) {
- args->op_errno = ENOTCONN;
+ op_errno = ENOTCONN;
goto out;
}
@@ -603,19 +967,44 @@ _gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
}
}
- gd_collate_errors (args, rsp.op_ret, rsp.op_errno, rsp.op_errstr);
+ ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Commit response "
+ "for 'Volume %s' received from unknown "
+ "peer: %s", gd_op_list[rsp.op],
+ uuid_utoa (rsp.uuid));
+ goto out;
+ }
+
uuid_copy (args->uuid, rsp.uuid);
- pthread_mutex_lock (&args->lock_dict);
- {
- ret = glusterd_syncop_aggr_rsp_dict (rsp.op, args->dict,
- rsp_dict);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "%s",
- "Failed to aggregate response from "
- " node/brick");
+ if (rsp.op == GD_OP_QUOTA) {
+ ret = dict_get_int32 (args->dict, "type", &type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "opcode");
+ goto out;
+ }
+ }
+
+ if ((rsp.op != GD_OP_QUOTA) || (type == GF_QUOTA_OPTION_TYPE_LIST)) {
+ pthread_mutex_lock (&args->lock_dict);
+ {
+ ret = glusterd_syncop_aggr_rsp_dict (rsp.op, args->dict,
+ rsp_dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ }
+ pthread_mutex_unlock (&args->lock_dict);
}
- pthread_mutex_unlock (&args->lock_dict);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
out:
+ gd_collate_errors (args, op_ret, op_errno, rsp.op_errstr,
+ GLUSTERD_MGMT_COMMIT_OP, peerinfo, rsp.uuid);
if (rsp_dict)
dict_unref (rsp_dict);
@@ -656,7 +1045,7 @@ gd_syncop_mgmt_commit_op (struct rpc_clnt *rpc, struct syncargs *args,
goto out;
synclock_unlock (&conf->big_lock);
- ret = gd_syncop_submit_request (rpc, req, args, &gd_mgmt_prog,
+ ret = gd_syncop_submit_request (rpc, req, args, NULL, &gd_mgmt_prog,
GLUSTERD_MGMT_COMMIT_OP ,
gd_syncop_commit_op_cbk,
(xdrproc_t) xdr_gd1_mgmt_commit_op_req);
@@ -688,8 +1077,8 @@ gd_build_peers_list (struct list_head *peers, struct list_head *xact_peers,
}
int
-gd_lock_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx,
- char **op_errstr, int npeers)
+gd_lock_op_phase (glusterd_conf_t *conf, glusterd_op_t op, dict_t *op_ctx,
+ char **op_errstr, int npeers, uuid_t txn_id)
{
int ret = -1;
int peer_cnt = 0;
@@ -697,6 +1086,9 @@ gd_lock_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx,
xlator_t *this = NULL;
glusterd_peerinfo_t *peerinfo = NULL;
struct syncargs args = {0};
+ struct list_head *peers = NULL;
+
+ peers = &conf->xaction_peers;
if (!npeers) {
ret = 0;
@@ -707,20 +1099,38 @@ gd_lock_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx,
synctask_barrier_init((&args));
peer_cnt = 0;
list_for_each_entry (peerinfo, peers, op_peers_list) {
- gd_syncop_mgmt_lock (peerinfo->rpc, &args, MY_UUID, peer_uuid);
+ if (conf->op_version < GD_OP_VERSION_4) {
+ /* Reset lock status */
+ peerinfo->locked = _gf_false;
+ gd_syncop_mgmt_lock (peerinfo, &args,
+ MY_UUID, peer_uuid);
+ } else
+ gd_syncop_mgmt_v3_lock (op, op_ctx, peerinfo, &args,
+ MY_UUID, peer_uuid, txn_id);
peer_cnt++;
}
gd_synctask_barrier_wait((&args), peer_cnt);
- ret = args.op_ret;
- if (ret) {
- gf_asprintf (op_errstr, "Another transaction could be "
- "in progress. Please try again after "
- "sometime.");
- gf_log (this->name, GF_LOG_ERROR, "Failed to acquire lock");
- goto out;
+
+ if (args.op_ret) {
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ else {
+ ret = gf_asprintf (op_errstr, "Another transaction "
+ "could be in progress. Please try "
+ "again after sometime.");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire lock");
+
+ }
}
- ret = 0;
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent lock op req for 'Volume %s' "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
out:
return ret;
}
@@ -750,7 +1160,7 @@ gd_stage_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx,
goto stage_done;
}
- if ((op == GD_OP_REPLACE_BRICK)) {
+ if ((op == GD_OP_REPLACE_BRICK || op == GD_OP_QUOTA)) {
ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx, rsp_dict);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "%s",
@@ -785,14 +1195,27 @@ stage_done:
op, req_dict, op_ctx);
peer_cnt++;
}
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent stage op req for 'Volume %s' "
+ "to %d peers", gd_op_list[op], peer_cnt);
+
gd_synctask_barrier_wait((&args), peer_cnt);
- ret = args.op_ret;
- if (dict_get_str (op_ctx, "errstr", &errstr) == 0)
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ else if (dict_get_str (op_ctx, "errstr", &errstr) == 0)
*op_errstr = gf_strdup (errstr);
- else if (args.errstr)
- *op_errstr = gf_strdup (args.errstr);
+
+ ret = args.op_ret;
out:
+ if ((ret == 0) && (op == GD_OP_QUOTA)) {
+ ret = glusterd_validate_and_set_gfid (op_ctx, req_dict,
+ op_errstr);
+ if (ret)
+ goto out;
+ }
+
if (rsp_dict)
dict_unref (rsp_dict);
return ret;
@@ -811,6 +1234,7 @@ gd_commit_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx,
uuid_t tmp_uuid = {0};
char *errstr = NULL;
struct syncargs args = {0};
+ int type = GF_QUOTA_OPTION_TYPE_NONE;
this = THIS;
rsp_dict = dict_new ();
@@ -824,15 +1248,28 @@ gd_commit_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx,
hostname = "localhost";
goto commit_done;
}
- if (op != GD_OP_SYNC_VOLUME) {
- ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx, rsp_dict);
+
+ if (op == GD_OP_QUOTA) {
+ ret = dict_get_int32 (op_ctx, "type", &type);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "%s",
- "Failed to aggregate response "
- "from node/brick");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "opcode");
goto out;
}
}
+
+ if (((op == GD_OP_QUOTA) && (type == GF_QUOTA_OPTION_TYPE_LIST)) ||
+ ((op != GD_OP_SYNC_VOLUME) && (op != GD_OP_QUOTA))) {
+
+ ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s", "Failed to aggregate "
+ "response from node/brick");
+ goto out;
+ }
+ }
+
dict_unref (rsp_dict);
rsp_dict = NULL;
@@ -851,6 +1288,7 @@ commit_done:
ret = 0;
goto out;
}
+
gd_syncargs_init (&args, op_ctx);
synctask_barrier_init((&args));
peer_cnt = 0;
@@ -862,57 +1300,120 @@ commit_done:
}
gd_synctask_barrier_wait((&args), peer_cnt);
ret = args.op_ret;
- if (dict_get_str (op_ctx, "errstr", &errstr) == 0)
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ else if (dict_get_str (op_ctx, "errstr", &errstr) == 0)
*op_errstr = gf_strdup (errstr);
- else if (args.errstr)
- *op_errstr = gf_strdup (args.errstr);
+ gf_log (this->name, GF_LOG_DEBUG, "Sent commit op req for 'Volume %s' "
+ "to %d peers", gd_op_list[op], peer_cnt);
out:
if (!ret)
glusterd_op_modify_op_ctx (op, op_ctx);
if (rsp_dict)
dict_unref (rsp_dict);
+
+ GF_FREE (args.errstr);
+ args.errstr = NULL;
+
return ret;
}
int
-gd_unlock_op_phase (struct list_head *peers, glusterd_op_t op, int op_ret,
+gd_unlock_op_phase (glusterd_conf_t *conf, glusterd_op_t op, int *op_ret,
rpcsvc_request_t *req, dict_t *op_ctx, char *op_errstr,
- int npeers)
+ int npeers, char *volname, gf_boolean_t is_acquired,
+ uuid_t txn_id)
{
glusterd_peerinfo_t *peerinfo = NULL;
glusterd_peerinfo_t *tmp = NULL;
uuid_t tmp_uuid = {0};
- int peer_cnt = 0;
+ int peer_cnt = 0;
int ret = -1;
xlator_t *this = NULL;
struct syncargs args = {0};
+ struct list_head *peers = NULL;
+
+ peers = &conf->xaction_peers;
if (!npeers) {
ret = 0;
goto out;
}
+ /* If the lock has not been held during this
+ * transaction, do not send unlock requests */
+ if (!is_acquired) {
+ ret = 0;
+ goto out;
+ }
+
this = THIS;
synctask_barrier_init((&args));
peer_cnt = 0;
- list_for_each_entry_safe (peerinfo, tmp, peers, op_peers_list) {
- gd_syncop_mgmt_unlock (peerinfo->rpc, &args, MY_UUID, tmp_uuid);
- list_del_init (&peerinfo->op_peers_list);
- peer_cnt++;
+ if (conf->op_version < GD_OP_VERSION_4) {
+ list_for_each_entry_safe (peerinfo, tmp, peers, op_peers_list) {
+ /* Only unlock peers that were locked */
+ if (peerinfo->locked) {
+ gd_syncop_mgmt_unlock (peerinfo, &args,
+ MY_UUID, tmp_uuid);
+ peer_cnt++;
+ list_del_init (&peerinfo->op_peers_list);
+ }
+ }
+ } else {
+ if (volname) {
+ list_for_each_entry_safe (peerinfo, tmp,
+ peers, op_peers_list) {
+ gd_syncop_mgmt_v3_unlock (op_ctx, peerinfo,
+ &args, MY_UUID,
+ tmp_uuid, txn_id);
+ peer_cnt++;
+ list_del_init (&peerinfo->op_peers_list);
+ }
+ }
}
gd_synctask_barrier_wait((&args), peer_cnt);
+
ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent unlock op req for 'Volume %s' "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to unlock "
"on some peer(s)");
}
out:
- glusterd_op_send_cli_response (op, op_ret, 0, req, op_ctx, op_errstr);
- glusterd_op_clear_op (op);
- glusterd_unlock (MY_UUID);
+ /* If unlock failed, and op_ret was previously set
+ * priority is given to the op_ret. If op_ret was
+ * not set, and unlock failed, then set op_ret */
+ if (!*op_ret)
+ *op_ret = ret;
+
+ if (is_acquired) {
+ /* Based on the op-version,
+ * we release the cluster or mgmt_v3 lock
+ * and clear the op */
+
+ glusterd_op_clear_op (op);
+ if (conf->op_version < GD_OP_VERSION_4)
+ glusterd_unlock (MY_UUID);
+ else {
+ if (volname) {
+ ret = glusterd_mgmt_v3_unlock (volname, MY_UUID,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to release lock for %s",
+ volname);
+ }
+ }
+ }
+
+ if (!*op_ret)
+ *op_ret = ret;
return 0;
}
@@ -929,7 +1430,8 @@ gd_get_brick_count (struct list_head *bricks)
}
int
-gd_brick_op_phase (glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict, char **op_errstr)
+gd_brick_op_phase (glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr)
{
glusterd_pending_node_t *pending_node = NULL;
struct list_head selected = {0,};
@@ -1001,14 +1503,20 @@ out:
void
gd_sync_task_begin (dict_t *op_ctx, rpcsvc_request_t * req)
{
- int ret = -1;
- int npeers = 0;
- dict_t *req_dict = NULL;
- glusterd_conf_t *conf = NULL;
- glusterd_op_t op = 0;
- int32_t tmp_op = 0;
- char *op_errstr = NULL;
- xlator_t *this = NULL;
+ int ret = -1;
+ int op_ret = -1;
+ int npeers = 0;
+ dict_t *req_dict = NULL;
+ glusterd_conf_t *conf = NULL;
+ glusterd_op_t op = 0;
+ int32_t tmp_op = 0;
+ char *op_errstr = NULL;
+ char *tmp = NULL;
+ char *volname = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_acquired = _gf_false;
+ uuid_t *txn_id = NULL;
+ glusterd_op_info_t txn_opinfo;
this = THIS;
GF_ASSERT (this);
@@ -1021,26 +1529,97 @@ gd_sync_task_begin (dict_t *op_ctx, rpcsvc_request_t * req)
"operation");
goto out;
}
-
op = tmp_op;
- ret = glusterd_lock (MY_UUID);
+
+ /* Generate a transaction-id for this operation and
+ * save it in the dict */
+ ret = glusterd_generate_txn_id (op_ctx, &txn_id);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to acquire lock");
- gf_asprintf (&op_errstr, "Another transaction is in progress. "
- "Please try again after sometime.");
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to generate transaction id");
goto out;
}
- /* storing op globally to access in synctask code paths
- * This is still acceptable, as we are performing this under
- * the 'cluster' lock*/
- glusterd_op_set_op (op);
+ /* Save opinfo for this transaction with the transaction id */
+ glusterd_txn_opinfo_init (&txn_opinfo, NULL, &op, NULL, NULL);
+ ret = glusterd_set_txn_opinfo (txn_id, &txn_opinfo);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Transaction ID : %s", uuid_utoa (*txn_id));
+
+ opinfo = txn_opinfo;
+
+ /* Save the MY_UUID as the originator_uuid */
+ ret = glusterd_set_originator_uuid (op_ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
+ goto out;
+ }
+
+ /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+ if (conf->op_version < GD_OP_VERSION_4) {
+ ret = glusterd_lock (MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock");
+ gf_asprintf (&op_errstr,
+ "Another transaction is in progress. "
+ "Please try again after sometime.");
+ goto out;
+ }
+ } else {
+
+ /* If no volname is given as a part of the command, locks will
+ * not be held */
+ ret = dict_get_str (op_ctx, "volname", &tmp);
+ if (ret) {
+ gf_log ("", GF_LOG_DEBUG, "Failed to get volume "
+ "name");
+ goto local_locking_done;
+ } else {
+ /* Use a copy of volname, as cli response will be
+ * sent before the unlock, and the volname in the
+ * dict, might be removed */
+ volname = gf_strdup (tmp);
+ if (!volname)
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_lock (volname, MY_UUID, "vol");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock for %s", volname);
+ gf_asprintf (&op_errstr,
+ "Another transaction is in progress "
+ "for %s. Please try again after sometime.",
+ volname);
+ goto out;
+ }
+ }
+
+ is_acquired = _gf_true;
+
+local_locking_done:
+
INIT_LIST_HEAD (&conf->xaction_peers);
+
npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op);
- ret = gd_lock_op_phase (&conf->xaction_peers, op, op_ctx, &op_errstr, npeers);
- if (ret)
- goto out;
+ /* If no volname is given as a part of the command, locks will
+ * not be held */
+ if (volname || (conf->op_version < GD_OP_VERSION_4)) {
+ ret = gd_lock_op_phase (conf, op, op_ctx, &op_errstr,
+ npeers, *txn_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Locking Peers Failed.");
+ goto out;
+ }
+ }
ret = glusterd_op_build_payload (&req_dict, &op_errstr, op_ctx);
if (ret) {
@@ -1067,14 +1646,34 @@ gd_sync_task_begin (dict_t *op_ctx, rpcsvc_request_t * req)
ret = 0;
out:
- (void) gd_unlock_op_phase (&conf->xaction_peers, op, ret, req,
- op_ctx, op_errstr, npeers);
+ op_ret = ret;
+ if (txn_id) {
+ (void) gd_unlock_op_phase (conf, op, &op_ret, req,
+ op_ctx, op_errstr,
+ npeers, volname,
+ is_acquired, *txn_id);
+
+ /* Clearing the transaction opinfo */
+ ret = glusterd_clear_txn_opinfo (txn_id);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to clear transaction's "
+ "opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+ }
+
+ glusterd_op_send_cli_response (op, op_ret, 0, req, op_ctx, op_errstr);
+
+ if (volname)
+ GF_FREE (volname);
if (req_dict)
dict_unref (req_dict);
- if (op_errstr)
+ if (op_errstr) {
GF_FREE (op_errstr);
+ op_errstr = NULL;
+ }
return;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h
index 9318862e5..e83ea2f4c 100644
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.h
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h
@@ -11,39 +11,40 @@
#define __RPC_SYNCOP_H
#include "syncop.h"
+#include "glusterd-sm.h"
+#include "glusterd.h"
#define GD_SYNC_OPCODE_KEY "sync-mgmt-operation"
/* gd_syncop_* */
-#define GD_SYNCOP(rpc, stb, cbk, req, prog, procnum, xdrproc) do { \
- int ret = 0; \
- struct synctask *task = NULL; \
- glusterd_conf_t *conf= THIS->private; \
- \
- task = synctask_get (); \
- stb->task = task; \
- \
- /*This is to ensure that the brick_op_cbk is able to \
- * take the big lock*/ \
- synclock_unlock (&conf->big_lock); \
- ret = gd_syncop_submit_request (rpc, req, stb, \
- prog, procnum, cbk, \
- (xdrproc_t)xdrproc); \
- if (!ret) \
- synctask_yield (stb->task); \
- synclock_lock (&conf->big_lock); \
+#define GD_SYNCOP(rpc, stb, cookie, cbk, req, prog, procnum, xdrproc) do { \
+ int ret = 0; \
+ struct synctask *task = NULL; \
+ glusterd_conf_t *conf= THIS->private; \
+ \
+ task = synctask_get (); \
+ stb->task = task; \
+ \
+ /*This is to ensure that the brick_op_cbk is able to \
+ * take the big lock*/ \
+ synclock_unlock (&conf->big_lock); \
+ ret = gd_syncop_submit_request (rpc, req, stb, cookie, \
+ prog, procnum, cbk, \
+ (xdrproc_t)xdrproc); \
+ if (!ret) \
+ synctask_yield (stb->task); \
+ synclock_lock (&conf->big_lock); \
} while (0)
-int gd_syncop_submit_request (struct rpc_clnt *rpc, void *req,
- void *cookie, rpc_clnt_prog_t *prog,
- int procnum, fop_cbk_fn_t cbkfn,
- xdrproc_t xdrproc);
+int gd_syncop_submit_request (struct rpc_clnt *rpc, void *req, void *local,
+ void *cookie, rpc_clnt_prog_t *prog, int procnum,
+ fop_cbk_fn_t cbkfn, xdrproc_t xdrproc);
-int gd_syncop_mgmt_lock (struct rpc_clnt *rpc, struct syncargs *arg,
+int gd_syncop_mgmt_lock (glusterd_peerinfo_t *peerinfo, struct syncargs *arg,
uuid_t my_uuid, uuid_t recv_uuid);
-int gd_syncop_mgmt_unlock (struct rpc_clnt *rpc, struct syncargs *arg,
+int gd_syncop_mgmt_unlock (glusterd_peerinfo_t *peerinfo, struct syncargs *arg,
uuid_t my_uuid, uuid_t recv_uuid);
int gd_syncop_mgmt_stage_op (struct rpc_clnt *rpc, struct syncargs *arg,
uuid_t my_uuid, uuid_t recv_uuid, int op,
@@ -51,4 +52,20 @@ int gd_syncop_mgmt_stage_op (struct rpc_clnt *rpc, struct syncargs *arg,
int gd_syncop_mgmt_commit_op (struct rpc_clnt *rpc, struct syncargs *arg,
uuid_t my_uuid, uuid_t recv_uuid, int op,
dict_t *dict_out, dict_t *op_ctx);
+
+void
+gd_synctask_barrier_wait (struct syncargs *args, int count);
+
+int
+gd_build_peers_list (struct list_head *peers, struct list_head *xact_peers,
+ glusterd_op_t op);
+int
+gd_brick_op_phase (glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr);
+
+int
+glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp);
+
+void
+gd_syncargs_init (struct syncargs *args, dict_t *op_ctx);
#endif /* __RPC_SYNCOP_H */
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index ad19484a1..7883a98bf 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -11,9 +11,14 @@
#define _CONFIG_H
#include "config.h"
#endif
-#include <openssl/md5.h>
#include <inttypes.h>
+#if !defined(__NetBSD__) && !defined(GF_DARWIN_HOST_OS)
+#include <mntent.h>
+#else
+#include "mntent_compat.h"
+#endif
+
#include "globals.h"
#include "glusterfs.h"
#include "compat.h"
@@ -36,13 +41,15 @@
#include "glusterd-store.h"
#include "glusterd-volgen.h"
#include "glusterd-pmap.h"
+#include "glusterfs-acl.h"
+#include "glusterd-syncop.h"
+#include "glusterd-locks.h"
#include "xdr-generic.h"
#include <sys/resource.h>
#include <inttypes.h>
#include <signal.h>
#include <sys/types.h>
-#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <rpc/pmap_clnt.h>
@@ -50,9 +57,8 @@
#include <fnmatch.h>
#include <sys/statvfs.h>
#include <ifaddrs.h>
-
-#ifdef GF_LINUX_HOST_OS
-#include <mntent.h>
+#ifdef HAVE_BD_XLATOR
+#include <lvm2app.h>
#endif
#ifdef GF_SOLARIS_HOST_OS
@@ -74,17 +80,19 @@
static glusterd_lock_t lock;
-static void
-md5_wrapper(const unsigned char *data, size_t len, char *md5)
+char*
+gd_peer_uuid_str (glusterd_peerinfo_t *peerinfo)
{
- unsigned short i = 0;
- unsigned short lim = MD5_DIGEST_LENGTH*2+1;
- unsigned char scratch[MD5_DIGEST_LENGTH] = {0,};
- MD5(data, len, scratch);
- for (; i < MD5_DIGEST_LENGTH; i++)
- snprintf(md5 + i * 2, lim-i*2, "%02x", scratch[i]);
+ if ((peerinfo == NULL) || uuid_is_null (peerinfo->uuid))
+ return NULL;
+
+ if (peerinfo->uuid_str[0] == '\0')
+ uuid_utoa_r (peerinfo->uuid, peerinfo->uuid_str);
+
+ return peerinfo->uuid_str;
}
+
int32_t
glusterd_get_lock_owner (uuid_t *uuid)
{
@@ -122,189 +130,6 @@ glusterd_is_fuse_available ()
return _gf_false;
}
-gf_boolean_t
-glusterd_is_loopback_localhost (const struct sockaddr *sa, char *hostname)
-{
- GF_ASSERT (sa);
-
- gf_boolean_t is_local = _gf_false;
- const struct in_addr *addr4 = NULL;
- const struct in6_addr *addr6 = NULL;
- uint8_t *ap = NULL;
- struct in6_addr loopbackaddr6 = IN6ADDR_LOOPBACK_INIT;
-
- switch (sa->sa_family) {
- case AF_INET:
- addr4 = &(((struct sockaddr_in *)sa)->sin_addr);
- ap = (uint8_t*)&addr4->s_addr;
- if (ap[0] == 127)
- is_local = _gf_true;
- break;
-
- case AF_INET6:
- addr6 = &(((struct sockaddr_in6 *)sa)->sin6_addr);
- if (memcmp (addr6, &loopbackaddr6,
- sizeof (loopbackaddr6)) == 0)
- is_local = _gf_true;
- break;
-
- default:
- if (hostname)
- gf_log ("glusterd", GF_LOG_ERROR,
- "unknown address family %d for %s",
- sa->sa_family, hostname);
- break;
- }
-
- return is_local;
-}
-
-char *
-get_ip_from_addrinfo (struct addrinfo *addr, char **ip)
-{
- char buf[64];
- void *in_addr = NULL;
- struct sockaddr_in *s4 = NULL;
- struct sockaddr_in6 *s6 = NULL;
-
- switch (addr->ai_family)
- {
- case AF_INET:
- s4 = (struct sockaddr_in *)addr->ai_addr;
- in_addr = &s4->sin_addr;
- break;
-
- case AF_INET6:
- s6 = (struct sockaddr_in6 *)addr->ai_addr;
- in_addr = &s6->sin6_addr;
- break;
-
- default:
- gf_log ("glusterd", GF_LOG_ERROR, "Invalid family");
- return NULL;
- }
-
- if (!inet_ntop(addr->ai_family, in_addr, buf, sizeof(buf))) {
- gf_log ("glusterd", GF_LOG_ERROR, "String conversion failed");
- return NULL;
- }
-
- *ip = strdup (buf);
- return *ip;
-}
-
-gf_boolean_t
-glusterd_interface_search (char *ip)
-{
- int32_t ret = -1;
- gf_boolean_t found = _gf_false;
- struct ifaddrs *ifaddr, *ifa;
- int family;
- char host[NI_MAXHOST];
- xlator_t *this = NULL;
- char *pct = NULL;
-
- this = THIS;
-
- ret = getifaddrs (&ifaddr);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "getifaddrs() failed: %s\n",
- gai_strerror(ret));
- goto out;
- }
-
- for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
- if (!ifa->ifa_addr) {
- /*
- * This seemingly happens if an interface hasn't
- * been bound to a particular protocol (seen with
- * TUN devices).
- */
- continue;
- }
- family = ifa->ifa_addr->sa_family;
-
- if (family != AF_INET && family != AF_INET6)
- continue;
-
- ret = getnameinfo (ifa->ifa_addr,
- (family == AF_INET) ? sizeof(struct sockaddr_in) :
- sizeof(struct sockaddr_in6),
- host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "getnameinfo() failed: %s\n",
- gai_strerror(ret));
- goto out;
- }
-
- /*
- * Sometimes the address comes back as addr%eth0 or
- * similar. Since % is an invalid character, we can
- * strip it out with confidence that doing so won't
- * harm anything.
- */
- pct = index(host,'%');
- if (pct) {
- *pct = '\0';
- }
-
- if (strncmp (ip, host, NI_MAXHOST) == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s is local address at interface %s",
- ip, ifa->ifa_name);
- found = _gf_true;
- goto out;
- }
- }
-out:
- if(ifaddr)
- freeifaddrs (ifaddr);
- return found;
-}
-
-
-gf_boolean_t
-glusterd_is_local_addr (char *hostname)
-{
- int32_t ret = -1;
- struct addrinfo *result = NULL;
- struct addrinfo *res = NULL;
- gf_boolean_t found = _gf_false;
- char *ip = NULL;
- xlator_t *this = NULL;
-
- this = THIS;
- ret = getaddrinfo (hostname, NULL, NULL, &result);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "error in getaddrinfo: %s\n",
- gai_strerror(ret));
- goto out;
- }
-
- for (res = result; res != NULL; res = res->ai_next) {
- gf_log (this->name, GF_LOG_DEBUG, "%s ",
- get_ip_from_addrinfo (res, &ip));
-
- found = glusterd_is_loopback_localhost (res->ai_addr, hostname)
- || glusterd_interface_search (ip);
- if (found)
- goto out;
- }
-
-out:
- if (result)
- freeaddrinfo (result);
-
- if (!found)
- gf_log (this->name, GF_LOG_DEBUG, "%s is not local", hostname);
-
- return found;
-}
-
int32_t
glusterd_lock (uuid_t uuid)
{
@@ -402,10 +227,11 @@ glusterd_get_uuid (uuid_t *uuid)
}
int
-glusterd_submit_request (struct rpc_clnt *rpc, void *req,
- call_frame_t *frame, rpc_clnt_prog_t *prog,
- int procnum, struct iobref *iobref,
- xlator_t *this, fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+glusterd_submit_request_unlocked (struct rpc_clnt *rpc, void *req,
+ call_frame_t *frame, rpc_clnt_prog_t *prog,
+ int procnum, struct iobref *iobref,
+ xlator_t *this, fop_cbk_fn_t cbkfn,
+ xdrproc_t xdrproc)
{
int ret = -1;
struct iobuf *iobuf = NULL;
@@ -476,6 +302,28 @@ out:
return ret;
}
+
+int
+glusterd_submit_request (struct rpc_clnt *rpc, void *req,
+ call_frame_t *frame, rpc_clnt_prog_t *prog,
+ int procnum, struct iobref *iobref,
+ xlator_t *this, fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+{
+ glusterd_conf_t *priv = THIS->private;
+ int ret = -1;
+
+ synclock_unlock (&priv->big_lock);
+ {
+ ret = glusterd_submit_request_unlocked (rpc, req, frame, prog,
+ procnum, iobref, this,
+ cbkfn, xdrproc);
+ }
+ synclock_lock (&priv->big_lock);
+
+ return ret;
+}
+
+
struct iobuf *
glusterd_serialize_reply (rpcsvc_request_t *req, void *arg,
struct iovec *outmsg, xdrproc_t xdrproc)
@@ -598,6 +446,37 @@ glusterd_check_volume_exists (char *volname)
return _gf_true;
}
+glusterd_volinfo_t *
+glusterd_volinfo_unref (glusterd_volinfo_t *volinfo)
+{
+ int refcnt = -1;
+
+ pthread_mutex_lock (&volinfo->reflock);
+ {
+ refcnt = --volinfo->refcnt;
+ }
+ pthread_mutex_unlock (&volinfo->reflock);
+
+ if (!refcnt) {
+ glusterd_volinfo_delete (volinfo);
+ return NULL;
+ }
+
+ return volinfo;
+}
+
+glusterd_volinfo_t *
+glusterd_volinfo_ref (glusterd_volinfo_t *volinfo)
+{
+ pthread_mutex_lock (&volinfo->reflock);
+ {
+ ++volinfo->refcnt;
+ }
+ pthread_mutex_unlock (&volinfo->reflock);
+
+ return volinfo;
+}
+
int32_t
glusterd_volinfo_new (glusterd_volinfo_t **volinfo)
{
@@ -612,8 +491,11 @@ glusterd_volinfo_new (glusterd_volinfo_t **volinfo)
if (!new_volinfo)
goto out;
+ LOCK_INIT (&new_volinfo->lock);
INIT_LIST_HEAD (&new_volinfo->vol_list);
+ INIT_LIST_HEAD (&new_volinfo->snapvol_list);
INIT_LIST_HEAD (&new_volinfo->bricks);
+ INIT_LIST_HEAD (&new_volinfo->snap_volumes);
new_volinfo->dict = dict_new ();
if (!new_volinfo->dict) {
@@ -629,9 +511,14 @@ glusterd_volinfo_new (glusterd_volinfo_t **volinfo)
goto out;
}
+ snprintf (new_volinfo->parent_volname, GLUSTERD_MAX_VOLUME_NAME, "N/A");
+
+ new_volinfo->snap_max_hard_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+
new_volinfo->xl = THIS;
- *volinfo = new_volinfo;
+ pthread_mutex_init (&new_volinfo->reflock, NULL);
+ *volinfo = glusterd_volinfo_ref (new_volinfo);
ret = 0;
@@ -640,6 +527,220 @@ out:
return ret;
}
+/* This function will create a new volinfo and then
+ * dup the entries from volinfo to the new_volinfo.
+ *
+ * @param volinfo volinfo which will be duplicated
+ * @param dup_volinfo new volinfo which will be created
+ * @param set_userauth if this true then auth info is also set
+ *
+ * @return 0 on success else -1
+ */
+int32_t
+glusterd_volinfo_dup (glusterd_volinfo_t *volinfo,
+ glusterd_volinfo_t **dup_volinfo,
+ gf_boolean_t set_userauth)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *new_volinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, dup_volinfo, out);
+
+ ret = glusterd_volinfo_new (&new_volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "not able to create the "
+ "duplicate volinfo for the volume %s",
+ volinfo->volname);
+ goto out;
+ }
+
+ new_volinfo->type = volinfo->type;
+ new_volinfo->replica_count = volinfo->replica_count;
+ new_volinfo->stripe_count = volinfo->stripe_count;
+ new_volinfo->dist_leaf_count = volinfo->dist_leaf_count;
+ new_volinfo->sub_count = volinfo->sub_count;
+ new_volinfo->transport_type = volinfo->transport_type;
+ new_volinfo->nfs_transport_type = volinfo->nfs_transport_type;
+ new_volinfo->brick_count = volinfo->brick_count;
+
+ dict_copy (volinfo->dict, new_volinfo->dict);
+ gd_update_volume_op_versions (new_volinfo);
+
+ if (set_userauth) {
+ glusterd_auth_set_username (new_volinfo,
+ volinfo->auth.username);
+ glusterd_auth_set_password (new_volinfo,
+ volinfo->auth.password);
+ }
+
+ *dup_volinfo = new_volinfo;
+ ret = 0;
+out:
+ if (ret && (NULL != new_volinfo)) {
+ (void) glusterd_volinfo_delete (new_volinfo);
+ }
+ return ret;
+}
+
+/* This function will duplicate brickinfo
+ *
+ * @param brickinfo Source brickinfo
+ * @param dup_brickinfo Destination brickinfo
+ *
+ * @return 0 on success else -1
+ */
+int32_t
+glusterd_brickinfo_dup (glusterd_brickinfo_t *brickinfo,
+ glusterd_brickinfo_t *dup_brickinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, brickinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, dup_brickinfo, out);
+
+ strcpy (dup_brickinfo->hostname, brickinfo->hostname);
+ strcpy (dup_brickinfo->path, brickinfo->path);
+ strcpy (dup_brickinfo->device_path, brickinfo->device_path);
+ ret = gf_canonicalize_path (dup_brickinfo->path);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to canonicalize "
+ "brick path");
+ goto out;
+ }
+ uuid_copy (dup_brickinfo->uuid, brickinfo->uuid);
+
+ dup_brickinfo->port = brickinfo->port;
+ dup_brickinfo->rdma_port = brickinfo->rdma_port;
+ if (NULL != brickinfo->logfile) {
+ dup_brickinfo->logfile = gf_strdup (brickinfo->logfile);
+ if (NULL == dup_brickinfo->logfile) {
+ ret = -1;
+ goto out;
+ }
+ }
+ strcpy (dup_brickinfo->brick_id, brickinfo->brick_id);
+ dup_brickinfo->status = brickinfo->status;
+ dup_brickinfo->snap_status = brickinfo->snap_status;
+out:
+ return ret;
+}
+
+/* This function will copy snap volinfo to the new
+ * passed volinfo and regenerate backend store files
+ * for the restored snap.
+ *
+ * @param new_volinfo new volinfo
+ * @param snap_volinfo volinfo of snap volume
+ *
+ * @return 0 on success and -1 on failure
+ *
+ * TODO: Duplicate all members of volinfo, e.g. geo-rep sync slaves
+ */
+int32_t
+glusterd_snap_volinfo_restore (dict_t *rsp_dict,
+ glusterd_volinfo_t *new_volinfo,
+ glusterd_volinfo_t *snap_volinfo)
+{
+ int32_t brick_count = -1;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_brickinfo_t *new_brickinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+
+ GF_VALIDATE_OR_GOTO (this->name, new_volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, snap_volinfo, out);
+
+ brick_count = 0;
+ list_for_each_entry (brickinfo, &snap_volinfo->bricks, brick_list) {
+ ret = glusterd_brickinfo_new (&new_brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create "
+ "new brickinfo");
+ goto out;
+ }
+
+ /* Duplicate brickinfo */
+ ret = glusterd_brickinfo_dup (brickinfo, new_brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to dup "
+ "brickinfo");
+ goto out;
+ }
+
+ /* If the brick is not of this peer, or snapshot is missed *
+ * for the brick do not replace the xattr for it */
+ if ((!uuid_compare (brickinfo->uuid, MY_UUID)) &&
+ (brickinfo->snap_status != -1)) {
+ /* We need to replace the volume id of all the bricks
+ * to the volume id of the origin volume. new_volinfo
+ * has the origin volume's volume id*/
+ ret = sys_lsetxattr (new_brickinfo->path,
+ GF_XATTR_VOL_ID_KEY,
+ new_volinfo->volume_id,
+ sizeof (new_volinfo->volume_id),
+ XATTR_REPLACE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "set extended attribute %s on %s. "
+ "Reason: %s, snap: %s",
+ GF_XATTR_VOL_ID_KEY,
+ new_brickinfo->path, strerror (errno),
+ new_volinfo->volname);
+ goto out;
+ }
+ }
+
+ /* If a snapshot is pending for this brick then
+ * restore should also be pending
+ */
+ if (brickinfo->snap_status == -1) {
+ /* Adding missed delete to the dict */
+ ret = glusterd_add_missed_snaps_to_dict
+ (rsp_dict,
+ snap_volinfo,
+ brickinfo,
+ brick_count + 1,
+ GF_SNAP_OPTION_TYPE_RESTORE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snapshot info "
+ "for %s:%s in the rsp_dict",
+ brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+ }
+
+ list_add_tail (&new_brickinfo->brick_list,
+ &new_volinfo->bricks);
+ /* ownership of new_brickinfo is passed to new_volinfo */
+ new_brickinfo = NULL;
+ brick_count++;
+ }
+
+ /* Regenerate all volfiles */
+ ret = glusterd_create_volfiles_and_notify_services (new_volinfo);
+
+out:
+ if (ret && (NULL != new_brickinfo)) {
+ (void) glusterd_brickinfo_delete (new_brickinfo);
+ }
+
+ return ret;
+}
+
void
glusterd_auth_cleanup (glusterd_volinfo_t *volinfo) {
@@ -724,6 +825,14 @@ out:
return ret;
}
+int
+glusterd_volinfo_remove (glusterd_volinfo_t *volinfo)
+{
+ list_del_init (&volinfo->vol_list);
+ glusterd_volinfo_unref (volinfo);
+ return 0;
+}
+
int32_t
glusterd_volinfo_delete (glusterd_volinfo_t *volinfo)
{
@@ -732,6 +841,7 @@ glusterd_volinfo_delete (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
list_del_init (&volinfo->vol_list);
+ list_del_init (&volinfo->snapvol_list);
ret = glusterd_volume_brickinfos_delete (volinfo);
if (ret)
@@ -741,9 +851,14 @@ glusterd_volinfo_delete (glusterd_volinfo_t *volinfo)
if (volinfo->gsync_slaves)
dict_unref (volinfo->gsync_slaves);
GF_FREE (volinfo->logdir);
+ if (volinfo->rebal.dict)
+ dict_unref (volinfo->rebal.dict);
+
+ gf_store_handle_destroy (volinfo->quota_conf_shandle);
glusterd_auth_cleanup (volinfo);
+ pthread_mutex_destroy (&volinfo->reflock);
GF_FREE (volinfo);
ret = 0;
@@ -752,7 +867,6 @@ out:
return ret;
}
-
int32_t
glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo)
{
@@ -778,6 +892,30 @@ out:
return ret;
}
+int
+glusterd_get_next_available_brickid (glusterd_volinfo_t *volinfo)
+{
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char *token = NULL;
+ int brickid = 0;
+ int max_brickid = -1;
+ int ret = -1;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ token = strrchr (brickinfo->brick_id, '-');
+ ret = gf_string2int32 (++token, &brickid);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unable to generate brick ID");
+ return ret;
+ }
+ if (brickid > max_brickid)
+ max_brickid = brickid;
+ }
+
+ return max_brickid + 1 ;
+}
+
int32_t
glusterd_resolve_brick (glusterd_brickinfo_t *brickinfo)
{
@@ -804,6 +942,7 @@ glusterd_brickinfo_new_from_brick (char *brick,
char *path = NULL;
char *tmp_host = NULL;
char *tmp_path = NULL;
+ char *vg = NULL;
GF_ASSERT (brick);
GF_ASSERT (brickinfo);
@@ -822,6 +961,17 @@ glusterd_brickinfo_new_from_brick (char *brick,
if (ret)
goto out;
+#ifdef HAVE_BD_XLATOR
+ vg = strchr (path, '?');
+ /* ? is used as a delimiter for vg */
+ if (vg) {
+ strncpy (new_brickinfo->vg, vg + 1, PATH_MAX - 1);
+ *vg = '\0';
+ }
+ new_brickinfo->caps = CAPS_BD;
+#else
+ vg = NULL; /* Avoid compiler warnings when BD not enabled */
+#endif
ret = gf_canonicalize_path (path);
if (ret)
goto out;
@@ -925,6 +1075,62 @@ out:
return available;
}
+#ifdef HAVE_BD_XLATOR
+/*
+ * Sets the tag of the format "trusted.glusterfs.volume-id:<uuid>" in
+ * the brick VG. It is used to avoid using same VG for another brick.
+ * @volume-id - gfid, @brick - brick info, @msg - Error message returned
+ * to the caller
+ */
+int
+glusterd_bd_set_vg_tag (unsigned char *volume_id, glusterd_brickinfo_t *brick,
+ char *msg, int msg_size)
+{
+ lvm_t handle = NULL;
+ vg_t vg = NULL;
+ char *uuid = NULL;
+ int ret = -1;
+
+ gf_asprintf (&uuid, "%s:%s", GF_XATTR_VOL_ID_KEY,
+ uuid_utoa (volume_id));
+ if (!uuid) {
+ snprintf (msg, sizeof(*msg), "Could not allocate memory "
+ "for tag");
+ return -1;
+ }
+
+ handle = lvm_init (NULL);
+ if (!handle) {
+ snprintf (msg, sizeof(*msg), "lvm_init failed");
+ goto out;
+ }
+
+ vg = lvm_vg_open (handle, brick->vg, "w", 0);
+ if (!vg) {
+ snprintf (msg, sizeof(*msg), "Could not open VG %s",
+ brick->vg);
+ goto out;
+ }
+
+ if (lvm_vg_add_tag (vg, uuid) < 0) {
+ snprintf (msg, sizeof(*msg), "Could not set tag %s for "
+ "VG %s", uuid, brick->vg);
+ goto out;
+ }
+ lvm_vg_write (vg);
+ ret = 0;
+out:
+ GF_FREE (uuid);
+
+ if (vg)
+ lvm_vg_close (vg);
+ if (handle)
+ lvm_quit (handle);
+
+ return ret;
+}
+#endif
+
int
glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo,
uuid_t volume_id, char **op_errstr,
@@ -995,7 +1201,7 @@ glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo,
goto out;
}
else if (parent_st.st_dev == root_st.st_dev) {
- snprintf (msg, sizeof (msg), "The brick %s:%s is "
+ snprintf (msg, sizeof (msg), "The brick %s:%s "
"is being created in the root partition. It "
"is recommended that you don't use the "
"system's root partition for storage backend."
@@ -1007,9 +1213,17 @@ glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo,
}
}
+#ifdef HAVE_BD_XLATOR
+ if (brickinfo->vg[0]) {
+ ret = glusterd_bd_set_vg_tag (volume_id, brickinfo, msg,
+ sizeof(msg));
+ if (ret)
+ goto out;
+ }
+#endif
ret = glusterd_check_and_set_brick_xattr (brickinfo->hostname,
brickinfo->path, volume_id,
- op_errstr);
+ op_errstr, is_force);
if (ret)
goto out;
@@ -1115,16 +1329,19 @@ glusterd_friend_cleanup (glusterd_peerinfo_t *peerinfo)
GF_ASSERT (peerinfo);
glusterd_peerctx_t *peerctx = NULL;
gf_boolean_t quorum_action = _gf_false;
+ glusterd_conf_t *priv = THIS->private;
if (peerinfo->quorum_contrib != QUORUM_NONE)
quorum_action = _gf_true;
if (peerinfo->rpc) {
/* cleanup the saved-frames before last unref */
+ synclock_unlock (&priv->big_lock);
rpc_clnt_connection_cleanup (&peerinfo->rpc->conn);
+ synclock_lock (&priv->big_lock);
peerctx = peerinfo->rpc->mydata;
peerinfo->rpc->mydata = NULL;
- peerinfo->rpc = rpc_clnt_unref (peerinfo->rpc);
+ peerinfo->rpc = glusterd_rpc_clnt_unref (priv, peerinfo->rpc);
peerinfo->rpc = NULL;
if (peerctx) {
GF_FREE (peerctx->errstr);
@@ -1138,6 +1355,68 @@ glusterd_friend_cleanup (glusterd_peerinfo_t *peerinfo)
return 0;
}
+int
+glusterd_volinfo_find_by_volume_id (uuid_t volume_id, glusterd_volinfo_t **volinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ if (!volume_id)
+ return -1;
+
+ this = THIS;
+ priv = this->private;
+
+ list_for_each_entry (voliter, &priv->volumes, vol_list) {
+ if (uuid_compare (volume_id, voliter->volume_id))
+ continue;
+ *volinfo = voliter;
+ ret = 0;
+ gf_log (this->name, GF_LOG_DEBUG, "Volume %s found",
+ voliter->volname);
+ break;
+ }
+ return ret;
+}
+
+int
+glusterd_snap_volinfo_find_by_volume_id (uuid_t volume_id,
+ glusterd_volinfo_t **volinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volinfo);
+
+ if (uuid_is_null(volume_id)) {
+ gf_log (this->name, GF_LOG_WARNING, "Volume UUID is NULL");
+ goto out;
+ }
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ list_for_each_entry (voliter, &snap->volumes, vol_list) {
+ if (uuid_compare (volume_id, voliter->volume_id))
+ continue;
+ *volinfo = voliter;
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_WARNING, "Snap volume not found");
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
int32_t
glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo)
{
@@ -1147,7 +1426,6 @@ glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo)
glusterd_conf_t *priv = NULL;
GF_ASSERT (volname);
-
this = THIS;
GF_ASSERT (this);
@@ -1156,7 +1434,8 @@ glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo)
list_for_each_entry (tmp_volinfo, &priv->volumes, vol_list) {
if (!strcmp (tmp_volinfo->volname, volname)) {
- gf_log (this->name, GF_LOG_DEBUG, "Volume %s found", volname);
+ gf_log (this->name, GF_LOG_DEBUG, "Volume %s found",
+ volname);
ret = 0;
*volinfo = tmp_volinfo;
break;
@@ -1168,6 +1447,68 @@ glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo)
}
int32_t
+glusterd_snap_volinfo_find (char *snap_volname, glusterd_snap_t *snap,
+ glusterd_volinfo_t **volinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+ GF_ASSERT (snap_volname);
+
+ list_for_each_entry (snap_vol, &snap->volumes, vol_list) {
+ if (!strcmp (snap_vol->volname, snap_volname)) {
+ ret = 0;
+ *volinfo = snap_vol;
+ goto out;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_WARNING, "Snap volume %s not found",
+ snap_volname);
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_snap_volinfo_find_from_parent_volname (char *origin_volname,
+ glusterd_snap_t *snap,
+ glusterd_volinfo_t **volinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+ GF_ASSERT (origin_volname);
+
+ list_for_each_entry (snap_vol, &snap->volumes, vol_list) {
+ if (!strcmp (snap_vol->parent_volname, origin_volname)) {
+ ret = 0;
+ *volinfo = snap_vol;
+ goto out;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "Snap volume not found(snap: %s, "
+ "origin-volume: %s", snap->snapname, origin_volname);
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
glusterd_service_stop (const char *service, char *pidfile, int sig,
gf_boolean_t force_kill)
{
@@ -1177,7 +1518,7 @@ glusterd_service_stop (const char *service, char *pidfile, int sig,
this = THIS;
GF_ASSERT (this);
- if (!glusterd_is_service_running (pidfile, &pid)) {
+ if (!gf_is_service_running (pidfile, &pid)) {
ret = 0;
gf_log (this->name, GF_LOG_INFO, "%s already stopped", service);
goto out;
@@ -1186,11 +1527,23 @@ glusterd_service_stop (const char *service, char *pidfile, int sig,
"%d", service, pid);
ret = kill (pid, sig);
+ if (ret) {
+ switch (errno) {
+ case ESRCH:
+ gf_log (this->name, GF_LOG_DEBUG, "%s is already stopped",
+ service);
+ ret = 0;
+ goto out;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Failed to kill %s: %s",
+ service, strerror (errno));
+ }
+ }
if (!force_kill)
goto out;
sleep (1);
- if (glusterd_is_service_running (pidfile, NULL)) {
+ if (gf_is_service_running (pidfile, NULL)) {
ret = kill (pid, SIGKILL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Unable to "
@@ -1247,22 +1600,20 @@ glusterd_set_brick_socket_filepath (glusterd_volinfo_t *volinfo,
*/
int32_t
glusterd_brick_connect (glusterd_volinfo_t *volinfo,
- glusterd_brickinfo_t *brickinfo)
+ glusterd_brickinfo_t *brickinfo, char *socketpath)
{
int ret = 0;
- char socketpath[PATH_MAX] = {0};
+ char volume_id_str[64];
+ char *brickid = NULL;
dict_t *options = NULL;
struct rpc_clnt *rpc = NULL;
glusterd_conf_t *priv = THIS->private;
GF_ASSERT (volinfo);
GF_ASSERT (brickinfo);
+ GF_ASSERT (socketpath);
if (brickinfo->rpc == NULL) {
- glusterd_set_brick_socket_filepath (volinfo, brickinfo,
- socketpath,
- sizeof (socketpath));
-
/* Setting frame-timeout to 10mins (600seconds).
* Unix domain sockets ensures that the connection is reliable.
* The default timeout of 30mins used for unreliable network
@@ -1272,30 +1623,30 @@ glusterd_brick_connect (glusterd_volinfo_t *volinfo,
600);
if (ret)
goto out;
+
+ uuid_utoa_r (volinfo->volume_id, volume_id_str);
+ ret = gf_asprintf (&brickid, "%s:%s:%s", volume_id_str,
+ brickinfo->hostname, brickinfo->path);
+ if (ret < 0)
+ goto out;
+
synclock_unlock (&priv->big_lock);
ret = glusterd_rpc_create (&rpc, options,
glusterd_brick_rpc_notify,
- brickinfo);
+ brickid);
synclock_lock (&priv->big_lock);
- if (ret)
+ if (ret) {
+ GF_FREE (brickid);
goto out;
+ }
brickinfo->rpc = rpc;
}
out:
+
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-/* Caller should ensure that brick process is not running*/
-static void
-_reap_brick_process (char *pidfile, char *brickpath)
-{
- unlink (pidfile);
- /* Brick process is not running and pmap may have an entry for it.*/
- pmap_registry_remove (THIS, 0, brickpath,
- GF_PMAP_PORT_BRICKSERVER, NULL);
-}
-
static int
_mk_rundir_p (glusterd_volinfo_t *volinfo)
{
@@ -1343,19 +1694,36 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
priv = this->private;
GF_ASSERT (priv);
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Snapshot is pending on %s:%s. "
+ "Hence not starting the brick",
+ brickinfo->hostname,
+ brickinfo->path);
+ ret = 0;
+ goto out;
+ }
+
ret = _mk_rundir_p (volinfo);
if (ret)
goto out;
+
+ glusterd_set_brick_socket_filepath (volinfo, brickinfo, socketpath,
+ sizeof (socketpath));
+
GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
- if (glusterd_is_service_running (pidfile, NULL))
+ if (gf_is_service_running (pidfile, NULL))
goto connect;
- _reap_brick_process (pidfile, brickinfo->path);
-
port = brickinfo->port;
if (!port)
port = pmap_registry_alloc (THIS);
+ /* Build the exp_path, before starting the glusterfsd even in
+ valgrind mode. Otherwise all the glusterfsd processes start
+ writing the valgrind log to the same file.
+ */
+ GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path);
runinit (&runner);
if (priv->valgrind) {
@@ -1378,9 +1746,15 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
runner_argprintf (&runner, "--log-file=%s", valgrind_logfile);
}
- GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path);
- snprintf (volfile, PATH_MAX, "%s.%s.%s", volinfo->volname,
- brickinfo->hostname, exp_path);
+ if (volinfo->is_snap_volume) {
+ snprintf (volfile, PATH_MAX,"/%s/%s/%s.%s.%s",
+ GLUSTERD_VOL_SNAP_DIR_PREFIX,
+ volinfo->snapshot->snapname, volinfo->volname,
+ brickinfo->hostname, exp_path);
+ } else {
+ snprintf (volfile, PATH_MAX, "%s.%s.%s", volinfo->volname,
+ brickinfo->hostname, exp_path);
+ }
if (volinfo->logdir) {
snprintf (logfile, PATH_MAX, "%s/%s.log",
@@ -1392,9 +1766,6 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
if (!brickinfo->logfile)
brickinfo->logfile = gf_strdup (logfile);
- glusterd_set_brick_socket_filepath (volinfo, brickinfo, socketpath,
- sizeof (socketpath));
-
(void) snprintf (glusterd_uuid, 1024, "*-posix.glusterd-uuid=%s",
uuid_utoa (MY_UUID));
runner_add_args (&runner, SBIN_DIR"/glusterfsd",
@@ -1442,9 +1813,13 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
brickinfo->rdma_port = rdma_port;
connect:
- ret = glusterd_brick_connect (volinfo, brickinfo);
- if (ret)
+ ret = glusterd_brick_connect (volinfo, brickinfo, socketpath);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to connect to brick %s:%s on %s",
+ brickinfo->hostname, brickinfo->path, socketpath);
goto out;
+ }
out:
return ret;
}
@@ -1483,15 +1858,23 @@ glusterd_brick_unlink_socket_file (glusterd_volinfo_t *volinfo,
int32_t
glusterd_brick_disconnect (glusterd_brickinfo_t *brickinfo)
{
+ rpc_clnt_t *rpc = NULL;
+ glusterd_conf_t *priv = THIS->private;
+
GF_ASSERT (brickinfo);
- if (brickinfo->rpc) {
- /* cleanup the saved-frames before last unref */
- rpc_clnt_connection_cleanup (&brickinfo->rpc->conn);
+ if (!brickinfo) {
+ gf_log_callingfn ("glusterd", GF_LOG_WARNING, "!brickinfo");
+ return -1;
+ }
+
+ rpc = brickinfo->rpc;
+ brickinfo->rpc = NULL;
- rpc_clnt_unref (brickinfo->rpc);
- brickinfo->rpc = NULL;
+ if (rpc) {
+ glusterd_rpc_clnt_unref (priv, rpc);
}
+
return 0;
}
@@ -1500,10 +1883,10 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo,
gf_boolean_t del_brick)
{
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
- char pidfile[PATH_MAX] = {0,};
- int ret = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ char pidfile[PATH_MAX] = {0,};
+ int ret = 0;
GF_ASSERT (volinfo);
GF_ASSERT (brickinfo);
@@ -1522,6 +1905,8 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
if (ret == 0) {
glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
(void) glusterd_brick_unlink_socket_file (volinfo, brickinfo);
+ GLUSTERD_GET_BRICK_RECON_PIDFILE (pidfile, volinfo, brickinfo, priv);
+ ret = glusterd_service_stop ("recon", pidfile, SIGTERM, _gf_false);
}
}
@@ -1666,89 +2051,85 @@ glusterd_sort_and_redirect (const char *src_filepath, int dest_fd)
}
int
-glusterd_volume_compute_cksum (glusterd_volinfo_t *volinfo)
-{
- int32_t ret = -1;
- glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
- char cksum_path[PATH_MAX] = {0,};
- char filepath[PATH_MAX] = {0,};
- int fd = -1;
- uint32_t cksum = 0;
- char buf[4096] = {0,};
+glusterd_volume_compute_cksum (glusterd_volinfo_t *volinfo, char *cksum_path,
+ char *filepath, gf_boolean_t is_quota_conf,
+ uint32_t *cs)
+{
+ int32_t ret = -1;
+ uint32_t cksum = 0;
+ int fd = -1;
+ int sort_fd = 0;
char sort_filepath[PATH_MAX] = {0};
- gf_boolean_t unlink_sortfile = _gf_false;
- int sort_fd = 0;
- xlator_t *this = NULL;
+ char *cksum_path_final = NULL;
+ char buf[4096] = {0,};
+ gf_boolean_t unlink_sortfile = _gf_false;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
GF_ASSERT (volinfo);
this = THIS;
priv = THIS->private;
GF_ASSERT (priv);
- GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
-
- snprintf (cksum_path, sizeof (cksum_path), "%s/%s",
- path, GLUSTERD_CKSUM_FILE);
-
fd = open (cksum_path, O_RDWR | O_APPEND | O_CREAT| O_TRUNC, 0600);
if (-1 == fd) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to open %s, errno: %d",
- cksum_path, errno);
+ gf_log (this->name, GF_LOG_ERROR, "Unable to open %s,"
+ " errno: %d", cksum_path, errno);
ret = -1;
goto out;
}
- snprintf (filepath, sizeof (filepath), "%s/%s", path,
- GLUSTERD_VOLUME_INFO_FILE);
- snprintf (sort_filepath, sizeof (sort_filepath), "/tmp/%s.XXXXXX",
- volinfo->volname);
+ if (!is_quota_conf) {
+ snprintf (sort_filepath, sizeof (sort_filepath),
+ "/tmp/%s.XXXXXX", volinfo->volname);
- sort_fd = mkstemp (sort_filepath);
- if (sort_fd < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Could not generate temp "
- "file, reason: %s for volume: %s", strerror (errno),
- volinfo->volname);
- goto out;
- } else {
- unlink_sortfile = _gf_true;
- }
+ sort_fd = mkstemp (sort_filepath);
+ if (sort_fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not generate "
+ "temp file, reason: %s for volume: %s",
+ strerror (errno), volinfo->volname);
+ goto out;
+ } else {
+ unlink_sortfile = _gf_true;
+ }
- /* sort the info file, result in sort_filepath */
+ /* sort the info file, result in sort_filepath */
- ret = glusterd_sort_and_redirect (filepath, sort_fd);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "sorting info file failed");
- goto out;
- }
+ ret = glusterd_sort_and_redirect (filepath, sort_fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "sorting info file "
+ "failed");
+ goto out;
+ }
- ret = close (sort_fd);
- if (ret)
- goto out;
+ ret = close (sort_fd);
+ if (ret)
+ goto out;
+ }
- ret = get_checksum_for_path (sort_filepath, &cksum);
+ cksum_path_final = is_quota_conf ? filepath : sort_filepath;
+ ret = get_checksum_for_path (cksum_path_final, &cksum);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to get checksum"
- " for path: %s", sort_filepath);
+ gf_log (this->name, GF_LOG_ERROR, "unable to get "
+ "checksum for path: %s", cksum_path_final);
goto out;
}
-
- snprintf (buf, sizeof (buf), "%s=%u\n", "info", cksum);
- ret = write (fd, buf, strlen (buf));
-
- if (ret <= 0) {
- ret = -1;
- goto out;
+ if (!is_quota_conf) {
+ snprintf (buf, sizeof (buf), "%s=%u\n", "info", cksum);
+ ret = write (fd, buf, strlen (buf));
+ if (ret <= 0) {
+ ret = -1;
+ goto out;
+ }
}
ret = get_checksum_for_file (fd, &cksum);
-
if (ret)
goto out;
- volinfo->cksum = cksum;
+ *cs = cksum;
out:
if (fd > 0)
@@ -1760,6 +2141,54 @@ out:
return ret;
}
+int glusterd_compute_cksum (glusterd_volinfo_t *volinfo,
+ gf_boolean_t is_quota_conf)
+{
+ int ret = -1;
+ uint32_t cs = 0;
+ char cksum_path[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ char filepath[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, conf);
+
+ if (is_quota_conf) {
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", path,
+ GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+ snprintf (filepath, sizeof (filepath), "%s/%s", path,
+ GLUSTERD_VOLUME_QUOTA_CONFIG);
+ } else {
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", path,
+ GLUSTERD_CKSUM_FILE);
+ snprintf (filepath, sizeof (filepath), "%s/%s", path,
+ GLUSTERD_VOLUME_INFO_FILE);
+ }
+
+ ret = glusterd_volume_compute_cksum (volinfo, cksum_path, filepath,
+ is_quota_conf, &cs);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to compute checksum "
+ "for volume %s", volinfo->volname);
+ goto out;
+ }
+
+ if (is_quota_conf)
+ volinfo->quota_conf_cksum = cs;
+ else
+ volinfo->cksum = cs;
+
+ ret = 0;
+out:
+ return ret;
+}
+
int
_add_dict_to_prdict (dict_t *this, char *key, data_t *value, void *data)
{
@@ -1812,12 +2241,17 @@ out:
return ret;
}
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
int32_t
glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
- dict_t *dict, int32_t count)
+ dict_t *dict, int32_t count,
+ char *prefix)
{
int32_t ret = -1;
- char prefix[512] = {0,};
+ char pfx[512] = {0,};
char key[512] = {0,};
glusterd_brickinfo_t *brickinfo = NULL;
int32_t i = 1;
@@ -1828,89 +2262,127 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
glusterd_dict_ctx_t ctx = {0};
char *rebalance_id_str = NULL;
char *rb_id_str = NULL;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (dict);
GF_ASSERT (volinfo);
+ GF_ASSERT (prefix);
- snprintf (key, sizeof (key), "volume%d.name", count);
+ snprintf (key, sizeof (key), "%s%d.name", prefix, count);
ret = dict_set_str (dict, key, volinfo->volname);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.type", count);
+ snprintf (key, sizeof (key), "%s%d.type", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->type);
if (ret)
goto out;
+ snprintf (key, sizeof (key), "volume%d.restored_from_snap", count);
+ ret = dict_set_dynstr_with_alloc
+ (dict, key,
+ uuid_utoa (volinfo->restored_from_snap));
+ if (ret)
+ goto out;
+
+ if (strlen (volinfo->parent_volname) > 0) {
+ snprintf (key, sizeof (key), "%s%d.parent_volname",
+ prefix, count);
+ ret = dict_set_dynstr_with_alloc (dict, key,
+ volinfo->parent_volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set parent_volname for %s",
+ volinfo->volname);
+ goto out;
+ }
+ }
+
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick_count", count);
+ snprintf (key, sizeof (key), "%s%d.brick_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->brick_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.version", count);
+ snprintf (key, sizeof (key), "%s%d.version", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->version);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.status", count);
+ snprintf (key, sizeof (key), "%s%d.status", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->status);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.sub_count", count);
+ snprintf (key, sizeof (key), "%s%d.sub_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->sub_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.stripe_count", count);
+ snprintf (key, sizeof (key), "%s%d.stripe_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->stripe_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.replica_count", count);
+ snprintf (key, sizeof (key), "%s%d.replica_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->replica_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.dist_count", count);
+ snprintf (key, sizeof (key), "%s%d.dist_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->dist_leaf_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.ckusm", count);
+ snprintf (key, sizeof (key), "%s%d.ckusm", prefix, count);
ret = dict_set_int64 (dict, key, volinfo->cksum);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.transport_type", count);
+ snprintf (key, sizeof (key), "%s%d.transport_type", prefix, count);
ret = dict_set_uint32 (dict, key, volinfo->transport_type);
if (ret)
goto out;
+ snprintf (key, sizeof (key), "%s%d.is_snap_volume", prefix, count);
+ ret = dict_set_uint32 (dict, key, volinfo->is_snap_volume);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Unable to set %s", key);
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s%d.snap-max-hard-limit", prefix, count);
+ ret = dict_set_uint64 (dict, key, volinfo->snap_max_hard_limit);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Unable to set %s", key);
+ goto out;
+ }
+
volume_id_str = gf_strdup (uuid_utoa (volinfo->volume_id));
if (!volume_id_str) {
ret = -1;
goto out;
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.volume_id", count);
+ snprintf (key, sizeof (key), "%s%d.volume_id", prefix, count);
ret = dict_set_dynstr (dict, key, volume_id_str);
if (ret)
goto out;
volume_id_str = NULL;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.username", count);
+ snprintf (key, sizeof (key), "%s%d.username", prefix, count);
str = glusterd_auth_get_username (volinfo);
if (str) {
ret = dict_set_dynstr (dict, key, gf_strdup (str));
@@ -1919,7 +2391,7 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.password", count);
+ snprintf (key, sizeof (key), "%s%d.password", prefix, count);
str = glusterd_auth_get_password (volinfo);
if (str) {
ret = dict_set_dynstr (dict, key, gf_strdup (str));
@@ -1928,34 +2400,49 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
}
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d.rebalance", count);
+ snprintf (key, 256, "%s%d.rebalance", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->rebal.defrag_cmd);
if (ret)
goto out;
- if (volinfo->rebal.defrag_cmd) {
- rebalance_id_str = gf_strdup (uuid_utoa
- (volinfo->rebal.rebalance_id));
- if (!rebalance_id_str) {
- ret = -1;
- goto out;
- }
- memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d.rebalance-id", count);
- ret = dict_set_dynstr (dict, key, rebalance_id_str);
- if (ret)
- goto out;
- rebalance_id_str = NULL;
+ rebalance_id_str = gf_strdup (uuid_utoa
+ (volinfo->rebal.rebalance_id));
+ if (!rebalance_id_str) {
+ ret = -1;
+ goto out;
}
+ memset (key, 0, sizeof (key));
+ snprintf (key, 256, "%s%d.rebalance-id", prefix, count);
+ ret = dict_set_dynstr (dict, key, rebalance_id_str);
+ if (ret)
+ goto out;
+ rebalance_id_str = NULL;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.rebalance-op", count);
+ snprintf (key, sizeof (key), "%s%d.rebalance-op", prefix, count);
ret = dict_set_uint32 (dict, key, volinfo->rebal.op);
if (ret)
goto out;
+ if (volinfo->rebal.dict) {
+ snprintf (pfx, sizeof (pfx), "%s%d", prefix, count);
+ ctx.dict = dict;
+ ctx.prefix = pfx;
+ ctx.opt_count = 1;
+ ctx.key_name = "rebal-dict-key";
+ ctx.val_name = "rebal-dict-value";
+
+ dict_foreach (volinfo->rebal.dict, _add_dict_to_prdict, &ctx);
+ ctx.opt_count--;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.rebal-dict-count", count);
+ ret = dict_set_int32 (dict, key, ctx.opt_count);
+ if (ret)
+ goto out;
+ }
+
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_STATUS, count);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_STATUS, prefix, count);
ret = dict_set_int32 (dict, key, volinfo->rep_brick.rb_status);
if (ret)
goto out;
@@ -1963,8 +2450,8 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
if (volinfo->rep_brick.rb_status > GF_RB_STATUS_NONE) {
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_SRC_BRICK,
- count);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_SRC_BRICK,
+ prefix, count);
gf_asprintf (&src_brick, "%s:%s",
volinfo->rep_brick.src_brick->hostname,
volinfo->rep_brick.src_brick->path);
@@ -1973,8 +2460,8 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_DST_BRICK,
- count);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_DST_BRICK,
+ prefix, count);
gf_asprintf (&dst_brick, "%s:%s",
volinfo->rep_brick.dst_brick->hostname,
volinfo->rep_brick.dst_brick->path);
@@ -1989,16 +2476,16 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.rb_id", count);
+ snprintf (key, sizeof (key), "%s%d.rb_id", prefix, count);
ret = dict_set_dynstr (dict, key, rb_id_str);
if (ret)
goto out;
rb_id_str = NULL;
}
- snprintf (prefix, sizeof (prefix), "volume%d", count);
+ snprintf (pfx, sizeof (pfx), "%s%d", prefix, count);
ctx.dict = dict;
- ctx.prefix = prefix;
+ ctx.prefix = pfx;
ctx.opt_count = 1;
ctx.key_name = "key";
ctx.val_name = "value";
@@ -2007,13 +2494,13 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
dict_foreach (volinfo->dict, _add_dict_to_prdict, &ctx);
ctx.opt_count--;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.opt-count", count);
+ snprintf (key, sizeof (key), "%s%d.opt-count", prefix, count);
ret = dict_set_int32 (dict, key, ctx.opt_count);
if (ret)
goto out;
ctx.dict = dict;
- ctx.prefix = prefix;
+ ctx.prefix = pfx;
ctx.opt_count = 1;
ctx.key_name = "slave-num";
ctx.val_name = "slave-val";
@@ -2023,42 +2510,417 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
ctx.opt_count--;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.gsync-count", count);
+ snprintf (key, sizeof (key), "%s%d.gsync-count", prefix, count);
ret = dict_set_int32 (dict, key, ctx.opt_count);
if (ret)
goto out;
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick%d.hostname",
- count, i);
+ snprintf (key, sizeof (key), "%s%d.brick%d.hostname",
+ prefix, count, i);
ret = dict_set_str (dict, key, brickinfo->hostname);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick%d.path",
- count, i);
+ snprintf (key, sizeof (key), "%s%d.brick%d.path",
+ prefix, count, i);
ret = dict_set_str (dict, key, brickinfo->path);
if (ret)
goto out;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.brick%d.decommissioned",
+ prefix, count, i);
+ ret = dict_set_int32 (dict, key, brickinfo->decommissioned);
+ if (ret)
+ goto out;
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.brick%d.brick_id",
+ prefix, count, i);
+ ret = dict_set_str (dict, key, brickinfo->brick_id);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof (key), "%s%d.brick%d.snap_status",
+ prefix, count, i);
+ ret = dict_set_int32 (dict, key, brickinfo->snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set snap_status for %s:%s",
+ brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s%d.brick%d.device_path",
+ prefix, count, i);
+ ret = dict_set_str (dict, key, brickinfo->device_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set snap_device for %s:%s",
+ brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+
i++;
}
+ /* Add volume op-versions to dict. This prevents volume inconsistencies
+ * in the cluster
+ */
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.op-version", prefix, count);
+ ret = dict_set_int32 (dict, key, volinfo->op_version);
+ if (ret)
+ goto out;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.client-op-version", prefix, count);
+ ret = dict_set_int32 (dict, key, volinfo->client_op_version);
+ if (ret)
+ goto out;
+
+ /*Add volume Capability (BD Xlator) to dict*/
+ memset (key, 0 ,sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.caps", prefix, count);
+ ret = dict_set_int32 (dict, key, volinfo->caps);
out:
GF_FREE (volume_id_str);
GF_FREE (rebalance_id_str);
GF_FREE (rb_id_str);
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+int
+glusterd_vol_add_quota_conf_to_dict (glusterd_volinfo_t *volinfo, dict_t* load,
+ int vol_idx, char *prefix)
+{
+ int fd = -1;
+ char *gfid_str = NULL;
+ unsigned char buf[16] = {0};
+ char key[PATH_MAX] = {0};
+ int gfid_idx = 0;
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (prefix);
+
+ ret = glusterd_store_create_quota_conf_sh_on_absence (volinfo);
+ if (ret)
+ goto out;
+
+ fd = open (volinfo->quota_conf_shandle->path, O_RDONLY);
+ if (fd == -1) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_store_quota_conf_skip_header (this, fd);
+ if (ret)
+ goto out;
+
+ for (gfid_idx=0; ; gfid_idx++) {
+
+ ret = read (fd, (void*)&buf, 16) ;
+ if (ret <= 0) {
+ //Finished reading all entries in the conf file
+ break;
+ }
+ if (ret != 16) {
+ //This should never happen. We must have a multiple of
+ //entry_sz bytes in our configuration file.
+ gf_log (this->name, GF_LOG_CRITICAL, "Quota "
+ "configuration store may be corrupt.");
+ goto out;
+ }
+
+ gfid_str = gf_strdup (uuid_utoa (buf));
+ if (!gfid_str) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof(key)-1, "%s%d.gfid%d", prefix,
+ vol_idx, gfid_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_set_dynstr (load, key, gfid_str);
+ if (ret) {
+ goto out;
+ }
+
+ gfid_str = NULL;
+ }
+
+ snprintf (key, sizeof(key)-1, "%s%d.gfid-count", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_set_int32 (load, key, gfid_idx);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof(key)-1, "%s%d.quota-cksum", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_set_uint32 (load, key, volinfo->quota_conf_cksum);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof(key)-1, "%s%d.quota-version", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_set_uint32 (load, key, volinfo->quota_conf_version);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+ if (fd != -1)
+ close (fd);
+ GF_FREE (gfid_str);
+ return ret;
+}
+
+int32_t
+glusterd_add_missed_snaps_to_export_dict (dict_t *peer_data)
+{
+ char name_buf[PATH_MAX] = "";
+ char value[PATH_MAX] = "";
+ int32_t missed_snap_count = 0;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Add the missed_entries in the dict */
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ list_for_each_entry (snap_opinfo,
+ &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ snprintf (name_buf, sizeof(name_buf),
+ "missed_snaps_%d", missed_snap_count);
+ snprintf (value, sizeof(value), "%s:%s=%s:%d:%s:%d:%d",
+ missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid,
+ snap_opinfo->snap_vol_id,
+ snap_opinfo->brick_num,
+ snap_opinfo->brick_path,
+ snap_opinfo->op,
+ snap_opinfo->status);
+
+ ret = dict_set_dynstr_with_alloc (peer_data, name_buf,
+ value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set %s",
+ name_buf);
+ goto out;
+ }
+ missed_snap_count++;
+ }
+ }
+
+ ret = dict_set_int32 (peer_data, "missed_snap_count",
+ missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set missed_snap_count");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_add_snap_to_dict (glusterd_snap_t *snap, dict_t *peer_data,
+ int32_t snap_count)
+{
+ char buf[NAME_MAX] = "";
+ char prefix[NAME_MAX] = "";
+ int32_t ret = -1;
+ int32_t volcount = 0;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ gf_boolean_t host_bricks = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap);
+ GF_ASSERT (peer_data);
+
+ snprintf (prefix, sizeof(prefix), "snap%d", snap_count);
+
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ volcount++;
+ ret = glusterd_add_volume_to_dict (volinfo, peer_data,
+ volcount, prefix);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add snap:%s volume:%s "
+ "to peer_data dict for handshake",
+ snap->snapname, volinfo->volname);
+ goto out;
+ }
+
+ ret = glusterd_vol_add_quota_conf_to_dict (volinfo, peer_data,
+ volcount, prefix);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add quota conf for "
+ "snap:%s volume:%s to peer_data "
+ "dict for handshake", snap->snapname,
+ volinfo->volname);
+ goto out;
+ }
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+ host_bricks = _gf_true;
+ break;
+ }
+ }
+ }
+
+ snprintf (buf, sizeof(buf), "%s.host_bricks", prefix);
+ ret = dict_set_int8 (peer_data, buf, (int8_t) host_bricks);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set host_bricks for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.volcount", prefix);
+ ret = dict_set_int32 (peer_data, buf, volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set volcount for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snapname", prefix);
+ ret = dict_set_dynstr_with_alloc (peer_data, buf, snap->snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snapname for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_id", prefix);
+ ret = dict_set_dynstr_with_alloc (peer_data, buf,
+ uuid_utoa (snap->snap_id));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snap_id for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ if (snap->description) {
+ snprintf (buf, sizeof(buf), "%s.snapid", prefix);
+ ret = dict_set_dynstr_with_alloc (peer_data, buf,
+ snap->description);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set description for snap %s",
+ snap->snapname);
+ goto out;
+ }
+ }
+
+ snprintf (buf, sizeof(buf), "%s.time_stamp", prefix);
+ ret = dict_set_int64 (peer_data, buf, (int64_t)snap->time_stamp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set time_stamp for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_restored", prefix);
+ ret = dict_set_int8 (peer_data, buf, snap->snap_restored);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snap_restored for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_status", prefix);
+ ret = dict_set_int32 (peer_data, buf, snap->snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snap_status for snap %s",
+ snap->snapname);
+ goto out;
+ }
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
return ret;
}
int32_t
-glusterd_build_volume_dict (dict_t **vols)
+glusterd_add_snapshots_to_export_dict (dict_t *peer_data)
+{
+ int32_t snap_count = 0;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (peer_data);
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ snap_count++;
+ ret = glusterd_add_snap_to_dict (snap, peer_data, snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add snap(%s) to the "
+ " peer_data dict for handshake",
+ snap->snapname);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (peer_data, "snap_count", snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap_count");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_add_volumes_to_export_dict (dict_t **peer_data)
{
int32_t ret = -1;
dict_t *dict = NULL;
@@ -2066,22 +2928,31 @@ glusterd_build_volume_dict (dict_t **vols)
glusterd_volinfo_t *volinfo = NULL;
int32_t count = 0;
glusterd_dict_ctx_t ctx = {0};
+ xlator_t *this = NULL;
- priv = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
dict = dict_new ();
-
if (!dict)
goto out;
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
count++;
- ret = glusterd_add_volume_to_dict (volinfo, dict, count);
+ ret = glusterd_add_volume_to_dict (volinfo, dict, count,
+ "volume");
+ if (ret)
+ goto out;
+ if (!glusterd_is_volume_quota_enabled (volinfo))
+ continue;
+ ret = glusterd_vol_add_quota_conf_to_dict (volinfo, dict,
+ count, "volume");
if (ret)
goto out;
}
-
ret = dict_set_int32 (dict, "count", count);
if (ret)
goto out;
@@ -2097,18 +2968,18 @@ glusterd_build_volume_dict (dict_t **vols)
if (ret)
goto out;
- *vols = dict;
+ *peer_data = dict;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
if (ret)
dict_unref (dict);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
return ret;
}
int32_t
-glusterd_compare_friend_volume (dict_t *vols, int32_t count, int32_t *status,
- char *hostname)
+glusterd_compare_friend_volume (dict_t *peer_data, int32_t count,
+ int32_t *status, char *hostname)
{
int32_t ret = -1;
@@ -2116,13 +2987,19 @@ glusterd_compare_friend_volume (dict_t *vols, int32_t count, int32_t *status,
glusterd_volinfo_t *volinfo = NULL;
char *volname = NULL;
uint32_t cksum = 0;
+ uint32_t quota_cksum = 0;
+ uint32_t quota_version = 0;
int32_t version = 0;
+ xlator_t *this = NULL;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
GF_ASSERT (status);
+ this = THIS;
+ GF_ASSERT (this);
+
snprintf (key, sizeof (key), "volume%d.name", count);
- ret = dict_get_str (vols, key, &volname);
+ ret = dict_get_str (peer_data, key, &volname);
if (ret)
goto out;
@@ -2136,14 +3013,14 @@ glusterd_compare_friend_volume (dict_t *vols, int32_t count, int32_t *status,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.version", count);
- ret = dict_get_int32 (vols, key, &version);
+ ret = dict_get_int32 (peer_data, key, &version);
if (ret)
goto out;
if (version > volinfo->version) {
//Mismatch detected
ret = 0;
- gf_log ("", GF_LOG_ERROR, "Version of volume %s differ."
+ gf_log (this->name, GF_LOG_ERROR, "Version of volume %s differ."
"local version = %d, remote version = %d on peer %s",
volinfo->volname, volinfo->version, version, hostname);
*status = GLUSTERD_VOL_COMP_UPDATE_REQ;
@@ -2157,29 +3034,78 @@ glusterd_compare_friend_volume (dict_t *vols, int32_t count, int32_t *status,
//
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.ckusm", count);
- ret = dict_get_uint32 (vols, key, &cksum);
+ ret = dict_get_uint32 (peer_data, key, &cksum);
if (ret)
goto out;
if (cksum != volinfo->cksum) {
ret = 0;
- gf_log ("", GF_LOG_ERROR, "Cksums of volume %s differ."
+ gf_log (this->name, GF_LOG_ERROR, "Cksums of volume %s differ."
" local cksum = %u, remote cksum = %u on peer %s",
volinfo->volname, volinfo->cksum, cksum, hostname);
*status = GLUSTERD_VOL_COMP_RJT;
goto out;
}
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.quota-version", count);
+ ret = dict_get_uint32 (peer_data, key, &quota_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota-version key absent for"
+ " volume %s in peer %s's response", volinfo->volname,
+ hostname);
+ ret = 0;
+ } else {
+ if (quota_version > volinfo->quota_conf_version) {
+ //Mismatch detected
+ ret = 0;
+ gf_log (this->name, GF_LOG_ERROR, "Quota configuration "
+ "versions of volume %s differ. "
+ "local version = %d, remote version = %d "
+ "on peer %s", volinfo->volname,
+ volinfo->quota_conf_version, quota_version,
+ hostname);
+ *status = GLUSTERD_VOL_COMP_UPDATE_REQ;
+ goto out;
+ } else if (quota_version < volinfo->quota_conf_version) {
+ *status = GLUSTERD_VOL_COMP_SCS;
+ goto out;
+ }
+ }
+
+ //Now, versions are same, compare cksums.
+ //
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.quota-cksum", count);
+ ret = dict_get_uint32 (peer_data, key, &quota_cksum);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota checksum absent for "
+ "volume %s in peer %s's response", volinfo->volname,
+ hostname);
+ ret = 0;
+ } else {
+ if (quota_cksum != volinfo->quota_conf_cksum) {
+ ret = 0;
+ gf_log (this->name, GF_LOG_ERROR, "Cksums of quota "
+ "configurations of volume %s differ. "
+ "local cksum = %u, remote cksum = %u on "
+ "peer %s", volinfo->volname,
+ volinfo->quota_conf_cksum, quota_cksum,
+ hostname);
+ *status = GLUSTERD_VOL_COMP_RJT;
+ goto out;
+ }
+ }
*status = GLUSTERD_VOL_COMP_SCS;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with ret: %d, status: %d",
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with ret: %d, status: %d",
ret, *status);
return ret;
}
static int32_t
-import_prdict_dict (dict_t *vols, dict_t *dst_dict, char *key_prefix,
+import_prdict_dict (dict_t *peer_data, dict_t *dst_dict, char *key_prefix,
char *value_prefix, int opt_count, char *prefix)
{
char key[512] = {0,};
@@ -2194,7 +3120,7 @@ import_prdict_dict (dict_t *vols, dict_t *dst_dict, char *key_prefix,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.%s%d",
prefix, key_prefix, i);
- ret = dict_get_str (vols, key, &opt_key);
+ ret = dict_get_str (peer_data, key, &opt_key);
if (ret) {
snprintf (msg, sizeof (msg), "Volume dict key not "
"specified");
@@ -2204,7 +3130,7 @@ import_prdict_dict (dict_t *vols, dict_t *dst_dict, char *key_prefix,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.%s%d",
prefix, value_prefix, i);
- ret = dict_get_str (vols, key, &opt_val);
+ ret = dict_get_str (peer_data, key, &opt_val);
if (ret) {
snprintf (msg, sizeof (msg), "Volume dict value not "
"specified");
@@ -2432,7 +3358,7 @@ glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo,
gf_boolean_t meets_quorum)
{
glusterd_brickinfo_t *brickinfo = NULL;
- glusterd_conf_t *conf = NULL;
+ glusterd_conf_t *conf = NULL;
conf = this->private;
if (volinfo->status != GLUSTERD_STATUS_STARTED)
@@ -2492,7 +3418,7 @@ out:
}
int32_t
-glusterd_import_friend_volume_opts (dict_t *vols, int count,
+glusterd_import_friend_volume_opts (dict_t *peer_data, int count,
glusterd_volinfo_t *volinfo)
{
char key[512] = {0,};
@@ -2501,9 +3427,12 @@ glusterd_import_friend_volume_opts (dict_t *vols, int count,
char msg[2048] = {0};
char volume_prefix[1024] = {0};
+ GF_ASSERT (peer_data);
+ GF_ASSERT (volinfo);
+
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.opt-count", count);
- ret = dict_get_int32 (vols, key, &opt_count);
+ ret = dict_get_int32 (peer_data, key, &opt_count);
if (ret) {
snprintf (msg, sizeof (msg), "Volume option count not "
"specified for %s", volinfo->volname);
@@ -2511,7 +3440,7 @@ glusterd_import_friend_volume_opts (dict_t *vols, int count,
}
snprintf (volume_prefix, sizeof (volume_prefix), "volume%d", count);
- ret = import_prdict_dict (vols, volinfo->dict, "key", "value",
+ ret = import_prdict_dict (peer_data, volinfo->dict, "key", "value",
opt_count, volume_prefix);
if (ret) {
snprintf (msg, sizeof (msg), "Unable to import options dict "
@@ -2521,14 +3450,14 @@ glusterd_import_friend_volume_opts (dict_t *vols, int count,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.gsync-count", count);
- ret = dict_get_int32 (vols, key, &opt_count);
+ ret = dict_get_int32 (peer_data, key, &opt_count);
if (ret) {
snprintf (msg, sizeof (msg), "Gsync count not "
"specified for %s", volinfo->volname);
goto out;
}
- ret = import_prdict_dict (vols, volinfo->gsync_slaves, "slave-num",
+ ret = import_prdict_dict (peer_data, volinfo->gsync_slaves, "slave-num",
"slave-val", opt_count, volume_prefix);
if (ret) {
snprintf (msg, sizeof (msg), "Unable to import gsync sessions "
@@ -2543,35 +3472,75 @@ out:
return ret;
}
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
int32_t
-glusterd_import_new_brick (dict_t *vols, int32_t vol_count,
+glusterd_import_new_brick (dict_t *peer_data, int32_t vol_count,
int32_t brick_count,
- glusterd_brickinfo_t **brickinfo)
+ glusterd_brickinfo_t **brickinfo,
+ char *prefix)
{
char key[512] = {0,};
int ret = -1;
+ int32_t snap_status = 0;
+ char *snap_device = NULL;
char *hostname = NULL;
char *path = NULL;
+ char *brick_id = NULL;
+ int decommissioned = 0;
glusterd_brickinfo_t *new_brickinfo = NULL;
char msg[2048] = {0};
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
GF_ASSERT (vol_count >= 0);
GF_ASSERT (brickinfo);
+ GF_ASSERT (prefix);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.brick%d.hostname",
+ prefix, vol_count, brick_count);
+ ret = dict_get_str (peer_data, key, &hostname);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload", key);
+ goto out;
+ }
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick%d.hostname",
- vol_count, brick_count);
- ret = dict_get_str (vols, key, &hostname);
+ snprintf (key, sizeof (key), "%s%d.brick%d.path",
+ prefix, vol_count, brick_count);
+ ret = dict_get_str (peer_data, key, &path);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload", key);
goto out;
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick%d.path",
- vol_count, brick_count);
- ret = dict_get_str (vols, key, &path);
+ snprintf (key, sizeof (key), "%s%d.brick%d.brick_id",
+ prefix, vol_count, brick_count);
+ ret = dict_get_str (peer_data, key, &brick_id);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.brick%d.decommissioned",
+ prefix, vol_count, brick_count);
+ ret = dict_get_int32 (peer_data, key, &decommissioned);
+ if (ret) {
+ /* For backward compatibility */
+ ret = 0;
+ }
+
+ snprintf (key, sizeof (key), "%s%d.brick%d.snap_status",
+ prefix, vol_count, brick_count);
+ ret = dict_get_int32 (peer_data, key, &snap_status);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload", key);
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s%d.brick%d.device_path",
+ prefix, vol_count, brick_count);
+ ret = dict_get_str (peer_data, key, &snap_device);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload", key);
goto out;
@@ -2583,6 +3552,11 @@ glusterd_import_new_brick (dict_t *vols, int32_t vol_count,
strcpy (new_brickinfo->path, path);
strcpy (new_brickinfo->hostname, hostname);
+ strcpy (new_brickinfo->device_path, snap_device);
+ new_brickinfo->snap_status = snap_status;
+ new_brickinfo->decommissioned = decommissioned;
+ if (brick_id)
+ strcpy (new_brickinfo->brick_id, brick_id);
//peerinfo might not be added yet
(void) glusterd_resolve_brick (new_brickinfo);
ret = 0;
@@ -2594,23 +3568,36 @@ out:
return ret;
}
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
int32_t
-glusterd_import_bricks (dict_t *vols, int32_t vol_count,
- glusterd_volinfo_t *new_volinfo)
+glusterd_import_bricks (dict_t *peer_data, int32_t vol_count,
+ glusterd_volinfo_t *new_volinfo, char *prefix)
{
int ret = -1;
int brick_count = 1;
+ int brickid = 0;
glusterd_brickinfo_t *new_brickinfo = NULL;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
GF_ASSERT (vol_count >= 0);
GF_ASSERT (new_volinfo);
+ GF_ASSERT (prefix);
while (brick_count <= new_volinfo->brick_count) {
- ret = glusterd_import_new_brick (vols, vol_count, brick_count,
- &new_brickinfo);
+ ret = glusterd_import_new_brick (peer_data, vol_count,
+ brick_count,
+ &new_brickinfo, prefix);
if (ret)
goto out;
+ if (new_brickinfo->brick_id[0] == '\0')
+ /*We were probed from a peer having op-version
+ less than GD_OP_VER_PERSISTENT_AFR_XATTRS*/
+ GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (new_brickinfo,
+ new_volinfo,
+ brickid++);
list_add_tail (&new_brickinfo->brick_list, &new_volinfo->bricks);
brick_count++;
}
@@ -2620,15 +3607,174 @@ out:
return ret;
}
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+static int
+glusterd_import_quota_conf (dict_t *peer_data, int vol_idx,
+ glusterd_volinfo_t *new_volinfo,
+ char *prefix)
+{
+ int gfid_idx = 0;
+ int gfid_count = 0;
+ int ret = -1;
+ int fd = -1;
+ char key[PATH_MAX] = {0};
+ char *gfid_str = NULL;
+ uuid_t gfid = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+ GF_ASSERT (prefix);
+
+ if (!glusterd_is_volume_quota_enabled (new_volinfo)) {
+ (void) glusterd_clean_up_quota_store (new_volinfo);
+ return 0;
+ }
+
+ ret = glusterd_store_create_quota_conf_sh_on_absence (new_volinfo);
+ if (ret)
+ goto out;
+
+ fd = gf_store_mkstemp (new_volinfo->quota_conf_shandle);
+ if (fd < 0) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key)-1, "%s%d.quota-cksum", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_get_uint32 (peer_data, key, &new_volinfo->quota_conf_cksum);
+ if (ret)
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to get quota cksum");
+
+ snprintf (key, sizeof (key)-1, "%s%d.quota-version", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_get_uint32 (peer_data, key,
+ &new_volinfo->quota_conf_version);
+ if (ret)
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to get quota "
+ "version");
+
+ snprintf (key, sizeof (key)-1, "%s%d.gfid-count", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_get_int32 (peer_data, key, &gfid_count);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_quota_conf_stamp_header (this, fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to add header to tmp "
+ "file");
+ goto out;
+ }
+
+ gfid_idx = 0;
+ for (gfid_idx = 0; gfid_idx < gfid_count; gfid_idx++) {
+
+ snprintf (key, sizeof (key)-1, "%s%d.gfid%d",
+ prefix, vol_idx, gfid_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_get_str (peer_data, key, &gfid_str);
+ if (ret)
+ goto out;
+
+ uuid_parse (gfid_str, gfid);
+ ret = write (fd, (void*)gfid, 16);
+ if (ret != 16) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Unable to write "
+ "gfid %s into quota.conf for %s", gfid_str,
+ new_volinfo->volname);
+ ret = -1;
+ goto out;
+ }
+
+ }
+
+ ret = gf_store_rename_tmppath (new_volinfo->quota_conf_shandle);
+
+ ret = 0;
+
+out:
+ if (fd != -1)
+ close (fd);
+
+ if (!ret) {
+ ret = glusterd_compute_cksum (new_volinfo, _gf_true);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_save_quota_version_and_cksum (new_volinfo);
+ if (ret)
+ goto out;
+ }
+
+ if (ret && (fd > 0)) {
+ gf_store_unlink_tmppath (new_volinfo->quota_conf_shandle);
+ (void) gf_store_handle_destroy
+ (new_volinfo->quota_conf_shandle);
+ new_volinfo->quota_conf_shandle = NULL;
+ }
+
+ return ret;
+}
+
+int
+gd_import_friend_volume_rebal_dict (dict_t *dict, int count,
+ glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ char key[256] = {0,};
+ int dict_count = 0;
+ char prefix[64] = {0};
+
+ GF_ASSERT (dict);
+ GF_ASSERT (volinfo);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.rebal-dict-count", count);
+ ret = dict_get_int32 (dict, key, &dict_count);
+ if (ret) {
+ /* Older peers will not have this dict */
+ ret = 0;
+ goto out;
+ }
+
+ volinfo->rebal.dict = dict_new ();
+ if(!volinfo->rebal.dict) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (prefix, sizeof (prefix), "volume%d", count);
+ ret = import_prdict_dict (dict, volinfo->rebal.dict, "rebal-dict-key",
+ "rebal-dict-value", dict_count, prefix);
+out:
+ if (ret && volinfo->rebal.dict)
+ dict_unref (volinfo->rebal.dict);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
int32_t
-glusterd_import_volinfo (dict_t *vols, int count,
- glusterd_volinfo_t **volinfo)
+glusterd_import_volinfo (dict_t *peer_data, int count,
+ glusterd_volinfo_t **volinfo,
+ char *prefix)
{
int ret = -1;
char key[256] = {0};
+ char *parent_volname = NULL;
char *volname = NULL;
glusterd_volinfo_t *new_volinfo = NULL;
char *volume_id_str = NULL;
+ char *restored_snap = NULL;
char msg[2048] = {0};
char *src_brick = NULL;
char *dst_brick = NULL;
@@ -2636,35 +3782,53 @@ glusterd_import_volinfo (dict_t *vols, int count,
int rb_status = 0;
char *rebalance_id_str = NULL;
char *rb_id_str = NULL;
+ int op_version = 0;
+ int client_op_version = 0;
+ uint32_t is_snap_volume = 0;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
GF_ASSERT (volinfo);
+ GF_ASSERT (prefix);
- snprintf (key, sizeof (key), "volume%d.name", count);
- ret = dict_get_str (vols, key, &volname);
+ snprintf (key, sizeof (key), "%s%d.name", prefix, count);
+ ret = dict_get_str (peer_data, key, &volname);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload", key);
goto out;
}
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.is_snap_volume", prefix, count);
+ ret = dict_get_uint32 (peer_data, key, &is_snap_volume);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload for %s",
+ key, volname);
+ goto out;
+ }
+
ret = glusterd_volinfo_new (&new_volinfo);
if (ret)
goto out;
strncpy (new_volinfo->volname, volname, sizeof (new_volinfo->volname));
-
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.type", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->type);
+ snprintf (key, sizeof (key), "%s%d.type", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->type);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
goto out;
}
+ snprintf (key, sizeof (key), "%s%d.parent_volname", prefix, count);
+ ret = dict_get_str (peer_data, key, &parent_volname);
+ if (!ret)
+ strncpy (new_volinfo->parent_volname, parent_volname,
+ sizeof(new_volinfo->parent_volname));
+
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->brick_count);
+ snprintf (key, sizeof (key), "%s%d.brick_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->brick_count);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2672,8 +3836,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.version", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->version);
+ snprintf (key, sizeof (key), "%s%d.version", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->version);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2681,8 +3845,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.status", count);
- ret = dict_get_int32 (vols, key, (int32_t *)&new_volinfo->status);
+ snprintf (key, sizeof (key), "%s%d.status", prefix, count);
+ ret = dict_get_int32 (peer_data, key, (int32_t *)&new_volinfo->status);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2690,8 +3854,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.sub_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->sub_count);
+ snprintf (key, sizeof (key), "%s%d.sub_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->sub_count);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2701,8 +3865,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
/* not having a 'stripe_count' key is not a error
(as peer may be of old version) */
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.stripe_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->stripe_count);
+ snprintf (key, sizeof (key), "%s%d.stripe_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->stripe_count);
if (ret)
gf_log (THIS->name, GF_LOG_INFO,
"peer is possibly old version");
@@ -2710,8 +3874,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
/* not having a 'replica_count' key is not a error
(as peer may be of old version) */
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.replica_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->replica_count);
+ snprintf (key, sizeof (key), "%s%d.replica_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->replica_count);
if (ret)
gf_log (THIS->name, GF_LOG_INFO,
"peer is possibly old version");
@@ -2719,15 +3883,16 @@ glusterd_import_volinfo (dict_t *vols, int count,
/* not having a 'dist_count' key is not a error
(as peer may be of old version) */
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.dist_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->dist_leaf_count);
+ snprintf (key, sizeof (key), "%s%d.dist_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->dist_leaf_count);
if (ret)
gf_log (THIS->name, GF_LOG_INFO,
"peer is possibly old version");
-
+ new_volinfo->subvol_count = new_volinfo->brick_count/
+ glusterd_get_dist_leaf_count (new_volinfo);
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.ckusm", count);
- ret = dict_get_uint32 (vols, key, &new_volinfo->cksum);
+ snprintf (key, sizeof (key), "%s%d.ckusm", prefix, count);
+ ret = dict_get_uint32 (peer_data, key, &new_volinfo->cksum);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2735,8 +3900,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.volume_id", count);
- ret = dict_get_str (vols, key, &volume_id_str);
+ snprintf (key, sizeof (key), "%s%d.volume_id", prefix, count);
+ ret = dict_get_str (peer_data, key, &volume_id_str);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2746,8 +3911,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
uuid_parse (volume_id_str, new_volinfo->volume_id);
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.username", count);
- ret = dict_get_str (vols, key, &str);
+ snprintf (key, sizeof (key), "%s%d.username", prefix, count);
+ ret = dict_get_str (peer_data, key, &str);
if (!ret) {
ret = glusterd_auth_set_username (new_volinfo, str);
if (ret)
@@ -2755,8 +3920,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.password", count);
- ret = dict_get_str (vols, key, &str);
+ snprintf (key, sizeof (key), "%s%d.password", prefix, count);
+ ret = dict_get_str (peer_data, key, &str);
if (!ret) {
ret = glusterd_auth_set_password (new_volinfo, str);
if (ret)
@@ -2764,8 +3929,29 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.transport_type", count);
- ret = dict_get_uint32 (vols, key, &new_volinfo->transport_type);
+ snprintf (key, sizeof (key), "%s%d.transport_type", prefix, count);
+ ret = dict_get_uint32 (peer_data, key, &new_volinfo->transport_type);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload for %s",
+ key, volname);
+ goto out;
+ }
+
+ new_volinfo->is_snap_volume = is_snap_volume;
+
+ snprintf (key, sizeof (key), "%s%d.restored_from_snap", prefix, count);
+ ret = dict_get_str (peer_data, key, &restored_snap);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload for %s",
+ key, volname);
+ goto out;
+ }
+
+ uuid_parse (restored_snap, new_volinfo->restored_from_snap);
+
+ snprintf (key, sizeof (key), "%s%d.snap-max-hard-limit", prefix, count);
+ ret = dict_get_uint64 (peer_data, key,
+ &new_volinfo->snap_max_hard_limit);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2773,42 +3959,47 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.rebalance", count);
- ret = dict_get_uint32 (vols, key, &new_volinfo->rebal.defrag_cmd);
+ snprintf (key, sizeof (key), "%s%d.rebalance", prefix, count);
+ ret = dict_get_uint32 (peer_data, key, &new_volinfo->rebal.defrag_cmd);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
goto out;
}
- if (new_volinfo->rebal.defrag_cmd) {
- memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.rebalance-id", count);
- ret = dict_get_str (vols, key, &rebalance_id_str);
- if (ret) {
- /* This is not present in older glusterfs versions,
- * so don't error out
- */
- ret = 0;
- } else {
- uuid_parse (rebalance_id_str,
- new_volinfo->rebal.rebalance_id);
- }
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.rebalance-id", prefix, count);
+ ret = dict_get_str (peer_data, key, &rebalance_id_str);
+ if (ret) {
+ /* This is not present in older glusterfs versions,
+ * so don't error out
+ */
+ ret = 0;
+ } else {
+ uuid_parse (rebalance_id_str, new_volinfo->rebal.rebalance_id);
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.rebalance-op", count);
- ret = dict_get_uint32 (vols, key,(uint32_t *) &new_volinfo->rebal.op);
+ snprintf (key, sizeof (key), "%s%d.rebalance-op", prefix, count);
+ ret = dict_get_uint32 (peer_data, key,
+ (uint32_t *) &new_volinfo->rebal.op);
if (ret) {
/* This is not present in older glusterfs versions,
* so don't error out
*/
ret = 0;
}
+ ret = gd_import_friend_volume_rebal_dict (peer_data, count,
+ new_volinfo);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Failed to import rebalance dict "
+ "for volume.");
+ goto out;
+ }
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_STATUS, count);
- ret = dict_get_int32 (vols, key, &rb_status);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_STATUS, prefix, count);
+ ret = dict_get_int32 (peer_data, key, &rb_status);
if (ret)
goto out;
new_volinfo->rep_brick.rb_status = rb_status;
@@ -2816,9 +4007,9 @@ glusterd_import_volinfo (dict_t *vols, int count,
if (new_volinfo->rep_brick.rb_status > GF_RB_STATUS_NONE) {
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_SRC_BRICK,
- count);
- ret = dict_get_str (vols, key, &src_brick);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_SRC_BRICK,
+ prefix, count);
+ ret = dict_get_str (peer_data, key, &src_brick);
if (ret)
goto out;
@@ -2831,9 +4022,9 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_DST_BRICK,
- count);
- ret = dict_get_str (vols, key, &dst_brick);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_DST_BRICK,
+ prefix, count);
+ ret = dict_get_str (peer_data, key, &dst_brick);
if (ret)
goto out;
@@ -2846,8 +4037,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.rb_id", count);
- ret = dict_get_str (vols, key, &rb_id_str);
+ snprintf (key, sizeof (key), "%s%d.rb_id", prefix, count);
+ ret = dict_get_str (peer_data, key, &rb_id_str);
if (ret) {
/* This is not present in older glusterfs versions,
* so don't error out
@@ -2859,12 +4050,53 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
- ret = glusterd_import_friend_volume_opts (vols, count, new_volinfo);
+ ret = glusterd_import_friend_volume_opts (peer_data, count,
+ new_volinfo);
+ if (ret)
+ goto out;
+
+ /* Import the volume's op-versions if available else set it to 1.
+ * Not having op-versions implies this informtation was obtained from a
+ * op-version 1 friend (gluster-3.3), ergo the cluster is at op-version
+ * 1 and all volumes are at op-versions 1.
+ *
+ * Either both the volume op-versions should be absent or both should be
+ * present. Only one being present is a failure
+ */
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.op-version", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &op_version);
+ if (ret)
+ ret = 0;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.client-op-version", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &client_op_version);
if (ret)
+ ret = 0;
+
+ if (op_version && client_op_version) {
+ new_volinfo->op_version = op_version;
+ new_volinfo->client_op_version = client_op_version;
+ } else if (((op_version == 0) && (client_op_version != 0)) ||
+ ((op_version != 0) && (client_op_version == 0))) {
+ ret = -1;
+ gf_log ("glusterd", GF_LOG_ERROR,
+ "Only one volume op-version found");
goto out;
- ret = glusterd_import_bricks (vols, count, new_volinfo);
+ } else {
+ new_volinfo->op_version = 1;
+ new_volinfo->client_op_version = 1;
+ }
+
+ memset (key, 0 ,sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.caps", prefix, count);
+ /*This is not present in older glusterfs versions, so ignore ret value*/
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->caps);
+
+ ret = glusterd_import_bricks (peer_data, count, new_volinfo, prefix);
if (ret)
goto out;
+
*volinfo = new_volinfo;
out:
if (msg[0])
@@ -2938,7 +4170,11 @@ glusterd_volinfo_stop_stale_bricks (glusterd_volinfo_t *new_volinfo,
old_brickinfo->hostname,
old_brickinfo->path,
new_volinfo, &new_brickinfo);
- if (ret) {
+ /* If the brick is stale, i.e it's not a part of the new volume
+ * or if it's part of the new volume and is pending a snap,
+ * then stop the brick process
+ */
+ if (ret || (new_brickinfo->snap_status == -1)) {
/*TODO: may need to switch to 'atomic' flavour of
* brick_stop, once we make peer rpc program also
* synctask enabled*/
@@ -2960,9 +4196,34 @@ int32_t
glusterd_delete_stale_volume (glusterd_volinfo_t *stale_volinfo,
glusterd_volinfo_t *valid_volinfo)
{
+ int32_t ret = -1;
+ glusterd_volinfo_t *temp_volinfo = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+ xlator_t *this = NULL;
+
GF_ASSERT (stale_volinfo);
GF_ASSERT (valid_volinfo);
+ /* Copy snap_volumes list from stale_volinfo to valid_volinfo */
+ valid_volinfo->snap_count = 0;
+ list_for_each_entry_safe (voliter, temp_volinfo,
+ &stale_volinfo->snap_volumes, snapvol_list) {
+ list_add_tail (&voliter->snapvol_list,
+ &valid_volinfo->snap_volumes);
+ valid_volinfo->snap_count++;
+ }
+
+ if ((!uuid_is_null (stale_volinfo->restored_from_snap)) &&
+ (uuid_compare (stale_volinfo->restored_from_snap,
+ valid_volinfo->restored_from_snap))) {
+ ret = glusterd_lvm_snapshot_remove (NULL, stale_volinfo);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Failed to remove lvm snapshot for "
+ "restored volume %s", stale_volinfo->volname);
+ }
+ }
+
/* If stale volume is in started state, copy the port numbers of the
* local bricks if they exist in the valid volume information.
* stop stale bricks. Stale volume information is going to be deleted.
@@ -2993,15 +4254,63 @@ glusterd_delete_stale_volume (glusterd_volinfo_t *stale_volinfo,
(void) glusterd_delete_all_bricks (stale_volinfo);
if (stale_volinfo->shandle) {
unlink (stale_volinfo->shandle->path);
- (void) glusterd_store_handle_destroy (stale_volinfo->shandle);
+ (void) gf_store_handle_destroy (stale_volinfo->shandle);
stale_volinfo->shandle = NULL;
}
- (void) glusterd_volinfo_delete (stale_volinfo);
+ (void) glusterd_volinfo_remove (stale_volinfo);
return 0;
}
+/* This function updates the rebalance information of the new volinfo using the
+ * information from the old volinfo.
+ */
+int
+gd_check_and_update_rebalance_info (glusterd_volinfo_t *old_volinfo,
+ glusterd_volinfo_t *new_volinfo)
+{
+ int ret = -1;
+ glusterd_rebalance_t *old = NULL;
+ glusterd_rebalance_t *new = NULL;
+
+ GF_ASSERT (old_volinfo);
+ GF_ASSERT (new_volinfo);
+
+ old = &(old_volinfo->rebal);
+ new = &(new_volinfo->rebal);
+
+ //Disconnect from rebalance process
+ if (old->defrag && old->defrag->rpc) {
+ rpc_transport_disconnect (old->defrag->rpc->conn.trans);
+ }
+
+ if (!uuid_is_null (old->rebalance_id) &&
+ uuid_compare (old->rebalance_id, new->rebalance_id)) {
+ (void)gd_stop_rebalance_process (old_volinfo);
+ goto out;
+ }
+
+ /* If the tasks match, copy the status and other information of the
+ * rebalance process from old_volinfo to new_volinfo
+ */
+ new->defrag_status = old->defrag_status;
+ new->rebalance_files = old->rebalance_files;
+ new->rebalance_data = old->rebalance_data;
+ new->lookedup_files = old->lookedup_files;
+ new->skipped_files = old->skipped_files;
+ new->rebalance_failures = old->rebalance_failures;
+ new->rebalance_time = old->rebalance_time;
+ new->dict = (old->dict ? dict_ref (old->dict) : NULL);
+
+ /* glusterd_rebalance_t.{op, id, defrag_cmd} are copied during volume
+ * import
+ * a new defrag object should come to life with rebalance being restarted
+ */
+out:
+ return ret;
+}
+
int32_t
-glusterd_import_friend_volume (dict_t *vols, size_t count)
+glusterd_import_friend_volume (dict_t *peer_data, size_t count)
{
int32_t ret = -1;
@@ -3010,18 +4319,27 @@ glusterd_import_friend_volume (dict_t *vols, size_t count)
glusterd_volinfo_t *old_volinfo = NULL;
glusterd_volinfo_t *new_volinfo = NULL;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
this = THIS;
GF_ASSERT (this);
priv = this->private;
GF_ASSERT (priv);
- ret = glusterd_import_volinfo (vols, count, &new_volinfo);
+ ret = glusterd_import_volinfo (peer_data, count,
+ &new_volinfo, "volume");
if (ret)
goto out;
+ if (!new_volinfo) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not importing snap volume");
+ goto out;
+ }
+
ret = glusterd_volinfo_find (new_volinfo->volname, &old_volinfo);
if (0 == ret) {
+ (void) gd_check_and_update_rebalance_info (old_volinfo,
+ new_volinfo);
(void) glusterd_delete_stale_volume (old_volinfo, new_volinfo);
}
@@ -3034,29 +4352,33 @@ glusterd_import_friend_volume (dict_t *vols, size_t count)
if (ret)
goto out;
- gd_update_volume_op_versions (new_volinfo);
+ ret = glusterd_import_quota_conf (peer_data, count,
+ new_volinfo, "volume");
+ if (ret)
+ goto out;
- list_add_tail (&new_volinfo->vol_list, &priv->volumes);
+ list_add_order (&new_volinfo->vol_list, &priv->volumes,
+ glusterd_compare_volume_name);
out:
gf_log ("", GF_LOG_DEBUG, "Returning with ret: %d", ret);
return ret;
}
int32_t
-glusterd_import_friend_volumes (dict_t *vols)
+glusterd_import_friend_volumes (dict_t *peer_data)
{
int32_t ret = -1;
int32_t count = 0;
int i = 1;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
- ret = dict_get_int32 (vols, "count", &count);
+ ret = dict_get_int32 (peer_data, "count", &count);
if (ret)
goto out;
while (i <= count) {
- ret = glusterd_import_friend_volume (vols, i);
+ ret = glusterd_import_friend_volume (peer_data, i);
if (ret)
goto out;
i++;
@@ -3157,24 +4479,840 @@ out:
}
int32_t
-glusterd_compare_friend_data (dict_t *vols, int32_t *status, char *hostname)
+glusterd_perform_missed_op (glusterd_snap_t *snap, int32_t op)
{
- int32_t ret = -1;
- int32_t count = 0;
- int i = 1;
- gf_boolean_t update = _gf_false;
- gf_boolean_t stale_nfs = _gf_false;
- gf_boolean_t stale_shd = _gf_false;
+ dict_t *dict = NULL;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t null_uuid = {0};
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+
+ dict = dict_new();
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create dict");
+ ret = -1;
+ goto out;
+ }
+
+ switch (op) {
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_snap_remove (dict, snap, _gf_true, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to remove snap");
+ goto out;
+ }
+
+ break;
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ /* TODO : As of now there is only volume in snapshot.
+ * Change this when multiple volume snapshot is introduced
+ */
+ snap_volinfo = list_entry (snap->volumes.next,
+ glusterd_volinfo_t, vol_list);
+
+ /* Find the parent volinfo */
+ ret = glusterd_volinfo_find (snap_volinfo->parent_volname,
+ &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not get volinfo of %s",
+ snap_volinfo->parent_volname);
+ goto out;
+ }
+
+ /* Bump down the original volinfo's version, coz it would have
+ * incremented already due to volume handshake
+ */
+ volinfo->version--;
+ uuid_copy (volinfo->restored_from_snap, null_uuid);
+
+ /* Perform the restore */
+ ret = gd_restore_snap_volume (dict, volinfo, snap_volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to restore "
+ "snap for %s", snap->snapname);
+ volinfo->version++;
+ goto out;
+ }
+
+ break;
+ default:
+ /* The entry must be a create, delete, or
+ * restore entry
+ */
+ gf_log (this->name, GF_LOG_ERROR, "Invalid missed snap entry");
+ ret = -1;
+ goto out;
+ }
+
+out:
+ dict_unref (dict);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Perform missed deletes and restores on this node */
+int32_t
+glusterd_perform_missed_snap_ops ()
+{
+ int32_t ret = -1;
+ int32_t op_status = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ uuid_t snap_uuid = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ /* If the pending snap_op is not for this node then continue */
+ if (strcmp (missed_snapinfo->node_uuid, uuid_utoa (MY_UUID)))
+ continue;
+
+ /* Find the snap id */
+ uuid_parse (missed_snapinfo->snap_uuid, snap_uuid);
+ snap = NULL;
+ snap = glusterd_find_snap_by_id (snap_uuid);
+ if (!snap) {
+ /* If the snap is not found, then a delete or a
+ * restore can't be pending on that snap_uuid.
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a pending delete or restore op");
+ continue;
+ }
+
+ op_status = GD_MISSED_SNAP_PENDING;
+ list_for_each_entry (snap_opinfo, &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ /* If the snap_op is create or its status is
+ * GD_MISSED_SNAP_DONE then continue
+ */
+ if ((snap_opinfo->status == GD_MISSED_SNAP_DONE) ||
+ (snap_opinfo->op == GF_SNAP_OPTION_TYPE_CREATE))
+ continue;
+
+ /* Perform the actual op for the first time for
+ * this snap, and mark the snap_status as
+ * GD_MISSED_SNAP_DONE. For other entries for the same
+ * snap, just mark the entry as done.
+ */
+ if (op_status == GD_MISSED_SNAP_PENDING) {
+ ret = glusterd_perform_missed_op
+ (snap,
+ snap_opinfo->op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to perform missed snap op");
+ goto out;
+ }
+ op_status = GD_MISSED_SNAP_DONE;
+ }
+
+ snap_opinfo->status = GD_MISSED_SNAP_DONE;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Import friend volumes missed_snap_list and update *
+ * missed_snap_list if need be */
+int32_t
+glusterd_import_friend_missed_snap_list (dict_t *peer_data)
+{
+ int32_t missed_snap_count = -1;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Add the friends missed_snaps entries to the in-memory list */
+ ret = dict_get_int32 (peer_data, "missed_snap_count",
+ &missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "No missed snaps");
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_add_missed_snaps_to_list (peer_data,
+ missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snaps to list");
+ goto out;
+ }
+
+ ret = glusterd_perform_missed_snap_ops ();
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to perform snap operations");
+ /* Not going to out at this point coz some *
+ * missed ops might have been performed. We *
+ * need to persist the current list *
+ */
+ }
+
+ ret = glusterd_store_update_missed_snaps ();
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to update missed_snaps_list");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Check for the peer_snap_name in the list of existing snapshots.
+ * If a snap exists with the same name and a different snap_id, then
+ * there is a conflict. Set conflict as _gf_true, and snap to the
+ * conflicting snap object. If a snap exists with the same name, and the
+ * same snap_id, then there is no conflict. Set conflict as _gf_false
+ * and snap to the existing snap object. If no snap exists with the
+ * peer_snap_name, then there is no conflict. Set conflict as _gf_false
+ * and snap to NULL.
+ */
+void
+glusterd_is_peer_snap_conflicting (char *peer_snap_name, char *peer_snap_id,
+ gf_boolean_t *conflict,
+ glusterd_snap_t **snap, char *hostname)
+{
+ uuid_t peer_snap_uuid = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_snap_name);
+ GF_ASSERT (peer_snap_id);
+ GF_ASSERT (conflict);
+ GF_ASSERT (snap);
+ GF_ASSERT (hostname);
+
+ *snap = glusterd_find_snap_by_name (peer_snap_name);
+ if (*snap) {
+ uuid_parse (peer_snap_id, peer_snap_uuid);
+ if (!uuid_compare (peer_snap_uuid, (*snap)->snap_id)) {
+ /* Current node contains the same snap having
+ * the same snapname and snap_id
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Snapshot %s from peer %s present in "
+ "localhost", peer_snap_name, hostname);
+ *conflict = _gf_false;
+ } else {
+ /* Current node contains the same snap having
+ * the same snapname but different snap_id
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Snapshot %s from peer %s conflicts with "
+ "snapshot in localhost", peer_snap_name,
+ hostname);
+ *conflict = _gf_true;
+ }
+ } else {
+ /* Peer contains snapshots missing on the current node */
+ gf_log (this->name, GF_LOG_INFO,
+ "Snapshot %s from peer %s missing on localhost",
+ peer_snap_name, hostname);
+ *conflict = _gf_false;
+ }
+}
+
+/* Check if the local node is hosting any bricks for the given snapshot */
+gf_boolean_t
+glusterd_are_snap_bricks_local (glusterd_snap_t *snap)
+{
+ gf_boolean_t is_local = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap);
+
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+ is_local = _gf_true;
+ goto out;
+ }
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", is_local);
+ return is_local;
+}
+
+/* Check if the peer has missed any snap delete for the given snap_id */
+gf_boolean_t
+glusterd_peer_has_missed_snap_delete (glusterd_peerinfo_t *peerinfo,
+ char *peer_snap_id)
+{
+ char *peer_uuid = NULL;
+ gf_boolean_t missed_delete = _gf_false;
+ glusterd_conf_t *priv = NULL;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (peer_snap_id);
+
+ peer_uuid = uuid_utoa (peerinfo->uuid);
+
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ /* Look for missed snap for the same peer, and
+ * the same snap_id
+ */
+ if ((!strcmp (peer_uuid, missed_snapinfo->node_uuid)) &&
+ (!strcmp (peer_snap_id, missed_snapinfo->snap_uuid))) {
+ /* Check if the missed snap's op is delete and the
+ * status is pending
+ */
+ list_for_each_entry (snap_opinfo,
+ &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ if ((snap_opinfo->op ==
+ GF_SNAP_OPTION_TYPE_DELETE) &&
+ (snap_opinfo->status ==
+ GD_MISSED_SNAP_PENDING)) {
+ missed_delete = _gf_true;
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", missed_delete);
+ return missed_delete;
+}
+
+/* Genrate and store snap volfiles for imported snap object */
+int32_t
+glusterd_gen_snap_volfiles (glusterd_volinfo_t *snap_vol, char *peer_snap_name)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *parent_volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (peer_snap_name);
+
+ ret = glusterd_store_volinfo (snap_vol, GLUSTERD_VOLINFO_VER_AC_NONE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store snapshot "
+ "volinfo (%s) for snap %s", snap_vol->volname,
+ peer_snap_name);
+ goto out;
+ }
+
+ ret = generate_brick_volfiles (snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "generating the brick volfiles for the "
+ "snap %s failed", peer_snap_name);
+ goto out;
+ }
+
+ ret = generate_client_volfiles (snap_vol, GF_CLIENT_TRUSTED);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "generating the trusted client volfiles for "
+ "the snap %s failed", peer_snap_name);
+ goto out;
+ }
+
+ ret = generate_client_volfiles (snap_vol, GF_CLIENT_OTHER);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "generating the client volfiles for the "
+ "snap %s failed", peer_snap_name);
+ goto out;
+ }
- GF_ASSERT (vols);
+ ret = glusterd_volinfo_find (snap_vol->parent_volname,
+ &parent_volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Parent volinfo "
+ "not found for %s volume of snap %s",
+ snap_vol->volname, peer_snap_name);
+ goto out;
+ }
+
+ glusterd_list_add_snapvol (parent_volinfo, snap_vol);
+
+ list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "not starting snap brick %s:%s for "
+ "for the snap %s (volume: %s)",
+ brickinfo->hostname, brickinfo->path,
+ peer_snap_name, parent_volinfo->volname);
+ continue;
+ }
+
+ ret = glusterd_brick_start (snap_vol, brickinfo, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "starting the "
+ "brick %s:%s for the snap %s (volume: %s) "
+ "failed", brickinfo->hostname, brickinfo->path,
+ peer_snap_name, parent_volinfo->volname);
+ goto out;
+ }
+ }
+
+ snap_vol->status = GLUSTERD_STATUS_STARTED;
+
+ ret = glusterd_store_volinfo (snap_vol, GLUSTERD_VOLINFO_VER_AC_NONE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store snap volinfo");
+ goto out;
+ }
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Import snapshot info from peer_data and add it to priv */
+int32_t
+glusterd_import_friend_snap (dict_t *peer_data, int32_t snap_count,
+ char *peer_snap_name, char *peer_snap_id)
+{
+ char buf[NAME_MAX] = "";
+ char prefix[NAME_MAX] = "";
+ dict_t *dict = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+ int32_t ret = -1;
+ int32_t volcount = -1;
+ int32_t i = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (peer_data);
+ GF_ASSERT (peer_snap_name);
+ GF_ASSERT (peer_snap_id);
+
+ snprintf (prefix, sizeof(prefix), "snap%d", snap_count);
+
+ snap = glusterd_new_snap_object ();
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not create "
+ "the snap object for snap %s", peer_snap_name);
+ goto out;
+ }
+
+ strcpy (snap->snapname, peer_snap_name);
+ uuid_parse (peer_snap_id, snap->snap_id);
+
+ snprintf (buf, sizeof(buf), "%s.snapid", prefix);
+ ret = dict_get_str (peer_data, buf, &snap->description);
+
+ snprintf (buf, sizeof(buf), "%s.time_stamp", prefix);
+ ret = dict_get_int64 (peer_data, buf, &snap->time_stamp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get time_stamp for snap %s",
+ peer_snap_name);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_restored", prefix);
+ ret = dict_get_int8 (peer_data, buf, (int8_t *) &snap->snap_restored);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get snap_restored for snap %s",
+ peer_snap_name);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_status", prefix);
+ ret = dict_get_int32 (peer_data, buf, (int32_t *) &snap->snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get snap_status for snap %s",
+ peer_snap_name);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.volcount", prefix);
+ ret = dict_get_int32 (peer_data, buf, &volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get volcount for snap %s",
+ peer_snap_name);
+ goto out;
+ }
+
+ ret = glusterd_store_create_snap_dir (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snap dir");
+ goto out;
+ }
+
+ list_add_order (&snap->snap_list, &priv->snapshots,
+ glusterd_compare_snap_time);
+
+ for (i = 1; i <= volcount; i++) {
+ ret = glusterd_import_volinfo (peer_data, i,
+ &snap_vol, prefix);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to import snap volinfo for "
+ "snap %s", peer_snap_name);
+ goto out;
+ }
+
+ snap_vol->snapshot = snap;
+
+ ret = glusterd_gen_snap_volfiles (snap_vol, peer_snap_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to generate snap vol files "
+ "for snap %s", peer_snap_name);
+ goto out;
+ }
+
+ ret = glusterd_import_quota_conf (peer_data, i,
+ snap_vol, prefix);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to import quota conf "
+ "for snap %s", peer_snap_name);
+ goto out;
+ }
+
+ snap_vol = NULL;
+ }
+
+ ret = glusterd_store_snap (snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Could not store snap"
+ "object %s", peer_snap_name);
+ goto out;
+ }
+
+out:
+ if (ret)
+ glusterd_snap_remove (dict, snap,
+ _gf_true, _gf_true);
+
+ if (dict)
+ dict_unref (dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* During a peer-handshake, after the volumes have synced, and the list of
+ * missed snapshots have synced, the node will perform the pending deletes
+ * and restores on this list. At this point, the current snapshot list in
+ * the node will be updated, and hence in case of conflicts arising during
+ * snapshot handshake, the peer hosting the bricks will be given precedence
+ * Likewise, if there will be a conflict, and both peers will be in the same
+ * state, i.e either both would be hosting bricks or both would not be hosting
+ * bricks, then a decision can't be taken and a peer-reject will happen.
+ *
+ * glusterd_compare_and_update_snap() implements the following algorithm to
+ * perform the above task:
+ * Step 1: Start.
+ * Step 2: Check if the peer is missing a delete on the said snap.
+ * If yes, goto step 6.
+ * Step 3: Check if there is a conflict between the peer's data and the
+ * local snap. If no, goto step 5.
+ * Step 4: As there is a conflict, check if both the peer and the local nodes
+ * are hosting bricks. Based on the results perform the following:
+ * Peer Hosts Bricks Local Node Hosts Bricks Action
+ * Yes Yes Goto Step 7
+ * No No Goto Step 7
+ * Yes No Goto Step 8
+ * No Yes Goto Step 6
+ * Step 5: Check if the local node is missing the peer's data.
+ * If yes, goto step 9.
+ * Step 6: It's a no-op. Goto step 10
+ * Step 7: Peer Reject. Goto step 10
+ * Step 8: Delete local node's data.
+ * Step 9: Accept Peer Data.
+ * Step 10: Stop
+ *
+ */
+int32_t
+glusterd_compare_and_update_snap (dict_t *peer_data, int32_t snap_count,
+ glusterd_peerinfo_t *peerinfo)
+{
+ char buf[NAME_MAX] = "";
+ char prefix[NAME_MAX] = "";
+ char *peer_snap_name = NULL;
+ char *peer_snap_id = NULL;
+ dict_t *dict = NULL;
+ glusterd_snap_t *snap = NULL;
+ gf_boolean_t conflict = _gf_false;
+ gf_boolean_t is_local = _gf_false;
+ gf_boolean_t is_hosted = _gf_false;
+ gf_boolean_t missed_delete = _gf_false;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+ GF_ASSERT (peerinfo);
+
+ snprintf (prefix, sizeof(prefix), "snap%d", snap_count);
+
+ /* Fetch the peer's snapname */
+ snprintf (buf, sizeof(buf), "%s.snapname", prefix);
+ ret = dict_get_str (peer_data, buf, &peer_snap_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snapname from peer: %s",
+ peerinfo->hostname);
+ goto out;
+ }
+
+ /* Fetch the peer's snap_id */
+ snprintf (buf, sizeof(buf), "%s.snap_id", prefix);
+ ret = dict_get_str (peer_data, buf, &peer_snap_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_id from peer: %s",
+ peerinfo->hostname);
+ goto out;
+ }
+
+ /* Check if the peer has missed a snap delete for the
+ * snap in question
+ */
+ missed_delete = glusterd_peer_has_missed_snap_delete (peerinfo,
+ peer_snap_id);
+ if (missed_delete == _gf_true) {
+ /* Peer has missed delete on the missing/conflicting snap_id */
+ gf_log (this->name, GF_LOG_INFO, "Peer %s has missed a delete "
+ "on snap %s", peerinfo->hostname, peer_snap_name);
+ ret = 0;
+ goto out;
+ }
+
+ /* Check if there is a conflict, and if the
+ * peer data is already present
+ */
+ glusterd_is_peer_snap_conflicting (peer_snap_name, peer_snap_id,
+ &conflict, &snap,
+ peerinfo->hostname);
+ if (conflict == _gf_false) {
+ if (snap) {
+ /* Peer has snap with the same snapname
+ * and snap_id. No need to accept peer data
+ */
+ ret = 0;
+ goto out;
+ } else {
+ /* Peer has snap with the same snapname
+ * and snap_id, which local node doesn't have.
+ */
+ goto accept_peer_data;
+ }
+ }
+
+ /* There is a conflict. Check if the current node is
+ * hosting bricks for the conflicted snap.
+ */
+ is_local = glusterd_are_snap_bricks_local (snap);
+
+ /* Check if the peer is hosting any bricks for the
+ * conflicting snap
+ */
+ snprintf (buf, sizeof(buf), "%s.host_bricks", prefix);
+ ret = dict_get_int8 (peer_data, buf, (int8_t *) &is_hosted);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch host_bricks from peer: %s "
+ "for %s", peerinfo->hostname, peer_snap_name);
+ goto out;
+ }
+
+ /* As there is a conflict at this point of time, the data of the
+ * node that hosts a brick takes precedence. If both the local
+ * node and the peer are in the same state, i.e if both of them
+ * are either hosting or not hosting the bricks, for the snap,
+ * then it's a peer reject
+ */
+ if (is_hosted == is_local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Conflict in snapshot %s with peer %s",
+ peer_snap_name, peerinfo->hostname);
+ ret = -1;
+ goto out;
+ }
+
+ if (is_hosted == _gf_false) {
+ /* If there was a conflict, and the peer is not hosting
+ * any brick, then don't accept peer data
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Peer doesn't hosts bricks for conflicting "
+ "snap(%s). Not accepting peer data.",
+ peer_snap_name);
+ ret = 0;
+ goto out;
+ }
+
+ /* The peer is hosting a brick in case of conflict
+ * And local node isn't. Hence remove local node's
+ * data and accept peer data
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG, "Peer hosts bricks for conflicting "
+ "snap(%s). Removing local data. Accepting peer data.",
+ peer_snap_name);
+
+ dict = dict_new();
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to create dict");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_snap_remove (dict, snap, _gf_true, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to remove snap %s", snap->snapname);
+ goto out;
+ }
+
+accept_peer_data:
+
+ /* Accept Peer Data */
+ ret = glusterd_import_friend_snap (peer_data, snap_count,
+ peer_snap_name, peer_snap_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to import snap %s from peer %s",
+ peer_snap_name, peerinfo->hostname);
+ goto out;
+ }
+
+out:
+ if (dict)
+ dict_unref (dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Compare snapshots present in peer_data, with the snapshots in
+ * the current node
+ */
+int32_t
+glusterd_compare_friend_snapshots (dict_t *peer_data,
+ glusterd_peerinfo_t *peerinfo)
+{
+ int32_t ret = -1;
+ int32_t snap_count = 0;
+ int i = 1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+ GF_ASSERT (peerinfo);
+
+ ret = dict_get_int32 (peer_data, "snap_count", &snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to fetch snap_count");
+ goto out;
+ }
+
+ for (i = 1; i <= snap_count; i++) {
+ /* Compare one snapshot from peer_data at a time */
+ ret = glusterd_compare_and_update_snap (peer_data, i, peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to compare snapshots with peer %s",
+ peerinfo->hostname);
+ goto out;
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_compare_friend_data (dict_t *peer_data, int32_t *status,
+ char *hostname)
+{
+ int32_t ret = -1;
+ int32_t count = 0;
+ int i = 1;
+ gf_boolean_t update = _gf_false;
+ gf_boolean_t stale_nfs = _gf_false;
+ gf_boolean_t stale_shd = _gf_false;
+ gf_boolean_t stale_qd = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
GF_ASSERT (status);
- ret = dict_get_int32 (vols, "count", &count);
+ ret = dict_get_int32 (peer_data, "count", &count);
if (ret)
goto out;
while (i <= count) {
- ret = glusterd_compare_friend_volume (vols, i, status,
+ ret = glusterd_compare_friend_volume (peer_data, i, status,
hostname);
if (ret)
goto out;
@@ -3194,10 +5332,12 @@ glusterd_compare_friend_data (dict_t *vols, int32_t *status, char *hostname)
stale_nfs = _gf_true;
if (glusterd_is_nodesvc_running ("glustershd"))
stale_shd = _gf_true;
- ret = glusterd_import_global_opts (vols);
+ if (glusterd_is_nodesvc_running ("quotad"))
+ stale_qd = _gf_true;
+ ret = glusterd_import_global_opts (peer_data);
if (ret)
goto out;
- ret = glusterd_import_friend_volumes (vols);
+ ret = glusterd_import_friend_volumes (peer_data);
if (ret)
goto out;
if (_gf_false == glusterd_are_all_volumes_stopped ()) {
@@ -3207,50 +5347,17 @@ glusterd_compare_friend_data (dict_t *vols, int32_t *status, char *hostname)
glusterd_nfs_server_stop ();
if (stale_shd)
glusterd_shd_stop ();
+ if (stale_qd)
+ glusterd_quotad_stop ();
}
}
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with ret: %d, status: %d",
- ret, *status);
-
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Returning with ret: %d, status: %d", ret, *status);
return ret;
}
-/* Valid only in if service is 'local' to glusterd.
- * pid can be -1, if reading pidfile failed */
-gf_boolean_t
-glusterd_is_service_running (char *pidfile, int *pid)
-{
- FILE *file = NULL;
- gf_boolean_t running = _gf_false;
- int ret = 0;
- int fno = 0;
-
- file = fopen (pidfile, "r+");
- if (!file)
- goto out;
-
- fno = fileno (file);
- ret = lockf (fno, F_TEST, 0);
- if (ret == -1)
- running = _gf_true;
- if (!pid)
- goto out;
-
- ret = fscanf (file, "%d", pid);
- if (ret <= 0) {
- gf_log ("", GF_LOG_ERROR, "Unable to read pidfile: %s, %s",
- pidfile, strerror (errno));
- *pid = -1;
- }
-
-out:
- if (file)
- fclose (file);
- return running;
-}
-
void
glusterd_get_nodesvc_dir (char *server, char *workdir,
char *path, size_t len)
@@ -3289,7 +5396,10 @@ glusterd_get_nodesvc_volfile (char *server, char *workdir,
GF_ASSERT (len == PATH_MAX);
glusterd_get_nodesvc_dir (server, workdir, dir, sizeof (dir));
- snprintf (volfile, len, "%s/%s-server.vol", dir, server);
+ if (strcmp ("quotad", server) != 0)
+ snprintf (volfile, len, "%s/%s-server.vol", dir, server);
+ else
+ snprintf (volfile, len, "%s/%s.vol", dir, server);
}
void
@@ -3302,11 +5412,14 @@ glusterd_nodesvc_set_online_status (char *server, gf_boolean_t status)
GF_ASSERT (priv);
GF_ASSERT (priv->shd);
GF_ASSERT (priv->nfs);
+ GF_ASSERT (priv->quotad);
if (!strcmp("glustershd", server))
priv->shd->online = status;
else if (!strcmp ("nfs", server))
priv->nfs->online = status;
+ else if (!strcmp ("quotad", server))
+ priv->quotad->online = status;
}
gf_boolean_t
@@ -3320,11 +5433,14 @@ glusterd_is_nodesvc_online (char *server)
GF_ASSERT (conf);
GF_ASSERT (conf->shd);
GF_ASSERT (conf->nfs);
+ GF_ASSERT (conf->quotad);
if (!strcmp (server, "glustershd"))
online = conf->shd->online;
else if (!strcmp (server, "nfs"))
online = conf->nfs->online;
+ else if (!strcmp (server, "quotad"))
+ online = conf->quotad->online;
return online;
}
@@ -3350,6 +5466,7 @@ glusterd_pending_node_get_rpc (glusterd_pending_node_t *pending_node)
nodesrv_t *shd = NULL;
glusterd_volinfo_t *volinfo = NULL;
nodesrv_t *nfs = NULL;
+ nodesrv_t *quotad = NULL;
GF_VALIDATE_OR_GOTO (THIS->name, pending_node, out);
GF_VALIDATE_OR_GOTO (THIS->name, pending_node->node, out);
@@ -3371,6 +5488,10 @@ glusterd_pending_node_get_rpc (glusterd_pending_node_t *pending_node)
nfs = pending_node->node;
rpc = nfs->rpc;
+ } else if (pending_node->type == GD_NODE_QUOTAD) {
+ quotad = pending_node->node;
+ rpc = quotad->rpc;
+
} else {
GF_ASSERT (0);
}
@@ -3390,11 +5511,14 @@ glusterd_nodesvc_get_rpc (char *server)
GF_ASSERT (priv);
GF_ASSERT (priv->shd);
GF_ASSERT (priv->nfs);
+ GF_ASSERT (priv->quotad);
if (!strcmp (server, "glustershd"))
rpc = priv->shd->rpc;
else if (!strcmp (server, "nfs"))
rpc = priv->nfs->rpc;
+ else if (!strcmp (server, "quotad"))
+ rpc = priv->quotad->rpc;
return rpc;
}
@@ -3412,11 +5536,14 @@ glusterd_nodesvc_set_rpc (char *server, struct rpc_clnt *rpc)
GF_ASSERT (priv);
GF_ASSERT (priv->shd);
GF_ASSERT (priv->nfs);
+ GF_ASSERT (priv->quotad);
if (!strcmp ("glustershd", server))
priv->shd->rpc = rpc;
else if (!strcmp ("nfs", server))
priv->nfs->rpc = rpc;
+ else if (!strcmp ("quotad", server))
+ priv->quotad->rpc = rpc;
return ret;
}
@@ -3457,20 +5584,19 @@ int32_t
glusterd_nodesvc_disconnect (char *server)
{
struct rpc_clnt *rpc = NULL;
+ glusterd_conf_t *priv = THIS->private;
rpc = glusterd_nodesvc_get_rpc (server);
+ (void)glusterd_nodesvc_set_rpc (server, NULL);
- if (rpc) {
- rpc_clnt_connection_cleanup (&rpc->conn);
- rpc_clnt_unref (rpc);
- (void)glusterd_nodesvc_set_rpc (server, NULL);
- }
+ if (rpc)
+ glusterd_rpc_clnt_unref (priv, rpc);
return 0;
}
int32_t
-glusterd_nodesvc_start (char *server)
+glusterd_nodesvc_start (char *server, gf_boolean_t wait)
{
int32_t ret = -1;
xlator_t *this = NULL;
@@ -3530,14 +5656,16 @@ glusterd_nodesvc_start (char *server)
"--trace-children=yes", "--track-origins=yes",
NULL);
runner_argprintf (&runner, "--log-file=%s", valgrind_logfile);
- }
+ }
runner_add_args (&runner, SBIN_DIR"/glusterfs",
"-s", "localhost",
"--volfile-id", volfileid,
"-p", pidfile,
"-l", logfile,
- "-S", sockfpath, NULL);
+ "-S", sockfpath,
+ "-L", "DEBUG",
+ NULL);
if (!strcmp (server, "glustershd")) {
snprintf (glusterd_uuid_option, sizeof (glusterd_uuid_option),
@@ -3545,10 +5673,27 @@ glusterd_nodesvc_start (char *server)
runner_add_args (&runner, "--xlator-option",
glusterd_uuid_option, NULL);
}
+ if (!strcmp (server, "quotad")) {
+ runner_add_args (&runner, "--xlator-option",
+ "*replicate*.data-self-heal=off",
+ "--xlator-option",
+ "*replicate*.metadata-self-heal=off",
+ "--xlator-option",
+ "*replicate*.entry-self-heal=off", NULL);
+ }
runner_log (&runner, "", GF_LOG_DEBUG,
"Starting the nfs/glustershd services");
- ret = runner_run_nowait (&runner);
+ if (!wait) {
+ ret = runner_run_nowait (&runner);
+ } else {
+ synclock_unlock (&priv->big_lock);
+ {
+ ret = runner_run (&runner);
+ }
+ synclock_lock (&priv->big_lock);
+ }
+
if (ret == 0) {
glusterd_nodesvc_connect (server, sockfpath);
}
@@ -3559,13 +5704,19 @@ out:
int
glusterd_nfs_server_start ()
{
- return glusterd_nodesvc_start ("nfs");
+ return glusterd_nodesvc_start ("nfs", _gf_false);
}
int
glusterd_shd_start ()
{
- return glusterd_nodesvc_start ("glustershd");
+ return glusterd_nodesvc_start ("glustershd", _gf_false);
+}
+
+int
+glusterd_quotad_start ()
+{
+ return glusterd_nodesvc_start ("quotad", _gf_true);
}
gf_boolean_t
@@ -3576,7 +5727,7 @@ glusterd_is_nodesvc_running (char *server)
glusterd_get_nodesvc_pidfile (server, priv->workdir,
pidfile, sizeof (pidfile));
- return glusterd_is_service_running (pidfile, NULL);
+ return gf_is_service_running (pidfile, NULL);
}
int32_t
@@ -3656,6 +5807,10 @@ glusterd_nfs_pmap_deregister ()
else
gf_log ("", GF_LOG_ERROR, "De-registration of NLM v1 failed");
+ if (pmap_unset (ACL_PROGRAM, ACLV3_VERSION))
+ gf_log ("", GF_LOG_INFO, "De-registered ACL v3 successfully");
+ else
+ gf_log ("", GF_LOG_ERROR, "De-registration of ACL v3 failed");
}
int
@@ -3682,6 +5837,12 @@ glusterd_shd_stop ()
}
int
+glusterd_quotad_stop ()
+{
+ return glusterd_nodesvc_stop ("quotad", SIGTERM);
+}
+
+int
glusterd_add_node_to_dict (char *server, dict_t *dict, int count,
dict_t *vol_opts)
{
@@ -3697,7 +5858,7 @@ glusterd_add_node_to_dict (char *server, dict_t *dict, int count,
sizeof (pidfile));
//Consider service to be running only when glusterd sees it Online
if (glusterd_is_nodesvc_online (server))
- running = glusterd_is_service_running (pidfile, &pid);
+ running = gf_is_service_running (pidfile, &pid);
/* For nfs-servers/self-heal-daemon setting
* brick<n>.hostname = "NFS Server" / "Self-heal Daemon"
@@ -3713,6 +5874,8 @@ glusterd_add_node_to_dict (char *server, dict_t *dict, int count,
ret = dict_set_str (dict, key, "NFS Server");
else if (!strcmp (server, "glustershd"))
ret = dict_set_str (dict, key, "Self-heal Daemon");
+ else if (!strcmp (server, "quotad"))
+ ret = dict_set_str (dict, key, "Quota Daemon");
if (ret)
goto out;
@@ -3831,11 +5994,22 @@ glusterd_reconfigure_shd ()
}
int
+glusterd_reconfigure_quotad ()
+{
+ return glusterd_reconfigure_nodesvc (glusterd_create_quotad_volfile);
+}
+
+int
glusterd_reconfigure_nfs ()
{
int ret = -1;
gf_boolean_t identical = _gf_false;
+ /*
+ * Check both OLD and NEW volfiles, if they are SAME by size
+ * and cksum i.e. "character-by-character". If YES, then
+ * NOTHING has been changed, just return.
+ */
ret = glusterd_check_nfs_volfile_identical (&identical);
if (ret)
goto out;
@@ -3845,6 +6019,31 @@ glusterd_reconfigure_nfs ()
goto out;
}
+ /*
+ * They are not identical. Find out if the topology is changed
+ * OR just the volume options. If just the options which got
+ * changed, then inform the xlator to reconfigure the options.
+ */
+ identical = _gf_false; /* RESET the FLAG */
+ ret = glusterd_check_nfs_topology_identical (&identical);
+ if (ret)
+ goto out;
+
+ /* Topology is not changed, but just the options. But write the
+ * options to NFS volfile, so that NFS will be reconfigured.
+ */
+ if (identical) {
+ ret = glusterd_create_nfs_volfile();
+ if (ret == 0) {/* Only if above PASSES */
+ ret = glusterd_fetchspec_notify (THIS);
+ }
+ goto out;
+ }
+
+ /*
+ * NFS volfile's topology has been changed. NFS server needs
+ * to be RESTARTED to ACT on the changed volfile.
+ */
ret = glusterd_check_generate_start_nfs ();
out:
@@ -3877,21 +6076,52 @@ glusterd_check_generate_start_shd ()
}
int
-glusterd_nodesvcs_batch_op (glusterd_volinfo_t *volinfo,
- int (*nfs_op) (), int (*shd_op) ())
+glusterd_check_generate_start_quotad ()
{
+ int ret = 0;
+
+ ret = glusterd_check_generate_start_service (glusterd_create_quotad_volfile,
+ glusterd_quotad_stop,
+ glusterd_quotad_start);
+ if (ret == -EINVAL)
+ ret = 0;
+ return ret;
+}
+
+int
+glusterd_nodesvcs_batch_op (glusterd_volinfo_t *volinfo, int (*nfs_op) (),
+ int (*shd_op) (), int (*qd_op) ())
+ {
int ret = 0;
+ xlator_t *this = THIS;
+ glusterd_conf_t *conf = NULL;
+
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
ret = nfs_op ();
if (ret)
goto out;
- if (volinfo && !glusterd_is_volume_replicate (volinfo))
+ if (volinfo && !glusterd_is_volume_replicate (volinfo)) {
+ ; //do nothing
+ } else {
+ ret = shd_op ();
+ if (ret)
+ goto out;
+ }
+
+ if (conf->op_version == GD_OP_VERSION_MIN)
goto out;
- ret = shd_op ();
+ if (volinfo && !glusterd_is_volume_quota_enabled (volinfo))
+ goto out;
+
+ ret = qd_op ();
if (ret)
goto out;
+
out:
return ret;
}
@@ -3901,7 +6131,8 @@ glusterd_nodesvcs_start (glusterd_volinfo_t *volinfo)
{
return glusterd_nodesvcs_batch_op (volinfo,
glusterd_nfs_server_start,
- glusterd_shd_start);
+ glusterd_shd_start,
+ glusterd_quotad_start);
}
int
@@ -3909,7 +6140,8 @@ glusterd_nodesvcs_stop (glusterd_volinfo_t *volinfo)
{
return glusterd_nodesvcs_batch_op (volinfo,
glusterd_nfs_server_stop,
- glusterd_shd_stop);
+ glusterd_shd_stop,
+ glusterd_quotad_stop);
}
gf_boolean_t
@@ -3955,21 +6187,53 @@ glusterd_all_replicate_volumes_stopped ()
return _gf_true;
}
+gf_boolean_t
+glusterd_all_volumes_with_quota_stopped ()
+{
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ list_for_each_entry (voliter, &priv->volumes, vol_list) {
+ if (!glusterd_is_volume_quota_enabled (voliter))
+ continue;
+ if (voliter->status == GLUSTERD_STATUS_STARTED)
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+
int
glusterd_nodesvcs_handle_graph_change (glusterd_volinfo_t *volinfo)
{
int (*shd_op) () = NULL;
int (*nfs_op) () = NULL;
+ int (*qd_op) () = NULL;
shd_op = glusterd_check_generate_start_shd;
nfs_op = glusterd_check_generate_start_nfs;
+ qd_op = glusterd_check_generate_start_quotad;
if (glusterd_are_all_volumes_stopped ()) {
shd_op = glusterd_shd_stop;
nfs_op = glusterd_nfs_server_stop;
- } else if (glusterd_all_replicate_volumes_stopped()) {
- shd_op = glusterd_shd_stop;
+ qd_op = glusterd_quotad_stop;
+ } else {
+ if (glusterd_all_replicate_volumes_stopped()) {
+ shd_op = glusterd_shd_stop;
+ }
+ if (glusterd_all_volumes_with_quota_stopped ()) {
+ qd_op = glusterd_quotad_stop;
+ }
}
- return glusterd_nodesvcs_batch_op (volinfo, nfs_op, shd_op);
+
+ return glusterd_nodesvcs_batch_op (volinfo, nfs_op, shd_op, qd_op);
}
int
@@ -3977,7 +6241,8 @@ glusterd_nodesvcs_handle_reconfigure (glusterd_volinfo_t *volinfo)
{
return glusterd_nodesvcs_batch_op (volinfo,
glusterd_reconfigure_nfs,
- glusterd_reconfigure_shd);
+ glusterd_reconfigure_shd,
+ glusterd_reconfigure_quotad);
}
int
@@ -4076,20 +6341,42 @@ out:
int
glusterd_restart_bricks (glusterd_conf_t *conf)
{
+ int ret = 0;
glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_snap_t *snap = NULL;
gf_boolean_t start_nodesvcs = _gf_false;
- int ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
list_for_each_entry (volinfo, &conf->volumes, vol_list) {
if (volinfo->status != GLUSTERD_STATUS_STARTED)
continue;
start_nodesvcs = _gf_true;
+ gf_log (this->name, GF_LOG_DEBUG, "starting the volume %s",
+ volinfo->volname);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
glusterd_brick_start (volinfo, brickinfo, _gf_false);
}
}
+ list_for_each_entry (snap, &conf->snapshots, snap_list) {
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ if (volinfo->status != GLUSTERD_STATUS_STARTED)
+ continue;
+ start_nodesvcs = _gf_true;
+ gf_log (this->name, GF_LOG_DEBUG, "starting the snap "
+ "volume %s", volinfo->volname);
+ list_for_each_entry (brickinfo, &volinfo->bricks,
+ brick_list) {
+ glusterd_brick_start (volinfo, brickinfo,
+ _gf_false);
+ }
+ }
+ }
+
if (start_nodesvcs)
glusterd_nodesvcs_handle_graph_change (NULL);
@@ -4099,13 +6386,26 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
int
_local_gsyncd_start (dict_t *this, char *key, data_t *value, void *data)
{
+ char *path_list = NULL;
char *slave = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *statefile = NULL;
+ char buf[1024] = "faulty";
int uuid_len = 0;
+ int ret = 0;
char uuid_str[64] = {0};
- glusterd_volinfo_t *volinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char confpath[PATH_MAX] = "";
+ char *op_errstr = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ GF_ASSERT (THIS);
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (data);
volinfo = data;
- GF_ASSERT (volinfo);
slave = strchr(value->data, ':');
if (slave)
slave ++;
@@ -4114,9 +6414,65 @@ _local_gsyncd_start (dict_t *this, char *key, data_t *value, void *data)
uuid_len = (slave - value->data - 1);
strncpy (uuid_str, (char*)value->data, uuid_len);
- glusterd_start_gsync (volinfo, slave, uuid_str, NULL);
- return 0;
+ /* Getting Local Brickpaths */
+ ret = glusterd_get_local_brickpaths (volinfo, &path_list);
+
+ /*Generating the conf file path needed by gsyncd */
+ ret = glusterd_get_slave_info (slave, &slave_ip,
+ &slave_vol, &op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (confpath, sizeof(confpath) - 1,
+ "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, volinfo->volname,
+ slave_ip, slave_vol);
+ confpath[ret] = '\0';
+
+ /* Fetching the last status of the node */
+ ret = glusterd_get_statefile_name (volinfo, slave,
+ confpath, &statefile);
+ if (ret) {
+ if (!strstr(slave, "::"))
+ gf_log ("", GF_LOG_INFO,
+ "%s is not a valid slave url.", slave);
+ else
+ gf_log ("", GF_LOG_INFO, "Unable to get"
+ " statefile's name");
+ goto out;
+ }
+
+ ret = glusterd_gsync_read_frm_status (statefile, buf, sizeof (buf));
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read the status");
+ goto out;
+ }
+
+ /* Looks for the last status, to find if the sessiom was running
+ * when the node went down. If the session was not started or
+ * not started, do not restart the geo-rep session */
+ if ((!strcmp (buf, "Not Started")) ||
+ (!strcmp (buf, "Stopped"))) {
+ gf_log ("", GF_LOG_INFO,
+ "Geo-Rep Session was not started between "
+ "%s and %s::%s. Not Restarting", volinfo->volname,
+ slave_ip, slave_vol);
+ goto out;
+ }
+
+ glusterd_start_gsync (volinfo, slave, path_list, confpath,
+ uuid_str, NULL);
+
+out:
+ GF_FREE (path_list);
+ GF_FREE (op_errstr);
+
+ return ret;
}
int
@@ -4165,7 +6521,7 @@ glusterd_get_brickinfo (xlator_t *this, const char *brickname, int port,
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
list_for_each_entry (tmpbrkinfo, &volinfo->bricks,
brick_list) {
- if (localhost && !glusterd_is_local_addr (tmpbrkinfo->hostname))
+ if (localhost && !gf_is_local_addr (tmpbrkinfo->hostname))
continue;
if (!strcmp(tmpbrkinfo->path, brickname) &&
(tmpbrkinfo->port == port)) {
@@ -4236,8 +6592,7 @@ out:
return -1;
}
-#ifdef GF_LINUX_HOST_OS
-static int
+int
glusterd_get_brick_root (char *path, char **mount_point)
{
char *ptr = NULL;
@@ -4361,6 +6716,17 @@ glusterd_add_inode_size_to_dict (dict_t *dict, int count)
"size for %s : %s package missing", fs_name,
((strcmp (fs_name, "xfs")) ?
"e2fsprogs" : "xfsprogs"));
+ /*
+ * Runner_start might return an error after the child has
+ * been forked, e.g. if the program isn't there. In that
+ * case, we still need to call runner_end to reap the
+ * child and free resources. Fortunately, that seems to
+ * be harmless for other kinds of failures.
+ */
+ if (runner_end(&runner)) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "double failure calling runner_end");
+ }
goto out;
}
@@ -4405,6 +6771,31 @@ glusterd_add_inode_size_to_dict (dict_t *dict, int count)
return ret;
}
+struct mntent *
+glusterd_get_mnt_entry_info (char *mnt_pt, FILE *mtab)
+{
+ struct mntent *entry = NULL;
+
+ mtab = setmntent (_PATH_MOUNTED, "r");
+ if (!mtab)
+ goto out;
+
+ entry = getmntent (mtab);
+
+ while (1) {
+ if (!entry)
+ goto out;
+
+ if (!strcmp (entry->mnt_dir, mnt_pt) &&
+ strcmp (entry->mnt_type, "rootfs"))
+ break;
+ entry = getmntent (mtab);
+ }
+
+out:
+ return entry;
+}
+
static int
glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
dict_t *dict, int count)
@@ -4413,9 +6804,6 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
char key[1024] = {0};
char base_key[1024] = {0};
char *mnt_pt = NULL;
- char *fs_name = NULL;
- char *mnt_options = NULL;
- char *device = NULL;
FILE *mtab = NULL;
struct mntent *entry = NULL;
@@ -4425,31 +6813,17 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
if (ret)
goto out;
- mtab = setmntent (_PATH_MOUNTED, "r");
- if (!mtab) {
+ entry = glusterd_get_mnt_entry_info (mnt_pt, mtab);
+ if (!entry) {
ret = -1;
goto out;
}
- entry = getmntent (mtab);
-
- while (1) {
- if (!entry) {
- ret = -1;
- goto out;
- }
- if (!strcmp (entry->mnt_dir, mnt_pt) &&
- strcmp (entry->mnt_type, "rootfs"))
- break;
- entry = getmntent (mtab);
- }
-
/* get device file */
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.device", base_key);
- device = gf_strdup (entry->mnt_fsname);
- ret = dict_set_dynstr (dict, key, device);
+ ret = dict_set_dynstr_with_alloc (dict, key, entry->mnt_fsname);
if (ret)
goto out;
@@ -4457,8 +6831,7 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.fs_name", base_key);
- fs_name = gf_strdup (entry->mnt_type);
- ret = dict_set_dynstr (dict, key, fs_name);
+ ret = dict_set_dynstr_with_alloc (dict, key, entry->mnt_type);
if (ret)
goto out;
@@ -4466,8 +6839,7 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.mnt_options", base_key);
- mnt_options = gf_strdup (entry->mnt_opts);
- ret = dict_set_dynstr (dict, key, mnt_options);
+ ret = dict_set_dynstr_with_alloc (dict, key, entry->mnt_opts);
out:
GF_FREE (mnt_pt);
@@ -4476,7 +6848,45 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
return ret;
}
-#endif
+
+char*
+glusterd_get_brick_mount_details (glusterd_brickinfo_t *brickinfo)
+{
+ int ret = -1;
+ char *mnt_pt = NULL;
+ char *device = NULL;
+ FILE *mtab = NULL;
+ struct mntent *entry = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brickinfo);
+
+ ret = glusterd_get_brick_root (brickinfo->path, &mnt_pt);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get mount point "
+ "for %s brick", brickinfo->path);
+ goto out;
+ }
+
+ entry = glusterd_get_mnt_entry_info (mnt_pt, mtab);
+ if (NULL == entry) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get mnt entry "
+ "for %s mount path", mnt_pt);
+ goto out;
+ }
+
+ /* get the fs_name/device */
+ device = gf_strdup (entry->mnt_fsname);
+
+out:
+ if (NULL != mtab) {
+ endmntent (mtab);
+ }
+
+ return device;
+}
int
glusterd_add_brick_detail_to_dict (glusterd_volinfo_t *volinfo,
@@ -4550,13 +6960,12 @@ glusterd_add_brick_detail_to_dict (glusterd_volinfo_t *volinfo,
if (ret)
goto out;
}
-#ifdef GF_LINUX_HOST_OS
+
ret = glusterd_add_brick_mount_details (brickinfo, dict, count);
if (ret)
goto out;
ret = glusterd_add_inode_size_to_dict (dict, count);
-#endif
out:
if (ret)
gf_log (this->name, GF_LOG_DEBUG, "Error adding brick"
@@ -4602,6 +7011,15 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
if (ret)
goto out;
+ /* add peer uuid */
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s.peerid", base_key);
+ ret = dict_set_dynstr_with_alloc (dict, key,
+ uuid_utoa (brickinfo->uuid));
+ if (ret) {
+ goto out;
+ }
+
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.port", base_key);
ret = dict_set_int32 (dict, key, brickinfo->port);
@@ -4610,7 +7028,7 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
- brick_online = glusterd_is_service_running (pidfile, &pid);
+ brick_online = gf_is_service_running (pidfile, &pid);
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.pid", base_key);
@@ -4832,7 +7250,7 @@ glusterd_hostname_to_uuid (char *hostname, uuid_t uuid)
ret = glusterd_friend_find_by_hostname (hostname, &peerinfo);
if (ret) {
- if (glusterd_is_local_addr (hostname)) {
+ if (gf_is_local_addr (hostname)) {
uuid_copy (uuid, MY_UUID);
ret = 0;
} else {
@@ -5170,11 +7588,12 @@ out:
int
glusterd_check_and_set_brick_xattr (char *host, char *path, uuid_t uuid,
- char **op_errstr)
+ char **op_errstr, gf_boolean_t is_force)
{
int ret = -1;
char msg[2048] = {0,};
gf_boolean_t in_use = _gf_false;
+ int flags = 0;
/* Check for xattr support in backend fs */
ret = sys_lsetxattr (path, "trusted.glusterfs.test",
@@ -5194,13 +7613,17 @@ glusterd_check_and_set_brick_xattr (char *host, char *path, uuid_t uuid,
if (ret)
goto out;
- if (in_use) {
+ if (in_use && !is_force) {
ret = -1;
goto out;
}
+
+ if (!is_force)
+ flags = XATTR_CREATE;
+
ret = sys_lsetxattr (path, GF_XATTR_VOL_ID_KEY, uuid, 16,
- XATTR_CREATE);
+ flags);
if (ret) {
snprintf (msg, sizeof (msg), "Failed to set extended "
"attributes %s, reason: %s",
@@ -5255,8 +7678,7 @@ glusterd_sm_tr_log_transition_add_to_dict (dict_t *dict,
snprintf (key, sizeof (key), "log%d-time", count);
gf_time_fmt (timestr, sizeof timestr, log->transitions[i].time,
gf_timefmt_FT);
- str = gf_strdup (timestr);
- ret = dict_set_dynstr (dict, key, str);
+ ret = dict_set_dynstr_with_alloc (dict, key, timestr);
if (ret)
goto out;
@@ -5514,7 +7936,7 @@ glusterd_delete_volume (glusterd_volinfo_t *volinfo)
if (ret)
goto out;
- ret = glusterd_volinfo_delete (volinfo);
+ glusterd_volinfo_remove (volinfo);
out:
gf_log (THIS->name, GF_LOG_DEBUG, "returning %d", ret);
return ret;
@@ -5555,12 +7977,92 @@ glusterd_delete_all_bricks (glusterd_volinfo_t* volinfo)
}
int
+glusterd_get_local_brickpaths (glusterd_volinfo_t *volinfo, char **pathlist)
+{
+ char **path_tokens = NULL;
+ char *tmp_path_list = NULL;
+ char path[PATH_MAX] = "";
+ int32_t count = 0;
+ int32_t pathlen = 0;
+ int32_t total_len = 0;
+ int32_t ret = 0;
+ int i = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ if ((!volinfo) || (!pathlist))
+ goto out;
+
+ path_tokens = GF_CALLOC (sizeof(char*), volinfo->brick_count,
+ gf_gld_mt_charptr);
+ if (!path_tokens) {
+ gf_log ("", GF_LOG_DEBUG, "Could not allocate memory.");
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ pathlen = snprintf (path, sizeof(path),
+ "--path=%s ", brickinfo->path);
+ if (pathlen < sizeof(path))
+ path[pathlen] = '\0';
+ else
+ path[sizeof(path)-1] = '\0';
+ path_tokens[count] = gf_strdup (path);
+ if (!path_tokens[count]) {
+ gf_log ("", GF_LOG_DEBUG,
+ "Could not allocate memory.");
+ ret = -1;
+ goto out;
+ }
+ count++;
+ total_len += pathlen;
+ }
+
+ tmp_path_list = GF_CALLOC (sizeof(char), total_len + 1,
+ gf_gld_mt_char);
+ if (!tmp_path_list) {
+ gf_log ("", GF_LOG_DEBUG, "Could not allocate memory.");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < count; i++)
+ strcat (tmp_path_list, path_tokens[i]);
+
+ if (count)
+ *pathlist = tmp_path_list;
+
+ ret = count;
+out:
+ for (i = 0; i < count; i++) {
+ GF_FREE (path_tokens[i]);
+ path_tokens[i] = NULL;
+ }
+
+ GF_FREE (path_tokens);
+ path_tokens = NULL;
+
+ if (ret == 0) {
+ gf_log ("", GF_LOG_DEBUG, "No Local Bricks Present.");
+ GF_FREE (tmp_path_list);
+ tmp_path_list = NULL;
+ }
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
- char *glusterd_uuid_str, char **op_errstr)
+ char *path_list, char *conf_path,
+ char *glusterd_uuid_str,
+ char **op_errstr)
{
int32_t ret = 0;
int32_t status = 0;
- char buf[PATH_MAX] = {0,};
char uuid_str [64] = {0};
runner_t runner = {0,};
xlator_t *this = NULL;
@@ -5573,32 +8075,23 @@ glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
GF_ASSERT (priv);
uuid_utoa_r (MY_UUID, uuid_str);
- if (strcmp (uuid_str, glusterd_uuid_str))
- goto out;
- ret = gsync_status (master_vol->volname, slave, &status);
- if (status == 0)
- goto out;
-
- snprintf (buf, PATH_MAX, "%s/"GEOREP"/%s", priv->workdir, master_vol->volname);
- ret = mkdir_p (buf, 0777, _gf_true);
- if (ret) {
- errcode = -1;
+ if (!path_list) {
+ ret = 0;
+ gf_log ("", GF_LOG_DEBUG, "No Bricks in this node."
+ " Not starting gsyncd.");
goto out;
}
- snprintf (buf, PATH_MAX, DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/%s",
- master_vol->volname);
- ret = mkdir_p (buf, 0777, _gf_true);
- if (ret) {
- errcode = -1;
+ ret = gsync_status (master_vol->volname, slave, conf_path, &status);
+ if (status == 0)
goto out;
- }
uuid_utoa_r (master_vol->volume_id, uuid_str);
runinit (&runner);
- runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd",
+ path_list, "-c", NULL);
+ runner_argprintf (&runner, "%s", conf_path);
runner_argprintf (&runner, ":%s", master_vol->volname);
runner_add_args (&runner, slave, "--config-set", "session-owner",
uuid_str, NULL);
@@ -5611,9 +8104,12 @@ glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
}
runinit (&runner);
- runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--monitor", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd",
+ path_list, "--monitor", "-c", NULL);
+ runner_argprintf (&runner, "%s", conf_path);
runner_argprintf (&runner, ":%s", master_vol->volname);
+ runner_argprintf (&runner, "--glusterd-uuid=%s",
+ uuid_utoa (priv->uuid));
runner_add_arg (&runner, slave);
synclock_unlock (&priv->big_lock);
ret = runner_run (&runner);
@@ -5629,7 +8125,7 @@ glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
out:
if ((ret != 0) && errcode == -1) {
if (op_errstr)
- *op_errstr = gf_strdup ("internal error, cannot start"
+ *op_errstr = gf_strdup ("internal error, cannot start "
"the " GEOREP " session");
}
@@ -5638,17 +8134,38 @@ out:
}
int32_t
-glusterd_recreate_bricks (glusterd_conf_t *conf)
+glusterd_recreate_volfiles (glusterd_conf_t *conf)
{
glusterd_volinfo_t *volinfo = NULL;
int ret = 0;
+ int op_ret = 0;
GF_ASSERT (conf);
list_for_each_entry (volinfo, &conf->volumes, vol_list) {
ret = generate_brick_volfiles (volinfo);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR, "Failed to "
+ "regenerate brick volfiles for %s",
+ volinfo->volname);
+ op_ret = ret;
+ }
+ ret = generate_client_volfiles (volinfo, GF_CLIENT_TRUSTED);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR, "Failed to "
+ "regenerate trusted client volfiles for %s",
+ volinfo->volname);
+ op_ret = ret;
+ }
+ ret = generate_client_volfiles (volinfo, GF_CLIENT_OTHER);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR, "Failed to "
+ "regenerate client volfiles for %s",
+ volinfo->volname);
+ op_ret = ret;
+ }
}
- return ret;
+ return op_ret;
}
int32_t
@@ -5658,7 +8175,7 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf)
char *type = NULL;
gf_boolean_t upgrade = _gf_false;
gf_boolean_t downgrade = _gf_false;
- gf_boolean_t regenerate_brick_volfiles = _gf_false;
+ gf_boolean_t regenerate_volfiles = _gf_false;
gf_boolean_t terminate = _gf_false;
ret = dict_get_str (options, "upgrade", &type);
@@ -5671,7 +8188,7 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf)
goto out;
}
if (_gf_true == upgrade)
- regenerate_brick_volfiles = _gf_true;
+ regenerate_volfiles = _gf_true;
}
ret = dict_get_str (options, "downgrade", &type);
@@ -5696,8 +8213,8 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf)
ret = 0;
else
terminate = _gf_true;
- if (regenerate_brick_volfiles) {
- ret = glusterd_recreate_bricks (conf);
+ if (regenerate_volfiles) {
+ ret = glusterd_recreate_volfiles (conf);
}
out:
if (terminate && (ret == 0))
@@ -5912,29 +8429,117 @@ out:
return ret;
}
-/* Checks if the given peer contains all the bricks belonging to the
- * given volume. Returns true if it does else returns false
+int
+glusterd_quotad_statedump (char *options, int option_cnt, char **op_errstr)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char pidfile_path[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ FILE *pidfile = NULL;
+ pid_t pid = -1;
+ char dumpoptions_path[PATH_MAX] = {0,};
+ char *option = NULL;
+ char *tmpptr = NULL;
+ char *dup_options = NULL;
+ char msg[256] = {0,};
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ dup_options = gf_strdup (options);
+ option = strtok_r (dup_options, " ", &tmpptr);
+ if (strcmp (option, "quotad")) {
+ snprintf (msg, sizeof (msg), "for quotad statedump, options "
+ "should be after the key 'quotad'");
+ *op_errstr = gf_strdup (msg);
+ ret = -1;
+ goto out;
+ }
+
+ GLUSTERD_GET_QUOTAD_DIR (path, conf);
+ GLUSTERD_GET_QUOTAD_PIDFILE (pidfile_path, path);
+
+ pidfile = fopen (pidfile_path, "r");
+ if (!pidfile) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to open pidfile: %s",
+ pidfile_path);
+ ret = -1;
+ goto out;
+ }
+
+ ret = fscanf (pidfile, "%d", &pid);
+ if (ret <= 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get pid of quotad "
+ "process");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (dumpoptions_path, sizeof (dumpoptions_path),
+ DEFAULT_VAR_RUN_DIRECTORY"/glusterdump.%d.options", pid);
+ ret = glusterd_set_dump_options (dumpoptions_path, options, option_cnt);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "error while parsing "
+ "statedump options");
+ ret = -1;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_INFO, "Performing statedump on quotad with "
+ "pid %d", pid);
+
+ kill (pid, SIGUSR1);
+
+ sleep (1);
+
+ ret = 0;
+out:
+ if (pidfile)
+ fclose (pidfile);
+ unlink (dumpoptions_path);
+ GF_FREE (dup_options);
+ return ret;
+}
+
+/* Checks if the given peer contains bricks belonging to the given volume.
+ * Returns,
+ * 2 - if peer contains all the bricks
+ * 1 - if peer contains at least 1 brick
+ * 0 - if peer contains no bricks
*/
-gf_boolean_t
+int
glusterd_friend_contains_vol_bricks (glusterd_volinfo_t *volinfo,
uuid_t friend_uuid)
{
- gf_boolean_t ret = _gf_true;
+ int ret = 0;
glusterd_brickinfo_t *brickinfo = NULL;
+ int count = 0;
GF_ASSERT (volinfo);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- if (uuid_compare (friend_uuid, brickinfo->uuid)) {
- ret = _gf_false;
- break;
+ if (!uuid_compare (brickinfo->uuid, friend_uuid)) {
+ count++;
}
}
+
+ if (count) {
+ if (count == volinfo->brick_count)
+ ret = 2;
+ else
+ ret = 1;
+ }
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-/* Remove all volumes which completely belong to given friend
+/* Cleanup the stale volumes left behind in the cluster. The volumes which are
+ * contained completely within the detached peer are stale with respect to the
+ * cluster.
*/
int
glusterd_friend_remove_cleanup_vols (uuid_t uuid)
@@ -5949,7 +8554,7 @@ glusterd_friend_remove_cleanup_vols (uuid_t uuid)
list_for_each_entry_safe (volinfo, tmp_volinfo,
&priv->volumes, vol_list) {
- if (glusterd_friend_contains_vol_bricks (volinfo, uuid)) {
+ if (glusterd_friend_contains_vol_bricks (volinfo, uuid) == 2) {
gf_log (THIS->name, GF_LOG_INFO,
"Deleting stale volume %s", volinfo->volname);
ret = glusterd_delete_volume (volinfo);
@@ -6043,22 +8648,44 @@ int
glusterd_volume_defrag_restart (glusterd_volinfo_t *volinfo, char *op_errstr,
size_t len, int cmd, defrag_cbk_fn_t cbk)
{
- glusterd_conf_t *priv = NULL;
- char pidfile[PATH_MAX];
- int ret = -1;
- pid_t pid;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ char pidfile[PATH_MAX] = {0,};
+ int ret = -1;
+ pid_t pid = 0;
- priv = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
if (!priv)
return ret;
- GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
-
- if (!glusterd_is_service_running (pidfile, &pid)) {
+ /* Don't start the rebalance process if the stautus is already
+ * completed, stopped or failed. If the status is started, check if
+ * there is an existing process already and connect to it. If not, then
+ * start the rebalance process
+ */
+ switch (volinfo->rebal.defrag_status) {
+ case GF_DEFRAG_STATUS_COMPLETE:
+ case GF_DEFRAG_STATUS_STOPPED:
+ case GF_DEFRAG_STATUS_FAILED:
+ break;
+ case GF_DEFRAG_STATUS_STARTED:
+ GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
+ if (gf_is_service_running (pidfile, &pid)) {
+ glusterd_rebalance_rpc_create (volinfo, _gf_true);
+ break;
+ }
+ case GF_DEFRAG_STATUS_NOT_STARTED:
glusterd_handle_defrag_start (volinfo, op_errstr, len, cmd,
cbk, volinfo->rebal.op);
- } else {
- glusterd_rebalance_rpc_create (volinfo, priv, cmd);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Unknown defrag status (%d)."
+ "Not starting rebalance process for %s.",
+ volinfo->rebal.defrag_status, volinfo->volname);
+ break;
}
return ret;
@@ -6074,6 +8701,8 @@ glusterd_restart_rebalance (glusterd_conf_t *conf)
list_for_each_entry (volinfo, &conf->volumes, vol_list) {
if (!volinfo->rebal.defrag_cmd)
continue;
+ if (!gd_should_i_start_rebalance (volinfo))
+ continue;
glusterd_volume_defrag_restart (volinfo, op_errstr, 256,
volinfo->rebal.defrag_cmd, NULL);
}
@@ -6092,6 +8721,7 @@ glusterd_volinfo_reset_defrag_stats (glusterd_volinfo_t *volinfo)
rebal->lookedup_files = 0;
rebal->rebalance_failures = 0;
rebal->rebalance_time = 0;
+ rebal->skipped_files = 0;
}
@@ -6188,6 +8818,7 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo,
uint64_t lookup = 0;
gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED;
uint64_t failures = 0;
+ uint64_t skipped = 0;
xlator_t *this = NULL;
double run_time = 0;
@@ -6218,6 +8849,11 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo,
gf_log (this->name, GF_LOG_TRACE,
"failed to get failure count");
+ ret = dict_get_uint64 (rsp_dict, "skipped", &skipped);
+ if (ret)
+ gf_log (this->name, GF_LOG_TRACE,
+ "failed to get skipped count");
+
ret = dict_get_double (rsp_dict, "run-time", &run_time);
if (ret)
gf_log (this->name, GF_LOG_TRACE,
@@ -6233,6 +8869,8 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo,
volinfo->rebal.defrag_status = status;
if (failures)
volinfo->rebal.rebalance_failures = failures;
+ if (skipped)
+ volinfo->rebal.skipped_files = skipped;
if (run_time)
volinfo->rebal.rebalance_time = run_time;
@@ -6240,6 +8878,70 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo,
}
int
+glusterd_check_topology_identical (const char *filename1,
+ const char *filename2,
+ gf_boolean_t *identical)
+{
+ int ret = -1; /* FAILURE */
+ xlator_t *this = THIS;
+ FILE *fp1 = NULL;
+ FILE *fp2 = NULL;
+ glusterfs_graph_t *grph1 = NULL;
+ glusterfs_graph_t *grph2 = NULL;
+
+ /* Invalid xlator, Nothing to do */
+ if (!this)
+ return (-1);
+
+ /* Sanitize the inputs */
+ GF_VALIDATE_OR_GOTO (this->name, filename1, out);
+ GF_VALIDATE_OR_GOTO (this->name, filename2, out);
+ GF_VALIDATE_OR_GOTO (this->name, identical, out);
+
+ /* fopen() the volfile1 to create the graph */
+ fp1 = fopen (filename1, "r");
+ if (fp1 == NULL) {
+ gf_log (this->name, GF_LOG_ERROR, "fopen() on file: %s failed "
+ "(%s)", filename1, strerror (errno));
+ goto out;
+ }
+
+ /* fopen() the volfile2 to create the graph */
+ fp2 = fopen (filename2, "r");
+ if (fp2 == NULL) {
+ gf_log (this->name, GF_LOG_ERROR, "fopen() on file: %s failed "
+ "(%s)", filename2, strerror (errno));
+ goto out;
+ }
+
+ /* create the graph for filename1 */
+ grph1 = glusterfs_graph_construct(fp1);
+ if (grph1 == NULL)
+ goto out;
+
+ /* create the graph for filename2 */
+ grph2 = glusterfs_graph_construct(fp2);
+ if (grph2 == NULL)
+ goto out;
+
+ /* compare the graph topology */
+ *identical = is_graph_topology_equal(grph1, grph2);
+ ret = 0; /* SUCCESS */
+out:
+ if (fp1)
+ fclose(fp1);
+ if (fp2)
+ fclose(fp2);
+ if (grph1)
+ glusterfs_graph_destroy(grph1);
+ if (grph2)
+ glusterfs_graph_destroy(grph2);
+
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+int
glusterd_check_files_identical (char *filename1, char *filename2,
gf_boolean_t *identical)
{
@@ -6395,7 +9097,7 @@ glusterd_append_gsync_status (dict_t *dst, dict_t *src)
goto out;
}
- ret = dict_set_dynstr (dst, "gsync-status", gf_strdup (stop_msg));
+ ret = dict_set_dynstr_with_alloc (dst, "gsync-status", stop_msg);
if (ret) {
gf_log ("glusterd", GF_LOG_WARNING, "Unable to set the stop"
"message in the ctx dictionary");
@@ -6409,21 +9111,16 @@ glusterd_append_gsync_status (dict_t *dst, dict_t *src)
}
-static int32_t
+int32_t
glusterd_append_status_dicts (dict_t *dst, dict_t *src)
{
- int dst_count = 0;
- int src_count = 0;
- int i = 0;
- int ret = 0;
- char mst[PATH_MAX] = {0,};
- char slv[PATH_MAX] = {0, };
- char sts[PATH_MAX] = {0, };
- char nds[PATH_MAX] = {0, };
- char *mst_val = NULL;
- char *slv_val = NULL;
- char *sts_val = NULL;
- char *nds_val = NULL;
+ char sts_val_name[PATH_MAX] = {0, };
+ int dst_count = 0;
+ int src_count = 0;
+ int i = 0;
+ int ret = 0;
+ gf_gsync_status_t *sts_val = NULL;
+ gf_gsync_status_t *dst_sts_val = NULL;
GF_ASSERT (dst);
@@ -6441,49 +9138,29 @@ glusterd_append_status_dicts (dict_t *dst, dict_t *src)
goto out;
}
- for (i = 1; i <= src_count; i++) {
- snprintf (nds, sizeof(nds), "node%d", i);
- snprintf (mst, sizeof(mst), "master%d", i);
- snprintf (slv, sizeof(slv), "slave%d", i);
- snprintf (sts, sizeof(sts), "status%d", i);
-
- ret = dict_get_str (src, nds, &nds_val);
- if (ret)
- goto out;
-
- ret = dict_get_str (src, mst, &mst_val);
- if (ret)
- goto out;
+ for (i = 0; i < src_count; i++) {
+ memset (sts_val_name, '\0', sizeof(sts_val_name));
+ snprintf (sts_val_name, sizeof(sts_val_name), "status_value%d", i);
- ret = dict_get_str (src, slv, &slv_val);
+ ret = dict_get_bin (src, sts_val_name, (void **) &sts_val);
if (ret)
goto out;
- ret = dict_get_str (src, sts, &sts_val);
- if (ret)
+ dst_sts_val = GF_CALLOC (1, sizeof(gf_gsync_status_t),
+ gf_common_mt_gsync_status_t);
+ if (!dst_sts_val) {
+ gf_log ("", GF_LOG_ERROR, "Out Of Memory");
goto out;
+ }
- snprintf (nds, sizeof(nds), "node%d", i+dst_count);
- snprintf (mst, sizeof(mst), "master%d", i+dst_count);
- snprintf (slv, sizeof(slv), "slave%d", i+dst_count);
- snprintf (sts, sizeof(sts), "status%d", i+dst_count);
-
- ret = dict_set_dynstr (dst, nds, gf_strdup (nds_val));
- if (ret)
- goto out;
+ memcpy (dst_sts_val, sts_val, sizeof(gf_gsync_status_t));
- ret = dict_set_dynstr (dst, mst, gf_strdup (mst_val));
- if (ret)
- goto out;
+ memset (sts_val_name, '\0', sizeof(sts_val_name));
+ snprintf (sts_val_name, sizeof(sts_val_name), "status_value%d", i + dst_count);
- ret = dict_set_dynstr (dst, slv, gf_strdup (slv_val));
+ ret = dict_set_bin (dst, sts_val_name, dst_sts_val, sizeof(gf_gsync_status_t));
if (ret)
goto out;
-
- ret = dict_set_dynstr (dst, sts, gf_strdup (sts_val));
- if (ret)
- goto out;
-
}
ret = dict_set_int32 (dst, "gsync-count", dst_count+src_count);
@@ -6499,6 +9176,7 @@ glusterd_gsync_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict, char *op_errstr)
{
dict_t *ctx = NULL;
int ret = 0;
+ char *conf_path = NULL;
if (aggr) {
ctx = aggr;
@@ -6520,9 +9198,21 @@ glusterd_gsync_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict, char *op_errstr)
ret = glusterd_append_gsync_status (ctx, rsp_dict);
if (ret)
goto out;
+
+ ret = dict_get_str (rsp_dict, "conf_path", &conf_path);
+ if (!ret && conf_path) {
+ ret = dict_set_dynstr_with_alloc (ctx, "conf_path",
+ conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store conf path.");
+ goto out;
+ }
+ }
}
if ((op_errstr) && (strcmp ("", op_errstr))) {
- ret = dict_set_dynstr (ctx, "errstr", gf_strdup(op_errstr));
+ ret = dict_set_dynstr_with_alloc (ctx, "errstr",
+ op_errstr);
if (ret)
goto out;
}
@@ -6683,8 +9373,12 @@ glusterd_volume_status_add_peer_rsp (dict_t *this, char *key, data_t *value,
int32_t ret = 0;
/* Skip the following keys, they are already present in the ctx_dict */
+ /* Also, skip all the task related pairs. They will be added to the
+ * ctx_dict later
+ */
if (!strcmp (key, "count") || !strcmp (key, "cmd") ||
- !strcmp (key, "brick-index-max") || !strcmp (key, "other-count"))
+ !strcmp (key, "brick-index-max") || !strcmp (key, "other-count") ||
+ !strncmp (key, "task", 4))
return 0;
rsp_ctx = data;
@@ -6709,6 +9403,194 @@ glusterd_volume_status_add_peer_rsp (dict_t *this, char *key, data_t *value,
return 0;
}
+static int
+glusterd_volume_status_copy_tasks_to_ctx_dict (dict_t *this, char *key,
+ data_t *value, void *data)
+{
+ int ret = 0;
+ dict_t *ctx_dict = NULL;
+ data_t *new_value = NULL;
+
+ if (strncmp (key, "task", 4))
+ return 0;
+
+ ctx_dict = data;
+ GF_ASSERT (ctx_dict);
+
+ new_value = data_copy (value);
+ GF_ASSERT (new_value);
+
+ ret = dict_set (ctx_dict, key, new_value);
+
+ return ret;
+}
+
+int
+glusterd_volume_status_aggregate_tasks_status (dict_t *ctx_dict,
+ dict_t *rsp_dict)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ int local_count = 0;
+ int remote_count = 0;
+ int i = 0;
+ int j = 0;
+ char key[128] = {0,};
+ char *task_type = NULL;
+ int local_status = 0;
+ int remote_status = 0;
+ char *local_task_id = NULL;
+ char *remote_task_id = NULL;
+
+ GF_ASSERT (ctx_dict);
+ GF_ASSERT (rsp_dict);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_int32 (rsp_dict, "tasks", &remote_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get remote task count");
+ goto out;
+ }
+ /* Local count will not be present when this is called for the first
+ * time with the origins rsp_dict
+ */
+ ret = dict_get_int32 (ctx_dict, "tasks", &local_count);
+ if (ret) {
+ ret = dict_foreach (rsp_dict,
+ glusterd_volume_status_copy_tasks_to_ctx_dict,
+ ctx_dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Failed to copy tasks"
+ "to ctx_dict.");
+ goto out;
+ }
+
+ if (local_count != remote_count) {
+ gf_log (this->name, GF_LOG_ERROR, "Local tasks count (%d) and "
+ "remote tasks count (%d) do not match. Not aggregating "
+ "tasks status.", local_count, remote_count);
+ ret = -1;
+ goto out;
+ }
+
+ /* Update the tasks statuses. For every remote tasks, search for the
+ * local task, and update the local task status based on the remote
+ * status.
+ */
+ for (i = 0; i < remote_count; i++) {
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.type", i);
+ ret = dict_get_str (rsp_dict, key, &task_type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get task typpe from rsp dict");
+ goto out;
+ }
+
+ /* Skip replace-brick status as it is going to be the same on
+ * all peers. rb_status is set by the replace brick commit
+ * function on all peers based on the replace brick command.
+ * We return the value of rb_status as the status for a
+ * replace-brick task in a 'volume status' command.
+ */
+ if (!strcmp (task_type, "Replace brick"))
+ continue;
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.status", i);
+ ret = dict_get_int32 (rsp_dict, key, &remote_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get task status from rsp dict");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "task%d.id", i);
+ ret = dict_get_str (rsp_dict, key, &remote_task_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get task id from rsp dict");
+ goto out;
+ }
+ for (j = 0; j < local_count; j++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.id", j);
+ ret = dict_get_str (ctx_dict, key, &local_task_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get local task-id");
+ goto out;
+ }
+
+ if (strncmp (remote_task_id, local_task_id,
+ strlen (remote_task_id))) {
+ /* Quit if a matching local task is not found */
+ if (j == (local_count - 1)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not find matching local "
+ "task for task %s",
+ remote_task_id);
+ goto out;
+ }
+ continue;
+ }
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.status", j);
+ ret = dict_get_int32 (ctx_dict, key, &local_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get local task status");
+ goto out;
+ }
+
+ /* Rebalance has 5 states,
+ * NOT_STARTED, STARTED, STOPPED, COMPLETE, FAILED
+ * The precedence used to determine the aggregate status
+ * is as below,
+ * STARTED > FAILED > STOPPED > COMPLETE > NOT_STARTED
+ */
+ /* TODO: Move this to a common place utilities that both
+ * CLI and glusterd need.
+ * Till then if the below algorithm is changed, change
+ * it in cli_xml_output_vol_rebalance_status in
+ * cli-xml-output.c
+ */
+ ret = 0;
+ int rank[] = {
+ [GF_DEFRAG_STATUS_STARTED] = 1,
+ [GF_DEFRAG_STATUS_FAILED] = 2,
+ [GF_DEFRAG_STATUS_STOPPED] = 3,
+ [GF_DEFRAG_STATUS_COMPLETE] = 4,
+ [GF_DEFRAG_STATUS_NOT_STARTED] = 5
+ };
+ if (rank[remote_status] <= rank[local_status])
+ ret = dict_set_int32 (ctx_dict, key,
+ remote_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "update task status");
+ goto out;
+ }
+ break;
+ }
+ }
+
+out:
+ return ret;
+}
+
+gf_boolean_t
+glusterd_status_has_tasks (int cmd) {
+ if (((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE) &&
+ (cmd & GF_CLI_STATUS_VOL))
+ return _gf_true;
+ return _gf_false;
+}
+
int
glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict)
{
@@ -6740,7 +9622,7 @@ glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict)
if (ret)
goto out;
- if (cmd & GF_CLI_STATUS_ALL && is_origin_glusterd ()) {
+ if (cmd & GF_CLI_STATUS_ALL && is_origin_glusterd (ctx_dict)) {
ret = dict_get_int32 (rsp_dict, "vol_count", &vol_count);
if (ret == 0) {
ret = dict_set_int32 (ctx_dict, "vol_count",
@@ -6762,6 +9644,9 @@ glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict)
}
}
+ if ((cmd & GF_CLI_STATUS_TASKS) != 0)
+ goto aggregate_tasks;
+
ret = dict_get_int32 (rsp_dict, "count", &rsp_node_count);
if (ret) {
ret = 0; //no bricks in the rsp
@@ -6805,9 +9690,22 @@ glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict)
ret = dict_set_int32 (ctx_dict, "other-count",
(other_count + rsp_other_count));
- if (ret)
+ if (ret) {
gf_log (THIS->name, GF_LOG_ERROR,
"Failed to update other-count");
+ goto out;
+ }
+
+aggregate_tasks:
+ /* Tasks are only present for a normal status command for a volume or
+ * for an explicit tasks status command for a volume
+ */
+ if (!(cmd & GF_CLI_STATUS_ALL) &&
+ (((cmd & GF_CLI_STATUS_TASKS) != 0) ||
+ glusterd_status_has_tasks (cmd)))
+ ret = glusterd_volume_status_aggregate_tasks_status (ctx_dict,
+ rsp_dict);
+
out:
return ret;
}
@@ -6831,6 +9729,7 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
int32_t current_index = 2;
int32_t value32 = 0;
uint64_t value = 0;
+ char *peer_uuid_str = NULL;
GF_ASSERT (rsp_dict);
conf = THIS->private;
@@ -6873,9 +9772,10 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
/* Finding the index of the node-uuid in the peer-list */
list_for_each_entry (peerinfo, &conf->peers, uuid_list) {
- if (!strcmp(peerinfo->uuid_str, node_uuid_str)){
+ peer_uuid_str = gd_peer_uuid_str (peerinfo);
+ if (strcmp (peer_uuid_str, node_uuid_str) == 0)
break;
- }
+
current_index++;
}
@@ -6962,6 +9862,18 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
}
memset (key, 0, 256);
+ snprintf (key, 256, "skipped-%d", index);
+ ret = dict_get_uint64 (rsp_dict, key, &value);
+ if (!ret) {
+ memset (key, 0, 256);
+ snprintf (key, 256, "skipped-%d", current_index);
+ ret = dict_set_uint64 (ctx_dict, key, value);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_DEBUG,
+ "failed to set skipped count");
+ }
+ }
+ memset (key, 0, 256);
snprintf (key, 256, "run-time-%d", index);
ret = dict_get_double (rsp_dict, key, &elapsed_time);
if (!ret) {
@@ -6981,6 +9893,334 @@ out:
}
int
+glusterd_snap_config_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ char buf[PATH_MAX] = "";
+ char *volname = NULL;
+ int ret = -1;
+ int config_command = 0;
+ uint64_t i = 0;
+ uint64_t value = 0;
+ uint64_t voldisplaycount = 0;
+
+ if (!dst || !src) {
+ gf_log ("", GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "config-command", &config_command);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "failed to get config-command type");
+ goto out;
+ }
+
+ switch (config_command) {
+ case GF_SNAP_CONFIG_DISPLAY:
+ ret = dict_get_uint64 (src, "snap-max-hard-limit", &value);
+ if (!ret) {
+ ret = dict_set_uint64 (dst, "snap-max-hard-limit", value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set snap_max_hard_limit");
+ goto out;
+ }
+ } else {
+ /* Received dummy response from other nodes */
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_uint64 (src, "snap-max-soft-limit", &value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get snap_max_soft_limit");
+ goto out;
+ }
+
+ ret = dict_set_uint64 (dst, "snap-max-soft-limit", value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set snap_max_soft_limit");
+ goto out;
+ }
+
+ ret = dict_get_uint64 (src, "voldisplaycount",
+ &voldisplaycount);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get voldisplaycount");
+ goto out;
+ }
+
+ ret = dict_set_uint64 (dst, "voldisplaycount",
+ voldisplaycount);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set voldisplaycount");
+ goto out;
+ }
+
+ for (i = 0; i < voldisplaycount; i++) {
+ snprintf (buf, sizeof(buf), "volume%"PRIu64"-volname", i);
+ ret = dict_get_str (src, buf, &volname);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s", buf);
+ goto out;
+ }
+ ret = dict_set_str (dst, buf, volname);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-hard-limit", i);
+ ret = dict_get_uint64 (src, buf, &value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s", buf);
+ goto out;
+ }
+ ret = dict_set_uint64 (dst, buf, value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-active-hard-limit", i);
+ ret = dict_get_uint64 (src, buf, &value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s", buf);
+ goto out;
+ }
+ ret = dict_set_uint64 (dst, buf, value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-soft-limit", i);
+ ret = dict_get_uint64 (src, buf, &value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s", buf);
+ goto out;
+ }
+ ret = dict_set_uint64 (dst, buf, value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set %s", buf);
+ goto out;
+ }
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+/* Aggregate missed_snap_counts from different nodes and save it *
+ * in the req_dict of the originator node */
+int
+glusterd_snap_create_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ char *buf = NULL;
+ char *tmp_str = NULL;
+ char name_buf[PATH_MAX] = "";
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t src_missed_snap_count = -1;
+ int32_t dst_missed_snap_count = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (!dst || !src) {
+ gf_log (this->name, GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (src, "missed_snap_count",
+ &src_missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "No missed snaps");
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "missed_snap_count",
+ &dst_missed_snap_count);
+ if (ret) {
+ /* Initialize dst_missed_count for the first time */
+ dst_missed_snap_count = 0;
+ }
+
+ for (i = 0; i < src_missed_snap_count; i++) {
+ snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d",
+ i);
+ ret = dict_get_str (src, name_buf, &buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch %s", name_buf);
+ goto out;
+ }
+
+ snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d",
+ dst_missed_snap_count);
+
+ tmp_str = gf_strdup (buf);
+ if (!tmp_str) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dst, name_buf, tmp_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set %s", name_buf);
+ goto out;
+ }
+
+ tmp_str = NULL;
+ dst_missed_snap_count++;
+ }
+
+ ret = dict_set_int32 (dst, "missed_snap_count", dst_missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set dst_missed_snap_count");
+ goto out;
+ }
+
+out:
+ if (ret && tmp_str)
+ GF_FREE(tmp_str);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_snap_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ int ret = -1;
+ int32_t snap_command = 0;
+
+ if (!dst || !src) {
+ gf_log ("", GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "type", &snap_command);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_snap_create_use_rsp_dict (dst, src);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to use rsp dict");
+ goto out;
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_CONFIG:
+ ret = glusterd_snap_config_use_rsp_dict (dst, src);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to use rsp dict");
+ goto out;
+ }
+ break;
+ default:
+ // copy the response dictinary's contents to the dict to be
+ // sent back to the cli
+ dict_copy (src, dst);
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_sys_exec_output_rsp_dict (dict_t *dst, dict_t *src)
+{
+ char output_name[PATH_MAX] = "";
+ char *output = NULL;
+ int ret = 0;
+ int i = 0;
+ int len = 0;
+ int src_output_count = 0;
+ int dst_output_count = 0;
+
+ if (!dst || !src) {
+ gf_log ("", GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "output_count", &dst_output_count);
+
+ ret = dict_get_int32 (src, "output_count", &src_output_count);
+ if (ret) {
+ gf_log ("", GF_LOG_DEBUG, "No output from source");
+ ret = 0;
+ goto out;
+ }
+
+ for (i = 1; i <= src_output_count; i++) {
+ len = snprintf (output_name, sizeof(output_name) - 1,
+ "output_%d", i);
+ output_name[len] = '\0';
+ ret = dict_get_str (src, output_name, &output);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch %s",
+ output_name);
+ goto out;
+ }
+
+ len = snprintf (output_name, sizeof(output_name) - 1,
+ "output_%d", i+dst_output_count);
+ output_name[len] = '\0';
+ ret = dict_set_dynstr (dst, output_name, gf_strdup (output));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to set %s",
+ output_name);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dst, "output_count",
+ dst_output_count+src_output_count);
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
glusterd_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
{
int ret = 0;
@@ -7040,6 +10280,90 @@ _profile_volume_add_brick_rsp (dict_t *this, char *key, data_t *value,
}
int
+glusterd_volume_quota_copy_to_op_ctx_dict (dict_t *dict, dict_t *rsp_dict)
+{
+ int ret = -1;
+ int i = 0;
+ int count = 0;
+ int rsp_dict_count = 0;
+ char *uuid_str = NULL;
+ char *uuid_str_dup = NULL;
+ char key[256] = {0,};
+ xlator_t *this = NULL;
+ int type = GF_QUOTA_OPTION_TYPE_NONE;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_int32 (dict, "type", &type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get quota opcode");
+ goto out;
+ }
+
+ if ((type != GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) &&
+ (type != GF_QUOTA_OPTION_TYPE_REMOVE)) {
+ dict_copy (rsp_dict, dict);
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_int32 (rsp_dict, "count", &rsp_dict_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get the count of "
+ "gfids from the rsp dict");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "count", &count);
+ if (ret)
+ /* The key "count" is absent in op_ctx when this function is
+ * called after self-staging on the originator. This must not
+ * be treated as error.
+ */
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to get count of gfids"
+ " from req dict. This could be because count is not yet"
+ " copied from rsp_dict into op_ctx");
+
+ for (i = 0; i < rsp_dict_count; i++) {
+ snprintf (key, sizeof(key)-1, "gfid%d", i);
+
+ ret = dict_get_str (rsp_dict, key, &uuid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get gfid "
+ "from rsp dict");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key)-1, "gfid%d", i + count);
+
+ uuid_str_dup = gf_strdup (uuid_str);
+ if (!uuid_str_dup) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, key, uuid_str_dup);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set gfid "
+ "from rsp dict into req dict");
+ GF_FREE (uuid_str_dup);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "count", rsp_dict_count + count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set aggregated "
+ "count in req dict");
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+int
glusterd_profile_volume_brick_rsp (void *pending_entry,
dict_t *rsp_dict, dict_t *op_ctx,
char **op_errstr, gd_node_type type)
@@ -7153,6 +10477,77 @@ out:
}
int
+_heal_volume_add_shd_rsp_of_statistics (dict_t *this, char *key, data_t
+ *value, void *data)
+{
+ char new_key[256] = {0,};
+ char int_str[16] = {0,};
+ char key_begin_string[128] = {0,};
+ data_t *new_value = NULL;
+ char *rxl_end = NULL;
+ char *rxl_child_end = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *key_begin_str = NULL;
+ int rxl_id = 0;
+ int rxl_child_id = 0;
+ int brick_id = 0;
+ int int_len = 0;
+ int ret = 0;
+ glusterd_heal_rsp_conv_t *rsp_ctx = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ rsp_ctx = data;
+ key_begin_str = strchr (key, '-');
+ if (!key_begin_str)
+ goto out;
+
+ int_len = strlen (key) - strlen (key_begin_str);
+ strncpy (key_begin_string, key, int_len);
+ key_begin_string[int_len] = '\0';
+
+ rxl_end = strchr (key_begin_str + 1, '-');
+ if (!rxl_end)
+ goto out;
+
+ int_len = strlen (key_begin_str) - strlen (rxl_end) - 1;
+ strncpy (int_str, key_begin_str + 1, int_len);
+ int_str[int_len] = '\0';
+ ret = gf_string2int (int_str, &rxl_id);
+ if (ret)
+ goto out;
+
+
+ rxl_child_end = strchr (rxl_end + 1, '-');
+ if (!rxl_child_end)
+ goto out;
+
+ int_len = strlen (rxl_end) - strlen (rxl_child_end) - 1;
+ strncpy (int_str, rxl_end + 1, int_len);
+ int_str[int_len] = '\0';
+ ret = gf_string2int (int_str, &rxl_child_id);
+ if (ret)
+ goto out;
+
+ volinfo = rsp_ctx->volinfo;
+ brick_id = rxl_id * volinfo->replica_count + rxl_child_id;
+
+ brickinfo = glusterd_get_brickinfo_by_position (volinfo, brick_id);
+ if (!brickinfo)
+ goto out;
+ if (!glusterd_is_local_brick (rsp_ctx->this, volinfo, brickinfo))
+ goto out;
+
+ new_value = data_copy (value);
+ snprintf (new_key, sizeof (new_key), "%s-%d%s", key_begin_string,
+ brick_id, rxl_child_end);
+ dict_set (rsp_ctx->dict, new_key, new_value);
+
+out:
+ return 0;
+
+}
+
+int
glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict,
dict_t *op_ctx, char **op_errstr)
{
@@ -7160,6 +10555,7 @@ glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict,
glusterd_heal_rsp_conv_t rsp_ctx = {0};
char *volname = NULL;
glusterd_volinfo_t *volinfo = NULL;
+ int heal_op = -1;
GF_ASSERT (rsp_dict);
GF_ASSERT (op_ctx);
@@ -7171,6 +10567,13 @@ glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict,
goto out;
}
+ ret = dict_get_int32 (req_dict, "heal-op", &heal_op);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get heal_op");
+ goto out;
+ }
+
+
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret)
@@ -7179,7 +10582,12 @@ glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict,
rsp_ctx.dict = op_ctx;
rsp_ctx.volinfo = volinfo;
rsp_ctx.this = THIS;
- dict_foreach (rsp_dict, _heal_volume_add_shd_rsp, &rsp_ctx);
+ if (heal_op == GF_AFR_OP_STATISTICS)
+ dict_foreach (rsp_dict, _heal_volume_add_shd_rsp_of_statistics,
+ &rsp_ctx);
+ else
+ dict_foreach (rsp_dict, _heal_volume_add_shd_rsp, &rsp_ctx);
+
out:
return ret;
@@ -7324,6 +10732,13 @@ glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict,
gf_log (THIS->name, GF_LOG_ERROR,
"failed to set failure count");
+ memset (key, 0 , 256);
+ snprintf (key, 256, "skipped-%d", i);
+ ret = dict_set_uint64 (op_ctx, key, volinfo->rebal.skipped_files);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set skipped count");
+
memset (key, 0, 256);
snprintf (key, 256, "run-time-%d", i);
ret = dict_set_double (op_ctx, key, volinfo->rebal.rebalance_time);
@@ -7371,20 +10786,66 @@ glusterd_handle_node_rsp (dict_t *req_dict, void *pending_entry,
return ret;
}
+int32_t
+glusterd_set_originator_uuid (dict_t *dict)
+{
+ int ret = -1;
+ uuid_t *originator_uuid = NULL;
+
+ GF_ASSERT (dict);
+
+ originator_uuid = GF_CALLOC (1, sizeof(uuid_t),
+ gf_common_mt_uuid_t);
+ if (!originator_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*originator_uuid, MY_UUID);
+ ret = dict_set_bin (dict, "originator_uuid",
+ originator_uuid, sizeof (uuid_t));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
+ goto out;
+ }
+
+out:
+ if (ret && originator_uuid)
+ GF_FREE (originator_uuid);
+
+ return ret;
+}
+
/* Should be used only when an operation is in progress, as that is the only
* time a lock_owner is set
*/
gf_boolean_t
-is_origin_glusterd ()
+is_origin_glusterd (dict_t *dict)
{
- int ret = 0;
- uuid_t lock_owner = {0,};
+ gf_boolean_t ret = _gf_false;
+ uuid_t lock_owner = {0,};
+ uuid_t *originator_uuid = NULL;
- ret = glusterd_get_lock_owner (&lock_owner);
- if (ret)
- return _gf_false;
+ GF_ASSERT (dict);
+
+ ret = dict_get_bin (dict, "originator_uuid",
+ (void **) &originator_uuid);
+ if (ret) {
+ /* If not originator_uuid has been set, then the command
+ * has been originated from a glusterd running on older version
+ * Hence fetching the lock owner */
+ ret = glusterd_get_lock_owner (&lock_owner);
+ if (ret) {
+ ret = _gf_false;
+ goto out;
+ }
+ ret = !uuid_compare (MY_UUID, lock_owner);
+ } else
+ ret = !uuid_compare (MY_UUID, *originator_uuid);
- return (uuid_compare (MY_UUID, lock_owner) == 0);
+out:
+ return ret;
}
int
@@ -7447,65 +10908,28 @@ glusterd_copy_uuid_to_dict (uuid_t uuid, dict_t *dict, char *key)
return 0;
}
-gf_boolean_t
-glusterd_is_same_address (char *name1, char *name2)
-{
- struct addrinfo *addr1 = NULL;
- struct addrinfo *addr2 = NULL;
- struct addrinfo *p = NULL;
- struct addrinfo *q = NULL;
- gf_boolean_t ret = _gf_false;
- int gai_err = 0;
-
- gai_err = getaddrinfo(name1,NULL,NULL,&addr1);
- if (gai_err != 0) {
- gf_log (name1, GF_LOG_WARNING,
- "error in getaddrinfo: %s\n", gai_strerror(gai_err));
- goto out;
- }
-
- gai_err = getaddrinfo(name2,NULL,NULL,&addr2);
- if (gai_err != 0) {
- gf_log (name2, GF_LOG_WARNING,
- "error in getaddrinfo: %s\n", gai_strerror(gai_err));
- goto out;
- }
-
- for (p = addr1; p; p = p->ai_next) {
- for (q = addr2; q; q = q->ai_next) {
- if (p->ai_addrlen != q->ai_addrlen) {
- continue;
- }
- if (memcmp(p->ai_addr,q->ai_addr,p->ai_addrlen)) {
- continue;
- }
- ret = _gf_true;
- goto out;
- }
- }
-
-out:
- if (addr1) {
- freeaddrinfo(addr1);
- }
- if (addr2) {
- freeaddrinfo(addr2);
- }
- return ret;
-
-}
-
int
_update_volume_op_versions (dict_t *this, char *key, data_t *value, void *data)
{
int op_version = 0;
glusterd_volinfo_t *ctx = NULL;
+ gf_boolean_t enabled = _gf_true;
+ int ret = -1;
GF_ASSERT (data);
ctx = data;
op_version = glusterd_get_op_version_for_key (key);
+ if (gd_is_xlator_option (key) || gd_is_boolean_option (key)) {
+ ret = gf_string2boolean (value->data, &enabled);
+ if (ret)
+ return 0;
+
+ if (!enabled)
+ return 0;
+ }
+
if (op_version > ctx->op_version)
ctx->op_version = op_version;
@@ -7519,7 +10943,8 @@ _update_volume_op_versions (dict_t *this, char *key, data_t *value, void *data)
void
gd_update_volume_op_versions (glusterd_volinfo_t *volinfo)
{
- glusterd_conf_t *conf = NULL;
+ glusterd_conf_t *conf = NULL;
+ gf_boolean_t ob_enabled = _gf_false;
GF_ASSERT (volinfo);
@@ -7532,5 +10957,588 @@ gd_update_volume_op_versions (glusterd_volinfo_t *volinfo)
dict_foreach (volinfo->dict, _update_volume_op_versions, volinfo);
+ /* Special case for open-behind
+ * If cluster op-version >= 2 and open-behind hasn't been explicitly
+ * disabled, volume op-versions must be updated to account for it
+ */
+
+ /* TODO: Remove once we have a general way to update automatically
+ * enabled features
+ */
+ if (conf->op_version >= 2) {
+ ob_enabled = dict_get_str_boolean (volinfo->dict,
+ "performance.open-behind",
+ _gf_true);
+ if (ob_enabled) {
+
+ if (volinfo->op_version < 2)
+ volinfo->op_version = 2;
+ if (volinfo->client_op_version < 2)
+ volinfo->client_op_version = 2;
+ }
+ }
+
return;
}
+
+int
+op_version_check (xlator_t *this, int min_op_version, char *msg, int msglen)
+{
+ int ret = 0;
+ glusterd_conf_t *priv = NULL;
+
+ GF_ASSERT (this);
+ GF_ASSERT (msg);
+
+ priv = this->private;
+ if (priv->op_version < min_op_version) {
+ snprintf (msg, msglen, "One or more nodes do not support "
+ "the required op-version. Cluster op-version must "
+ "atleast be %d.", min_op_version);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ ret = -1;
+ }
+ return ret;
+}
+
+
+/* A task is committed/completed once the task-id for it is cleared */
+gf_boolean_t
+gd_is_remove_brick_committed (glusterd_volinfo_t *volinfo)
+{
+ GF_ASSERT (volinfo);
+
+ if ((GD_OP_REMOVE_BRICK == volinfo->rebal.op) &&
+ !uuid_is_null (volinfo->rebal.rebalance_id))
+ return _gf_false;
+
+ return _gf_true;
+}
+
+gf_boolean_t
+glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo,
+ struct list_head *peers,
+ char **down_peerstr)
+{
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ gf_boolean_t ret = _gf_false;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (!uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ list_for_each_entry (peerinfo, peers, uuid_list) {
+ if (uuid_compare (peerinfo->uuid, brickinfo->uuid))
+ continue;
+
+ /*Found peer who owns the brick, return false
+ * if peer is not connected or not friend */
+ if (!(peerinfo->connected) ||
+ (peerinfo->state.state !=
+ GD_FRIEND_STATE_BEFRIENDED)) {
+ *down_peerstr = gf_strdup (peerinfo->hostname);
+ gf_log ("", GF_LOG_DEBUG, "Peer %s is down. ",
+ peerinfo->hostname);
+ goto out;
+ }
+ }
+ }
+
+ ret = _gf_true;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+gf_boolean_t
+glusterd_is_status_tasks_op (glusterd_op_t op, dict_t *dict)
+{
+ int ret = -1;
+ uint32_t cmd = GF_CLI_STATUS_NONE;
+ gf_boolean_t is_status_tasks = _gf_false;
+
+ if (op != GD_OP_STATUS_VOLUME)
+ goto out;
+
+ ret = dict_get_uint32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to get opcode");
+ goto out;
+ }
+
+ if (cmd & GF_CLI_STATUS_TASKS)
+ is_status_tasks = _gf_true;
+
+out:
+ return is_status_tasks;
+}
+
+int
+glusterd_compare_snap_time(struct list_head *list1, struct list_head *list2)
+{
+ glusterd_snap_t *snap1 = NULL;
+ glusterd_snap_t *snap2 = NULL;
+ double diff_time = 0;
+
+ GF_ASSERT (list1);
+ GF_ASSERT (list2);
+
+ snap1 = list_entry(list1, glusterd_snap_t, snap_list);
+ snap2 = list_entry(list2, glusterd_snap_t, snap_list);
+ diff_time = difftime(snap1->time_stamp, snap2->time_stamp);
+
+ return ((int)diff_time);
+}
+
+int
+glusterd_compare_snap_vol_time(struct list_head *list1, struct list_head *list2)
+{
+ glusterd_volinfo_t *snapvol1 = NULL;
+ glusterd_volinfo_t *snapvol2 = NULL;
+ double diff_time = 0;
+
+ GF_ASSERT (list1);
+ GF_ASSERT (list2);
+
+ snapvol1 = list_entry(list1, glusterd_volinfo_t, snapvol_list);
+ snapvol2 = list_entry(list2, glusterd_volinfo_t, snapvol_list);
+ diff_time = difftime(snapvol1->snapshot->time_stamp,
+ snapvol2->snapshot->time_stamp);
+
+ return ((int)diff_time);
+}
+
+int32_t
+glusterd_missed_snapinfo_new (glusterd_missed_snap_info **missed_snapinfo)
+{
+ glusterd_missed_snap_info *new_missed_snapinfo = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (missed_snapinfo);
+
+ new_missed_snapinfo = GF_CALLOC (1, sizeof(*new_missed_snapinfo),
+ gf_gld_mt_missed_snapinfo_t);
+
+ if (!new_missed_snapinfo)
+ goto out;
+
+ INIT_LIST_HEAD (&new_missed_snapinfo->missed_snaps);
+ INIT_LIST_HEAD (&new_missed_snapinfo->snap_ops);
+
+ *missed_snapinfo = new_missed_snapinfo;
+
+ ret = 0;
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_missed_snap_op_new (glusterd_snap_op_t **snap_op)
+{
+ glusterd_snap_op_t *new_snap_op = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap_op);
+
+ new_snap_op = GF_CALLOC (1, sizeof(*new_snap_op),
+ gf_gld_mt_missed_snapinfo_t);
+
+ if (!new_snap_op)
+ goto out;
+
+ new_snap_op->brick_num = -1;
+ new_snap_op->op = -1;
+ new_snap_op->status = -1;
+ INIT_LIST_HEAD (&new_snap_op->snap_ops_list);
+
+ *snap_op = new_snap_op;
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Tells if rebalance needs to be started for the given volume on the peer
+ *
+ * Rebalance should be started on a peer only if an involved brick is present on
+ * the peer.
+ *
+ * For a normal rebalance, if any one brick of the given volume is present on
+ * the peer, the rebalance process should be started.
+ *
+ * For a rebalance as part of a remove-brick operation, the rebalance process
+ * should be started only if one of the bricks being removed is present on the
+ * peer
+ */
+gf_boolean_t
+gd_should_i_start_rebalance (glusterd_volinfo_t *volinfo) {
+ gf_boolean_t retval = _gf_false;
+ int ret = -1;
+ glusterd_brickinfo_t *brick = NULL;
+ int count = 0;
+ int i = 0;
+ char key[1023] = {0,};
+ char *brickname = NULL;
+
+
+ switch (volinfo->rebal.op) {
+ case GD_OP_REBALANCE:
+ list_for_each_entry (brick, &volinfo->bricks, brick_list) {
+ if (uuid_compare (MY_UUID, brick->uuid) == 0) {
+ retval = _gf_true;
+ break;
+ }
+ }
+ break;
+ case GD_OP_REMOVE_BRICK:
+ ret = dict_get_int32 (volinfo->rebal.dict, "count", &count);
+ if (ret) {
+ goto out;
+ }
+ for (i = 1; i <= count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "brick%d", i);
+ ret = dict_get_str (volinfo->rebal.dict, key,
+ &brickname);
+ if (ret)
+ goto out;
+ ret = glusterd_volume_brickinfo_get_by_brick (brickname,
+ volinfo,
+ &brick);
+ if (ret)
+ goto out;
+ if (uuid_compare (MY_UUID, brick->uuid) == 0) {
+ retval = _gf_true;
+ break;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+out:
+ return retval;
+}
+
+int
+glusterd_is_volume_quota_enabled (glusterd_volinfo_t *volinfo)
+{
+ return (glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA));
+}
+
+int
+glusterd_validate_and_set_gfid (dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr)
+{
+ int ret = -1;
+ int count = 0;
+ int i = 0;
+ int op_code = GF_QUOTA_OPTION_TYPE_NONE;
+ uuid_t uuid1 = {0};
+ uuid_t uuid2 = {0,};
+ char *path = NULL;
+ char key[256] = {0,};
+ char *uuid1_str = NULL;
+ char *uuid1_str_dup = NULL;
+ char *uuid2_str = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_int32 (op_ctx, "type", &op_code);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get quota opcode");
+ goto out;
+ }
+
+ if ((op_code != GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) &&
+ (op_code != GF_QUOTA_OPTION_TYPE_REMOVE)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_str (op_ctx, "path", &path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get path");
+ goto out;
+ }
+
+ ret = dict_get_int32 (op_ctx, "count", &count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get count");
+ goto out;
+ }
+
+ /* If count is 0, fail the command with ENOENT.
+ *
+ * If count is 1, treat gfid0 as the gfid on which the operation
+ * is to be performed and resume the command.
+ *
+ * if count > 1, get the 0th gfid from the op_ctx and,
+ * compare it with the remaining 'count -1' gfids.
+ * If they are found to be the same, set gfid0 in the op_ctx and
+ * resume the operation, else error out.
+ */
+
+ if (count == 0) {
+ gf_asprintf (op_errstr, "Failed to get trusted.gfid attribute "
+ "on path %s. Reason : %s", path,
+ strerror (ENOENT));
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key) - 1, "gfid%d", 0);
+
+ ret = dict_get_str (op_ctx, key, &uuid1_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get key '%s'",
+ key);
+ goto out;
+ }
+
+ uuid_parse (uuid1_str, uuid1);
+
+ for (i = 1; i < count; i++) {
+ snprintf (key, sizeof (key)-1, "gfid%d", i);
+
+ ret = dict_get_str (op_ctx, key, &uuid2_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get key "
+ "'%s'", key);
+ goto out;
+ }
+
+ uuid_parse (uuid2_str, uuid2);
+
+ if (uuid_compare (uuid1, uuid2)) {
+ gf_asprintf (op_errstr, "gfid mismatch between %s and "
+ "%s for path %s", uuid1_str, uuid2_str,
+ path);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (i == count) {
+ uuid1_str_dup = gf_strdup (uuid1_str);
+ if (!uuid1_str_dup) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (req_dict, "gfid", uuid1_str_dup);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set gfid");
+ GF_FREE (uuid1_str_dup);
+ goto out;
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to iterate through %d"
+ " entries in the req dict", count);
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+glusterd_clean_up_quota_store (glusterd_volinfo_t *volinfo)
+{
+ char voldir[PATH_MAX] = {0,};
+ char quota_confpath[PATH_MAX] = {0,};
+ char cksum_path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_VOLUME_DIR (voldir, volinfo, conf);
+
+ snprintf (quota_confpath, sizeof (quota_confpath), "%s/%s", voldir,
+ GLUSTERD_VOLUME_QUOTA_CONFIG);
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", voldir,
+ GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+
+ unlink (quota_confpath);
+ unlink (cksum_path);
+
+ gf_store_handle_destroy (volinfo->quota_conf_shandle);
+ volinfo->quota_conf_shandle = NULL;
+ volinfo->quota_conf_version = 0;
+
+}
+
+#define QUOTA_CONF_HEADER \
+ "GlusterFS Quota conf | version: v%d.%d\n"
+
+int
+glusterd_store_quota_conf_skip_header (xlator_t *this, int fd)
+{
+ char buf[PATH_MAX] = {0,};
+
+ snprintf (buf, sizeof(buf)-1, QUOTA_CONF_HEADER, 1, 1);
+ return gf_skip_header_section (fd, strlen (buf));
+}
+
+int
+glusterd_store_quota_conf_stamp_header (xlator_t *this, int fd)
+{
+ char buf[PATH_MAX] = {0,};
+ int buf_len = 0;
+ ssize_t ret = -1;
+ ssize_t written = 0;
+
+ snprintf (buf, sizeof(buf)-1, QUOTA_CONF_HEADER, 1, 1);
+ buf_len = strlen (buf);
+ for (written = 0; written != buf_len; written += ret) {
+ ret = write (fd, buf + written, buf_len - written);
+ if (ret == -1) {
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_remove_auxiliary_mount (char *volname)
+{
+ int ret = -1;
+ runner_t runner = {0,};
+ char mountdir[PATH_MAX] = {0,};
+ char pidfile[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GLUSTERFS_GET_AUX_MOUNT_PIDFILE (pidfile, volname);
+
+ if (!gf_is_service_running (pidfile, NULL)) {
+ gf_log (this->name, GF_LOG_DEBUG, "Aux mount of volume %s "
+ "absent, hence returning", volname);
+ return 0;
+ }
+
+ GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH (mountdir, volname, "/");
+ runinit (&runner);
+ runner_add_args (&runner, "umount",
+
+#if GF_LINUX_HOST_OS
+ "-l",
+#endif
+ mountdir, NULL);
+ ret = runner_run_reuse (&runner);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "umount on %s failed, "
+ "reason : %s", mountdir, strerror (errno));
+ runner_end (&runner);
+
+ rmdir (mountdir);
+ return ret;
+}
+
+/* Stops the rebalance process of the given volume
+ */
+int
+gd_stop_rebalance_process (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char pidfile[PATH_MAX] = {0,};
+
+ GF_ASSERT (volinfo);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_DEFRAG_PID_FILE (pidfile, volinfo, conf);
+ ret = glusterd_service_stop ("rebalance", pidfile, SIGTERM, _gf_true);
+
+ return ret;
+}
+
+rpc_clnt_t *
+glusterd_rpc_clnt_unref (glusterd_conf_t *conf, rpc_clnt_t *rpc)
+{
+ rpc_clnt_t *ret = NULL;
+
+ GF_ASSERT (conf);
+ GF_ASSERT (rpc);
+ synclock_unlock (&conf->big_lock);
+ ret = rpc_clnt_unref (rpc);
+ synclock_lock (&conf->big_lock);
+
+ return ret;
+}
+
+int32_t
+glusterd_compare_volume_name(struct list_head *list1, struct list_head *list2)
+{
+ glusterd_volinfo_t *volinfo1 = NULL;
+ glusterd_volinfo_t *volinfo2 = NULL;
+
+ volinfo1 = list_entry(list1, glusterd_volinfo_t, vol_list);
+ volinfo2 = list_entry(list2, glusterd_volinfo_t, vol_list);
+ return strcmp(volinfo1->volname, volinfo2->volname);
+}
+
+int32_t
+glusterd_mount_lvm_snapshot (char *device_path, char *brick_mount_path)
+{
+ char msg[NAME_MAX] = "";
+ int32_t ret = -1;
+ runner_t runner = {0, };
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brick_mount_path);
+ GF_ASSERT (device_path);
+
+
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "mount -o nouuid %s %s",
+ device_path, brick_mount_path);
+ runner_add_args (&runner, "mount", "-o", "nouuid", device_path,
+ brick_mount_path, NULL);
+ runner_log (&runner, this->name, GF_LOG_DEBUG, msg);
+ ret = runner_run (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "mounting the snapshot "
+ "logical device %s failed (error: %s)", device_path,
+ strerror (errno));
+ goto out;
+ } else
+ gf_log (this->name, GF_LOG_DEBUG, "mounting the snapshot "
+ "logical device %s successful", device_path);
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
index 6196b50e4..84fa89b0e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.h
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
@@ -8,7 +8,7 @@
cases as published by the Free Software Foundation.
*/
#ifndef _GLUSTERD_UTILS_H
-#define _GLUSTERD_UTILS_H_
+#define _GLUSTERD_UTILS_H
#ifndef _CONFIG_H
#define _CONFIG_H
@@ -29,6 +29,10 @@
#include "protocol-common.h"
#define GLUSTERD_SOCK_DIR "/var/run"
+#define GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(brickinfo, volinfo, brickid) do {\
+ sprintf (brickinfo->brick_id, "%s-client-%d",\
+ volinfo->volname, brickid);\
+} while (0)
struct glusterd_lock_ {
uuid_t owner;
@@ -81,6 +85,11 @@ glusterd_submit_request (struct rpc_clnt *rpc, void *req,
int32_t
glusterd_volinfo_new (glusterd_volinfo_t **volinfo);
+int32_t
+glusterd_volinfo_dup (glusterd_volinfo_t *volinfo,
+ glusterd_volinfo_t **dup_volinfo,
+ gf_boolean_t set_userauth);
+
char *
glusterd_auth_get_username (glusterd_volinfo_t *volinfo);
@@ -115,12 +124,30 @@ int32_t
glusterd_peer_hostname_new (char *hostname, glusterd_peer_hostname_t **name);
int32_t
+glusterd_snap_volinfo_find (char *volname, glusterd_snap_t *snap,
+ glusterd_volinfo_t **volinfo);
+int32_t
+glusterd_snap_volinfo_find_from_parent_volname (char *origin_volname,
+ glusterd_snap_t *snap,
+ glusterd_volinfo_t **volinfo);
+
+int32_t
glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo);
+int
+glusterd_volinfo_find_by_volume_id (uuid_t volume_id, glusterd_volinfo_t **volinfo);
+
+int
+glusterd_snap_volinfo_find_by_volume_id (uuid_t volume_id,
+ glusterd_volinfo_t **volinfo);
+
int32_t
glusterd_service_stop(const char *service, char *pidfile, int sig,
gf_boolean_t force_kill);
+int
+glusterd_get_next_available_brickid (glusterd_volinfo_t *volinfo);
+
int32_t
glusterd_resolve_brick (glusterd_brickinfo_t *brickinfo);
@@ -134,6 +161,12 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo,
gf_boolean_t del_brick);
+glusterd_volinfo_t *
+glusterd_volinfo_ref (glusterd_volinfo_t *volinfo);
+
+glusterd_volinfo_t *
+glusterd_volinfo_unref (glusterd_volinfo_t *volinfo);
+
int32_t
glusterd_volinfo_delete (glusterd_volinfo_t *volinfo);
@@ -148,26 +181,22 @@ glusterd_volume_brickinfo_get_by_brick (char *brick,
glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t **brickinfo);
-gf_boolean_t
-glusterd_is_local_addr (char *hostname);
-
int32_t
-glusterd_build_volume_dict (dict_t **vols);
+glusterd_add_volumes_to_export_dict (dict_t **peer_data);
int32_t
-glusterd_compare_friend_data (dict_t *vols, int32_t *status, char *hostname);
+glusterd_compare_friend_data (dict_t *peer_data, int32_t *status,
+ char *hostname);
int
-glusterd_volume_compute_cksum (glusterd_volinfo_t *volinfo);
+glusterd_compute_cksum (glusterd_volinfo_t *volinfo,
+ gf_boolean_t is_quota_conf);
void
glusterd_get_nodesvc_volfile (char *server, char *workdir,
char *volfile, size_t len);
gf_boolean_t
-glusterd_is_service_running (char *pidfile, int *pid);
-
-gf_boolean_t
glusterd_is_nodesvc_running ();
gf_boolean_t
@@ -188,6 +217,12 @@ glusterd_shd_start ();
int32_t
glusterd_shd_stop ();
+int32_t
+glusterd_quotad_start ();
+
+int32_t
+glusterd_quotad_stop ();
+
void
glusterd_set_socket_filepath (char *sock_filepath, char *sockpath, size_t len);
@@ -217,7 +252,7 @@ int
glusterd_remote_hostname_get (rpcsvc_request_t *req,
char *remote_host, int len);
int32_t
-glusterd_import_friend_volumes (dict_t *vols);
+glusterd_import_friend_volumes (dict_t *peer_data);
void
glusterd_set_volume_status (glusterd_volinfo_t *volinfo,
glusterd_volume_status status);
@@ -228,6 +263,9 @@ int
glusterd_check_generate_start_shd (void);
int
+glusterd_check_generate_start_quotad (void);
+
+int
glusterd_nodesvcs_handle_graph_change (glusterd_volinfo_t *volinfo);
int
@@ -243,7 +281,8 @@ int32_t
glusterd_volume_count_get (void);
int32_t
glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
- dict_t *dict, int32_t count);
+ dict_t *dict, int32_t count,
+ char *prefix);
int
glusterd_get_brickinfo (xlator_t *this, const char *brickname,
int port, gf_boolean_t localhost,
@@ -282,6 +321,7 @@ glusterd_is_defrag_on (glusterd_volinfo_t *volinfo);
int32_t
glusterd_volinfo_bricks_delete (glusterd_volinfo_t *volinfo);
+
int
glusterd_friend_find_by_uuid (uuid_t uuid,
glusterd_peerinfo_t **peerinfo);
@@ -318,7 +358,7 @@ glusterd_rb_check_bricks (glusterd_volinfo_t *volinfo,
int
glusterd_check_and_set_brick_xattr (char *host, char *path, uuid_t uuid,
- char **op_errstr);
+ char **op_errstr, gf_boolean_t is_force);
int
glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo,
@@ -351,7 +391,7 @@ gf_boolean_t
glusterd_peerinfo_is_uuid_unknown (glusterd_peerinfo_t *peerinfo);
int32_t
glusterd_brick_connect (glusterd_volinfo_t *volinfo,
- glusterd_brickinfo_t *brickinfo);
+ glusterd_brickinfo_t *brickinfo, char *socketpath);
int32_t
glusterd_brick_disconnect (glusterd_brickinfo_t *brickinfo);
int32_t
@@ -359,15 +399,25 @@ glusterd_delete_volume (glusterd_volinfo_t *volinfo);
int32_t
glusterd_delete_brick (glusterd_volinfo_t* volinfo,
glusterd_brickinfo_t *brickinfo);
+
int32_t
glusterd_delete_all_bricks (glusterd_volinfo_t* volinfo);
+
int
glusterd_spawn_daemons (void *opaque);
+
int
glusterd_restart_gsyncds (glusterd_conf_t *conf);
+
int
glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
- char *glusterd_uuid_str, char **op_errstr);
+ char *path_list, char *conf_path,
+ char *glusterd_uuid_str,
+ char **op_errstr);
+int
+glusterd_get_local_brickpaths (glusterd_volinfo_t *volinfo,
+ char **pathlist);
+
int32_t
glusterd_recreate_bricks (glusterd_conf_t *conf);
int32_t
@@ -395,12 +445,17 @@ glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
char *options, int option_cnt, char **op_errstr);
int
glusterd_nfs_statedump (char *options, int option_cnt, char **op_errstr);
+
+int
+glusterd_quotad_statedump (char *options, int option_cnt, char **op_errstr);
+
gf_boolean_t
glusterd_is_volume_replicate (glusterd_volinfo_t *volinfo);
+
gf_boolean_t
glusterd_is_brick_decommissioned (glusterd_volinfo_t *volinfo, char *hostname,
char *path);
-gf_boolean_t
+int
glusterd_friend_contains_vol_bricks (glusterd_volinfo_t *volinfo,
uuid_t friend_uuid);
int
@@ -450,6 +505,12 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo,
int
glusterd_check_files_identical (char *filename1, char *filename2,
gf_boolean_t *identical);
+
+int
+glusterd_check_topology_identical (const char *filename1,
+ const char *filename2,
+ gf_boolean_t *identical);
+
void
glusterd_volinfo_reset_defrag_stats (glusterd_volinfo_t *volinfo);
int
@@ -471,6 +532,10 @@ int
glusterd_volume_heal_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
int
glusterd_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_sys_exec_output_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_snap_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
int32_t
glusterd_handle_node_rsp (dict_t *req_ctx, void *pending_entry,
glusterd_op_t op, dict_t *rsp_dict, dict_t *op_ctx,
@@ -479,6 +544,11 @@ int
glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
int
glusterd_volume_heal_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+
+int32_t
+glusterd_check_if_quota_trans_enabled (glusterd_volinfo_t *volinfo);
+int
+glusterd_volume_quota_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp);
int
_profile_volume_add_brick_rsp (dict_t *this, char *key, data_t *value,
void *data);
@@ -487,11 +557,19 @@ glusterd_profile_volume_brick_rsp (void *pending_entry,
dict_t *rsp_dict, dict_t *op_ctx,
char **op_errstr, gd_node_type type);
+gf_boolean_t
+glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo,
+ struct list_head *peers,
+ char **down_peerstr);
+
+int32_t
+glusterd_set_originator_uuid (dict_t *dict);
+
/* Should be used only when an operation is in progress, as that is the only
* time a lock_owner is set
*/
gf_boolean_t
-is_origin_glusterd ();
+is_origin_glusterd (dict_t *dict);
gf_boolean_t
glusterd_is_quorum_changed (dict_t *options, char *option, char *value);
@@ -518,6 +596,10 @@ int
glusterd_generate_and_set_task_id (dict_t *dict, char *key);
int
+glusterd_validate_and_set_gfid (dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr);
+
+int
glusterd_copy_uuid_to_dict (uuid_t uuid, dict_t *dict, char *key);
gf_boolean_t
@@ -525,4 +607,152 @@ glusterd_is_same_address (char *name1, char *name2);
void
gd_update_volume_op_versions (glusterd_volinfo_t *volinfo);
+
+int
+op_version_check (xlator_t *this, int min_op_version, char *msg, int msglen);
+
+char*
+gd_peer_uuid_str (glusterd_peerinfo_t *peerinfo);
+
+gf_boolean_t
+gd_is_remove_brick_committed (glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo,
+ struct list_head *peers,
+ char **down_peerstr);
+
+int
+glusterd_get_slave_details_confpath (glusterd_volinfo_t *volinfo, dict_t *dict,
+ char **slave_ip, char **slave_vol,
+ char **conf_path, char **op_errstr);
+
+int
+glusterd_get_slave_info (char *slave, char **slave_ip,
+ char **slave_vol, char **op_errstr);
+
+int
+glusterd_get_statefile_name (glusterd_volinfo_t *volinfo, char *slave,
+ char *conf_path, char **statefile);
+
+int
+glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen);
+
+int
+glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
+ dict_t *resp_dict, char *path_list,
+ char *conf_path, gf_boolean_t is_force);
+
+int
+glusterd_check_gsync_running_local (char *master, char *slave,
+ char *conf_path,
+ gf_boolean_t *is_run);
+
+gf_boolean_t
+glusterd_is_status_tasks_op (glusterd_op_t op, dict_t *dict);
+
+gf_boolean_t
+gd_should_i_start_rebalance (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_is_volume_quota_enabled (glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_all_volumes_with_quota_stopped ();
+
+int
+glusterd_reconfigure_quotad ();
+
+void
+glusterd_clean_up_quota_store (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_store_quota_conf_skip_header (xlator_t *this, int fd);
+
+int
+glusterd_store_quota_conf_stamp_header (xlator_t *this, int fd);
+
+int
+glusterd_remove_auxiliary_mount (char *volname);
+
+gf_boolean_t
+glusterd_status_has_tasks (int cmd);
+
+int
+gd_stop_rebalance_process (glusterd_volinfo_t *volinfo);
+
+rpc_clnt_t *
+glusterd_rpc_clnt_unref (glusterd_conf_t *conf, rpc_clnt_t *rpc);
+
+int32_t
+glusterd_compare_volume_name(struct list_head *, struct list_head *);
+
+char*
+glusterd_get_brick_mount_details (glusterd_brickinfo_t *brickinfo);
+
+struct mntent *
+glusterd_get_mnt_entry_info (char *mnt_pt, FILE *mtab);
+
+int
+glusterd_get_brick_root (char *path, char **mount_point);
+
+
+int
+glusterd_compare_snap_time(struct list_head *, struct list_head *);
+
+int
+glusterd_compare_snap_vol_time(struct list_head *, struct list_head *);
+
+int32_t
+glusterd_snap_volinfo_restore (dict_t *rsp_dict,
+ glusterd_volinfo_t *new_volinfo,
+ glusterd_volinfo_t *snap_volinfo);
+int32_t
+glusterd_lvm_snapshot_remove (dict_t *rsp_dict, glusterd_volinfo_t *snap_vol);
+
+int32_t
+glusterd_missed_snapinfo_new (glusterd_missed_snap_info **missed_snapinfo);
+
+int32_t
+glusterd_missed_snap_op_new (glusterd_snap_op_t **snap_op);
+
+int32_t
+glusterd_add_missed_snaps_to_dict (dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *brickinfo,
+ int32_t brick_number, int32_t op);
+
+int32_t
+glusterd_add_missed_snaps_to_export_dict (dict_t *peer_data);
+
+int32_t
+glusterd_import_friend_missed_snap_list (dict_t *peer_data);
+
+int32_t
+gd_restore_snap_volume (dict_t *rsp_dict,
+ glusterd_volinfo_t *orig_vol,
+ glusterd_volinfo_t *snap_vol);
+
+int32_t
+glusterd_mount_lvm_snapshot (char *device_path, char *brick_mount_path);
+
+int32_t
+glusterd_add_snapshots_to_export_dict (dict_t *peer_data);
+
+int32_t
+glusterd_compare_friend_snapshots (dict_t *peer_data,
+ glusterd_peerinfo_t *peerinfo);
+
+int32_t
+glusterd_snapobject_delete (glusterd_snap_t *snap);
+
+int32_t
+glusterd_snap_volume_remove (dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ gf_boolean_t remove_lvm,
+ gf_boolean_t force);
+
+int32_t
+glusterd_store_create_snap_dir (glusterd_snap_t *snap);
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 56ce6b3a1..f42d596ba 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -15,6 +15,7 @@
#include <fnmatch.h>
#include <sys/wait.h>
+#include <dlfcn.h>
#if (HAVE_LIB_XML)
#include <libxml/encoding.h>
@@ -27,6 +28,8 @@
#include "logging.h"
#include "dict.h"
#include "graph-utils.h"
+#include "glusterd-store.h"
+#include "glusterd-hooks.h"
#include "trie.h"
#include "glusterd-mem-types.h"
#include "cli1-xdr.h"
@@ -34,6 +37,7 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "run.h"
+#include "options.h"
extern struct volopt_map_entry glusterd_volopt_map[];
@@ -99,7 +103,6 @@ xlator_instantiate_va (const char *type, const char *format, va_list arg)
return NULL;
}
-#ifdef __not_used_as_of_now_
static xlator_t *
xlator_instantiate (const char *type, const char *format, ...)
{
@@ -112,7 +115,6 @@ xlator_instantiate (const char *type, const char *format, ...)
return xl;
}
-#endif
static int
volgen_xlator_link (xlator_t *pxl, xlator_t *cxl)
@@ -361,10 +363,9 @@ volopt_trie_section (int lvl, char **patt, char *word, char **hint, int hints)
GF_ASSERT (hints <= 2);
nodevec.cnt = hints;
ret = trie_measure_vec (trie, word, &nodevec);
- if (ret || !nodevec.nodes[0])
- trie_destroy (trie);
+ if (!ret && nodevec.nodes[0])
+ ret = process_nodevec (&nodevec, hint);
- ret = process_nodevec (&nodevec, hint);
trie_destroy (trie);
return ret;
@@ -460,6 +461,8 @@ process_option (char *key, data_t *value, void *param)
vme.key = key;
vme.voltype = odt->vme->voltype;
vme.option = odt->vme->option;
+ vme.op_version = odt->vme->op_version;
+
if (!vme.option) {
vme.option = strrchr (key, '.');
if (vme.option)
@@ -590,6 +593,8 @@ get_server_xlator (char *xlator)
subvol = GF_XLATOR_MARKER;
if (strcmp (xlator, "io-stats") == 0)
subvol = GF_XLATOR_IO_STATS;
+ if (strcmp (xlator, "bd") == 0)
+ subvol = GF_XLATOR_BD;
return subvol;
}
@@ -734,7 +739,7 @@ int
glusterd_volinfo_get_boolean (glusterd_volinfo_t *volinfo, char *key)
{
char *val = NULL;
- gf_boolean_t boo = _gf_false;
+ gf_boolean_t enabled = _gf_false;
int ret = 0;
ret = glusterd_volinfo_get (volinfo, key, &val);
@@ -742,14 +747,14 @@ glusterd_volinfo_get_boolean (glusterd_volinfo_t *volinfo, char *key)
return -1;
if (val)
- ret = gf_string2boolean (val, &boo);
+ ret = gf_string2boolean (val, &enabled);
if (ret) {
gf_log ("", GF_LOG_ERROR, "value for %s option is not valid", key);
return -1;
}
- return boo;
+ return enabled;
}
gf_boolean_t
@@ -1251,8 +1256,8 @@ static int
server_check_marker_off (volgen_graph_t *graph, struct volopt_map_entry *vme,
glusterd_volinfo_t *volinfo)
{
- gf_boolean_t bool = _gf_false;
- int ret = 0;
+ gf_boolean_t enabled = _gf_false;
+ int ret = 0;
GF_ASSERT (volinfo);
GF_ASSERT (vme);
@@ -1260,8 +1265,8 @@ server_check_marker_off (volgen_graph_t *graph, struct volopt_map_entry *vme,
if (strcmp (vme->option, "!xtime") != 0)
return 0;
- ret = gf_string2boolean (vme->value, &bool);
- if (ret || bool)
+ ret = gf_string2boolean (vme->value, &enabled);
+ if (ret || enabled)
goto out;
ret = glusterd_volinfo_get_boolean (volinfo, VKEY_MARKER_XTIME);
@@ -1272,10 +1277,10 @@ server_check_marker_off (volgen_graph_t *graph, struct volopt_map_entry *vme,
}
if (ret) {
- bool = _gf_false;
- ret = glusterd_check_gsync_running (volinfo, &bool);
+ enabled = _gf_false;
+ ret = glusterd_check_gsync_running (volinfo, &enabled);
- if (bool) {
+ if (enabled) {
gf_log ("", GF_LOG_WARNING, GEOREP" sessions active"
"for the volume %s, cannot disable marker "
,volinfo->volname);
@@ -1321,6 +1326,44 @@ sys_loglevel_option_handler (volgen_graph_t *graph,
}
static int
+logger_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
+ void *param)
+{
+ char *role = NULL;
+ struct volopt_map_entry vme2 = {0,};
+
+ role = (char *) param;
+
+ if (strcmp (vme->option, "!logger") != 0 ||
+ !strstr (vme->key, role))
+ return 0;
+
+ memcpy (&vme2, vme, sizeof (vme2));
+ vme2.option = "logger";
+
+ return basic_option_handler (graph, &vme2, NULL);
+}
+
+static int
+log_format_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
+ void *param)
+{
+ char *role = NULL;
+ struct volopt_map_entry vme2 = {0,};
+
+ role = (char *) param;
+
+ if (strcmp (vme->option, "!log-format") != 0 ||
+ !strstr (vme->key, role))
+ return 0;
+
+ memcpy (&vme2, vme, sizeof (vme2));
+ vme2.option = "log-format";
+
+ return basic_option_handler (graph, &vme2, NULL);
+}
+
+static int
volgen_graph_set_xl_options (volgen_graph_t *graph, dict_t *dict)
{
int32_t ret = -1;
@@ -1372,6 +1415,12 @@ server_spec_option_handler (volgen_graph_t *graph,
if (!ret)
ret = sys_loglevel_option_handler (graph, vme, "brick");
+ if (!ret)
+ ret = logger_option_handler (graph, vme, "brick");
+
+ if (!ret)
+ ret = log_format_option_handler (graph, vme, "brick");
+
return ret;
}
@@ -1394,30 +1443,338 @@ server_spec_extended_option_handler (volgen_graph_t *graph,
static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo);
+xlator_t *
+add_one_peer (volgen_graph_t *graph, glusterd_brickinfo_t *peer,
+ char *volname, uint16_t index)
+{
+ xlator_t *kid;
+
+ kid = volgen_graph_add_nolink (graph, "protocol/client",
+ "%s-client-%u", volname,
+ index++);
+ if (!kid) {
+ return NULL;
+ }
+
+ /* TBD: figure out where to get the proper transport list */
+ if (xlator_set_option(kid,"transport-type","socket")) {
+ return NULL;
+ }
+ if (xlator_set_option(kid,"remote-host",peer->hostname)) {
+ return NULL;
+ }
+ if (xlator_set_option(kid,"remote-subvolume",peer->path)) {
+ return NULL;
+ }
+ /* TBD: deal with RDMA, SSL */
+
+ return kid;
+}
+
+void
+assign_groups (glusterd_volinfo_t *volinfo)
+{
+ glusterd_brickinfo_t *brickinfo = NULL;
+ uint16_t group_num = 0;
+ int in_group = 0;
+ uuid_t tmp_uuid;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (in_group == 0) {
+ uuid_generate(tmp_uuid);
+ }
+ brickinfo->group = group_num;
+ uuid_copy(brickinfo->nsr_uuid,tmp_uuid);
+ if (++in_group >= volinfo->replica_count) {
+ in_group = 0;
+ ++group_num;
+ }
+ }
+}
+
+int
+add_nsr_stuff (volgen_graph_t *graph, char *volname,
+ glusterd_brickinfo_t *brickinfo, glusterd_volinfo_t *volinfo,
+ char *changelog_basepath)
+{
+ xlator_t *me;
+ xlator_t *kid;
+ glusterd_brickinfo_t *peer;
+ uint16_t index = 0;
+ //uint32_t i=0;
+ char *leader_opt;
+ uint32_t replica_group_size = 1;
+ char dst[NSR_MAX_PATH_SIZE];
+ char local_path[NSR_MAX_PATH_SIZE];
+ char local_name[NSR_MAX_PATH_SIZE];
+ char hosts[NSR_MAX_PATH_SIZE * NSR_MAX_REPLICA_GROUP_SIZE];
+ char remote_names[NSR_MAX_REPLICA_GROUP_SIZE * NSR_MAX_PATH_SIZE];
+ char filepath[PATH_MAX] = {0,};
+ char lp[PATH_MAX] = {0,};
+ xlator_t *xl = NULL;
+ char s[256];
+ char transt[16] = {0,};
+ char auth[256];
+ char c_d[NSR_MAX_PATH_SIZE];
+ char *username = NULL, *password = NULL;
+ gf_boolean_t enable_recon = _gf_false;
+ static uint32_t nsr_port = 27000;
+
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr.recon") > 0) {
+ enable_recon = _gf_true;
+ }
+
+ volgen_graph_t ng = {0,};
+ char path[PATH_MAX] = {0,};
+ char *ptr = NULL, *this = NULL, *that = NULL;
+ glusterd_conf_t *priv = NULL;
+
+
+ priv = THIS->private;
+ remote_names[0] = '\0';
+ that = gf_strdup (brickinfo->hostname);
+ this = gf_strdup (brickinfo->path);
+ ptr = strchr (this, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (this, '/');
+ }
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
+ snprintf (dst, PATH_MAX,
+ "%s/%s/%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR,
+ that,
+ this);
+
+ /* Create the NSR xlator, but defer linkage for now. */
+ me = xlator_instantiate ("cluster/nsr", "%s-nsr", volname);
+ if (!me || volgen_xlator_link(me,first_of(graph))) {
+ return -1;
+ }
+
+ strcpy(local_name, brickinfo->hostname);
+ strcpy(local_path, brickinfo->hostname);
+ strcat(local_name, ":");
+ strcat(local_name, brickinfo->path);
+ strcpy(hosts, brickinfo->hostname);
+
+ peer = list_prev (brickinfo, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ /* Check leader status while we have this pointer in hand. */
+ leader_opt = (!peer || (peer->group != brickinfo->group)) ? "yes"
+ : "no";
+ if (xlator_set_option(me,"vol-name",volname))
+ return -1;
+ if (xlator_set_option(me,"my-name",local_name))
+ return -1;
+ if (xlator_set_option(me,"leader",leader_opt))
+ return -1;
+ if (xlator_set_option(me,"subvol-uuid",
+ uuid_utoa(brickinfo->nsr_uuid))) {
+ return -1;
+ }
+
+#define FILL_REMOTE_NAMES { \
+ strcat(remote_names, \
+ peer->hostname); \
+ strcat(remote_names, \
+ ":"); \
+ strcat(remote_names, \
+ peer->path); \
+ strcat(remote_names, \
+ ","); \
+ strcat(hosts, ","); \
+ strcat(hosts, \
+ peer->hostname); \
+ replica_group_size++; \
+}
+
+ /* Now get on with the show. */
+ while (peer) {
+ if (peer->group != brickinfo->group) {
+ break;
+ }
+ gf_log ("glusterd", GF_LOG_INFO,
+ "%s:%s needs client for %s:%s",
+ brickinfo->hostname, brickinfo->path,
+ peer->hostname, peer->path);
+ kid = add_one_peer (graph, peer, volname, index++);
+ if (!kid || volgen_xlator_link(me,kid)) {
+ return -1;
+ }
+ FILL_REMOTE_NAMES;
+ peer = list_prev (peer, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ }
+
+ peer = list_next (brickinfo, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ while (peer) {
+ if (peer->group != brickinfo->group) {
+ break;
+ }
+ gf_log ("glusterd", GF_LOG_INFO,
+ "%s:%s needs client for %s:%s",
+ brickinfo->hostname, brickinfo->path,
+ peer->hostname, peer->path);
+ kid = add_one_peer (graph, peer, volname, index++);
+ if (!kid || volgen_xlator_link(me,kid)) {
+ return -1;
+ }
+ FILL_REMOTE_NAMES;
+ peer = list_next (peer, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ }
+
+ // to remove the final ","
+ if (strlen(remote_names)) {
+ remote_names[strlen(remote_names) - 1] = '\0';
+ }
+ if (xlator_set_option(me,"etcd-servers",hosts))
+ return -1;
+
+ // Finish linkage to client file
+ glusterfs_graph_set_first(&graph->graph,me);
+
+ if (enable_recon == _gf_false)
+ return 0;
+
+ /* Now fill in the various files required for reconciliation */
+ snprintf (filepath, PATH_MAX,
+ "%s-nsr-recon.vol",
+ dst);
+ gf_log ("glusterd", GF_LOG_INFO,
+ "writing nsr recon volfile in %s\n",
+ filepath);
+#if 0
+ strcpy(lp, local_name);
+#else
+ strcpy(lp, brickinfo->path);
+#endif
+ strcat(lp,"/recon");
+ bzero(&ng, sizeof(ng));
+ xl = volgen_graph_add_as (&ng, "cluster/nsr_recon",lp);
+ if (!xl)
+ return -1;
+ sprintf(s,"%d",replica_group_size);
+ if (xlator_set_option(xl, "replica-group-size", s) == -1)
+ return -1;
+ if (xlator_set_option(xl, "local-member", local_name) == -1)
+ return -1;
+ if (xlator_set_option(xl, "replica-group-members", remote_names) == -1)
+ return -1;
+ if (xlator_set_option(xl,"vol-name",volname))
+ return -1;
+ if (xlator_set_option(xl,"changelog-dir",changelog_basepath))
+ return -1;
+ if (xlator_set_option(xl,"base-dir",brickinfo->path))
+ return -1;
+
+ xl = volgen_graph_add (&ng, "protocol/server", lp);
+ if (!xl)
+ return -1;
+ get_vol_transport_type (volinfo, transt);
+ if(xlator_set_option (xl, "transport-type", transt) == -1)
+ return -1;
+ sprintf(s,"%d",nsr_port);
+ if(xlator_set_option (xl, "transport.socket.listen-port", s) == -1)
+ return -1;
+ strcpy(auth, "auth.addr.");
+ strcat(auth, lp);
+ strcat(auth, ".allow");
+ if(xlator_set_option (xl, auth, "*") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-null", "off") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-unix", "off") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-glusterfs", "off") == -1)
+ return -1;
+ if(volgen_write_volfile(&ng, filepath) == -1)
+ return -1;
+
+ bzero(&ng, sizeof(ng));
+ kid = volgen_graph_add_nolink (&ng, "protocol/client",
+ "%s-client-%u", lp, 0);
+ if (!kid)
+ return -1;
+ if (xlator_set_option(kid,"remote-host",brickinfo->hostname))
+ return -1;
+#if 0
+ strcpy(lp, brickinfo->path);
+ strcat(lp,"/recon");
+#endif
+ if (xlator_set_option(kid,"remote-subvolume",lp))
+ return -1;
+ if(xlator_set_option (kid, "transport-type", transt) == -1)
+ return -1;
+ sprintf(s,"%d",nsr_port++);
+ if(xlator_set_option (kid, "remote-port", s) == -1)
+ return -1;
+ snprintf (c_d, PATH_MAX,
+ "%s/%s/con:%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR,
+ that, this);
+ if (volgen_write_volfile(&ng, c_d))
+ return -1;
+
+ bzero(&ng, sizeof(ng));
+ kid = volgen_graph_add_nolink (&ng, "protocol/client",
+ "%s-client-%u", lp, 0);
+ if (!kid)
+ return -1;
+ if (xlator_set_option(kid,"remote-host",brickinfo->hostname))
+ return -1;
+ if (xlator_set_option(kid,"remote-subvolume",brickinfo->path))
+ return -1;
+ if(xlator_set_option (kid, "transport-type", transt) == -1)
+ return -1;
+ username = glusterd_auth_get_username (volinfo);
+ password = glusterd_auth_get_password (volinfo);
+ if(xlator_set_option (kid, "username", username) == -1)
+ return -1;
+ if(xlator_set_option (kid, "password", password) == -1)
+ return -1;
+ snprintf (c_d, PATH_MAX,
+ "%s/%s/data:%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR, that,
+ this);
+ if (volgen_write_volfile(&ng, c_d))
+ return -1;
+
+ return 0;
+
+}
+
static int
server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
dict_t *set_dict, void *param)
{
- char *volname = NULL;
- char *path = NULL;
- int pump = 0;
- xlator_t *xl = NULL;
- xlator_t *txl = NULL;
- xlator_t *rbxl = NULL;
- char transt[16] = {0,};
- char *ptranst = NULL;
- char volume_id[64] = {0,};
- char tstamp_file[PATH_MAX] = {0,};
- int ret = 0;
- char *xlator = NULL;
- char *loglevel = NULL;
- char *username = NULL;
- char *password = NULL;
- char index_basepath[PATH_MAX] = {0};
- char key[1024] = {0};
- char *vgname = NULL;
- char *vg = NULL;
- glusterd_brickinfo_t *brickinfo = NULL;
+ char *volname = NULL;
+ char *path = NULL;
+ int pump = 0;
+ xlator_t *xl = NULL;
+ xlator_t *txl = NULL;
+ xlator_t *rbxl = NULL;
+ char transt[16] = {0,};
+ char *ptranst = NULL;
+ char volume_id[64] = {0,};
+ char tstamp_file[PATH_MAX] = {0,};
+ int ret = 0;
+ char *xlator = NULL;
+ char *loglevel = NULL;
+ char *username = NULL;
+ char *password = NULL;
+ char index_basepath[PATH_MAX] = {0};
+ char key[1024] = {0};
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char changelog_basepath[PATH_MAX] = {0,};
+ gf_boolean_t quota_enabled = _gf_true;
+ gf_boolean_t pgfid_feat = _gf_false;
+ char *value = NULL;
brickinfo = param;
path = brickinfo->path;
@@ -1436,47 +1793,89 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
}
- if (volinfo->backend == GD_VOL_BK_BD) {
- xl = volgen_graph_add (graph, "storage/bd_map", volname);
+ ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_QUOTA, &value);
+ if (value) {
+ ret = gf_string2boolean (value, &quota_enabled);
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_volinfo_get (volinfo,
+ "update-link-count-parent",
+ &value);
+ if (value) {
+ ret = gf_string2boolean (value, &pgfid_feat);
+ if (ret)
+ goto out;
+ }
+
+ xl = volgen_graph_add (graph, "storage/posix", volname);
+ if (!xl)
+ return -1;
+
+ ret = xlator_set_option (xl, "directory", path);
+ if (ret)
+ return -1;
+
+ ret = xlator_set_option (xl, "volume-id",
+ uuid_utoa (volinfo->volume_id));
+ if (ret)
+ return -1;
+
+ if (quota_enabled || pgfid_feat)
+ xlator_set_option (xl, "update-link-count-parent",
+ "on");
+
+ ret = check_and_add_debug_xl (graph, set_dict, volname,
+ "posix");
+ if (ret)
+ return -1;
+#ifdef HAVE_BD_XLATOR
+ if (*brickinfo->vg != '\0') {
+ /* Now add BD v2 xlator if volume is BD type */
+ xl = volgen_graph_add (graph, "storage/bd", volname);
if (!xl)
return -1;
ret = xlator_set_option (xl, "device", "vg");
if (ret)
return -1;
-
- vg = gf_strdup (path);
- vgname = strrchr (vg, '/');
- if (strchr(vg, '/') != vgname) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "invalid vg specified %s", path);
- GF_FREE (vg);
- goto out;
- }
- vgname++;
- ret = xlator_set_option (xl, "export", vgname);
- GF_FREE (vg);
+ ret = xlator_set_option (xl, "export", brickinfo->vg);
if (ret)
return -1;
- } else {
- xl = volgen_graph_add (graph, "storage/posix", volname);
- if (!xl)
- return -1;
- ret = xlator_set_option (xl, "directory", path);
+ ret = check_and_add_debug_xl (graph, set_dict, volname, "bd");
if (ret)
return -1;
- ret = xlator_set_option (xl, "volume-id",
- uuid_utoa (volinfo->volume_id));
- if (ret)
- return -1;
+ }
+#endif
+
+ xl = volgen_graph_add (graph, "features/changelog", volname);
+ if (!xl)
+ return -1;
+
+ ret = xlator_set_option (xl, "changelog-brick", path);
+ if (ret)
+ return -1;
+
+ snprintf (changelog_basepath, sizeof (changelog_basepath),
+ "%s/%s", path, ".glusterfs/changelogs");
+ ret = xlator_set_option (xl, "changelog-dir", changelog_basepath);
+ if (ret)
+ return -1;
- ret = check_and_add_debug_xl (graph, set_dict, volname,
- "posix");
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ ret = xlator_set_option (xl, "encoding", "ascii");
if (ret)
return -1;
}
+
+ ret = check_and_add_debug_xl (graph, set_dict, volname, "changelog");
+ if (ret)
+ return -1;
+
+
xl = volgen_graph_add (graph, "features/access-control", volname);
if (!xl)
return -1;
@@ -1501,6 +1900,10 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
if (ret)
return -1;
+ xl = volgen_graph_add (graph, "features/barrier", volname);
+ if (!xl)
+ return -1;
+
ret = dict_get_int32 (volinfo->dict, "enable-pump", &pump);
if (ret == -ENOENT)
ret = pump = 0;
@@ -1551,9 +1954,19 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
return -1;
}
- xl = volgen_graph_add (graph, "features/index", volname);
- if (!xl)
- return -1;
+ /* TBD: conditionalize on NSR being enabled */
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ ret = add_nsr_stuff (graph, volname, brickinfo, volinfo,
+ changelog_basepath);
+ if (ret) {
+ return -1;
+ }
+ }
+ else {
+ xl = volgen_graph_add (graph, "features/index", volname);
+ if (!xl)
+ return -1;
+ }
snprintf (index_basepath, sizeof (index_basepath), "%s/%s",
path, ".glusterfs/indices");
@@ -1583,7 +1996,21 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
if (ret)
return -1;
- if (dict_get_str_boolean (set_dict, "features.read-only", 0) &&
+ xl = volgen_graph_add (graph, "features/quota", volname);
+ if (!xl)
+ return -1;
+ ret = xlator_set_option (xl, "volume-uuid", volname);
+ if (ret)
+ return -1;
+
+ ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_QUOTA, &value);
+ if (value) {
+ ret = xlator_set_option (xl, "server-quota", value);
+ if (ret)
+ return -1;
+ }
+
+ if (dict_get_str_boolean (set_dict, "features.read-only", 0) &&
dict_get_str_boolean (set_dict, "features.worm",0)) {
gf_log (THIS->name, GF_LOG_ERROR,
"read-only and worm cannot be set together");
@@ -1592,7 +2019,8 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
/* Check for read-only volume option, and add it to the graph */
- if (dict_get_str_boolean (set_dict, "features.read-only", 0)) {
+ if (dict_get_str_boolean (set_dict, "features.read-only", 0)
+ || volinfo -> is_snap_volume) {
xl = volgen_graph_add (graph, "features/read-only", volname);
if (!xl) {
ret = -1;
@@ -1609,6 +2037,21 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
}
+ /* Check for compress volume option, and add it to the graph on server side */
+ ret = dict_get_str_boolean (set_dict, "network.compression", 0);
+ if (ret == -1)
+ goto out;
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/cdc", volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ ret = xlator_set_option (xl, "mode", "server");
+ if (ret)
+ goto out;
+ }
+
xl = volgen_graph_add_as (graph, "debug/io-stats", path);
if (!xl)
return -1;
@@ -1672,10 +2115,11 @@ static int
perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
void *param)
{
- char *volname = NULL;
gf_boolean_t enabled = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
- volname = param;
+ GF_ASSERT (param);
+ volinfo = param;
if (strcmp (vme->option, "!perf") != 0)
return 0;
@@ -1685,7 +2129,13 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
if (!enabled)
return 0;
- if (volgen_graph_add (graph, vme->voltype, volname))
+ /* Check op-version before adding the 'open-behind' xlator in the graph
+ */
+ if (!strcmp (vme->key, "performance.open-behind") &&
+ (vme->op_version > volinfo->client_op_version))
+ return 0;
+
+ if (volgen_graph_add (graph, vme->voltype, volinfo->volname))
return 0;
else
return -1;
@@ -1850,7 +2300,7 @@ xml_add_volset_element (xmlTextWriterPtr writer, const char *name,
#endif
static int
-get_key_from_volopt ( struct volopt_map_entry *vme, char **key)
+_get_xlator_opt_key_from_vme ( struct volopt_map_entry *vme, char **key)
{
int ret = 0;
@@ -1893,11 +2343,22 @@ get_key_from_volopt ( struct volopt_map_entry *vme, char **key)
return ret;
}
+static void
+_free_xlator_opt_key (char *key)
+{
+ GF_ASSERT (key);
+
+ if (!strcmp (key, AUTH_ALLOW_OPT_KEY) ||
+ !strcmp (key, AUTH_REJECT_OPT_KEY) ||
+ !strcmp (key, NFS_DISABLE_OPT_KEY))
+ GF_FREE (key);
+
+ return;
+}
+
int
glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
{
-
- char *xlator_type = NULL;
void *dl_handle = NULL;
volume_opt_list_t vol_opt_handle = {{0},};
char *key = NULL;
@@ -1905,7 +2366,7 @@ glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
int ret = -1;
char *def_val = NULL;
char *descr = NULL;
- char output_string[25600] = {0, };
+ char output_string[51200] = {0, };
char *output = NULL;
char tmp_str[2048] = {0, };
#if (HAVE_LIB_XML)
@@ -1923,36 +2384,48 @@ glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
for (vme = &glusterd_volopt_map[0]; vme->key; vme++) {
- if ( ( vme->type == NO_DOC) || (vme->type == GLOBAL_NO_DOC) )
+ if ((vme->type == NO_DOC) || (vme->type == GLOBAL_NO_DOC))
continue;
- if (get_key_from_volopt (vme, &key))
- goto out; /*Some error while getin key*/
-
if (vme->description) {
descr = vme->description;
def_val = vme->value;
} else {
- if (!xlator_type || strcmp (vme->voltype, xlator_type)){
- ret = xlator_volopt_dynload (vme->voltype,
- &dl_handle,
- &vol_opt_handle);
- if (ret) {
- dl_handle = NULL;
- continue;
- }
+ if (_get_xlator_opt_key_from_vme (vme, &key)) {
+ gf_log ("glusterd", GF_LOG_DEBUG, "Failed to "
+ "get %s key from volume option entry",
+ vme->key);
+ goto out; /*Some error while geting key*/
}
+
+ ret = xlator_volopt_dynload (vme->voltype,
+ &dl_handle,
+ &vol_opt_handle);
+
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_DEBUG,
+ "xlator_volopt_dynload error(%d)", ret);
+ ret = 0;
+ goto cont;
+ }
+
ret = xlator_option_info_list (&vol_opt_handle, key,
&def_val, &descr);
- if (ret) /*Swallow Error i.e if option not found*/
- continue;
+ if (ret) { /*Swallow Error i.e if option not found*/
+ gf_log ("glusterd", GF_LOG_DEBUG,
+ "Failed to get option for %s key", key);
+ ret = 0;
+ goto cont;
+ }
}
if (xml_out) {
#if (HAVE_LIB_XML)
if (xml_add_volset_element (writer,vme->key,
- def_val, descr))
- goto out;
+ def_val, descr)) {
+ ret = -1;
+ goto cont;
+ }
#else
gf_log ("glusterd", GF_LOG_ERROR, "Libxml not present");
#endif
@@ -1962,11 +2435,18 @@ glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
vme->key, def_val, descr);
strcat (output_string, tmp_str);
}
-
- if (!strcmp (key, AUTH_ALLOW_OPT_KEY) ||
- !strcmp (key, AUTH_REJECT_OPT_KEY) ||
- !strcmp (key, NFS_DISABLE_OPT_KEY))
- GF_FREE (key);
+cont:
+ if (dl_handle) {
+ dlclose (dl_handle);
+ dl_handle = NULL;
+ vol_opt_handle.given_opt = NULL;
+ }
+ if (key) {
+ _free_xlator_opt_key (key);
+ key = NULL;
+ }
+ if (ret)
+ goto out;
}
#if (HAVE_LIB_XML)
@@ -1993,7 +2473,7 @@ glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
}
ret = dict_set_dynstr (ctx, "help-str", output);
- out:
+out:
gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -2043,7 +2523,7 @@ volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
list_for_each_entry (brick, &volinfo->bricks, brick_list) {
ret = -1;
xl = volgen_graph_add_nolink (graph, "protocol/client",
- "%s-client-%d", volname, i);
+ "%s", brick->brick_id);
if (!xl)
goto out;
ret = xlator_set_option (xl, "remote-host", brick->hostname);
@@ -2249,28 +2729,46 @@ out:
static int
volgen_graph_build_dht_cluster (volgen_graph_t *graph,
- glusterd_volinfo_t *volinfo, size_t child_count)
+ glusterd_volinfo_t *volinfo, size_t child_count,
+ gf_boolean_t is_quotad)
{
int32_t clusters = 0;
int ret = -1;
char *decommissioned_children = NULL;
xlator_t *dht = NULL;
- char *optstr = NULL;
- gf_boolean_t use_nufa = _gf_false;
+ char *voltype = "cluster/distribute";
+ char *name_fmt = NULL;
- if (dict_get_str(volinfo->dict,"cluster.nufa",&optstr) == 0) {
- /* Keep static analyzers quiet by "using" the value. */
- ret = gf_string2boolean(optstr,&use_nufa);
+ /* NUFA and Switch section */
+ if (dict_get_str_boolean (volinfo->dict, "cluster.nufa", 0) &&
+ dict_get_str_boolean (volinfo->dict, "cluster.switch", 0)) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "nufa and switch cannot be set together");
+ ret = -1;
+ goto out;
}
+ /* Check for NUFA volume option, and change the voltype */
+ if (dict_get_str_boolean (volinfo->dict, "cluster.nufa", 0))
+ voltype = "cluster/nufa";
+
+ /* Check for switch volume option, and change the voltype */
+ if (dict_get_str_boolean (volinfo->dict, "cluster.switch", 0))
+ voltype = "cluster/switch";
+
+ if (is_quotad)
+ name_fmt = "%s";
+ else
+ name_fmt = "%s-dht";
+
clusters = volgen_graph_build_clusters (graph, volinfo,
- use_nufa
- ? "cluster/nufa"
- : "cluster/distribute",
- "%s-dht",
- child_count, child_count);
+ voltype,
+ name_fmt,
+ child_count,
+ child_count);
if (clusters < 0)
goto out;
+
dht = first_of (graph);
ret = _graph_get_decommissioned_children (dht, volinfo,
&decommissioned_children);
@@ -2290,10 +2788,11 @@ out:
static int
volume_volgen_graph_build_clusters (volgen_graph_t *graph,
- glusterd_volinfo_t *volinfo)
+ glusterd_volinfo_t *volinfo,
+ gf_boolean_t is_quotad)
{
- char *replicate_args[] = {"cluster/replicate",
- "%s-replicate-%d"};
+ char *replicate_type = "cluster/replicate";
+ char *replicate_fmt = "%s-replicate-%d";
char *stripe_args[] = {"cluster/stripe",
"%s-stripe-%d"};
int rclusters = 0;
@@ -2307,12 +2806,16 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
if (volinfo->dist_leaf_count == 1)
goto build_distribute;
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ replicate_type = "cluster/nsrc";
+ }
+
/* All other cases, it will have one or the other cluster type */
switch (volinfo->type) {
case GF_CLUSTER_TYPE_REPLICATE:
clusters = volgen_graph_build_clusters (graph, volinfo,
- replicate_args[0],
- replicate_args[1],
+ replicate_type,
+ replicate_fmt,
volinfo->brick_count,
volinfo->replica_count);
if (clusters < 0)
@@ -2332,8 +2835,8 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
if (volinfo->replica_count == 0)
goto out;
clusters = volgen_graph_build_clusters (graph, volinfo,
- replicate_args[0],
- replicate_args[1],
+ replicate_type,
+ replicate_fmt,
volinfo->brick_count,
volinfo->replica_count);
if (clusters < 0)
@@ -2363,7 +2866,7 @@ build_distribute:
}
ret = volgen_graph_build_dht_cluster (graph, volinfo,
- dist_count);
+ dist_count, is_quotad);
if (ret)
goto out;
@@ -2372,29 +2875,74 @@ out:
return ret;
}
+static int client_graph_set_perf_options(volgen_graph_t *graph,
+ glusterd_volinfo_t *volinfo,
+ dict_t *set_dict)
+{
+ data_t *tmp_data = NULL;
+ char *volname = NULL;
+
+ /*
+ * Logic to make sure NFS doesn't have performance translators by
+ * default for a volume
+ */
+ volname = volinfo->volname;
+ tmp_data = dict_get (set_dict, "nfs-volume-file");
+ if (!tmp_data)
+ return volgen_graph_set_options_generic(graph, set_dict,
+ volinfo,
+ &perfxl_option_handler);
+ else
+ return volgen_graph_set_options_generic(graph, set_dict,
+ volname,
+ &nfsperfxl_option_handler);
+}
+
static int
client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
dict_t *set_dict, void *param)
{
- int ret = 0;
- xlator_t *xl = NULL;
- char *volname = NULL;
- data_t *tmp_data = NULL;
+ int ret = 0;
+ xlator_t *xl = NULL;
+ char *volname = NULL;
+ glusterd_conf_t *conf = THIS->private;
+ char *tmp = NULL;
+ gf_boolean_t var = _gf_false;
+ gf_boolean_t ob = _gf_false;
+ xlator_t *this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
volname = volinfo->volname;
ret = volgen_graph_build_clients (graph, volinfo, set_dict, param);
if (ret)
goto out;
- ret = volume_volgen_graph_build_clusters (graph, volinfo);
- if (ret)
+ ret = volume_volgen_graph_build_clusters (graph, volinfo, _gf_false);
+ if (ret == -1)
goto out;
- ret = glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA);
+ /* Check for compress volume option, and add it to the graph on client side */
+ ret = dict_get_str_boolean (set_dict, "network.compression", 0);
+ if (ret == -1)
+ goto out;
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/cdc", volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ ret = xlator_set_option (xl, "mode", "client");
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_volinfo_get_boolean (volinfo, "features.encryption");
if (ret == -1)
goto out;
if (ret) {
- xl = volgen_graph_add (graph, "features/quota", volname);
+ xl = volgen_graph_add (graph, "encryption/crypt", volname);
if (!xl) {
ret = -1;
@@ -2402,16 +2950,121 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
}
- /* Logic to make sure NFS doesn't have performance translators by
- default for a volume */
- tmp_data = dict_get (set_dict, "nfs-volume-file");
- if (!tmp_data)
- ret = volgen_graph_set_options_generic (graph, set_dict, volname,
- &perfxl_option_handler);
- else
- ret = volgen_graph_set_options_generic (graph, set_dict, volname,
- &nfsperfxl_option_handler);
+ if (conf->op_version == GD_OP_VERSION_MIN) {
+ ret = glusterd_volinfo_get_boolean (volinfo,
+ VKEY_FEATURES_QUOTA);
+ if (ret == -1)
+ goto out;
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/quota",
+ volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+
+ ret = glusterd_volinfo_get_boolean (volinfo, "features.file-snapshot");
+ if (ret == -1)
+ goto out;
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/qemu-block", volname);
+
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* Do not allow changing read-after-open option if root-squash is
+ enabled.
+ */
+ ret = dict_get_str (set_dict, "performance.read-after-open", &tmp);
+ if (!ret) {
+ ret = dict_get_str (volinfo->dict, "server.root-squash", &tmp);
+ if (!ret) {
+ ob = _gf_false;
+ ret = gf_string2boolean (tmp, &ob);
+ if (!ret && ob) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "root-squash is enabled. Please turn it"
+ " off to change read-after-open "
+ "option");
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ /* open behind causes problems when root-squash is enabled
+ (by allowing reads to happen even though the squashed user
+ does not have permissions to do so) as it fakes open to be
+ successful and later sends reads on anonymous fds. So when
+ root-squash is enabled, open-behind's option to read after
+ open is done is also enabled.
+ */
+ ret = dict_get_str (set_dict, "server.root-squash", &tmp);
+ if (!ret) {
+ ret = gf_string2boolean (tmp, &var);
+ if (ret)
+ goto out;
+
+ if (var) {
+ ret = dict_get_str (volinfo->dict,
+ "performance.read-after-open",
+ &tmp);
+ if (!ret) {
+ ret = gf_string2boolean (tmp, &ob);
+ /* go ahead with turning read-after-open on
+ even if string2boolean conversion fails,
+ OR if read-after-open option is turned off
+ */
+ if (ret || !ob)
+ ret = dict_set_str (set_dict,
+ "performance.read-after-open",
+ "yes");
+ } else {
+ ret = dict_set_str (set_dict,
+ "performance.read-after-open",
+ "yes");
+ }
+ } else {
+ /* When root-squash has to be turned off, open-behind's
+ read-after-open option should be reset to what was
+ there before root-squash was turned on. If the option
+ cannot be found in volinfo's dict, it means that
+ option was not set before turning on root-squash.
+ */
+ ob = _gf_false;
+ ret = dict_get_str (volinfo->dict,
+ "performance.read-after-open",
+ &tmp);
+ if (!ret) {
+ ret = gf_string2boolean (tmp, &ob);
+
+ if (!ret && ob) {
+ ret = dict_set_str (set_dict,
+ "performance.read-after-open",
+ "yes");
+ }
+ }
+ /* consider operation is failure only if read-after-open
+ option is enabled and could not set into set_dict
+ */
+ if (!ob)
+ ret = 0;
+ }
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "setting "
+ "open behind option as part of root "
+ "squash failed");
+ goto out;
+ }
+ }
+ ret = client_graph_set_perf_options(graph, volinfo, set_dict);
if (ret)
goto out;
@@ -2430,14 +3083,28 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
&loglevel_option_handler);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING, "changing client log level"
+ gf_log (this->name, GF_LOG_WARNING, "changing client log level"
" failed");
ret = volgen_graph_set_options_generic (graph, set_dict, "client",
&sys_loglevel_option_handler);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING, "changing client syslog "
+ gf_log (this->name, GF_LOG_WARNING, "changing client syslog "
"level failed");
+
+ ret = volgen_graph_set_options_generic (graph, set_dict, "client",
+ &logger_option_handler);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "changing client logger"
+ " failed");
+
+ ret = volgen_graph_set_options_generic (graph, set_dict, "client",
+ &log_format_option_handler);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "changing client log format"
+ " failed");
+
out:
return ret;
}
@@ -2774,16 +3441,31 @@ build_shd_graph (volgen_graph_t *graph, dict_t *mod_dict)
&loglevel_option_handler);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING, "changing loglevel "
+ gf_log (this->name, GF_LOG_WARNING, "changing loglevel "
"of self-heal daemon failed");
ret = volgen_graph_set_options_generic (graph, set_dict,
"client",
&sys_loglevel_option_handler);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING, "changing syslog "
+ gf_log (this->name, GF_LOG_WARNING, "changing syslog "
"level of self-heal daemon failed");
+ ret = volgen_graph_set_options_generic (graph, set_dict,
+ "client",
+ &logger_option_handler);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "changing logger "
+ "of self-heal daemon failed");
+
+ ret = volgen_graph_set_options_generic (graph, set_dict,
+ "client",
+ &log_format_option_handler);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "changing log "
+ "format level of self-heal daemon failed");
+
ret = dict_reset (set_dict);
if (ret)
goto out;
@@ -2836,6 +3518,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
if (ret)
goto out;
+ ret = xlator_set_option (nfsxl, "nfs.drc", "on");
+ if (ret)
+ goto out;
+
list_for_each_entry (voliter, &priv->volumes, vol_list) {
if (voliter->status != GLUSTERD_STATUS_STARTED)
continue;
@@ -2948,9 +3634,6 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
return ret;
}
-
-
-
/****************************
*
* Volume generation interface
@@ -3011,7 +3694,7 @@ out:
if (brickinfo)
glusterd_brickinfo_delete (brickinfo);
if (volinfo)
- glusterd_volinfo_delete (volinfo);
+ glusterd_volinfo_unref (volinfo);
return ret;
}
@@ -3037,7 +3720,100 @@ glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo,
return ret;
}
+static int
+build_quotad_graph (volgen_graph_t *graph, dict_t *mod_dict)
+{
+ volgen_graph_t cgraph = {0};
+ glusterd_volinfo_t *voliter = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ dict_t *set_dict = NULL;
+ int ret = 0;
+ xlator_t *quotad_xl = NULL;
+ char *skey = NULL;
+ this = THIS;
+ priv = this->private;
+
+ set_dict = dict_new ();
+ if (!set_dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ quotad_xl = volgen_graph_add_as (graph, "features/quotad", "quotad");
+ if (!quotad_xl) {
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (voliter, &priv->volumes, vol_list) {
+ if (voliter->status != GLUSTERD_STATUS_STARTED)
+ continue;
+
+ if (1 != glusterd_is_volume_quota_enabled (voliter))
+ continue;
+
+ ret = dict_set_uint32 (set_dict, "trusted-client",
+ GF_CLIENT_TRUSTED);
+ if (ret)
+ goto out;
+
+ dict_copy (voliter->dict, set_dict);
+ if (mod_dict)
+ dict_copy (mod_dict, set_dict);
+
+ ret = gf_asprintf(&skey, "%s.volume-id", voliter->volname);
+ if (ret == -1) {
+ gf_log("", GF_LOG_ERROR, "Out of memory");
+ goto out;
+ }
+ ret = xlator_set_option(quotad_xl, skey, voliter->volname);
+ GF_FREE(skey);
+ if (ret)
+ goto out;
+
+ memset (&cgraph, 0, sizeof (cgraph));
+ ret = volgen_graph_build_clients (&cgraph, voliter, set_dict,
+ NULL);
+ if (ret)
+ goto out;
+
+ ret = volume_volgen_graph_build_clusters (&cgraph, voliter,
+ _gf_true);
+ if (ret) {
+ ret = -1;
+ goto out;
+ }
+
+ if (mod_dict) {
+ dict_copy (mod_dict, set_dict);
+ ret = volgen_graph_set_options_generic (&cgraph, set_dict,
+ voliter,
+ basic_option_handler);
+ } else {
+ ret = volgen_graph_set_options_generic (&cgraph,
+ voliter->dict,
+ voliter,
+ basic_option_handler);
+ }
+ if (ret)
+ goto out;
+
+ ret = volgen_graph_merge_sub (graph, &cgraph, 1);
+ if (ret)
+ goto out;
+
+ ret = dict_reset (set_dict);
+ if (ret)
+ goto out;
+ }
+
+out:
+ if (set_dict)
+ dict_unref (set_dict);
+ return ret;
+}
static void
get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo)
@@ -3088,6 +3864,10 @@ generate_brick_volfiles (glusterd_volinfo_t *volinfo)
}
}
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ assign_groups(volinfo);
+ }
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
gf_log ("", GF_LOG_DEBUG,
"Found a brick - %s:%s", brickinfo->hostname,
@@ -3139,7 +3919,7 @@ enumerate_transport_reqs (gf_transport_type type, char **types)
}
}
-static int
+int
generate_client_volfiles (glusterd_volinfo_t *volinfo,
glusterd_client_type_t client_type)
{
@@ -3184,6 +3964,8 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo,
out:
if (dict)
dict_unref (dict);
+
+ gf_log ("", GF_LOG_TRACE, "Returning %d", ret);
return ret;
}
@@ -3319,7 +4101,6 @@ glusterd_check_nfs_volfile_identical (gf_boolean_t *identical)
GF_ASSERT (this);
GF_ASSERT (identical);
-
conf = this->private;
glusterd_get_nodesvc_volfile ("nfs", conf->workdir,
@@ -3357,6 +4138,67 @@ out:
}
int
+glusterd_check_nfs_topology_identical (gf_boolean_t *identical)
+{
+ char nfsvol[PATH_MAX] = {0,};
+ char tmpnfsvol[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = THIS;
+ int ret = -1;
+ int tmpclean = 0;
+ int tmpfd = -1;
+
+ if ((!identical) || (!this) || (!this->private))
+ goto out;
+
+ conf = (glusterd_conf_t *) this->private;
+
+ /* Fetch the original NFS volfile */
+ glusterd_get_nodesvc_volfile ("nfs", conf->workdir,
+ nfsvol, sizeof (nfsvol));
+
+ /* Create the temporary NFS volfile */
+ snprintf (tmpnfsvol, sizeof (tmpnfsvol), "/tmp/gnfs-XXXXXX");
+ tmpfd = mkstemp (tmpnfsvol);
+ if (tmpfd < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to create temp file %s: (%s)",
+ tmpnfsvol, strerror (errno));
+ goto out;
+ }
+
+ tmpclean = 1; /* SET the flag to unlink() tmpfile */
+
+ ret = glusterd_create_global_volfile (build_nfs_graph,
+ tmpnfsvol, NULL);
+ if (ret)
+ goto out;
+
+ /* Compare the topology of volfiles */
+ ret = glusterd_check_topology_identical (nfsvol, tmpnfsvol,
+ identical);
+out:
+ if (tmpfd >= 0)
+ close (tmpfd);
+ if (tmpclean)
+ unlink (tmpnfsvol);
+ return ret;
+}
+
+int
+glusterd_create_quotad_volfile ()
+{
+ char filepath[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = THIS->private;
+
+ glusterd_get_nodesvc_volfile ("quotad", conf->workdir,
+ filepath, sizeof (filepath));
+ return glusterd_create_global_volfile (build_quotad_graph,
+ filepath, NULL);
+}
+
+
+int
glusterd_delete_volfile (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo)
{
@@ -3645,36 +4487,111 @@ out:
return ret;
}
-uint32_t
-glusterd_get_op_version_for_key (char *key)
-{
+static struct volopt_map_entry *
+_gd_get_vmep (char *key) {
char *completion = NULL;
struct volopt_map_entry *vmep = NULL;
- int ret = 0;
+ int ret = 0;
- COMPLETE_OPTION(key, completion, ret);
+ COMPLETE_OPTION ((char *)key, completion, ret);
for (vmep = glusterd_volopt_map; vmep->key; vmep++) {
- if (strcmp (vmep->key, key) == 0) {
- return vmep->op_version;
- }
+ if (strcmp (vmep->key, key) == 0)
+ return vmep;
}
+ return NULL;
+}
+
+uint32_t
+glusterd_get_op_version_for_key (char *key)
+{
+ struct volopt_map_entry *vmep = NULL;
+
+ GF_ASSERT (key);
+
+ vmep = _gd_get_vmep (key);
+ if (vmep)
+ return vmep->op_version;
+
return 0;
}
gf_boolean_t
gd_is_client_option (char *key)
{
- char *completion = NULL;
struct volopt_map_entry *vmep = NULL;
- int ret = 0;
- COMPLETE_OPTION(key, completion, ret);
- for (vmep = glusterd_volopt_map; vmep->key; vmep++) {
- if (strcmp (vmep->key, key) == 0) {
- return vmep->client_option;
- }
+ GF_ASSERT (key);
+
+ vmep = _gd_get_vmep (key);
+ if (vmep && (vmep->flags & OPT_FLAG_CLIENT_OPT))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+gf_boolean_t
+gd_is_xlator_option (char *key)
+{
+ struct volopt_map_entry *vmep = NULL;
+
+ GF_ASSERT (key);
+
+ vmep = _gd_get_vmep (key);
+ if (vmep && (vmep->flags & OPT_FLAG_XLATOR_OPT))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+volume_option_type_t
+_gd_get_option_type (char *key)
+{
+ struct volopt_map_entry *vmep = NULL;
+ void *dl_handle = NULL;
+ volume_opt_list_t vol_opt_list = {{0},};
+ int ret = -1;
+ volume_option_t *opt = NULL;
+ char *xlopt_key = NULL;
+ volume_option_type_t opt_type = GF_OPTION_TYPE_MAX;
+
+ GF_ASSERT (key);
+
+ vmep = _gd_get_vmep (key);
+
+ if (vmep) {
+ INIT_LIST_HEAD (&vol_opt_list.list);
+ ret = xlator_volopt_dynload (vmep->voltype, &dl_handle,
+ &vol_opt_list);
+ if (ret)
+ goto out;
+
+ if (_get_xlator_opt_key_from_vme (vmep, &xlopt_key))
+ goto out;
+
+ opt = xlator_volume_option_get_list (&vol_opt_list, xlopt_key);
+ _free_xlator_opt_key (xlopt_key);
+
+ if (opt)
+ opt_type = opt->type;
+ }
+
+out:
+ if (dl_handle) {
+ dlclose (dl_handle);
+ dl_handle = NULL;
}
+ return opt_type;
+}
+
+gf_boolean_t
+gd_is_boolean_option (char *key)
+{
+ GF_ASSERT (key);
+
+ if (GF_OPTION_TYPE_BOOL == _gd_get_option_type (key))
+ return _gf_true;
+
return _gf_false;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h
index 746c6e92b..f4703c288 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.h
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h
@@ -22,7 +22,10 @@
#define VKEY_DIAG_CNT_FOP_HITS "diagnostics.count-fop-hits"
#define VKEY_DIAG_LAT_MEASUREMENT "diagnostics.latency-measurement"
#define VKEY_FEATURES_LIMIT_USAGE "features.limit-usage"
+#define VKEY_FEATURES_SOFT_LIMIT "features.soft-limit"
#define VKEY_MARKER_XTIME GEOREP".indexing"
+#define VKEY_MARKER_XTIME_FORCE GEOREP".ignore-pid-check"
+#define VKEY_CHANGELOG "changelog.changelog"
#define VKEY_FEATURES_QUOTA "features.quota"
#define AUTH_ALLOW_MAP_KEY "auth.allow"
@@ -32,6 +35,10 @@
#define AUTH_REJECT_OPT_KEY "auth.addr.*.reject"
#define NFS_DISABLE_OPT_KEY "nfs.*.disable"
+// TBD - bring this from a common conf file
+#define NSR_MAX_REPLICA_GROUP_SIZE 8
+#define NSR_MAX_PATH_SIZE (1024 + PATH_MAX)
+#define NSR_CONF_PATH "/var/lib/glusterd/nsr/"
typedef enum {
GF_CLIENT_TRUSTED,
@@ -60,7 +67,9 @@ typedef enum {
typedef enum gd_volopt_flags_ {
OPT_FLAG_NONE,
- OPT_FLAG_FORCE = 1,
+ OPT_FLAG_FORCE = 0x01, // option needs force to be reset
+ OPT_FLAG_XLATOR_OPT = 0x02, // option enables/disables xlators
+ OPT_FLAG_CLIENT_OPT = 0x04, // option affects clients
} gd_volopt_flags_t;
typedef enum {
@@ -71,6 +80,7 @@ typedef enum {
GF_XLATOR_INDEX,
GF_XLATOR_MARKER,
GF_XLATOR_IO_STATS,
+ GF_XLATOR_BD,
GF_XLATOR_NONE,
} glusterd_server_xlator_t;
@@ -102,7 +112,7 @@ struct volopt_map_entry {
/* If client_option is true, the option affects clients.
* this is used to calculate client-op-version of volumes
*/
- gf_boolean_t client_option;
+ //gf_boolean_t client_option;
};
int glusterd_create_rb_volfiles (glusterd_volinfo_t *volinfo,
@@ -116,9 +126,14 @@ void glusterd_get_shd_filepath (char *filename);
int glusterd_create_nfs_volfile ();
int glusterd_create_shd_volfile ();
+int glusterd_create_quotad_volfile ();
int glusterd_delete_volfile (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo);
+int
+glusterd_delete_snap_volfile (glusterd_volinfo_t *volinfo,
+ glusterd_volinfo_t *snap_volinfo,
+ glusterd_brickinfo_t *brickinfo);
int glusterd_volinfo_get (glusterd_volinfo_t *volinfo, char *key, char **value);
int glusterd_volinfo_get_boolean (glusterd_volinfo_t *volinfo, char *key);
@@ -132,15 +147,33 @@ glusterd_check_voloption_flags (char *key, int32_t flags);
gf_boolean_t
glusterd_is_valid_volfpath (char *volname, char *brick);
int generate_brick_volfiles (glusterd_volinfo_t *volinfo);
+int generate_snap_brick_volfiles (glusterd_volinfo_t *volinfo,
+ glusterd_volinfo_t *snap_volinfo);
+int generate_client_volfiles (glusterd_volinfo_t *volinfo,
+ glusterd_client_type_t client_type);
+int
+generate_snap_client_volfiles (glusterd_volinfo_t *actual_volinfo,
+ glusterd_volinfo_t *snap_volinfo,
+ glusterd_client_type_t client_type,
+ gf_boolean_t vol_restore);
int glusterd_get_volopt_content (dict_t *dict, gf_boolean_t xml_out);
char*
glusterd_get_trans_type_rb (gf_transport_type ttype);
int
glusterd_check_nfs_volfile_identical (gf_boolean_t *identical);
+int
+glusterd_check_nfs_topology_identical (gf_boolean_t *identical);
uint32_t
glusterd_get_op_version_for_key (char *key);
gf_boolean_t
gd_is_client_option (char *key);
+
+gf_boolean_t
+gd_is_xlator_option (char *key);
+
+gf_boolean_t
+gd_is_boolean_option (char *key);
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
index 0cae8d58b..504aeb839 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
@@ -30,6 +30,9 @@
#define glusterd_op_start_volume_args_get(dict, volname, flags) \
glusterd_op_stop_volume_args_get (dict, volname, flags)
+extern int
+_get_slave_status (dict_t *this, char *key, data_t *value, void *data);
+
int
__glusterd_handle_create_volume (rpcsvc_request_t *req)
{
@@ -526,9 +529,12 @@ __glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req)
glusterd_op_t cli_op = GD_OP_STATEDUMP_VOLUME;
char err_str[2048] = {0,};
xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
this = THIS;
GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
@@ -577,6 +583,14 @@ __glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req)
goto out;
}
+ if (priv->op_version == GD_OP_VERSION_MIN &&
+ strstr (options, "quotad")) {
+ snprintf (err_str, sizeof (err_str), "The cluster is operating "
+ "at op-version 1. Taking quotad's statedump is "
+ "disallowed in this state");
+ ret = -1;
+ goto out;
+ }
gf_log (this->name, GF_LOG_INFO, "Received statedump request for "
"volume %s with options %s", volname, options);
@@ -604,31 +618,96 @@ glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req)
}
#ifdef HAVE_BD_XLATOR
+/*
+ * Validates if given VG in the brick exists or not. Also checks if VG has
+ * GF_XATTR_VOL_ID_KEY tag set to avoid using same VG for multiple bricks.
+ * Tag is checked only during glusterd_op_stage_create_volume. Tag is set during
+ * glusterd_validate_and_create_brickpath().
+ * @brick - brick info, @check_tag - check for VG tag or not
+ * @msg - Error message to return to caller
+ */
int
-glusterd_is_valid_vg (const char *name)
+glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg)
{
- lvm_t handle = NULL;
- vg_t vg = NULL;
- char *vg_name = NULL;
- int retval = -1;
+ lvm_t handle = NULL;
+ vg_t vg = NULL;
+ char *vg_name = NULL;
+ int retval = 0;
+ char *p = NULL;
+ char *ptr = NULL;
+ struct dm_list *dm_lvlist = NULL;
+ struct dm_list *dm_seglist = NULL;
+ struct lvm_lv_list *lv_list = NULL;
+ struct lvm_property_value prop = {0, };
+ struct lvm_lvseg_list *seglist = NULL;
+ struct dm_list *taglist = NULL;
+ struct lvm_str_list *strl = NULL;
handle = lvm_init (NULL);
if (!handle) {
- gf_log ("", GF_LOG_ERROR, "lvm_init failed");
+ sprintf (msg, "lvm_init failed, could not validate vg");
return -1;
}
- vg_name = gf_strdup (name);
- vg = lvm_vg_open (handle, basename (vg_name), "r", 0);
+ if (*brick->vg == '\0') { /* BD xlator has vg in brick->path */
+ p = gf_strdup (brick->path);
+ vg_name = strtok_r (p, "/", &ptr);
+ } else
+ vg_name = brick->vg;
+
+ vg = lvm_vg_open (handle, vg_name, "r", 0);
if (!vg) {
- gf_log ("", GF_LOG_ERROR, "no such vg: %s", vg_name);
- goto out;
+ sprintf (msg, "no such vg: %s", vg_name);
+ retval = -1;
+ goto out;
+ }
+ if (!check_tag)
+ goto next;
+
+ taglist = lvm_vg_get_tags (vg);
+ if (!taglist)
+ goto next;
+
+ dm_list_iterate_items (strl, taglist) {
+ if (!strncmp(strl->str, GF_XATTR_VOL_ID_KEY,
+ strlen (GF_XATTR_VOL_ID_KEY))) {
+ sprintf (msg, "VG %s is already part of"
+ " a brick", vg_name);
+ retval = -1;
+ goto out;
+ }
+ }
+next:
+
+ brick->caps = CAPS_BD | CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT;
+
+ dm_lvlist = lvm_vg_list_lvs (vg);
+ if (!dm_lvlist)
+ goto out;
+
+ dm_list_iterate_items (lv_list, dm_lvlist) {
+ dm_seglist = lvm_lv_list_lvsegs (lv_list->lv);
+ dm_list_iterate_items (seglist, dm_seglist) {
+ prop = lvm_lvseg_get_property (seglist->lvseg,
+ "segtype");
+ if (!prop.is_valid || !prop.value.string)
+ continue;
+ if (!strcmp (prop.value.string, "thin-pool")) {
+ brick->caps |= CAPS_THIN;
+ gf_log (THIS->name, GF_LOG_INFO, "Thin Pool "
+ "\"%s\" will be used for thin LVs",
+ lvm_lv_get_name (lv_list->lv));
+ break;
+ }
+ }
}
+
retval = 0;
out:
if (vg)
lvm_vg_close (vg);
lvm_quit (handle);
- GF_FREE (vg_name);
+ if (p)
+ GF_FREE (p);
return retval;
}
#endif
@@ -653,9 +732,6 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr)
char msg[2048] = {0};
uuid_t volume_uuid;
char *volume_uuid_str;
-#ifdef HAVE_BD_XLATOR
- char *dev_type = NULL;
-#endif
gf_boolean_t is_force = _gf_false;
this = THIS;
@@ -700,10 +776,6 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr)
goto out;
}
-#ifdef HAVE_BD_XLATOR
- ret = dict_get_str (dict, "device", &dev_type);
-#endif
-
ret = dict_get_str (dict, "bricks", &bricks);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Unable to get bricks for "
@@ -752,19 +824,14 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr)
goto out;
}
+ if (!uuid_compare (brick_info->uuid, MY_UUID)) {
#ifdef HAVE_BD_XLATOR
- if (dev_type) {
- ret = glusterd_is_valid_vg (brick_info->path);
- if (ret) {
- snprintf (msg, sizeof(msg), "invalid vg %s",
- brick_info->path);
- goto out;
+ if (brick_info->vg[0]) {
+ ret = glusterd_is_valid_vg (brick_info, 1, msg);
+ if (ret)
+ goto out;
}
-
- break;
- } else
#endif
- if (!uuid_compare (brick_info->uuid, MY_UUID)) {
ret = glusterd_validate_and_create_brickpath (brick_info,
volume_uuid, op_errstr,
is_force);
@@ -856,12 +923,13 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr)
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
- char msg[2048];
+ char msg[2048] = {0,};
glusterd_conf_t *priv = NULL;
xlator_t *this = NULL;
uuid_t volume_id = {0,};
char volid[50] = {0,};
char xattr_volid[50] = {0,};
+ int caps = 0;
this = THIS;
GF_ASSERT (this);
@@ -911,11 +979,10 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr)
if (uuid_compare (brickinfo->uuid, MY_UUID))
continue;
- if (volinfo->backend == GD_VOL_BK_BD)
- continue;
-
ret = gf_lstat_dir (brickinfo->path, NULL);
- if (ret) {
+ if (ret && (flags & GF_CLI_FLAG_OP_FORCE)) {
+ continue;
+ } else if (ret) {
snprintf (msg, sizeof (msg), "Failed to find "
"brick directory %s for volume %s. "
"Reason : %s", brickinfo->path,
@@ -924,13 +991,27 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr)
}
ret = sys_lgetxattr (brickinfo->path, GF_XATTR_VOL_ID_KEY,
volume_id, 16);
- if (ret < 0) {
+ if (ret < 0 && (!(flags & GF_CLI_FLAG_OP_FORCE))) {
snprintf (msg, sizeof (msg), "Failed to get "
"extended attribute %s for brick dir %s. "
"Reason : %s", GF_XATTR_VOL_ID_KEY,
brickinfo->path, strerror (errno));
ret = -1;
goto out;
+ } else if (ret < 0) {
+ ret = sys_lsetxattr (brickinfo->path,
+ GF_XATTR_VOL_ID_KEY,
+ volinfo->volume_id, 16,
+ XATTR_CREATE);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Failed to set "
+ "extended attribute %s on %s. Reason: "
+ "%s", GF_XATTR_VOL_ID_KEY,
+ brickinfo->path, strerror (errno));
+ goto out;
+ } else {
+ continue;
+ }
}
if (uuid_compare (volinfo->volume_id, volume_id)) {
snprintf (msg, sizeof (msg), "Volume id mismatch for "
@@ -942,8 +1023,24 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr)
ret = -1;
goto out;
}
+#ifdef HAVE_BD_XLATOR
+ if (brickinfo->vg[0])
+ caps = CAPS_BD | CAPS_THIN |
+ CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT;
+ /* Check for VG/thin pool if its BD volume */
+ if (brickinfo->vg[0]) {
+ ret = glusterd_is_valid_vg (brickinfo, 0, msg);
+ if (ret)
+ goto out;
+ /* if anyone of the brick does not have thin support,
+ disable it for entire volume */
+ caps &= brickinfo->caps;
+ } else
+ caps = 0;
+#endif
}
+ volinfo->caps = caps;
ret = 0;
out:
if (ret && (msg[0] != '\0')) {
@@ -964,6 +1061,7 @@ glusterd_op_stage_stop_volume (dict_t *dict, char **op_errstr)
glusterd_volinfo_t *volinfo = NULL;
char msg[2048] = {0};
xlator_t *this = NULL;
+ gsync_status_param_t param = {0,};
this = THIS;
GF_ASSERT (this);
@@ -1007,7 +1105,22 @@ glusterd_op_stage_stop_volume (dict_t *dict, char **op_errstr)
if (ret && (is_run == _gf_false))
gf_log (this->name, GF_LOG_WARNING, "Unable to get the status"
" of active "GEOREP" session");
- if (is_run) {
+
+ param.volinfo = volinfo;
+ ret = dict_foreach (volinfo->gsync_slaves, _get_slave_status, &param);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "_get_slave_satus failed");
+ snprintf (msg, sizeof(msg), GEOREP" Unable to get the status "
+ "of active "GEOREP" session for the volume '%s'.\n"
+ "Please check the log file for more info. Use "
+ "'force' option to ignore and stop the volume.",
+ volname);
+ ret = -1;
+ goto out;
+ }
+
+ if (is_run && param.is_active) {
gf_log (this->name, GF_LOG_WARNING, GEOREP" sessions active"
"for the volume %s ", volname);
snprintf (msg, sizeof(msg), GEOREP" sessions are active "
@@ -1098,6 +1211,16 @@ glusterd_op_stage_delete_volume (dict_t *dict, char **op_errstr)
goto out;
}
+ if (volinfo->snap_count > 0 || !list_empty(&volinfo->snap_volumes)) {
+ snprintf (msg, sizeof (msg), "Cannot delete Volume %s ,"
+ "as it has %"PRIu64" snapshots. "
+ "To delete the volume, "
+ "first delete all the snapshots under it.",
+ volname, volinfo->snap_count);
+ ret = -1;
+ goto out;
+ }
+
ret = 0;
out:
@@ -1195,14 +1318,22 @@ glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr)
goto out;
}
- if ((heal_op != GF_AFR_OP_INDEX_SUMMARY) &&
- !glusterd_is_nodesvc_online ("glustershd")) {
- ret = -1;
- *op_errstr = gf_strdup ("Self-heal daemon is not running."
- " Check self-heal daemon log file.");
- gf_log (this->name, GF_LOG_WARNING, "%s", "Self-heal daemon is "
- "not running. Check self-heal daemon log file.");
- goto out;
+ switch (heal_op) {
+ case GF_AFR_OP_INDEX_SUMMARY:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ break;
+ default:
+ if (!glusterd_is_nodesvc_online("glustershd")){
+ ret = -1;
+ *op_errstr = gf_strdup ("Self-heal daemon is "
+ "not running. Check self-heal "
+ "daemon log file.");
+ gf_log (this->name, GF_LOG_WARNING, "%s",
+ "Self-heal daemon is not running."
+ "Check self-heal daemon log file.");
+ goto out;
+ }
}
ret = 0;
@@ -1222,6 +1353,13 @@ glusterd_op_stage_statedump_volume (dict_t *dict, char **op_errstr)
gf_boolean_t is_running = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
char msg[2408] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
ret = glusterd_op_statedump_volume_args_get (dict, &volname, &options,
&option_cnt);
@@ -1230,10 +1368,7 @@ glusterd_op_stage_statedump_volume (dict_t *dict, char **op_errstr)
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- snprintf (msg, sizeof(msg), "Volume %s does not exist",
- volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ snprintf (msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
goto out;
}
@@ -1243,16 +1378,31 @@ glusterd_op_stage_statedump_volume (dict_t *dict, char **op_errstr)
is_running = glusterd_is_volume_started (volinfo);
if (!is_running) {
- snprintf (msg, sizeof(msg), "Volume %s is not in a started"
+ snprintf (msg, sizeof(msg), "Volume %s is not in the started"
" state", volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
ret = -1;
goto out;
}
+ if (priv->op_version == GD_OP_VERSION_MIN &&
+ strstr (options, "quotad")) {
+ snprintf (msg, sizeof (msg), "The cluster is operating "
+ "at op-version 1. Taking quotad's statedump is "
+ "disallowed in this state");
+ ret = -1;
+ goto out;
+ }
+ if ((strstr (options, "quotad")) &&
+ (!glusterd_is_volume_quota_enabled (volinfo))) {
+ snprintf (msg, sizeof (msg), "Quota is not enabled on "
+ "volume %s", volname);
+ ret = -1;
+ goto out;
+ }
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ if (ret && msg[0] != '\0')
+ *op_errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1326,109 +1476,6 @@ out:
return ret;
}
-#ifdef HAVE_BD_XLATOR
-int
-glusterd_op_stage_bd (dict_t *dict, char **op_errstr)
-{
- int ret = -1;
- char *volname = NULL;
- char *path = NULL;
- char *size = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- char msg[2048] = {0,};
- gf_xl_bd_op_t bd_op = GF_BD_OP_INVALID;
- uint64_t bytes = 0;
-
- ret = dict_get_str (dict, "volname", &volname);
- if (ret) {
- snprintf (msg, sizeof(msg), "Failed to get volume name");
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-
- ret = dict_get_int32 (dict, "bd-op", (int32_t *)&bd_op);
- if (ret) {
- snprintf (msg, sizeof(msg), "Failed to get bd-op");
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-
- ret = dict_get_str (dict, "path", &path);
- if (ret) {
- snprintf (msg, sizeof(msg), "Failed to get path");
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-
- if (bd_op == GF_BD_OP_NEW_BD) {
- ret = dict_get_str (dict, "size", &size);
- if (ret) {
- snprintf (msg, sizeof(msg), "Failed to get size");
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
- if (gf_string2bytesize (size, &bytes) < 0) {
- snprintf (msg, sizeof(msg),
- "Invalid size %s, suffix with KB, MB etc",
- size);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- ret = -1;
- goto out;
- }
- } else if (bd_op == GF_BD_OP_SNAPSHOT_BD) {
- ret = dict_get_str (dict, "size", &size);
- if (ret) {
- snprintf (msg, sizeof(msg), "Failed to get size");
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-
- if (gf_string2bytesize (size, &bytes) < 0) {
- ret = -1;
- snprintf (msg, sizeof(msg),
- "Invalid size %s, suffix with KB, MB etc",
- size);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
- }
-
- ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret) {
- snprintf (msg, sizeof(msg), "Volume %s does not exist",
- volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-
- ret = glusterd_validate_volume_id (dict, volinfo);
- if (ret)
- goto out;
-
- if (!glusterd_is_volume_started (volinfo)) {
- snprintf (msg, sizeof(msg), "Volume %s is not started",
- volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- ret = -1;
- goto out;
- }
-
- ret = 0;
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
- return ret;
-}
-#endif
-
int
glusterd_op_create_volume (dict_t *dict, char **op_errstr)
{
@@ -1450,9 +1497,9 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
char *str = NULL;
char *username = NULL;
char *password = NULL;
-#ifdef HAVE_BD_XLATOR
- char *device = NULL;
-#endif
+ int caps = 0;
+ int brickid = 0;
+ char msg[1024] __attribute__((unused)) = {0, };
this = THIS;
GF_ASSERT (this);
@@ -1507,12 +1554,6 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
goto out;
}
-#ifdef HAVE_BD_XLATOR
- ret = dict_get_str (dict, "device", &device);
- if (!ret)
- volinfo->backend = GD_VOL_BK_BD;
-#endif
-
/* replica-count 1 means, no replication, file is in one brick only */
volinfo->replica_count = 1;
/* stripe-count 1 means, no striping, file is present as a whole */
@@ -1621,23 +1662,55 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
if (count)
brick = strtok_r (brick_list+1, " \n", &saveptr);
+ caps = CAPS_BD | CAPS_THIN | CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT;
+ brickid = glusterd_get_next_available_brickid (volinfo);
+ if (brickid < 0)
+ goto out;
while ( i <= count) {
ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo);
if (ret)
goto out;
+ GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo,
+ brickid++);
+
ret = glusterd_resolve_brick (brickinfo);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
brickinfo->hostname, brickinfo->path);
goto out;
}
+
+#ifdef HAVE_BD_XLATOR
+ if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+ if (brickinfo->vg[0]) {
+ ret = glusterd_is_valid_vg (brickinfo, 0, msg);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ msg);
+ goto out;
+ }
+
+ /* if anyone of the brick does not have thin
+ support, disable it for entire volume */
+ caps &= brickinfo->caps;
+
+
+ } else
+ caps = 0;
+ }
+#endif
+
list_add_tail (&brickinfo->brick_list, &volinfo->bricks);
brick = strtok_r (NULL, " \n", &saveptr);
i++;
}
+ gd_update_volume_op_versions (volinfo);
+
+ volinfo->caps = caps;
+
ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
if (ret) {
glusterd_store_delete_volume (volinfo);
@@ -1652,14 +1725,14 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
}
volinfo->rebal.defrag_status = 0;
- list_add_tail (&volinfo->vol_list, &priv->volumes);
+ list_add_order (&volinfo->vol_list, &priv->volumes,
+ glusterd_compare_volume_name);
vol_added = _gf_true;
- gd_update_volume_op_versions (volinfo);
out:
GF_FREE(free_ptr);
if (!vol_added && volinfo)
- glusterd_volinfo_delete (volinfo);
+ glusterd_volinfo_unref (volinfo);
return ret;
}
@@ -1689,7 +1762,10 @@ glusterd_op_start_volume (dict_t *dict, char **op_errstr)
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
ret = glusterd_brick_start (volinfo, brickinfo, _gf_true);
- if (ret)
+ /* If 'force' try to start all bricks regardless of success or
+ * failure
+ */
+ if (!(flags & GF_CLI_FLAG_OP_FORCE) && ret)
goto out;
}
@@ -1706,6 +1782,74 @@ out:
return ret;
}
+int
+glusterd_stop_volume (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char mountdir[PATH_MAX] = {0,};
+ runner_t runner = {0,};
+ char pidfile[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ ret = glusterd_brick_stop (volinfo, brickinfo, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to stop "
+ "brick (%s)", brickinfo->path);
+ goto out;
+ }
+ }
+
+ glusterd_set_volume_status (volinfo, GLUSTERD_STATUS_STOPPED);
+
+ ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store volinfo of "
+ "%s volume", volinfo->volname);
+ goto out;
+ }
+
+ /* If quota auxiliary mount is present, unmount it */
+ GLUSTERFS_GET_AUX_MOUNT_PIDFILE (pidfile, volinfo->volname);
+
+ if (!gf_is_service_running (pidfile, NULL)) {
+ gf_log (this->name, GF_LOG_DEBUG, "Aux mount of volume %s "
+ "absent", volinfo->volname);
+ } else {
+ GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH (mountdir, volinfo->volname,
+ "/");
+
+ runinit (&runner);
+ runner_add_args (&runner, "umount",
+
+ #if GF_LINUX_HOST_OS
+ "-l",
+ #endif
+ mountdir, NULL);
+ ret = runner_run_reuse (&runner);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "umount on %s failed, "
+ "reason : %s", mountdir, strerror (errno));
+
+ runner_end (&runner);
+ }
+
+ ret = glusterd_nodesvcs_handle_graph_change (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to notify graph "
+ "change for %s volume", volinfo->volname);
+ goto out;
+ }
+
+out:
+ return ret;
+}
int
glusterd_op_stop_volume (dict_t *dict)
@@ -1714,7 +1858,6 @@ glusterd_op_stop_volume (dict_t *dict)
int flags = 0;
char *volname = NULL;
glusterd_volinfo_t *volinfo = NULL;
- glusterd_brickinfo_t *brickinfo = NULL;
xlator_t *this = NULL;
this = THIS;
@@ -1731,19 +1874,12 @@ glusterd_op_stop_volume (dict_t *dict)
goto out;
}
- list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- ret = glusterd_brick_stop (volinfo, brickinfo, _gf_false);
- if (ret)
- goto out;
- }
-
- glusterd_set_volume_status (volinfo, GLUSTERD_STATUS_STOPPED);
-
- ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
- if (ret)
+ ret = glusterd_stop_volume (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to stop %s volume",
+ volname);
goto out;
-
- ret = glusterd_nodesvcs_handle_graph_change (volinfo);
+ }
out:
return ret;
}
@@ -1775,6 +1911,10 @@ glusterd_op_delete_volume (dict_t *dict)
goto out;
}
+ ret = glusterd_remove_auxiliary_mount (volname);
+ if (ret)
+ goto out;
+
ret = glusterd_delete_volume (volinfo);
out:
gf_log (this->name, GF_LOG_DEBUG, "returning %d", ret);
@@ -1813,6 +1953,12 @@ glusterd_op_statedump_volume (dict_t *dict, char **op_errstr)
ret = glusterd_nfs_statedump (options, option_cnt, op_errstr);
if (ret)
goto out;
+
+ } else if (strstr (options, "quotad")) {
+ ret = glusterd_quotad_statedump (options, option_cnt,
+ op_errstr);
+ if (ret)
+ goto out;
} else {
list_for_each_entry (brickinfo, &volinfo->bricks,
brick_list) {
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index b04afdf95..1374e82cd 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -116,8 +116,8 @@ validate_cache_max_min_size (dict_t *dict, char *key, char *value,
"performance.cache-max-file-size",
&current_max_value);
if (current_max_value) {
- gf_string2bytesize (current_max_value, &max_value);
- gf_string2bytesize (value, &min_value);
+ gf_string2bytesize_uint64 (current_max_value, &max_value);
+ gf_string2bytesize_uint64 (value, &min_value);
current_min_value = value;
}
} else if ((!strcmp (key, "performance.cache-max-file-size")) ||
@@ -126,8 +126,8 @@ validate_cache_max_min_size (dict_t *dict, char *key, char *value,
"performance.cache-min-file-size",
&current_min_value);
if (current_min_value) {
- gf_string2bytesize (current_min_value, &min_value);
- gf_string2bytesize (value, &max_value);
+ gf_string2bytesize_uint64 (current_min_value, &min_value);
+ gf_string2bytesize_uint64 (value, &max_value);
current_max_value = value;
}
}
@@ -347,197 +347,227 @@ out:
struct volopt_map_entry glusterd_volopt_map[] = {
/* DHT xlator options */
- { .key = "cluster.lookup-unhashed",
- .voltype = "cluster/distribute",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.min-free-disk",
- .voltype = "cluster/distribute",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.min-free-inodes",
- .voltype = "cluster/distribute",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.rebalance-stats",
- .voltype = "cluster/distribute",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.subvols-per-directory",
- .voltype = "cluster/distribute",
- .option = "directory-layout-spread",
- .op_version = 2,
- .validate_fn = validate_subvols_per_directory,
- .client_option = _gf_true
- },
- { .key = "cluster.readdir-optimize",
- .voltype = "cluster/distribute",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.nufa",
- .voltype = "cluster/distribute",
- .option = "!nufa",
- .type = NO_DOC,
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.rsync-hash-regex",
- .voltype = "cluster/distribute",
- .type = NO_DOC,
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.extra-hash-regex",
- .voltype = "cluster/distribute",
- .type = NO_DOC,
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.dht-xattr-name",
- .voltype = "cluster/distribute",
- .option = "xattr-name",
- .type = NO_DOC,
- .op_version = 2,
- .client_option = _gf_true
+ { .key = "cluster.lookup-unhashed",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.min-free-disk",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.min-free-inodes",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.rebalance-stats",
+ .voltype = "cluster/distribute",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.subvols-per-directory",
+ .voltype = "cluster/distribute",
+ .option = "directory-layout-spread",
+ .op_version = 2,
+ .validate_fn = validate_subvols_per_directory,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.readdir-optimize",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.rsync-hash-regex",
+ .voltype = "cluster/distribute",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.extra-hash-regex",
+ .voltype = "cluster/distribute",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.dht-xattr-name",
+ .voltype = "cluster/distribute",
+ .option = "xattr-name",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* NUFA xlator options (Distribute special case) */
+ { .key = "cluster.nufa",
+ .voltype = "cluster/distribute",
+ .option = "!nufa",
+ .type = NO_DOC,
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.local-volume-name",
+ .voltype = "cluster/nufa",
+ .option = "local-volume-name",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Switch xlator options (Distribute special case) */
+ { .key = "cluster.switch",
+ .voltype = "cluster/distribute",
+ .option = "!switch",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.switch-pattern",
+ .voltype = "cluster/switch",
+ .option = "pattern.switch.case",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
},
/* AFR xlator options */
- { .key = "cluster.entry-change-log",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.read-subvolume",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.read-subvolume-index",
- .voltype = "cluster/replicate",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.read-hash-mode",
- .voltype = "cluster/replicate",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.background-self-heal-count",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.metadata-self-heal",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.data-self-heal",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.entry-self-heal",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
+ { .key = "cluster.entry-change-log",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.read-subvolume",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.read-subvolume-index",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.read-hash-mode",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.background-self-heal-count",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.metadata-self-heal",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.data-self-heal",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.entry-self-heal",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
},
{ .key = "cluster.self-heal-daemon",
.voltype = "cluster/replicate",
.option = "!self-heal-daemon",
.op_version = 1
},
- { .key = "cluster.heal-timeout",
- .voltype = "cluster/replicate",
- .option = "!heal-timeout",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.strict-readdir",
- .voltype = "cluster/replicate",
- .type = NO_DOC,
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.self-heal-window-size",
- .voltype = "cluster/replicate",
- .option = "data-self-heal-window-size",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.data-change-log",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.metadata-change-log",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.data-self-heal-algorithm",
- .voltype = "cluster/replicate",
- .option = "data-self-heal-algorithm",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.eager-lock",
- .voltype = "cluster/replicate",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.quorum-type",
- .voltype = "cluster/replicate",
- .option = "quorum-type",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.quorum-count",
- .voltype = "cluster/replicate",
- .option = "quorum-count",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "cluster.choose-local",
- .voltype = "cluster/replicate",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.self-heal-readdir-size",
- .voltype = "cluster/replicate",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.post-op-delay-secs",
- .voltype = "cluster/replicate",
- .type = NO_DOC,
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "cluster.readdir-failover",
- .voltype = "cluster/replicate",
- .op_version = 2,
- .client_option = _gf_true
+ { .key = "cluster.heal-timeout",
+ .voltype = "cluster/replicate",
+ .option = "!heal-timeout",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.strict-readdir",
+ .voltype = "cluster/replicate",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.self-heal-window-size",
+ .voltype = "cluster/replicate",
+ .option = "data-self-heal-window-size",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.data-change-log",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.metadata-change-log",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.data-self-heal-algorithm",
+ .voltype = "cluster/replicate",
+ .option = "data-self-heal-algorithm",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.eager-lock",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.quorum-type",
+ .voltype = "cluster/replicate",
+ .option = "quorum-type",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.quorum-count",
+ .voltype = "cluster/replicate",
+ .option = "quorum-count",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.choose-local",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.self-heal-readdir-size",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.post-op-delay-secs",
+ .voltype = "cluster/replicate",
+ .type = NO_DOC,
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.readdir-failover",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.ensure-durability",
+ .voltype = "cluster/replicate",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
},
/* Stripe xlator options */
- { .key = "cluster.stripe-block-size",
- .voltype = "cluster/stripe",
- .option = "block-size",
- .op_version = 1,
+ { .key = "cluster.stripe-block-size",
+ .voltype = "cluster/stripe",
+ .option = "block-size",
+ .op_version = 1,
.validate_fn = validate_stripe,
- .client_option = _gf_true
+ .flags = OPT_FLAG_CLIENT_OPT
},
- { .key = "cluster.stripe-coalesce",
- .voltype = "cluster/stripe",
- .option = "coalesce",
- .op_version = 2,
- .client_option = _gf_true
+ { .key = "cluster.stripe-coalesce",
+ .voltype = "cluster/stripe",
+ .option = "coalesce",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
},
/* IO-stats xlator options */
@@ -563,55 +593,77 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.option = "!brick-log-level",
.op_version = 1
},
- { .key = "diagnostics.client-log-level",
- .voltype = "debug/io-stats",
- .option = "!client-log-level",
- .op_version = 1,
- .client_option = _gf_true
+ { .key = "diagnostics.client-log-level",
+ .voltype = "debug/io-stats",
+ .option = "!client-log-level",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
},
{ .key = "diagnostics.brick-sys-log-level",
.voltype = "debug/io-stats",
.option = "!sys-log-level",
.op_version = 1
},
- { .key = "diagnostics.client-sys-log-level",
- .voltype = "debug/io-stats",
- .option = "!sys-log-level",
- .op_version = 1,
- .client_option = _gf_true
+ { .key = "diagnostics.client-sys-log-level",
+ .voltype = "debug/io-stats",
+ .option = "!sys-log-level",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "diagnostics.brick-logger",
+ .voltype = "debug/io-stats",
+ .option = "!logger",
+ .op_version = 4
+ },
+ { .key = "diagnostics.client-logger",
+ .voltype = "debug/io-stats",
+ .option = "!logger",
+ .op_version = 4,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "diagnostics.brick-log-format",
+ .voltype = "debug/io-stats",
+ .option = "!log-format",
+ .op_version = 4
+ },
+ { .key = "diagnostics.client-log-format",
+ .voltype = "debug/io-stats",
+ .option = "!log-format",
+ .op_version = 4,
+ .flags = OPT_FLAG_CLIENT_OPT
},
/* IO-cache xlator options */
- { .key = "performance.cache-max-file-size",
- .voltype = "performance/io-cache",
- .option = "max-file-size",
- .op_version = 1,
- .validate_fn = validate_cache_max_min_size,
- .client_option = _gf_true
+ { .key = "performance.cache-max-file-size",
+ .voltype = "performance/io-cache",
+ .option = "max-file-size",
+ .op_version = 1,
+ .validate_fn = validate_cache_max_min_size,
+ .flags = OPT_FLAG_CLIENT_OPT
},
- { .key = "performance.cache-min-file-size",
- .voltype = "performance/io-cache",
- .option = "min-file-size",
- .op_version = 1,
- .validate_fn = validate_cache_max_min_size,
- .client_option = _gf_true
+ { .key = "performance.cache-min-file-size",
+ .voltype = "performance/io-cache",
+ .option = "min-file-size",
+ .op_version = 1,
+ .validate_fn = validate_cache_max_min_size,
+ .flags = OPT_FLAG_CLIENT_OPT
},
- { .key = "performance.cache-refresh-timeout",
- .voltype = "performance/io-cache",
- .option = "cache-timeout",
- .op_version = 1,
- .client_option = _gf_true
+ { .key = "performance.cache-refresh-timeout",
+ .voltype = "performance/io-cache",
+ .option = "cache-timeout",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
},
- { .key = "performance.cache-priority",
- .voltype = "performance/io-cache",
- .option = "priority",
- .op_version = 1,
- .client_option = _gf_true
+ { .key = "performance.cache-priority",
+ .voltype = "performance/io-cache",
+ .option = "priority",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
},
- { .key = "performance.cache-size",
- .voltype = "performance/io-cache",
- .op_version = 1,
- .client_option = _gf_true
+ { .key = "performance.cache-size",
+ .voltype = "performance/io-cache",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
},
/* IO-threads xlator options */
@@ -638,102 +690,136 @@ struct volopt_map_entry glusterd_volopt_map[] = {
},
{ .key = "performance.enable-least-priority",
.voltype = "performance/io-threads",
- .op_version = 2
+ .op_version = 1
},
{ .key = "performance.least-rate-limit",
.voltype = "performance/io-threads",
- .op_version = 1
+ .op_version = 2
},
/* Other perf xlators' options */
- { .key = "performance.cache-size",
- .voltype = "performance/quick-read",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "performance.flush-behind",
- .voltype = "performance/write-behind",
- .option = "flush-behind",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "performance.write-behind-window-size",
- .voltype = "performance/write-behind",
- .option = "cache-size",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "performance.strict-o-direct",
- .voltype = "performance/write-behind",
- .option = "strict-O_DIRECT",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "performance.strict-write-ordering",
- .voltype = "performance/write-behind",
- .option = "strict-write-ordering",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "performance.lazy-open",
- .voltype = "performance/open-behind",
- .option = "lazy-open",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "performance.read-ahead-page-count",
- .voltype = "performance/read-ahead",
- .option = "page-count",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "performance.md-cache-timeout",
- .voltype = "performance/md-cache",
- .option = "md-cache-timeout",
- .op_version = 2,
- .client_option = _gf_true
+ { .key = "performance.cache-size",
+ .voltype = "performance/quick-read",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.flush-behind",
+ .voltype = "performance/write-behind",
+ .option = "flush-behind",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.write-behind-window-size",
+ .voltype = "performance/write-behind",
+ .option = "cache-size",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.strict-o-direct",
+ .voltype = "performance/write-behind",
+ .option = "strict-O_DIRECT",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.strict-write-ordering",
+ .voltype = "performance/write-behind",
+ .option = "strict-write-ordering",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.lazy-open",
+ .voltype = "performance/open-behind",
+ .option = "lazy-open",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.read-after-open",
+ .voltype = "performance/open-behind",
+ .option = "read-after-open",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.read-ahead-page-count",
+ .voltype = "performance/read-ahead",
+ .option = "page-count",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.md-cache-timeout",
+ .voltype = "performance/md-cache",
+ .option = "md-cache-timeout",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Crypt xlator options */
+
+ { .key = "features.encryption",
+ .voltype = "encryption/crypt",
+ .option = "!feat",
+ .value = "off",
+ .op_version = 3,
+ .description = "enable/disable client-side encryption for "
+ "the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+
+ { .key = "encryption.master-key",
+ .voltype = "encryption/crypt",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "encryption.data-key-size",
+ .voltype = "encryption/crypt",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "encryption.block-size",
+ .voltype = "encryption/crypt",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
},
/* Client xlator options */
- { .key = "network.frame-timeout",
- .voltype = "protocol/client",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "network.ping-timeout",
- .voltype = "protocol/client",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "network.tcp-window-size",
- .voltype = "protocol/client",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "features.lock-heal",
- .voltype = "protocol/client",
- .option = "lk-heal",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "features.grace-timeout",
- .voltype = "protocol/client",
- .option = "grace-timeout",
- .op_version = 1,
- .client_option = _gf_true
- },
- { .key = "client.ssl",
- .voltype = "protocol/client",
- .option = "transport.socket.ssl-enabled",
- .type = NO_DOC,
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "network.remote-dio",
- .voltype = "protocol/client",
- .option = "filter-O_DIRECT",
- .op_version = 1,
- .client_option = _gf_true
+ { .key = "network.frame-timeout",
+ .voltype = "protocol/client",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "network.ping-timeout",
+ .voltype = "protocol/client",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "network.tcp-window-size",
+ .voltype = "protocol/client",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "features.lock-heal",
+ .voltype = "protocol/client",
+ .option = "lk-heal",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "features.grace-timeout",
+ .voltype = "protocol/client",
+ .option = "grace-timeout",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "client.ssl",
+ .voltype = "protocol/client",
+ .option = "transport.socket.ssl-enabled",
+ .type = NO_DOC,
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "network.remote-dio",
+ .voltype = "protocol/client",
+ .option = "filter-O_DIRECT",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
},
/* Server xlator options */
@@ -773,11 +859,27 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.option = "root-squash",
.op_version = 2
},
+ { .key = "server.anonuid",
+ .voltype = "protocol/server",
+ .option = "anonuid",
+ .op_version = 3
+ },
+ { .key = "server.anongid",
+ .voltype = "protocol/server",
+ .option = "anongid",
+ .op_version = 3
+ },
{ .key = "server.statedump-path",
.voltype = "protocol/server",
.option = "statedump-path",
.op_version = 1
},
+ { .key = "server.outstanding-rpc-limit",
+ .voltype = "protocol/server",
+ .option = "rpc.outstanding-rpc-limit",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
{ .key = "features.lock-heal",
.voltype = "protocol/server",
.option = "lk-heal",
@@ -796,118 +898,202 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = NO_DOC,
.op_version = 2
},
-
- /* Performance xlators enable/disbable options */
- { .key = "performance.write-behind",
- .voltype = "performance/write-behind",
- .option = "!perf",
- .value = "on",
- .op_version = 1,
- .description = "enable/disable write-behind translator in the "
- "volume.",
- .client_option = _gf_true
- },
- { .key = "performance.read-ahead",
- .voltype = "performance/read-ahead",
- .option = "!perf",
- .value = "on",
- .op_version = 1,
- .description = "enable/disable read-ahead translator in the "
- "volume.",
- .client_option = _gf_true
- },
- { .key = "performance.io-cache",
- .voltype = "performance/io-cache",
- .option = "!perf",
- .value = "on",
- .op_version = 1,
- .description = "enable/disable io-cache translator in the volume.",
- .client_option = _gf_true
+ { .key = "cluster.nsr",
+ .voltype = "cluster/nsr",
+ .option = "!nsr",
+ .op_version = 3,
+ .description = "enable NSR instead of AFR for replication",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
},
- { .key = "performance.quick-read",
- .voltype = "performance/quick-read",
- .option = "!perf",
- .value = "on",
- .op_version = 1,
- .description = "enable/disable quick-read translator in the "
- "volume.",
- .client_option = _gf_true
- },
- { .key = "performance.open-behind",
- .voltype = "performance/open-behind",
- .option = "!perf",
- .value = "on",
- .op_version = 2,
- .description = "enable/disable open-behind translator in the "
- "volume.",
- .client_option = _gf_true
- },
- { .key = "performance.stat-prefetch",
- .voltype = "performance/md-cache",
- .option = "!perf",
- .value = "on",
- .op_version = 1,
- .description = "enable/disable meta-data caching translator in the "
- "volume.",
- .client_option = _gf_true
+ { .key = "cluster.nsr.recon",
+ .voltype = "cluster/nsr",
+ .op_version = 3,
+ .description = "enable NSR reconciliation",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
},
- { .key = "performance.client-io-threads",
- .voltype = "performance/io-threads",
- .option = "!perf",
- .value = "off",
- .op_version = 1,
- .description = "enable/disable io-threads translator in the client "
- "graph of volume.",
- .client_option = _gf_true
+ { .key = "cluster.nsr.quorum-percent",
+ .voltype = "cluster/nsr",
+ .option = "quorum-percent",
+ .op_version = 3,
+ .description = "percent of rep_count-1 bricks that must be up"
},
- { .key = "performance.nfs.write-behind",
+
+ /* Performance xlators enable/disbable options */
+ { .key = "performance.write-behind",
.voltype = "performance/write-behind",
- .option = "!nfsperf",
+ .option = "!perf",
.value = "on",
- .type = NO_DOC,
- .op_version = 1
+ .op_version = 1,
+ .description = "enable/disable write-behind translator in the "
+ "volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
},
- { .key = "performance.nfs.read-ahead",
+ { .key = "performance.read-ahead",
.voltype = "performance/read-ahead",
- .option = "!nfsperf",
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable read-ahead translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.readdir-ahead",
+ .voltype = "performance/readdir-ahead",
+ .option = "!perf",
.value = "off",
- .type = NO_DOC,
- .op_version = 1
+ .op_version = 3,
+ .description = "enable/disable readdir-ahead translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
},
- { .key = "performance.nfs.io-cache",
+
+ { .key = "performance.io-cache",
.voltype = "performance/io-cache",
- .option = "!nfsperf",
- .value = "off",
- .type = NO_DOC,
- .op_version = 1
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable io-cache translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT
},
- { .key = "performance.nfs.quick-read",
+ { .key = "performance.quick-read",
.voltype = "performance/quick-read",
- .option = "!nfsperf",
- .value = "off",
- .type = NO_DOC,
- .op_version = 1
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable quick-read translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+
+ },
+ { .key = "performance.open-behind",
+ .voltype = "performance/open-behind",
+ .option = "!perf",
+ .value = "on",
+ .op_version = 2,
+ .description = "enable/disable open-behind translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+
},
- { .key = "performance.nfs.stat-prefetch",
+ { .key = "performance.stat-prefetch",
.voltype = "performance/md-cache",
- .option = "!nfsperf",
- .value = "off",
- .type = NO_DOC,
- .op_version = 1
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable meta-data caching translator in the "
+ "volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
},
- { .key = "performance.nfs.io-threads",
+ { .key = "performance.client-io-threads",
.voltype = "performance/io-threads",
- .option = "!nfsperf",
+ .option = "!perf",
+ .value = "off",
+ .op_version = 1,
+ .description = "enable/disable io-threads translator in the client "
+ "graph of volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.write-behind",
+ .voltype = "performance/write-behind",
+ .option = "!nfsperf",
+ .value = "on",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.read-ahead",
+ .voltype = "performance/read-ahead",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.io-cache",
+ .voltype = "performance/io-cache",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.quick-read",
+ .voltype = "performance/quick-read",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.stat-prefetch",
+ .voltype = "performance/md-cache",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.io-threads",
+ .voltype = "performance/io-threads",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.force-readdirp",
+ .voltype = "performance/md-cache",
+ .option = "force-readdirp",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Feature translators */
+ { .key = "features.file-snapshot",
+ .voltype = "features/qemu-block",
+ .option = "!feat",
.value = "off",
+ .op_version = 3,
+ .description = "enable/disable file-snapshot feature in the "
+ "volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+
+#ifdef HAVE_LIB_Z
+ /* Compressor-decompressor xlator options
+ * defaults used from xlator/features/compress/src/cdc.h
+ */
+ { .key = "network.compression",
+ .voltype = "features/cdc",
+ .option = "!feat",
+ .value = "off",
+ .op_version = 3,
+ .description = "enable/disable network compression translator",
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "network.compression.window-size",
+ .voltype = "features/cdc",
+ .option = "window-size",
+ .op_version = 3
+ },
+ { .key = "network.compression.mem-level",
+ .voltype = "features/cdc",
+ .option = "mem-level",
+ .op_version = 3
+ },
+ { .key = "network.compression.min-size",
+ .voltype = "features/cdc",
+ .option = "min-size",
+ .op_version = 3
+ },
+ { .key = "network.compression.compression-level",
+ .voltype = "features/cdc",
+ .option = "compression-level",
+ .op_version = 3
+ },
+ { .key = "network.compression.debug",
+ .voltype = "features/cdc",
+ .option = "debug",
.type = NO_DOC,
- .op_version = 1
- },
- { .key = "performance.force-readdirp",
- .voltype = "performance/md-cache",
- .option = "force-readdirp",
- .op_version = 2,
- .client_option = _gf_true
+ .op_version = 3
},
+#endif
/* Quota xlator options */
{ .key = VKEY_FEATURES_LIMIT_USAGE,
@@ -915,15 +1101,38 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.option = "limit-set",
.type = NO_DOC,
.op_version = 1,
- .client_option = _gf_true
},
- { .key = "features.quota-timeout",
+ {
+ .key = "features.quota-timeout",
.voltype = "features/quota",
.option = "timeout",
.value = "0",
.op_version = 1,
.validate_fn = validate_quota,
- .client_option = _gf_true
+ },
+ { .key = "features.default-soft-limit",
+ .voltype = "features/quota",
+ .option = "default-soft-limit",
+ .type = NO_DOC,
+ .op_version = 3,
+ },
+ { .key = "features.soft-timeout",
+ .voltype = "features/quota",
+ .option = "soft-timeout",
+ .type = NO_DOC,
+ .op_version = 3,
+ },
+ { .key = "features.hard-timeout",
+ .voltype = "features/quota",
+ .option = "hard-timeout",
+ .type = NO_DOC,
+ .op_version = 3,
+ },
+ { .key = "features.alert-time",
+ .voltype = "features/quota",
+ .option = "alert-time",
+ .type = NO_DOC,
+ .op_version = 3,
},
{ .key = "features.quota-deem-statfs",
.voltype = "features/quota",
@@ -932,7 +1141,6 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = DOC,
.op_version = 2,
.validate_fn = validate_quota,
- .client_option = _gf_true
},
/* Marker xlator options */
@@ -952,6 +1160,22 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.flags = OPT_FLAG_FORCE,
.op_version = 1
},
+ { .key = VKEY_MARKER_XTIME_FORCE,
+ .voltype = "features/marker",
+ .option = "gsync-force-xtime",
+ .value = "off",
+ .type = NO_DOC,
+ .flags = OPT_FLAG_FORCE,
+ .op_version = 2
+ },
+ { .key = VKEY_MARKER_XTIME_FORCE,
+ .voltype = "features/marker",
+ .option = "!gsync-force-xtime",
+ .value = "off",
+ .type = NO_DOC,
+ .flags = OPT_FLAG_FORCE,
+ .op_version = 2
+ },
{ .key = VKEY_FEATURES_QUOTA,
.voltype = "features/marker",
.option = "quota",
@@ -962,12 +1186,13 @@ struct volopt_map_entry glusterd_volopt_map[] = {
},
/* Debug xlators options */
- { .key = "debug.trace",
- .voltype = "debug/trace",
- .option = "!debug",
- .value = "off",
- .type = NO_DOC,
- .op_version = 1
+ { .key = "debug.trace",
+ .voltype = "debug/trace",
+ .option = "!debug",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
},
{ .key = "debug.log-history",
.voltype = "debug/trace",
@@ -993,36 +1218,37 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = NO_DOC,
.op_version = 2
},
- { .key = "debug.error-gen",
- .voltype = "debug/error-gen",
- .option = "!debug",
- .value = "off",
- .type = NO_DOC,
- .op_version = 1
+ { .key = "debug.error-gen",
+ .voltype = "debug/error-gen",
+ .option = "!debug",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
},
{ .key = "debug.error-failure",
.voltype = "debug/error-gen",
.option = "failure",
.type = NO_DOC,
- .op_version = 2
+ .op_version = 3
},
{ .key = "debug.error-number",
.voltype = "debug/error-gen",
.option = "error-no",
.type = NO_DOC,
- .op_version = 2
+ .op_version = 3
},
{ .key = "debug.random-failure",
.voltype = "debug/error-gen",
.option = "random-failure",
.type = NO_DOC,
- .op_version = 2
+ .op_version = 3
},
{ .key = "debug.error-fops",
.voltype = "debug/error-gen",
.option = "enable",
.type = NO_DOC,
- .op_version = 2
+ .op_version = 3
},
@@ -1069,6 +1295,12 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = GLOBAL_DOC,
.op_version = 1
},
+ { .key = "nfs.outstanding-rpc-limit",
+ .voltype = "nfs/server",
+ .option = "rpc.outstanding-rpc-limit",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
{ .key = "nfs.port",
.voltype = "nfs/server",
.option = "nfs.port",
@@ -1139,57 +1371,136 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.type = GLOBAL_DOC,
.op_version = 1
},
+ { .key = "nfs.acl",
+ .voltype = "nfs/server",
+ .option = "nfs.acl",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
{ .key = "nfs.mount-udp",
.voltype = "nfs/server",
.option = "nfs.mount-udp",
.type = GLOBAL_DOC,
.op_version = 1
},
+ { .key = "nfs.mount-rmtab",
+ .voltype = "nfs/server",
+ .option = "nfs.mount-rmtab",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.rpc-statd",
+ .voltype = "nfs/server",
+ .option = "nfs.rpc-statd",
+ .type = NO_DOC,
+ .op_version = 4,
+ },
+ { .key = "nfs.log-level",
+ .voltype = "nfs/server",
+ .option = "nfs.log-level",
+ .type = NO_DOC,
+ .op_version = 4,
+ },
{ .key = "nfs.server-aux-gids",
.voltype = "nfs/server",
.option = "nfs.server-aux-gids",
.type = NO_DOC,
.op_version = 2
},
+ { .key = "nfs.drc",
+ .voltype = "nfs/server",
+ .option = "nfs.drc",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.drc-size",
+ .voltype = "nfs/server",
+ .option = "nfs.drc-size",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.read-size",
+ .voltype = "nfs/server",
+ .option = "nfs3.read-size",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.write-size",
+ .voltype = "nfs/server",
+ .option = "nfs3.write-size",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.readdir-size",
+ .voltype = "nfs/server",
+ .option = "nfs3.readdir-size",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
/* Other options which don't fit any place above */
- { .key = "features.read-only",
- .voltype = "features/read-only",
- .option = "!read-only",
- .value = "off",
- .op_version = 2,
- .client_option = _gf_true
- },
- { .key = "features.worm",
- .voltype = "features/worm",
- .option = "!worm",
- .value = "off",
- .op_version = 2,
- .client_option = _gf_true
+ { .key = "features.read-only",
+ .voltype = "features/read-only",
+ .option = "!read-only",
+ .value = "off",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "features.worm",
+ .voltype = "features/worm",
+ .option = "!worm",
+ .value = "off",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
},
{ .key = "storage.linux-aio",
.voltype = "storage/posix",
- .op_version = 2
+ .op_version = 1
+ },
+ { .key = "storage.batch-fsync-mode",
+ .voltype = "storage/posix",
+ .op_version = 3
+ },
+ { .key = "storage.batch-fsync-delay-usec",
+ .voltype = "storage/posix",
+ .op_version = 3
+ },
+ { .key = "storage.xattr-user-namespace-mode",
+ .voltype = "storage/posix",
+ .op_version = 4
},
{ .key = "storage.owner-uid",
.voltype = "storage/posix",
.option = "brick-uid",
- .op_version = 2
+ .op_version = 1
},
{ .key = "storage.owner-gid",
.voltype = "storage/posix",
.option = "brick-gid",
- .op_version = 2
+ .op_version = 1
},
{ .key = "storage.node-uuid-pathinfo",
.voltype = "storage/posix",
- .op_version = 2
+ .op_version = 3
},
- { .key = "config.memory-accounting",
- .voltype = "configuration",
- .option = "!config",
- .op_version = 2,
- .client_option = _gf_true
+ { .key = "storage.health-check-interval",
+ .voltype = "storage/posix",
+ .op_version = 3
+ },
+ { .option = "update-link-count-parent",
+ .key = "storage.build-pgfid",
+ .voltype = "storage/posix",
+ .op_version = 4
+ },
+ { .key = "storage.bd-aio",
+ .voltype = "storage/bd",
+ .op_version = 3
+ },
+ { .key = "config.memory-accounting",
+ .voltype = "configuration",
+ .option = "!config",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
},
{ .key = "config.transport",
.voltype = "configuration",
@@ -1206,6 +1517,42 @@ struct volopt_map_entry glusterd_volopt_map[] = {
.value = "0",
.op_version = 2
},
+ /* changelog translator - global tunables */
+ { .key = "changelog.changelog",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "changelog.changelog-dir",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "changelog.encoding",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "changelog.rollover-time",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "changelog.fsync-interval",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "features.barrier",
+ .voltype = "features/barrier",
+ .value = "disable",
+ .op_version = 4
+ },
+ { .key = "features.barrier-timeout",
+ .voltype = "features/barrier",
+ .value = "120",
+ .op_version = 4
+ },
{ .key = NULL
}
};
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
index 3b419d2e0..4d09d7fd9 100644
--- a/xlators/mgmt/glusterd/src/glusterd.c
+++ b/xlators/mgmt/glusterd/src/glusterd.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -30,12 +30,14 @@
#include "dict.h"
#include "compat.h"
#include "compat-errno.h"
+#include "syscall.h"
#include "statedump.h"
#include "glusterd-sm.h"
#include "glusterd-op-sm.h"
#include "glusterd-store.h"
#include "glusterd-hooks.h"
#include "glusterd-utils.h"
+#include "glusterd-locks.h"
#include "common-utils.h"
#include "run.h"
@@ -44,29 +46,43 @@
#include "glusterd-mountbroker.h"
extern struct rpcsvc_program gluster_handshake_prog;
+extern struct rpcsvc_program gluster_cli_getspec_prog;
extern struct rpcsvc_program gluster_pmap_prog;
extern glusterd_op_info_t opinfo;
extern struct rpcsvc_program gd_svc_mgmt_prog;
+extern struct rpcsvc_program gd_svc_mgmt_v3_prog;
extern struct rpcsvc_program gd_svc_peer_prog;
extern struct rpcsvc_program gd_svc_cli_prog;
+extern struct rpcsvc_program gd_svc_cli_prog_ro;
extern struct rpc_clnt_program gd_brick_prog;
extern struct rpcsvc_program glusterd_mgmt_hndsk_prog;
+extern char snap_mount_folder[PATH_MAX];
+
rpcsvc_cbk_program_t glusterd_cbk_prog = {
.progname = "Gluster Callback",
.prognum = GLUSTER_CBK_PROGRAM,
.progver = GLUSTER_CBK_VERSION,
};
-struct rpcsvc_program *all_programs[] = {
+struct rpcsvc_program *gd_inet_programs[] = {
&gd_svc_peer_prog,
- &gd_svc_cli_prog,
+ &gd_svc_cli_prog_ro,
&gd_svc_mgmt_prog,
+ &gd_svc_mgmt_v3_prog,
&gluster_pmap_prog,
&gluster_handshake_prog,
&glusterd_mgmt_hndsk_prog,
};
-int rpcsvc_programs_count = (sizeof (all_programs) / sizeof (all_programs[0]));
+int gd_inet_programs_count = (sizeof (gd_inet_programs) /
+ sizeof (gd_inet_programs[0]));
+
+struct rpcsvc_program *gd_uds_programs[] = {
+ &gd_svc_cli_prog,
+ &gluster_cli_getspec_prog,
+};
+int gd_uds_programs_count = (sizeof (gd_uds_programs) /
+ sizeof (gd_uds_programs[0]));
const char *gd_op_list[GD_OP_MAX + 1] = {
[GD_OP_NONE] = "Invalid op",
@@ -94,6 +110,10 @@ const char *gd_op_list[GD_OP_MAX + 1] = {
[GD_OP_LIST_VOLUME] = "Lists",
[GD_OP_CLEARLOCKS_VOLUME] = "Clear locks",
[GD_OP_DEFRAG_BRICK_VOLUME] = "Rebalance",
+ [GD_OP_COPY_FILE] = "Copy File",
+ [GD_OP_SYS_EXEC] = "Execute system commands",
+ [GD_OP_GSYNC_CREATE] = "Geo-replication Create",
+ [GD_OP_SNAP] = "Snapshot",
[GD_OP_MAX] = "Invalid op"
};
@@ -119,12 +139,12 @@ glusterd_uuid_init ()
GF_ASSERT (this);
priv = this->private;
- ret = glusterd_retrieve_uuid ();
- if (ret == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "retrieved UUID: %s", uuid_utoa (priv->uuid));
- return 0;
- }
+ ret = glusterd_retrieve_uuid ();
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "retrieved UUID: %s", uuid_utoa (priv->uuid));
+ return 0;
+ }
ret = glusterd_uuid_generate_save ();
@@ -292,7 +312,7 @@ out:
}
-inline int32_t
+static inline int32_t
glusterd_program_register (xlator_t *this, rpcsvc_t *svc,
rpcsvc_program_t *prog)
{
@@ -519,7 +539,7 @@ runinit_gsyncd_setrx (runner_t *runner, glusterd_conf_t *conf)
{
runinit (runner);
runner_add_args (runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (runner, "%s/"GSYNC_CONF,conf->workdir);
+ runner_argprintf (runner, "%s/"GSYNC_CONF_TEMPLATE, conf->workdir);
runner_add_arg (runner, "--config-set-rx");
}
@@ -581,7 +601,7 @@ configure_syncdaemon (glusterd_conf_t *conf)
/* gluster-params */
runinit_gsyncd_setrx (&runner, conf);
runner_add_args (&runner, "gluster-params",
- "xlator-option=*-dht.assert-no-child-down=true",
+ "aux-gfid-mount",
".", ".", NULL);
RUN_GSYNCD_CMD;
@@ -595,17 +615,50 @@ configure_syncdaemon (glusterd_conf_t *conf)
runner_add_args (&runner, ".", ".", NULL);
RUN_GSYNCD_CMD;
+ /* ssh-command tar */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg (&runner, "ssh-command-tar");
+ runner_argprintf (&runner,
+ "ssh -oPasswordAuthentication=no "
+ "-oStrictHostKeyChecking=no "
+ "-i %s/tar_ssh.pem", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
/* pid-file */
runinit_gsyncd_setrx (&runner, conf);
runner_add_arg (&runner, "pid-file");
- runner_argprintf (&runner, "%s/${mastervol}/${eSlave}.pid", georepdir);
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.pid", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* geo-rep working dir */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg (&runner, "georep-session-working-dir");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/", georepdir);
runner_add_args (&runner, ".", ".", NULL);
RUN_GSYNCD_CMD;
/* state-file */
runinit_gsyncd_setrx (&runner, conf);
runner_add_arg (&runner, "state-file");
- runner_argprintf (&runner, "%s/${mastervol}/${eSlave}.status", georepdir);
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.status", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-detail-file */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg (&runner, "state-detail-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status",
+ georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-detail-file */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg (&runner, "state-detail-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status",
+ georepdir);
runner_add_args (&runner, ".", ".", NULL);
RUN_GSYNCD_CMD;
@@ -633,10 +686,32 @@ configure_syncdaemon (glusterd_conf_t *conf)
runinit_gsyncd_setrx (&runner, conf);
runner_add_args (&runner,
"gluster-log-file",
- DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}.gluster.log",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}${local_id}.gluster.log",
".", ".", NULL);
RUN_GSYNCD_CMD;
+ /* ignore-deletes */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_args (&runner, "ignore-deletes", "true", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* special-sync-mode */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_args (&runner, "special-sync-mode", "partial", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* change-detector == changelog */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_args(&runner, "change-detector", "changelog", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg(&runner, "working-dir");
+ runner_argprintf(&runner, "%s/${mastervol}/${eSlave}",
+ DEFAULT_VAR_RUN_DIRECTORY);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
/************
* slave pre-configuration
************/
@@ -650,7 +725,7 @@ configure_syncdaemon (glusterd_conf_t *conf)
/* gluster-params */
runinit_gsyncd_setrx (&runner, conf);
runner_add_args (&runner, "gluster-params",
- "xlator-option=*-dht.assert-no-child-down=true",
+ "aux-gfid-mount",
".", NULL);
RUN_GSYNCD_CMD;
@@ -730,7 +805,7 @@ check_prepare_mountbroker_root (char *mountbroker_root)
dfd0 = dup (dfd);
for (;;) {
- ret = openat (dfd, "..", O_RDONLY);
+ ret = sys_openat (dfd, "..", O_RDONLY);
if (ret != -1) {
dfd2 = ret;
ret = fstat (dfd2, &st2);
@@ -765,11 +840,11 @@ check_prepare_mountbroker_root (char *mountbroker_root)
st = st2;
}
- ret = mkdirat (dfd0, MB_HIVE, 0711);
+ ret = sys_mkdirat (dfd0, MB_HIVE, 0711);
if (ret == -1 && errno == EEXIST)
ret = 0;
if (ret != -1)
- ret = fstatat (dfd0, MB_HIVE, &st, AT_SYMLINK_NOFOLLOW);
+ ret = sys_fstatat (dfd0, MB_HIVE, &st, AT_SYMLINK_NOFOLLOW);
if (ret == -1 || st.st_mode != (S_IFDIR|0711)) {
gf_log ("", GF_LOG_ERROR,
"failed to set up mountbroker-root directory %s",
@@ -866,6 +941,14 @@ _install_mount_spec (dict_t *opts, char *key, data_t *value, void *data)
"adding %smount spec failed: label: %s desc: %s",
georep ? GEOREP" " : "", label, pdesc);
+ if (mspec) {
+ if (mspec->patterns) {
+ GF_FREE (mspec->patterns->components);
+ GF_FREE (mspec->patterns);
+ }
+ GF_FREE (mspec);
+ }
+
return -1;
}
@@ -896,6 +979,188 @@ glusterd_launch_synctask (synctask_fn_t fn, void *opaque)
" and other volume related services");
}
+int
+glusterd_uds_rpcsvc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
+ void *data)
+{
+ /* glusterd_rpcsvc_notify() does stuff that calls coming in from the
+ * unix domain socket don't need. This is just an empty function to be
+ * used for the uds listener. This will be used later if required.
+ */
+ return 0;
+}
+
+/* The glusterd unix domain socket listener only listens for cli */
+rpcsvc_t *
+glusterd_init_uds_listener (xlator_t *this)
+{
+ int ret = -1;
+ dict_t *options = NULL;
+ rpcsvc_t *rpc = NULL;
+ data_t *sock_data = NULL;
+ char sockfile[PATH_MAX+1] = {0,};
+ int i = 0;
+
+
+ GF_ASSERT (this);
+
+ sock_data = dict_get (this->options, "glusterd-sockfile");
+ if (!sock_data) {
+ strncpy (sockfile, DEFAULT_GLUSTERD_SOCKFILE, PATH_MAX);
+ } else {
+ strncpy (sockfile, sock_data->data, PATH_MAX);
+ }
+
+ options = dict_new ();
+ if (!options)
+ goto out;
+
+ ret = rpcsvc_transport_unix_options_build (&options, sockfile);
+ if (ret)
+ goto out;
+
+ rpc = rpcsvc_init (this, this->ctx, options, 8);
+ if (rpc == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = rpcsvc_register_notify (rpc, glusterd_uds_rpcsvc_notify,
+ this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Failed to register notify function");
+ goto out;
+ }
+
+ ret = rpcsvc_create_listeners (rpc, options, this->name);
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to create listener");
+ goto out;
+ }
+ ret = 0;
+
+ for (i = 0; i < gd_uds_programs_count; i++) {
+ ret = glusterd_program_register (this, rpc, gd_uds_programs[i]);
+ if (ret) {
+ i--;
+ for (; i >= 0; i--)
+ rpcsvc_program_unregister (rpc,
+ gd_uds_programs[i]);
+
+ goto out;
+ }
+ }
+
+out:
+ if (options)
+ dict_unref (options);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to start glusterd "
+ "unix domain socket listener.");
+ if (rpc) {
+ GF_FREE (rpc);
+ rpc = NULL;
+ }
+ }
+ return rpc;
+}
+
+void
+glusterd_stop_uds_listener (xlator_t *this)
+{
+ glusterd_conf_t *conf = NULL;
+ rpcsvc_listener_t *listener = NULL;
+ rpcsvc_listener_t *next = NULL;
+
+ GF_ASSERT (this);
+ conf = this->private;
+
+ (void) rpcsvc_program_unregister (conf->uds_rpc, &gd_svc_cli_prog);
+ (void) rpcsvc_program_unregister (conf->uds_rpc, &gluster_handshake_prog);
+
+ list_for_each_entry_safe (listener, next, &conf->uds_rpc->listeners,
+ list) {
+ rpcsvc_listener_destroy (listener);
+ }
+
+ (void) rpcsvc_unregister_notify (conf->uds_rpc, glusterd_rpcsvc_notify,
+ this);
+
+ unlink (DEFAULT_GLUSTERD_SOCKFILE);
+
+ GF_FREE (conf->uds_rpc);
+ conf->uds_rpc = NULL;
+
+ return;
+}
+
+static int
+glusterd_init_snap_folder (xlator_t *this)
+{
+ int ret = -1;
+ struct stat buf = {0,};
+
+ GF_ASSERT (this);
+
+ /* Snapshot volumes are mounted under /var/run/gluster/snaps folder.
+ * But /var/run is normally a symbolic link to /run folder, which
+ * creates problems as the entry point in the mtab for the mount point
+ * and glusterd maintained entry point will be different. Therefore
+ * identify the correct run folder and use it for snap volume mounting.
+ */
+ ret = lstat (GLUSTERD_VAR_RUN_DIR, &buf);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "stat fails on %s, exiting. (errno = %d)",
+ GLUSTERD_VAR_RUN_DIR, errno);
+ goto out;
+ }
+
+ /* If /var/run is symlink then use /run folder */
+ if (S_ISLNK (buf.st_mode)) {
+ strcpy (snap_mount_folder, GLUSTERD_RUN_DIR);
+ } else {
+ strcpy (snap_mount_folder, GLUSTERD_VAR_RUN_DIR);
+ }
+
+ strcat (snap_mount_folder, GLUSTERD_DEFAULT_SNAPS_BRICK_DIR);
+
+ ret = stat (snap_mount_folder, &buf);
+ if ((ret != 0) && (ENOENT != errno)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "stat fails on %s, exiting. (errno = %d)",
+ snap_mount_folder, errno);
+ ret = -1;
+ goto out;
+ }
+
+ if ((!ret) && (!S_ISDIR(buf.st_mode))) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Provided snap path %s is not a directory,"
+ "exiting", snap_mount_folder);
+ ret = -1;
+ goto out;
+ }
+
+ if ((-1 == ret) && (ENOENT == errno)) {
+ /* Create missing folders */
+ ret = mkdir_p (snap_mount_folder, 0777, _gf_false);
+
+ if (-1 == ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Unable to create directory %s"
+ " ,errno = %d", snap_mount_folder, errno);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+
/*
* init - called during glusterd initialization
*
@@ -907,6 +1172,7 @@ init (xlator_t *this)
{
int32_t ret = -1;
rpcsvc_t *rpc = NULL;
+ rpcsvc_t *uds_rpc = NULL;
glusterd_conf_t *conf = NULL;
data_t *dir_data = NULL;
struct stat buf = {0,};
@@ -932,7 +1198,7 @@ init (xlator_t *this)
if ((ret != 0) && (ENOENT != errno)) {
gf_log (this->name, GF_LOG_ERROR,
"stat fails on %s, exiting. (errno = %d)",
- workdir, errno);
+ workdir, errno);
exit (1);
}
@@ -957,9 +1223,18 @@ init (xlator_t *this)
first_time = 1;
}
+ setenv ("GLUSTERD_WORKING_DIR", workdir, 1);
gf_log (this->name, GF_LOG_INFO, "Using %s as working directory",
workdir);
+ ret = glusterd_init_snap_folder (this);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Unable to create "
+ "snap backend folder");
+ exit (1);
+ }
+
snprintf (cmd_log_filename, PATH_MAX,"%s/.cmd_log_history",
DEFAULT_LOG_FILE_DIRECTORY);
ret = gf_cmd_log_init (cmd_log_filename);
@@ -981,6 +1256,17 @@ init (xlator_t *this)
exit (1);
}
+ snprintf (storedir, PATH_MAX, "%s/snaps", workdir);
+
+ ret = mkdir (storedir, 0777);
+
+ if ((-1 == ret) && (errno != EEXIST)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Unable to create snaps directory %s"
+ " ,errno = %d", storedir, errno);
+ exit (1);
+ }
+
snprintf (storedir, PATH_MAX, "%s/peers", workdir);
ret = mkdir (storedir, 0777);
@@ -1019,6 +1305,15 @@ init (xlator_t *this)
exit (1);
}
+ snprintf (storedir, PATH_MAX, "%s/quotad", workdir);
+ ret = mkdir (storedir, 0777);
+ if ((-1 == ret) && (errno != EEXIST)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Unable to create quotad directory %s"
+ " ,errno = %d", storedir, errno);
+ exit (1);
+ }
+
snprintf (storedir, PATH_MAX, "%s/groups", workdir);
ret = mkdir (storedir, 0777);
if ((-1 == ret) && (errno != EEXIST)) {
@@ -1057,32 +1352,49 @@ init (xlator_t *this)
goto out;
}
- for (i = 0; i < rpcsvc_programs_count; i++) {
- ret = glusterd_program_register (this, rpc, all_programs[i]);
+ for (i = 0; i < gd_inet_programs_count; i++) {
+ ret = glusterd_program_register (this, rpc,
+ gd_inet_programs[i]);
if (ret) {
i--;
for (; i >= 0; i--)
rpcsvc_program_unregister (rpc,
- all_programs[i]);
+ gd_inet_programs[i]);
goto out;
}
}
+ /* Start a unix domain socket listener just for cli commands
+ * This should prevent ports from being wasted by being in TIMED_WAIT
+ * when cli commands are done continuously
+ */
+ uds_rpc = glusterd_init_uds_listener (this);
+ if (uds_rpc == NULL) {
+ ret = -1;
+ goto out;
+ }
+
conf = GF_CALLOC (1, sizeof (glusterd_conf_t),
gf_gld_mt_glusterd_conf_t);
GF_VALIDATE_OR_GOTO(this->name, conf, out);
- conf->shd = GF_CALLOC (1, sizeof (nodesrv_t),
- gf_gld_mt_nodesrv_t);
+
+ conf->shd = GF_CALLOC (1, sizeof (nodesrv_t), gf_gld_mt_nodesrv_t);
GF_VALIDATE_OR_GOTO(this->name, conf->shd, out);
- conf->nfs = GF_CALLOC (1, sizeof (nodesrv_t),
- gf_gld_mt_nodesrv_t);
+ conf->nfs = GF_CALLOC (1, sizeof (nodesrv_t), gf_gld_mt_nodesrv_t);
GF_VALIDATE_OR_GOTO(this->name, conf->nfs, out);
+ conf->quotad = GF_CALLOC (1, sizeof (nodesrv_t),
+ gf_gld_mt_nodesrv_t);
+ GF_VALIDATE_OR_GOTO(this->name, conf->quotad, out);
INIT_LIST_HEAD (&conf->peers);
INIT_LIST_HEAD (&conf->volumes);
+ INIT_LIST_HEAD (&conf->snapshots);
+ INIT_LIST_HEAD (&conf->missed_snaps_list);
+
pthread_mutex_init (&conf->mutex, NULL);
conf->rpc = rpc;
+ conf->uds_rpc = uds_rpc;
conf->gfs_mgmt = &gd_brick_prog;
strncpy (conf->workdir, workdir, PATH_MAX);
@@ -1100,6 +1412,12 @@ init (xlator_t *this)
if (ret)
goto out;
+ conf->base_port = GF_IANA_PRIV_PORTS_START;
+ if (dict_get_uint32(this->options, "base-port", &conf->base_port) == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "base-port override: %d", conf->base_port);
+ }
+
/* Set option to run bricks on valgrind if enabled in glusterd.vol */
conf->valgrind = _gf_false;
ret = dict_get_str (this->options, "run-with-valgrind", &valgrind_str);
@@ -1115,6 +1433,8 @@ init (xlator_t *this)
}
this->private = conf;
+ glusterd_mgmt_v3_lock_init ();
+ glusterd_txn_opinfo_dict_init ();
(void) glusterd_nodesvc_set_online_status ("glustershd", _gf_false);
GLUSTERD_GET_HOOKS_DIR (hooks_dir, GLUSTERD_HOOK_VER, conf);
@@ -1155,7 +1475,21 @@ init (xlator_t *this)
if (list_empty (&conf->peers)) {
glusterd_launch_synctask (glusterd_spawn_daemons, NULL);
+ gf_log (this->name, GF_LOG_INFO,
+ "no peers, should start FRESH etcd");
+ /*
+ * We might not have any peers now, but if we did once before
+ * then we don't want to start up with a config that still has
+ * references to them.
+ */
+ nuke_etcd_dir();
+ }
+ else {
+ gf_log (this->name, GF_LOG_INFO,
+ "have peers, should start etcd with old config");
}
+ conf->etcd_pid = start_etcd(uuid_utoa(MY_UUID),NULL);
+
ret = glusterd_options_init (this);
if (ret < 0)
goto out;
@@ -1199,11 +1533,19 @@ fini (xlator_t *this)
goto out;
conf = this->private;
+
+ glusterd_stop_uds_listener (this);
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
+
FREE (conf->pmap);
if (conf->handle)
- glusterd_store_handle_destroy (conf->handle);
+ gf_store_handle_destroy (conf->handle);
glusterd_sm_tr_log_delete (&conf->op_sm_log);
+ glusterd_mgmt_v3_lock_fini ();
+ glusterd_txn_opinfo_dict_fini ();
GF_FREE (conf);
+
this->private = NULL;
out:
return;
@@ -1296,13 +1638,31 @@ struct volume_options options[] = {
{ .key = {"server-quorum-type"},
.type = GF_OPTION_TYPE_STR,
.value = { "none", "server"},
- .description = "If set to server, enables the specified "
- "volume to participate in quorum."
+ .description = "This feature is on the server-side i.e. in glusterd."
+ " Whenever the glusterd on a machine observes that "
+ "the quorum is not met, it brings down the bricks to "
+ "prevent data split-brains. When the network "
+ "connections are brought back up and the quorum is "
+ "restored the bricks in the volume are brought back "
+ "up."
},
{ .key = {"server-quorum-ratio"},
.type = GF_OPTION_TYPE_PERCENT,
.description = "Sets the quorum percentage for the trusted "
"storage pool."
},
+ { .key = {"glusterd-sockfile"},
+ .type = GF_OPTION_TYPE_PATH,
+ .description = "The socket file on which glusterd should listen for "
+ "cli requests. Default is "DEFAULT_GLUSTERD_SOCKFILE "."
+ },
+ { .key = {"base-port"},
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Sets the base port for portmap query"
+ },
+ { .key = {"snap-brick-path"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "directory where the bricks for the snapshots will be created"
+ },
{ .key = {NULL} },
};
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index b466ac20c..7157bee64 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -37,16 +37,25 @@
#include "glusterd-pmap.h"
#include "cli1-xdr.h"
#include "syncop.h"
+#include "store.h"
+#include "glusterd-etcd.h"
#define GLUSTERD_MAX_VOLUME_NAME 1000
-#define DEFAULT_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs"
#define GLUSTERD_TR_LOG_SIZE 50
#define GLUSTERD_NAME "glusterd"
#define GLUSTERD_SOCKET_LISTEN_BACKLOG 128
#define GLUSTERD_QUORUM_TYPE_KEY "cluster.server-quorum-type"
#define GLUSTERD_QUORUM_RATIO_KEY "cluster.server-quorum-ratio"
#define GLUSTERD_GLOBAL_OPT_VERSION "global-option-version"
+#define GLUSTERD_COMMON_PEM_PUB_FILE "/geo-replication/common_secret.pem.pub"
+#define GEO_CONF_MAX_OPT_VALS 6
+#define GLUSTERD_CREATE_HOOK_SCRIPT "/hooks/1/gsync-create/post/" \
+ "S56glusterd-geo-rep-create-post.sh"
+
+#define GLUSTERD_SNAPS_MAX_HARD_LIMIT 256
+#define GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT 90
+#define GLUSTERD_SNAPS_MAX_SOFT_LIMIT_PERCENT 100
#define GLUSTERD_SERVER_QUORUM "server"
#define FMTSTR_CHECK_VOL_EXISTS "Volume %s does not exist"
@@ -67,6 +76,9 @@
struct glusterd_volinfo_;
typedef struct glusterd_volinfo_ glusterd_volinfo_t;
+struct glusterd_snap_;
+typedef struct glusterd_snap_ glusterd_snap_t;
+
typedef enum glusterd_op_ {
GD_OP_NONE = 0,
GD_OP_CREATE_VOLUME,
@@ -93,26 +105,22 @@ typedef enum glusterd_op_ {
GD_OP_LIST_VOLUME,
GD_OP_CLEARLOCKS_VOLUME,
GD_OP_DEFRAG_BRICK_VOLUME,
- GD_OP_BD_OP,
+ GD_OP_COPY_FILE,
+ GD_OP_SYS_EXEC,
+ GD_OP_GSYNC_CREATE,
+ GD_OP_SNAP,
GD_OP_MAX,
} glusterd_op_t;
extern const char * gd_op_list[];
-struct glusterd_store_iter_ {
- int fd;
- FILE *file;
- char filepath[PATH_MAX];
-};
-
-typedef struct glusterd_store_iter_ glusterd_store_iter_t;
struct glusterd_volgen {
dict_t *dict;
};
typedef struct {
- struct rpc_clnt *rpc;
- gf_boolean_t online;
+ struct rpc_clnt *rpc;
+ gf_boolean_t online;
} nodesrv_t;
typedef struct {
@@ -122,37 +130,56 @@ typedef struct {
} gd_global_opts_t;
typedef struct {
- struct _volfile_ctx *volfile;
- pthread_mutex_t mutex;
- struct list_head peers;
- struct list_head xaction_peers;
- gf_boolean_t verify_volfile_checksum;
- gf_boolean_t trace;
- uuid_t uuid;
- char workdir[PATH_MAX];
- rpcsvc_t *rpc;
- nodesrv_t *shd;
- nodesrv_t *nfs;
- struct pmap_registry *pmap;
- struct list_head volumes;
- pthread_mutex_t xprt_lock;
- struct list_head xprt_list;
- glusterd_store_handle_t *handle;
- gf_timer_t *timer;
- glusterd_sm_tr_log_t op_sm_log;
+ struct _volfile_ctx *volfile;
+ pthread_mutex_t mutex;
+ struct list_head peers;
+ struct list_head xaction_peers;
+ gf_boolean_t verify_volfile_checksum;
+ gf_boolean_t trace;
+ uuid_t uuid;
+ char workdir[PATH_MAX];
+ rpcsvc_t *rpc;
+ nodesrv_t *shd;
+ nodesrv_t *nfs;
+ nodesrv_t *quotad;
+ struct pmap_registry *pmap;
+ struct list_head volumes;
+ struct list_head snapshots; /*List of snap volumes */
+ pthread_mutex_t xprt_lock;
+ struct list_head xprt_list;
+ gf_store_handle_t *handle;
+ gf_timer_t *timer;
+ glusterd_sm_tr_log_t op_sm_log;
struct rpc_clnt_program *gfs_mgmt;
+ dict_t *mgmt_v3_lock; /* Dict for saving
+ * mgmt_v3 locks */
+ dict_t *glusterd_txn_opinfo; /* Dict for saving
+ * transaction opinfos */
+ uuid_t global_txn_id; /* To be used in
+ * heterogeneous
+ * cluster with no
+ * transaction ids */
+
+ struct list_head mount_specs;
+ gf_boolean_t valgrind;
+ pthread_t brick_thread;
+ void *hooks_priv;
- struct list_head mount_specs;
- gf_boolean_t valgrind;
- pthread_t brick_thread;
- void *hooks_priv;
/* need for proper handshake_t */
- int op_version; /* Starts with 1 for 3.3.0 */
- xlator_t *xl; /* Should be set to 'THIS' before creating thread */
- gf_boolean_t pending_quorum_action;
- dict_t *opts;
- synclock_t big_lock;
- gf_boolean_t restart_done;
+ int op_version; /* Starts with 1 for 3.3.0 */
+ xlator_t *xl; /* Should be set to 'THIS' before creating thread */
+ gf_boolean_t pending_quorum_action;
+ dict_t *opts;
+ synclock_t big_lock;
+ gf_boolean_t restart_done;
+ rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */
+ uint32_t base_port;
+ uint64_t snap_max_hard_limit;
+ uint64_t snap_max_soft_limit;
+ char *snap_bricks_directory;
+ gf_store_handle_t *missed_snaps_list_shandle;
+ struct list_head missed_snaps_list;
+ pid_t etcd_pid;
} glusterd_conf_t;
@@ -162,18 +189,33 @@ typedef enum gf_brick_status {
} gf_brick_status_t;
struct glusterd_brickinfo {
- char hostname[1024];
- char path[PATH_MAX];
- struct list_head brick_list;
- uuid_t uuid;
- int port;
- int rdma_port;
- char *logfile;
- gf_boolean_t signed_in;
- glusterd_store_handle_t *shandle;
- gf_brick_status_t status;
- struct rpc_clnt *rpc;
- int decommissioned;
+ char hostname[1024];
+ char path[PATH_MAX];
+ char device_path[PATH_MAX];
+ char brick_id[1024];/*Client xlator name, AFR changelog name*/
+ struct list_head brick_list;
+ uuid_t uuid;
+ int port;
+ int rdma_port;
+ char *logfile;
+ gf_boolean_t signed_in;
+ gf_store_handle_t *shandle;
+ gf_brick_status_t status;
+ struct rpc_clnt *rpc;
+ int decommissioned;
+ char vg[PATH_MAX]; /* FIXME: Use max size for length of vg */
+ int caps; /* Capability */
+ int32_t snap_status;
+ /*
+ * The group is used to identify which bricks are part of the same
+ * replica set during brick-volfile generation, so that NSR volfiles
+ * can "cross-connect" the bricks to one another. This same approach
+ * could be used to make client-volfile generation much simpler and
+ * more efficient too, though it would require some further adaptation
+ * to support more than one layer of hierarchy.
+ */
+ uint16_t group;
+ uuid_t nsr_uuid;
};
typedef struct glusterd_brickinfo glusterd_brickinfo_t;
@@ -196,7 +238,7 @@ struct glusterd_defrag_info_ {
int cmd;
pthread_t th;
gf_defrag_status_t defrag_status;
- struct rpc_clnt * rpc;
+ struct rpc_clnt *rpc;
uint32_t connected;
char mount[1024];
char databuf[131072];
@@ -229,91 +271,170 @@ struct _auth {
typedef struct _auth auth_t;
-typedef enum glusterd_vol_backend_ {
- GD_VOL_BK_DEFAULT = 0, /* POSIX */
- GD_VOL_BK_BD = 1,
-} glusterd_vol_backend_t;
+/* Capabilities of xlator */
+#define CAPS_BD 0x00000001
+#define CAPS_THIN 0x00000002
+#define CAPS_OFFLOAD_COPY 0x00000004
+#define CAPS_OFFLOAD_SNAPSHOT 0x00000008
+#define CAPS_OFFLOAD_ZERO 0x00000020
struct glusterd_rebalance_ {
- gf_defrag_status_t defrag_status;
- uint64_t rebalance_files;
- uint64_t rebalance_data;
- uint64_t lookedup_files;
+ gf_defrag_status_t defrag_status;
+ uint64_t rebalance_files;
+ uint64_t rebalance_data;
+ uint64_t lookedup_files;
+ uint64_t skipped_files;
glusterd_defrag_info_t *defrag;
- gf_cli_defrag_type defrag_cmd;
- uint64_t rebalance_failures;
- uuid_t rebalance_id;
- double rebalance_time;
- glusterd_op_t op;
+ gf_cli_defrag_type defrag_cmd;
+ uint64_t rebalance_failures;
+ uuid_t rebalance_id;
+ double rebalance_time;
+ glusterd_op_t op;
+ dict_t *dict; /* Dict to store misc information
+ * like list of bricks being removed */
};
typedef struct glusterd_rebalance_ glusterd_rebalance_t;
struct glusterd_replace_brick_ {
gf_rb_status_t rb_status;
- glusterd_brickinfo_t *src_brick;
- glusterd_brickinfo_t *dst_brick;
+ glusterd_brickinfo_t *src_brick;
+ glusterd_brickinfo_t *dst_brick;
uuid_t rb_id;
};
typedef struct glusterd_replace_brick_ glusterd_replace_brick_t;
struct glusterd_volinfo_ {
- char volname[GLUSTERD_MAX_VOLUME_NAME];
- int type;
- int brick_count;
- struct list_head vol_list;
- struct list_head bricks;
- glusterd_volume_status status;
- int sub_count; /* backward compatibility */
- int stripe_count;
- int replica_count;
- int subvol_count; /* Number of subvolumes in a
+ gf_lock_t lock;
+ char volname[GLUSTERD_MAX_VOLUME_NAME];
+ gf_boolean_t is_snap_volume;
+ glusterd_snap_t *snapshot;
+ uuid_t restored_from_snap;
+ char parent_volname[GLUSTERD_MAX_VOLUME_NAME];
+ /* In case of a snap volume
+ i.e (is_snap_volume == TRUE) this
+ field will contain the name of
+ the volume which is snapped. In
+ case of a non-snap volume, this
+ field will be initialized as N/A */
+ int type;
+ int brick_count;
+ uint64_t snap_count;
+ uint64_t snap_max_hard_limit;
+ struct list_head vol_list;
+ /* In case of a snap volume
+ i.e (is_snap_volume == TRUE) this
+ is linked to glusterd_snap_t->volumes.
+ In case of a non-snap volume, this is
+ linked to glusterd_conf_t->volumes */
+ struct list_head snapvol_list;
+ /* This is a current pointer for
+ glusterd_volinfo_t->snap_volumes */
+ struct list_head bricks;
+ struct list_head snap_volumes;
+ /* TODO : Need to remove this, as this
+ * is already part of snapshot object.
+ */
+ glusterd_volume_status status;
+ int sub_count; /* backward compatibility */
+ int stripe_count;
+ int replica_count;
+ int subvol_count; /* Number of subvolumes in a
distribute volume */
- int dist_leaf_count; /* Number of bricks in one
+ int dist_leaf_count; /* Number of bricks in one
distribute subvolume */
- int port;
- glusterd_store_handle_t *shandle;
- glusterd_store_handle_t *rb_shandle;
- glusterd_store_handle_t *node_state_shandle;
+ int port;
+ gf_store_handle_t *shandle;
+ gf_store_handle_t *rb_shandle;
+ gf_store_handle_t *node_state_shandle;
+ gf_store_handle_t *quota_conf_shandle;
/* Defrag/rebalance related */
- glusterd_rebalance_t rebal;
+ glusterd_rebalance_t rebal;
/* Replace brick status */
- glusterd_replace_brick_t rep_brick;
+ glusterd_replace_brick_t rep_brick;
- int version;
- uint32_t cksum;
- gf_transport_type transport_type;
- gf_transport_type nfs_transport_type;
+ int version;
+ uint32_t quota_conf_version;
+ uint32_t cksum;
+ uint32_t quota_conf_cksum;
+ gf_transport_type transport_type;
+ gf_transport_type nfs_transport_type;
- dict_t *dict;
+ dict_t *dict;
- uuid_t volume_id;
- auth_t auth;
- char *logdir;
+ uuid_t volume_id;
+ auth_t auth;
+ char *logdir;
- dict_t *gsync_slaves;
+ dict_t *gsync_slaves;
- int decommission_in_progress;
- xlator_t *xl;
+ int decommission_in_progress;
+ xlator_t *xl;
- gf_boolean_t memory_accounting;
- glusterd_vol_backend_t backend;
+ gf_boolean_t memory_accounting;
+ int caps; /* Capability */
- int op_version;
- int client_op_version;
+ int op_version;
+ int client_op_version;
+ pthread_mutex_t reflock;
+ int refcnt;
};
+typedef enum gd_snap_status_ {
+ GD_SNAP_STATUS_NONE,
+ GD_SNAP_STATUS_INIT,
+ GD_SNAP_STATUS_IN_USE,
+ GD_SNAP_STATUS_DECOMMISSION,
+ GD_SNAP_STATUS_RESTORED,
+} gd_snap_status_t;
+
+struct glusterd_snap_ {
+ gf_lock_t lock;
+ struct list_head volumes;
+ struct list_head snap_list;
+ char snapname[GLUSTERD_MAX_SNAP_NAME];
+ uuid_t snap_id;
+ char *description;
+ time_t time_stamp;
+ gf_boolean_t snap_restored;
+ gd_snap_status_t snap_status;
+ gf_store_handle_t *shandle;
+};
+
+typedef struct glusterd_snap_op_ {
+ char *snap_vol_id;
+ int32_t brick_num;
+ char *brick_path;
+ int32_t op;
+ int32_t status;
+ struct list_head snap_ops_list;
+} glusterd_snap_op_t;
+
+typedef struct glusterd_missed_snap_ {
+ char *node_uuid;
+ char *snap_uuid;
+ struct list_head missed_snaps;
+ struct list_head snap_ops;
+} glusterd_missed_snap_info;
+
typedef enum gd_node_type_ {
GD_NODE_NONE,
GD_NODE_BRICK,
GD_NODE_SHD,
GD_NODE_REBALANCE,
GD_NODE_NFS,
+ GD_NODE_QUOTAD,
} gd_node_type;
+typedef enum missed_snap_stat {
+ GD_MISSED_SNAP_NONE,
+ GD_MISSED_SNAP_PENDING,
+ GD_MISSED_SNAP_DONE,
+} missed_snap_stat;
+
typedef struct glusterd_pending_node_ {
struct list_head list;
void *node;
@@ -321,6 +442,13 @@ typedef struct glusterd_pending_node_ {
int32_t index;
} glusterd_pending_node_t;
+struct gsync_config_opt_vals_ {
+ char *op_name;
+ int no_of_pos_vals;
+ gf_boolean_t case_sensitive;
+ char *values[GEO_CONF_MAX_OPT_VALS];
+};
+
enum glusterd_op_ret {
GLUSTERD_CONNECTION_AWAITED = 100,
};
@@ -335,14 +463,23 @@ enum glusterd_vol_comp_status_ {
#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
#define GLUSTERD_DEFAULT_PORT GF_DEFAULT_BASE_PORT
#define GLUSTERD_INFO_FILE "glusterd.info"
+#define GLUSTERD_VOLUME_QUOTA_CONFIG "quota.conf"
#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
#define GLUSTERD_PEER_DIR_PREFIX "peers"
#define GLUSTERD_VOLUME_INFO_FILE "info"
+#define GLUSTERD_SNAP_INFO_FILE "info"
#define GLUSTERD_VOLUME_RBSTATE_FILE "rbstate"
#define GLUSTERD_BRICK_INFO_DIR "bricks"
#define GLUSTERD_CKSUM_FILE "cksum"
+#define GLUSTERD_VOL_QUOTA_CKSUM_FILE "quota.cksum"
#define GLUSTERD_TRASH "trash"
#define GLUSTERD_NODE_STATE_FILE "node_state.info"
+#define GLUSTERD_MISSED_SNAPS_LIST_FILE "missed_snaps_list"
+#define GLUSTERD_VOL_SNAP_DIR_PREFIX "snaps"
+
+#define GLUSTERD_DEFAULT_SNAPS_BRICK_DIR "/gluster/snaps"
+#define GLUSTERD_VAR_RUN_DIR "/var/run"
+#define GLUSTERD_RUN_DIR "/run"
/* definitions related to replace brick */
#define RB_CLIENT_MOUNTPOINT "rb_mount"
@@ -355,18 +492,40 @@ enum glusterd_vol_comp_status_ {
typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
-#define GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv) \
- snprintf (path, PATH_MAX, "%s/vols/%s", priv->workdir,\
- volinfo->volname);
+#define GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv) \
+ if (volinfo->is_snap_volume) { \
+ snprintf (path, PATH_MAX, "%s/snaps/%s/%s", priv->workdir, \
+ volinfo->snapshot->snapname, volinfo->volname); \
+ } else { \
+ snprintf (path, PATH_MAX, "%s/vols/%s", priv->workdir, \
+ volinfo->volname); \
+ }
-#define GLUSTERD_GET_BRICK_DIR(path, volinfo, priv) \
- snprintf (path, PATH_MAX, "%s/%s/%s/%s", priv->workdir,\
- GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname, \
- GLUSTERD_BRICK_INFO_DIR);
+#define GLUSTERD_GET_SNAP_DIR(path, snap, priv) \
+ snprintf (path, PATH_MAX, "%s/snaps/%s", priv->workdir, \
+ snap->snapname);
+
+#define GLUSTERD_GET_BRICK_DIR(path, volinfo, priv) \
+ if (volinfo->is_snap_volume) { \
+ snprintf (path, PATH_MAX, "%s/snaps/%s/%s/%s", priv->workdir, \
+ volinfo->snapshot->snapname, volinfo->volname, \
+ GLUSTERD_BRICK_INFO_DIR); \
+ } else { \
+ snprintf (path, PATH_MAX, "%s/%s/%s/%s", priv->workdir, \
+ GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname, \
+ GLUSTERD_BRICK_INFO_DIR); \
+ }
#define GLUSTERD_GET_NFS_DIR(path, priv) \
snprintf (path, PATH_MAX, "%s/nfs", priv->workdir);
+#define GLUSTERD_GET_QUOTAD_DIR(path, priv) \
+ snprintf (path, PATH_MAX, "%s/quotad", priv->workdir);
+
+#define GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH(abspath, volname, path) \
+ snprintf (abspath, sizeof (abspath)-1, \
+ DEFAULT_VAR_RUN_DIRECTORY"/%s%s", volname, path);
+
#define GLUSTERD_REMOVE_SLASH_FROM_PATH(path,string) do { \
int i = 0; \
for (i = 1; i < strlen (path); i++) { \
@@ -382,6 +541,15 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
GLUSTERD_GET_VOLUME_DIR (volpath, volinfo, priv); \
GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); \
snprintf (pidfile, PATH_MAX, "%s/run/%s-%s.pid", \
+ volpath, brickinfo->hostname, exp_path); \
+ } while (0)
+
+#define GLUSTERD_GET_BRICK_RECON_PIDFILE(pidfile,volinfo,brickinfo, priv) do { \
+ char exp_path[PATH_MAX] = {0,}; \
+ char volpath[PATH_MAX] = {0,}; \
+ GLUSTERD_GET_VOLUME_DIR (volpath, volinfo, priv); \
+ GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); \
+ snprintf (pidfile, PATH_MAX, "%s/run/%s:-%s-recon.pid", \
volpath, brickinfo->hostname, exp_path); \
} while (0)
@@ -390,10 +558,15 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
nfspath); \
}
+#define GLUSTERD_GET_QUOTAD_PIDFILE(pidfile,quotadpath) { \
+ snprintf (pidfile, PATH_MAX, "%s/run/quotad.pid", \
+ quotadpath); \
+ }
+
#define GLUSTERD_STACK_DESTROY(frame) do {\
- frame->local = NULL; \
- STACK_DESTROY (frame->root);\
- } while (0)
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ } while (0)
#define GLUSTERD_GET_DEFRAG_DIR(path, volinfo, priv) do { \
char vol_path[PATH_MAX]; \
@@ -401,13 +574,19 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
snprintf (path, PATH_MAX, "%s/rebalance",vol_path); \
} while (0)
-#define GLUSTERD_GET_DEFRAG_SOCK_FILE(path, volinfo, priv) do { \
+#define GLUSTERD_GET_DEFRAG_SOCK_FILE_OLD(path, volinfo, priv) do { \
char defrag_path[PATH_MAX]; \
GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv); \
snprintf (path, PATH_MAX, "%s/%s.sock", defrag_path, \
uuid_utoa(MY_UUID)); \
} while (0)
+#define GLUSTERD_GET_DEFRAG_SOCK_FILE(path, volinfo) do { \
+ snprintf (path, UNIX_PATH_MAX, DEFAULT_VAR_RUN_DIRECTORY \
+ "/gluster-rebalance-%s.sock", \
+ uuid_utoa(volinfo->volume_id)); \
+ } while (0)
+
#define GLUSTERD_GET_DEFRAG_PID_FILE(path, volinfo, priv) do { \
char defrag_path[PATH_MAX]; \
GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv); \
@@ -415,6 +594,24 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
uuid_utoa(MY_UUID)); \
} while (0)
+#define GLUSTERFS_GET_AUX_MOUNT_PIDFILE(pidfile, volname) { \
+ snprintf (pidfile, PATH_MAX-1, \
+ DEFAULT_VAR_RUN_DIRECTORY"/%s.pid", volname); \
+ }
+
+#define GLUSTERD_GET_UUID_NOHYPHEN(ret_string, uuid) do { \
+ char *snap_volname_ptr = ret_string; \
+ char *snap_volid_ptr = uuid_utoa(uuid); \
+ while (*snap_volid_ptr) { \
+ if (*snap_volid_ptr == '-') { \
+ snap_volid_ptr++; \
+ } else { \
+ (*snap_volname_ptr++) = \
+ (*snap_volid_ptr++); \
+ } \
+ } \
+ *snap_volname_ptr = '\0'; \
+ } while (0)
int glusterd_uuid_init();
@@ -425,11 +622,11 @@ int glusterd_uuid_generate_save ();
static inline unsigned char *
__glusterd_uuid()
{
- glusterd_conf_t *priv = THIS->private;
+ glusterd_conf_t *priv = THIS->private;
- if (uuid_is_null (priv->uuid))
- glusterd_uuid_init();
- return &priv->uuid[0];
+ if (uuid_is_null (priv->uuid))
+ glusterd_uuid_init();
+ return &priv->uuid[0];
}
int glusterd_big_locked_notify (struct rpc_clnt *rpc, void *mydata,
@@ -446,7 +643,8 @@ int32_t
glusterd_brick_from_brickinfo (glusterd_brickinfo_t *brickinfo,
char **new_brick);
int
-glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port);
+glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
+ dict_t *dict);
int
glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *myhostname,
@@ -476,6 +674,14 @@ int
glusterd_op_unlock_send_resp (rpcsvc_request_t *req, int32_t status);
int
+glusterd_op_mgmt_v3_lock_send_resp (rpcsvc_request_t *req,
+ uuid_t *txn_id, int32_t status);
+
+int
+glusterd_op_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req,
+ uuid_t *txn_id, int32_t status);
+
+int
glusterd_op_stage_send_resp (rpcsvc_request_t *req,
int32_t op, int32_t status,
char *op_errstr, dict_t *rsp_dict);
@@ -520,7 +726,7 @@ glusterd_handle_defrag_volume_v2 (rpcsvc_request_t *req);
int
glusterd_xfer_cli_probe_resp (rpcsvc_request_t *req, int32_t op_ret,
int32_t op_errno, char *op_errstr, char *hostname,
- int port);
+ int port, dict_t *dict);
int
glusterd_op_commit_send_resp (rpcsvc_request_t *req,
@@ -532,7 +738,7 @@ glusterd_xfer_friend_remove_resp (rpcsvc_request_t *req, char *hostname, int por
int
glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
- uuid_t uuid);
+ uuid_t uuid, dict_t *dict);
int
glusterd_handle_cli_deprobe (rpcsvc_request_t *req);
@@ -607,6 +813,12 @@ int
glusterd_handle_reset_volume (rpcsvc_request_t *req);
int
+glusterd_handle_copy_file (rpcsvc_request_t *req);
+
+int
+glusterd_handle_sys_exec (rpcsvc_request_t *req);
+
+int
glusterd_handle_gsync_set (rpcsvc_request_t *req);
int
@@ -618,7 +830,7 @@ glusterd_handle_fsm_log (rpcsvc_request_t *req);
int
glusterd_xfer_cli_deprobe_resp (rpcsvc_request_t *req, int32_t op_ret,
int32_t op_errno, char *op_errstr,
- char *hostname);
+ char *hostname, dict_t *dict);
int
glusterd_fetchspec_notify (xlator_t *this);
@@ -679,19 +891,28 @@ int glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
glusterd_op_t op);
int
glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
- glusterd_conf_t *priv, int cmd);
+ gf_boolean_t reconnect);
int glusterd_handle_cli_heal_volume (rpcsvc_request_t *req);
int glusterd_handle_cli_list_volume (rpcsvc_request_t *req);
+int
+glusterd_handle_snapshot (rpcsvc_request_t *req);
+
/* op-sm functions */
int glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr);
int glusterd_op_heal_volume (dict_t *dict, char **op_errstr);
int glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr);
int glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int glusterd_op_stage_copy_file (dict_t *dict, char **op_errstr);
+int glusterd_op_copy_file (dict_t *dict, char **op_errstr);
+int glusterd_op_stage_sys_exec (dict_t *dict, char **op_errstr);
+int glusterd_op_sys_exec (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int glusterd_op_stage_gsync_create (dict_t *dict, char **op_errstr);
+int glusterd_op_gsync_create (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
int glusterd_op_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
-int glusterd_op_stage_quota (dict_t *dict, char **op_errstr);
+int glusterd_op_stage_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
int glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
dict_t *rsp_dict);
int glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict);
@@ -720,7 +941,6 @@ int glusterd_op_statedump_volume (dict_t *dict, char **op_errstr);
int glusterd_op_stage_clearlocks_volume (dict_t *dict, char **op_errstr);
int glusterd_op_clearlocks_volume (dict_t *dict, char **op_errstr,
dict_t *rsp_dict);
-int glusterd_op_stage_bd (dict_t *dict, char **op_errstr);
/* misc */
void glusterd_do_replace_brick (void *data);
@@ -731,10 +951,69 @@ int glusterd_op_statedump_volume_args_get (dict_t *dict, char **volname,
char **options, int *option_cnt);
int glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr,
- char **master, char **slave);
+ char **master, char **slave, char **host_uuid);
+
+int glusterd_stop_volume (glusterd_volinfo_t *volinfo);
+
/* Synctask part */
int32_t glusterd_op_begin_synctask (rpcsvc_request_t *req, glusterd_op_t op,
void *dict);
int32_t
glusterd_defrag_event_notify_handle (dict_t *dict);
+
+int32_t
+glusterd_txn_opinfo_dict_init ();
+
+void
+glusterd_txn_opinfo_dict_fini ();
+
+void
+glusterd_txn_opinfo_init ();
+
+/* snapshot */
+glusterd_snap_t*
+glusterd_new_snap_object();
+
+int32_t
+glusterd_list_add_snapvol (glusterd_volinfo_t *origin_vol,
+ glusterd_volinfo_t *snap_vol);
+
+glusterd_snap_t*
+glusterd_remove_snap_by_id (uuid_t snap_id);
+
+glusterd_snap_t*
+glusterd_remove_snap_by_name (char *snap_name);
+
+glusterd_snap_t*
+glusterd_find_snap_by_name (char *snap_name);
+
+glusterd_snap_t*
+glusterd_find_snap_by_id (uuid_t snap_id);
+
+int
+glusterd_snapshot_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict);
+int
+glusterd_snapshot_brickop (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_snapshot (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr,
+ dict_t *rsp_dict);
+char *
+glusterd_build_snap_device_path (char *device, char *snapname);
+int32_t
+glusterd_snap_remove (dict_t *rsp_dict, glusterd_snap_t *snap,
+ gf_boolean_t remove_lvm, gf_boolean_t force);
+int32_t
+glusterd_snapshot_cleanup (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+glusterd_add_missed_snaps_to_list (dict_t *dict, int32_t missed_snap_count);
+
+int32_t
+glusterd_add_new_entry_to_list (char *missed_info, char *snap_vol_id,
+ int32_t brick_num, char *brick_path,
+ int32_t snap_op, int32_t snap_status);
+
#endif
diff --git a/xlators/mount/fuse/src/Makefile.am b/xlators/mount/fuse/src/Makefile.am
index 653121d18..7d1f93447 100644
--- a/xlators/mount/fuse/src/Makefile.am
+++ b/xlators/mount/fuse/src/Makefile.am
@@ -1,7 +1,9 @@
noinst_HEADERS_linux = $(CONTRIBDIR)/fuse-include/fuse_kernel.h\
$(CONTRIBDIR)/fuse-include/mount_util.h\
$(CONTRIBDIR)/fuse-lib/mount-gluster-compat.h
-noinst_HEADERS_darwin = $(CONTRIBDIR)/fuse-include/fuse_kernel_macfuse.h
+noinst_HEADERS_darwin = $(CONTRIBDIR)/fuse-include/fuse_kernel_macfuse.h\
+ $(CONTRIBDIR)/macfuse/fuse_param.h\
+ $(CONTRIBDIR)/macfuse/fuse_ioctl.h
noinst_HEADERS_common = $(CONTRIBDIR)/fuse-include/fuse-mount.h\
$(CONTRIBDIR)/fuse-include/fuse-misc.h fuse-mem-types.h \
fuse-bridge.h
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
index 43c98a23f..d5ca4d146 100644
--- a/xlators/mount/fuse/src/fuse-bridge.c
+++ b/xlators/mount/fuse/src/fuse-bridge.c
@@ -11,6 +11,8 @@
#include <sys/wait.h>
#include "fuse-bridge.h"
#include "mount-gluster-compat.h"
+#include "glusterfs.h"
+#include "glusterfs-acl.h"
#ifdef __NetBSD__
#undef open /* in perfuse.h, pulled from mount-gluster-compat.h */
@@ -27,6 +29,7 @@ static void fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino);
* Send an invalidate notification up to fuse to purge the file from local
* page cache.
*/
+
static int32_t
fuse_invalidate(xlator_t *this, inode_t *inode)
{
@@ -43,14 +46,54 @@ fuse_invalidate(xlator_t *this, inode_t *inode)
return 0;
nodeid = inode_to_fuse_nodeid(inode);
- gf_log(this->name, GF_LOG_DEBUG, "Invalidate inode id %lu.", nodeid);
- fuse_log_eh (this, "Sending invalidate inode id: %lu gfid: %s", nodeid,
+ gf_log(this->name, GF_LOG_DEBUG, "Invalidate inode id %"GF_PRI_INODE"." , nodeid);
+ fuse_log_eh (this, "Sending invalidate inode id: %"GF_PRI_INODE" gfid: %s", nodeid,
uuid_utoa (inode->gfid));
fuse_invalidate_inode(this, nodeid);
return 0;
}
+static int32_t
+fuse_forget_cbk (xlator_t *this, inode_t *inode)
+{
+ //Nothing to free in inode ctx, hence return.
+ return 0;
+}
+
+void
+fuse_inode_set_need_lookup (inode_t *inode, xlator_t *this)
+{
+ uint64_t need_lookup = 1;
+
+ if (!inode || !this)
+ return;
+
+ inode_ctx_set (inode, this, &need_lookup);
+
+ return;
+}
+
+
+gf_boolean_t
+fuse_inode_needs_lookup (inode_t *inode, xlator_t *this)
+{
+ uint64_t need_lookup = 0;
+ gf_boolean_t ret = _gf_false;
+
+ if (!inode || !this)
+ return ret;
+
+ inode_ctx_get (inode, this, &need_lookup);
+ if (need_lookup)
+ ret = _gf_true;
+ need_lookup = 0;
+ inode_ctx_set (inode, this, &need_lookup);
+
+ return ret;
+}
+
+
fuse_fd_ctx_t *
__fuse_fd_ctx_check_n_create (xlator_t *this, fd_t *fd)
{
@@ -144,6 +187,8 @@ send_fuse_iov (xlator_t *this, fuse_in_header_t *finh, struct iovec *iov_out,
fouh->unique = finh->unique;
res = writev (priv->fd, iov_out, count);
+ gf_log ("glusterfs-fuse", GF_LOG_TRACE, "writev() result %d/%d %s",
+ res, fouh->len, res == -1 ? strerror (errno) : "");
if (res == -1)
return errno;
@@ -173,19 +218,26 @@ send_fuse_data (xlator_t *this, fuse_in_header_t *finh, void *data, size_t size)
{
struct fuse_out_header fouh = {0, };
struct iovec iov_out[2];
+ int ret = 0;
fouh.error = 0;
iov_out[0].iov_base = &fouh;
iov_out[1].iov_base = data;
iov_out[1].iov_len = size;
- return send_fuse_iov (this, finh, iov_out, 2);
+ ret = send_fuse_iov (this, finh, iov_out, 2);
+ if (ret != 0)
+ gf_log ("glusterfs-fuse", GF_LOG_ERROR, "send_fuse_iov() "
+ "failed: %s", strerror (ret));
+
+ return ret;
}
#define send_fuse_obj(this, finh, obj) \
send_fuse_data (this, finh, obj, sizeof (*(obj)))
+#if FUSE_KERNEL_MINOR_VERSION >= 11
static void
fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
{
@@ -243,6 +295,7 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
if (inode)
inode_unref (inode);
}
+#endif
/*
* Send an inval inode notification to fuse. This causes an invalidation of the
@@ -251,6 +304,7 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
static void
fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
{
+#if FUSE_KERNEL_MINOR_VERSION >= 11
struct fuse_out_header *fouh = NULL;
struct fuse_notify_inval_inode_out *fniio = NULL;
fuse_private_t *priv = NULL;
@@ -296,8 +350,13 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
if (inode)
inode_unref (inode);
+#else
+ gf_log ("glusterfs-fuse", GF_LOG_WARNING,
+ "fuse_invalidate_inode not implemented on OS X due to missing FUSE notification");
+#endif
}
+
int
send_fuse_err (xlator_t *this, fuse_in_header_t *finh, int error)
{
@@ -318,7 +377,7 @@ send_fuse_err (xlator_t *this, fuse_in_header_t *finh, int error)
uuid_utoa (inode->gfid));
} else {
fuse_log_eh (this, "Sending %s for operation %d on "
- "inode %ld", strerror (error),
+ "inode %" GF_PRI_INODE, strerror (error),
finh->opcode, finh->nodeid);
}
}
@@ -371,7 +430,7 @@ fuse_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %"PRId64,
+ "%"PRIu64": %s() %s => %"PRIu64,
frame->root->unique, gf_fop_list[frame->root->op],
state->loc.path, buf->ia_ino);
@@ -463,6 +522,13 @@ fuse_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1 && state->is_revalidate == 1) {
itable = state->itable;
+ /*
+ * A stale mapping might exist for a dentry/inode that has been
+ * removed from another client.
+ */
+ if (op_errno == ENOENT)
+ inode_unlink(state->loc.inode, state->loc.parent,
+ state->loc.name);
inode_unref (state->loc.inode);
state->loc.inode = inode_new (itable);
state->is_revalidate = 2;
@@ -522,8 +588,8 @@ fuse_lookup_resume (fuse_state_t *state)
static void
fuse_lookup (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
- char *name = msg;
- fuse_state_t *state = NULL;
+ char *name = msg;
+ fuse_state_t *state = NULL;
GET_STATE (this, finh, state);
@@ -531,15 +597,27 @@ fuse_lookup (xlator_t *this, fuse_in_header_t *finh, void *msg)
finh->nodeid, name);
fuse_resolve_and_resume (state, fuse_lookup_resume);
+
+ return;
+}
+
+static inline void
+do_forget(xlator_t *this, uint64_t unique, uint64_t nodeid, uint64_t nlookup)
+{
+ inode_t *fuse_inode = fuse_ino_to_inode(nodeid, this);
+
+ fuse_log_eh(this, "%"PRIu64": FORGET %"PRIu64"/%"PRIu64" gfid: (%s)",
+ unique, nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
+
+ inode_forget(fuse_inode, nlookup);
+ inode_unref(fuse_inode);
}
static void
fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
- struct fuse_forget_in *ffi = msg;
-
- inode_t *fuse_inode;
+ struct fuse_forget_in *ffi = msg;
if (finh->nodeid == 1) {
GF_FREE (finh);
@@ -550,17 +628,31 @@ fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg)
"%"PRIu64": FORGET %"PRIu64"/%"PRIu64,
finh->unique, finh->nodeid, ffi->nlookup);
- fuse_inode = fuse_ino_to_inode (finh->nodeid, this);
+ do_forget(this, finh->unique, finh->nodeid, ffi->nlookup);
+
+ GF_FREE (finh);
+}
- fuse_log_eh (this, "%"PRIu64": FORGET %"PRIu64"/%"PRIu64" gfid: (%s)",
- finh->unique, finh->nodeid, ffi->nlookup,
- uuid_utoa (fuse_inode->gfid));
+#if FUSE_KERNEL_MINOR_VERSION >= 16
+static void
+fuse_batch_forget(xlator_t *this, fuse_in_header_t *finh, void *msg)
+{
+ struct fuse_batch_forget_in *fbfi = msg;
+ struct fuse_forget_one *ffo = (struct fuse_forget_one *) (fbfi + 1);
+ int i;
- inode_forget (fuse_inode, ffi->nlookup);
- inode_unref (fuse_inode);
+ gf_log("glusterfs-fuse", GF_LOG_TRACE,
+ "%"PRIu64": BATCH_FORGET %"PRIu64"/%"PRIu32,
+ finh->unique, finh->nodeid, fbfi->count);
- GF_FREE (finh);
+ for (i = 0; i < fbfi->count; i++) {
+ if (ffo[i].nodeid == 1)
+ continue;
+ do_forget(this, finh->unique, ffo[i].nodeid, ffo[i].nlookup);
+ }
+ GF_FREE(finh);
}
+#endif
static int
fuse_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -580,7 +672,7 @@ fuse_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %"PRId64, frame->root->unique,
+ "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique,
gf_fop_list[frame->root->op],
state->loc.path ? state->loc.path : "ERR",
prebuf->ia_ino);
@@ -635,7 +727,7 @@ fuse_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state->loc.inode ? uuid_utoa (state->loc.inode->gfid) : "");
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %"PRId64, frame->root->unique,
+ "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique,
gf_fop_list[frame->root->op],
state->loc.path ? state->loc.path : "ERR",
buf->ia_ino);
@@ -795,6 +887,16 @@ out:
return ret;
}
+
+gf_boolean_t
+direct_io_mode (dict_t *xdata)
+{
+ if (xdata && dict_get (xdata, "direct-io-mode"))
+ return _gf_true;
+ return _gf_false;
+}
+
+
static int
fuse_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
@@ -818,7 +920,8 @@ fuse_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!IA_ISDIR (fd->inode->ia_type)) {
if (((priv->direct_io_mode == 2)
&& ((state->flags & O_ACCMODE) != O_RDONLY))
- || (priv->direct_io_mode == 1))
+ || (priv->direct_io_mode == 1)
+ || (direct_io_mode (xdata)))
foo.open_flags |= FOPEN_DIRECT_IO;
#ifdef GF_DARWIN_HOST_OS
/* In Linux: by default, buffer cache
@@ -920,7 +1023,7 @@ fuse_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %"PRId64, frame->root->unique,
+ "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique,
gf_fop_list[frame->root->op],
state->loc.path ? state->loc.path : "ERR",
statpost->ia_ino);
@@ -1049,7 +1152,9 @@ fuse_setattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
struct fuse_setattr_in *fsi = msg;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
fuse_private_t *priv = NULL;
+#endif
fuse_state_t *state = NULL;
GET_STATE (this, finh, state);
@@ -1077,8 +1182,8 @@ fuse_setattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
* http://git.kernel.org/?p=linux/kernel/git/torvalds/
* linux-2.6.git;a=commit;h=v2.6.23-5896-gf333211
*/
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >= 9
+ priv = this->private;
if (priv->proto_minor >= 9 && fsi->valid & FATTR_LOCKOWNER)
state->lk_owner = fsi->lock_owner;
#endif
@@ -1318,7 +1423,7 @@ fuse_mknod_resume (fuse_state_t *state)
{
if (!state->loc.parent) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "MKNOD %"PRId64"/%s (%s/%s) resolution failed",
+ "MKNOD %"PRIu64"/%s (%s/%s) resolution failed",
state->finh->nodeid, state->resolve.bname,
uuid_utoa (state->resolve.gfid), state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
@@ -1355,11 +1460,11 @@ fuse_mknod (xlator_t *this, fuse_in_header_t *finh, void *msg)
char *name = (char *)(fmi + 1);
fuse_state_t *state = NULL;
+#if FUSE_KERNEL_MINOR_VERSION >= 12
fuse_private_t *priv = NULL;
- int32_t ret = -1;
+ int32_t ret = -1;
priv = this->private;
-#if FUSE_KERNEL_MINOR_VERSION >= 12
if (priv->proto_minor < 12)
name = (char *)msg + FUSE_COMPAT_MKNOD_IN_SIZE;
#endif
@@ -1373,8 +1478,8 @@ fuse_mknod (xlator_t *this, fuse_in_header_t *finh, void *msg)
state->mode = fmi->mode;
state->rdev = fmi->rdev;
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >=12
+ priv = this->private;
FUSE_ENTRY_CREATE(this, priv, finh, state, fmi, "MKNOD");
#endif
@@ -1388,7 +1493,7 @@ fuse_mkdir_resume (fuse_state_t *state)
{
if (!state->loc.parent) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "MKDIR %"PRId64" (%s/%s) resolution failed",
+ "MKDIR %"PRIu64" (%s/%s) resolution failed",
state->finh->nodeid, uuid_utoa (state->resolve.gfid),
state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
@@ -1422,10 +1527,12 @@ fuse_mkdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
struct fuse_mkdir_in *fmi = msg;
char *name = (char *)(fmi + 1);
+#if FUSE_KERNEL_MINOR_VERSION >=12
fuse_private_t *priv = NULL;
+ int32_t ret = -1;
+#endif
fuse_state_t *state;
- int32_t ret = -1;
GET_STATE (this, finh, state);
@@ -1435,8 +1542,8 @@ fuse_mkdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
state->mode = fmi->mode;
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >=12
+ priv = this->private;
FUSE_ENTRY_CREATE(this, priv, finh, state, fmi, "MKDIR");
#endif
@@ -1450,7 +1557,7 @@ fuse_unlink_resume (fuse_state_t *state)
{
if (!state->loc.parent || !state->loc.inode) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "UNLINK %"PRId64" (%s/%s) resolution failed",
+ "UNLINK %"PRIu64" (%s/%s) resolution failed",
state->finh->nodeid, uuid_utoa (state->resolve.gfid),
state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
@@ -1486,7 +1593,7 @@ fuse_rmdir_resume (fuse_state_t *state)
{
if (!state->loc.parent || !state->loc.inode) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "RMDIR %"PRId64" (%s/%s) resolution failed",
+ "RMDIR %"PRIu64" (%s/%s) resolution failed",
state->finh->nodeid, uuid_utoa (state->resolve.gfid),
state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
@@ -1522,7 +1629,7 @@ fuse_symlink_resume (fuse_state_t *state)
{
if (!state->loc.parent) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "SYMLINK %"PRId64" (%s/%s) -> %s resolution failed",
+ "SYMLINK %"PRIu64" (%s/%s) -> %s resolution failed",
state->finh->nodeid, uuid_utoa (state->resolve.gfid),
state->resolve.bname, state->name);
send_fuse_err (state->this, state->finh, ENOENT);
@@ -1595,7 +1702,7 @@ fuse_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s -> %s => 0 (buf->ia_ino=%"PRId64")",
+ "%"PRIu64": %s -> %s => 0 (buf->ia_ino=%"PRIu64")",
frame->root->unique, state->loc.path, state->loc2.path,
buf->ia_ino);
@@ -1766,11 +1873,12 @@ fuse_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (((priv->direct_io_mode == 2)
&& ((state->flags & O_ACCMODE) != O_RDONLY))
- || (priv->direct_io_mode == 1))
+ || (priv->direct_io_mode == 1)
+ || direct_io_mode (xdata))
foo.open_flags |= FOPEN_DIRECT_IO;
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %p (ino=%"PRId64")",
+ "%"PRIu64": %s() %s => %p (ino=%"PRIu64")",
frame->root->unique, gf_fop_list[frame->root->op],
state->loc.path, fd, buf->ia_ino);
@@ -1910,17 +2018,17 @@ fuse_create (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
#if FUSE_KERNEL_MINOR_VERSION >= 12
struct fuse_create_in *fci = msg;
+ fuse_private_t *priv = NULL;
+ int32_t ret = -1;
#else
struct fuse_open_in *fci = msg;
#endif
char *name = (char *)(fci + 1);
- fuse_private_t *priv = NULL;
fuse_state_t *state = NULL;
- int32_t ret = -1;
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >= 12
+ priv = this->private;
if (priv->proto_minor < 12)
name = (char *)((struct fuse_open_in *)msg + 1);
#endif
@@ -1934,8 +2042,8 @@ fuse_create (xlator_t *this, fuse_in_header_t *finh, void *msg)
state->mode = fci->mode;
state->flags = fci->flags;
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >=12
+ priv = this->private;
FUSE_ENTRY_CREATE(this, priv, finh, state, fci, "CREATE");
#endif
fuse_resolve_and_resume (state, fuse_create_resume);
@@ -2029,7 +2137,7 @@ fuse_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret >= 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": READ => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64,
+ "%"PRIu64": READ => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRIu64,
frame->root->unique,
op_ret, state->size, state->off, stbuf->ia_size);
@@ -2073,7 +2181,9 @@ fuse_readv (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
struct fuse_read_in *fri = msg;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
fuse_private_t *priv = NULL;
+#endif
fuse_state_t *state = NULL;
fd_t *fd = NULL;
@@ -2085,8 +2195,8 @@ fuse_readv (xlator_t *this, fuse_in_header_t *finh, void *msg)
fuse_resolve_fd_init (state, &state->resolve, fd);
/* See comment by similar code in fuse_settatr */
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >= 9
+ priv = this->private;
if (priv->proto_minor >= 9 && fri->read_flags & FUSE_READ_LOCKOWNER)
state->lk_owner = fri->lock_owner;
#endif
@@ -2094,8 +2204,9 @@ fuse_readv (xlator_t *this, fuse_in_header_t *finh, void *msg)
state->size = fri->size;
state->off = fri->offset;
/* lets ignore 'fri->read_flags', but just consider 'fri->flags' */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
state->io_flags = fri->flags;
-
+#endif
fuse_resolve_and_resume (state, fuse_readv_resume);
}
@@ -2115,7 +2226,7 @@ fuse_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret >= 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": WRITE => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64,
+ "%"PRIu64": WRITE => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRIu64,
frame->root->unique,
op_ret, state->size, state->off, stbuf->ia_size);
@@ -2157,7 +2268,7 @@ fuse_write_resume (fuse_state_t *state)
iobref_add (iobref, iobuf);
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": WRITE (%p, size=%"PRId64", offset=%"PRId64")",
+ "%"PRIu64": WRITE (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
state->finh->unique, state->fd, state->size, state->off);
FUSE_FOP (state, fuse_writev_cbk, GF_FOP_WRITE, writev, state->fd,
@@ -2176,11 +2287,12 @@ fuse_write (xlator_t *this, fuse_in_header_t *finh, void *msg)
struct fuse_write_in *fwi = (struct fuse_write_in *)
(finh + 1);
- fuse_private_t *priv = NULL;
fuse_state_t *state = NULL;
fd_t *fd = NULL;
-
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+ fuse_private_t *priv = NULL;
priv = this->private;
+#endif
GET_STATE (this, finh, state);
fd = FH_TO_FD (fwi->fh);
@@ -2189,7 +2301,11 @@ fuse_write (xlator_t *this, fuse_in_header_t *finh, void *msg)
state->off = fwi->offset;
/* lets ignore 'fwi->write_flags', but just consider 'fwi->flags' */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
state->io_flags = fwi->flags;
+#else
+ state->io_flags = fwi->write_flags;
+#endif
/* TODO: may need to handle below flag
(fwi->write_flags & FUSE_WRITE_CACHE);
*/
@@ -2198,8 +2314,8 @@ fuse_write (xlator_t *this, fuse_in_header_t *finh, void *msg)
fuse_resolve_fd_init (state, &state->resolve, fd);
/* See comment by similar code in fuse_settatr */
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >= 9
+ priv = this->private;
if (priv->proto_minor >= 9 && fwi->write_flags & FUSE_WRITE_LOCKOWNER)
state->lk_owner = fwi->lock_owner;
#endif
@@ -2243,6 +2359,16 @@ fuse_flush (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
+int
+fuse_internal_release (xlator_t *this, fd_t *fd)
+{
+ //This is a place holder function to prevent "xlator does not implement
+ //release_cbk" Warning log.
+ //Actual release happens as part of fuse_release which gets executed
+ //when kernel fuse sends it.
+ return 0;
+}
+
static void
fuse_release (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -2427,7 +2553,8 @@ fuse_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
fuse_state_t *state = NULL;
fuse_in_header_t *finh = NULL;
- int size = 0;
+ size_t size = 0;
+ size_t max_size = 0;
char *buf = NULL;
gf_dirent_t *entry = NULL;
struct fuse_dirent *fde = NULL;
@@ -2453,16 +2580,23 @@ fuse_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
frame->root->unique, op_ret, state->size, state->off);
list_for_each_entry (entry, &entries->list, list) {
- size += FUSE_DIRENT_ALIGN (FUSE_NAME_OFFSET +
- strlen (entry->d_name));
+ size_t fde_size = FUSE_DIRENT_ALIGN (FUSE_NAME_OFFSET +
+ strlen (entry->d_name));
+ max_size += fde_size;
+
+ if (max_size > state->size) {
+ /* we received too many entries to fit in the reply */
+ max_size -= fde_size;
+ break;
+ }
}
- if (size <= 0) {
- send_fuse_data (this, finh, 0, 0);
- goto out;
- }
+ if (max_size == 0) {
+ send_fuse_data (this, finh, 0, 0);
+ goto out;
+ }
- buf = GF_CALLOC (1, size, gf_fuse_mt_char);
+ buf = GF_CALLOC (1, max_size, gf_fuse_mt_char);
if (!buf) {
gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
"%"PRIu64": READDIR => -1 (%s)", frame->root->unique,
@@ -2476,6 +2610,9 @@ fuse_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
fde = (struct fuse_dirent *)(buf + size);
gf_fuse_fill_dirent (entry, fde, priv->enable_ino32);
size += FUSE_DIRENT_SIZE (fde);
+
+ if (size == max_size)
+ break;
}
send_fuse_data (this, finh, buf, size);
@@ -2495,7 +2632,7 @@ void
fuse_readdir_resume (fuse_state_t *state)
{
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": READDIR (%p, size=%zu, offset=%"PRId64")",
+ "%"PRIu64": READDIR (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
state->finh->unique, state->fd, state->size, state->off);
FUSE_FOP (state, fuse_readdir_cbk, GF_FOP_READDIR,
@@ -2521,7 +2658,7 @@ fuse_readdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
fuse_resolve_and_resume (state, fuse_readdir_resume);
}
-
+#if FUSE_KERNEL_MINOR_VERSION >= 20
static int
fuse_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
@@ -2529,7 +2666,8 @@ fuse_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
fuse_state_t *state = NULL;
fuse_in_header_t *finh = NULL;
- int size = 0;
+ size_t max_size = 0;
+ size_t size = 0;
char *buf = NULL;
gf_dirent_t *entry = NULL;
struct fuse_direntplus *fde = NULL;
@@ -2554,16 +2692,23 @@ fuse_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
frame->root->unique, op_ret, state->size, state->off);
list_for_each_entry (entry, &entries->list, list) {
- size += FUSE_DIRENT_ALIGN (FUSE_NAME_OFFSET_DIRENTPLUS +
- strlen (entry->d_name));
+ size_t fdes = FUSE_DIRENT_ALIGN (FUSE_NAME_OFFSET_DIRENTPLUS +
+ strlen (entry->d_name));
+ max_size += fdes;
+
+ if (max_size > state->size) {
+ /* we received too many entries to fit in the reply */
+ max_size -= fdes;
+ break;
+ }
}
- if (size <= 0) {
+ if (max_size == 0) {
send_fuse_data (this, finh, 0, 0);
goto out;
}
- buf = GF_CALLOC (1, size, gf_fuse_mt_char);
+ buf = GF_CALLOC (1, max_size, gf_fuse_mt_char);
if (!buf) {
gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
"%"PRIu64": READDIRP => -1 (%s)", frame->root->unique,
@@ -2586,7 +2731,7 @@ fuse_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
size += FUSE_DIRENTPLUS_SIZE (fde);
if (!entry->inode)
- continue;
+ goto next_entry;
entry->d_stat.ia_blksize = this->ctx->page_size;
gf_fuse_stat2attr (&entry->d_stat, &feo->attr, priv->enable_ino32);
@@ -2594,12 +2739,14 @@ fuse_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
linked_inode = inode_link (entry->inode, state->fd->inode,
entry->d_name, &entry->d_stat);
if (!linked_inode)
- continue;
+ goto next_entry;
inode_lookup (linked_inode);
feo->nodeid = inode_to_fuse_nodeid (linked_inode);
+ fuse_inode_set_need_lookup (linked_inode, this);
+
inode_unref (linked_inode);
feo->entry_valid =
@@ -2610,6 +2757,10 @@ fuse_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
calc_timeout_sec (priv->attribute_timeout);
feo->attr_valid_nsec =
calc_timeout_nsec (priv->attribute_timeout);
+
+next_entry:
+ if (size == max_size)
+ break;
}
send_fuse_data (this, finh, buf, size);
@@ -2621,12 +2772,11 @@ out:
}
-
void
fuse_readdirp_resume (fuse_state_t *state)
{
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": READDIRP (%p, size=%zu, offset=%"PRId64")",
+ "%"PRIu64": READDIRP (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
state->finh->unique, state->fd, state->size, state->off);
FUSE_FOP (state, fuse_readdirp_cbk, GF_FOP_READDIRP,
@@ -2652,7 +2802,52 @@ fuse_readdirp (xlator_t *this, fuse_in_header_t *finh, void *msg)
fuse_resolve_and_resume (state, fuse_readdirp_resume);
}
+#endif
+#if FUSE_KERNEL_MINOR_VERSION >= 19
+#ifdef FALLOC_FL_KEEP_SIZE
+static int
+fuse_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return fuse_err_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+}
+
+static void
+fuse_fallocate_resume(fuse_state_t *state)
+{
+ gf_log("glusterfs-fuse", GF_LOG_TRACE,
+ "%"PRIu64": FALLOCATE (%p, flags=%d, size=%zu, offset=%"PRId64")",
+ state->finh->unique, state->fd, state->flags, state->size,
+ state->off);
+
+ if (state->flags & FALLOC_FL_PUNCH_HOLE)
+ FUSE_FOP(state, fuse_fallocate_cbk, GF_FOP_DISCARD, discard,
+ state->fd, state->off, state->size, state->xdata);
+ else
+ FUSE_FOP(state, fuse_fallocate_cbk, GF_FOP_FALLOCATE, fallocate,
+ state->fd, (state->flags & FALLOC_FL_KEEP_SIZE),
+ state->off, state->size, state->xdata);
+}
+
+static void
+fuse_fallocate(xlator_t *this, fuse_in_header_t *finh, void *msg)
+{
+ struct fuse_fallocate_in *ffi = msg;
+ fuse_state_t *state = NULL;
+
+ GET_STATE(this, finh, state);
+ state->off = ffi->offset;
+ state->size = ffi->length;
+ state->flags = ffi->mode;
+ state->fd = FH_TO_FD(ffi->fh);
+
+ fuse_resolve_fd_init(state, &state->resolve, state->fd);
+ fuse_resolve_and_resume(state, fuse_fallocate_resume);
+}
+#endif /* FALLOC_FL_KEEP_SIZE */
+#endif /* FUSE minor version >= 19 */
static void
fuse_releasedir (xlator_t *this, fuse_in_header_t *finh, void *msg)
@@ -2895,8 +3090,8 @@ fuse_setxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
}
if (!priv->acl) {
- if ((strcmp (name, "system.posix_acl_access") == 0) ||
- (strcmp (name, "system.posix_acl_default") == 0)) {
+ if ((strcmp (name, POSIX_ACL_ACCESS_XATTR) == 0) ||
+ (strcmp (name, POSIX_ACL_DEFAULT_XATTR) == 0)) {
send_fuse_err (this, finh, EOPNOTSUPP);
GF_FREE (finh);
return;
@@ -2924,12 +3119,14 @@ fuse_setxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
gf_log ("fuse", GF_LOG_TRACE,
"got request to invalidate %"PRIu64, finh->nodeid);
send_fuse_err (this, finh, 0);
+#if FUSE_KERNEL_MINOR_VERSION >= 11
fuse_invalidate_entry (this, finh->nodeid);
+#endif
GF_FREE (finh);
return;
}
- if (!strcmp (GFID_XATTR_KEY, name)) {
+ if (!strcmp (GFID_XATTR_KEY, name) || !strcmp (GF_XATTR_VOL_ID_KEY, name)) {
send_fuse_err (this, finh, EPERM);
GF_FREE (finh);
return;
@@ -3120,6 +3317,8 @@ out:
void
fuse_getxattr_resume (fuse_state_t *state)
{
+ char *value = NULL;
+
if (!state->loc.inode) {
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"%"PRIu64": GETXATTR %s/%"PRIu64" (%s) "
@@ -3137,6 +3336,46 @@ fuse_getxattr_resume (fuse_state_t *state)
state->fd = fd_lookup (state->loc.inode, state->finh->pid);
#endif /* GF_TEST_FFOP */
+ if (state->name &&
+ (strcmp (state->name, VIRTUAL_GFID_XATTR_KEY) == 0)) {
+ /* send glusterfs gfid in binary form */
+
+ value = GF_CALLOC (16 + 1, sizeof(char),
+ gf_common_mt_char);
+ if (!value) {
+ send_fuse_err (state->this, state->finh, ENOMEM);
+ goto internal_out;
+ }
+ memcpy (value, state->loc.inode->gfid, 16);
+
+ send_fuse_xattr (THIS, state->finh, value, 16, state->size);
+ GF_FREE (value);
+ internal_out:
+ free_fuse_state (state);
+ return;
+ }
+
+ if (state->name &&
+ (strcmp (state->name, VIRTUAL_GFID_XATTR_KEY_STR) == 0)) {
+ /* transform binary gfid to canonical form */
+
+ value = GF_CALLOC (UUID_CANONICAL_FORM_LEN + 1, sizeof(char),
+ gf_common_mt_char);
+ if (!value) {
+ send_fuse_err (state->this, state->finh, ENOMEM);
+ goto internal_out1;
+ }
+ uuid_utoa_r (state->loc.inode->gfid, value);
+
+ send_fuse_xattr (THIS, state->finh, value,
+ UUID_CANONICAL_FORM_LEN, state->size);
+ GF_FREE (value);
+ internal_out1:
+ free_fuse_state (state);
+ return;
+ }
+
+
if (state->fd) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": GETXATTR %p/%"PRIu64" (%s)", state->finh->unique,
@@ -3158,15 +3397,16 @@ fuse_getxattr_resume (fuse_state_t *state)
static void
fuse_getxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
- struct fuse_getxattr_in *fgxi = msg;
- char *name = (char *)(fgxi + 1);
-
- fuse_state_t *state = NULL;
- struct fuse_private *priv = NULL;
- int rv = 0;
- char *newkey = NULL;
+ struct fuse_getxattr_in *fgxi = msg;
+ char *name = (char *)(fgxi + 1);
+ fuse_state_t *state = NULL;
+ struct fuse_private *priv = NULL;
+ int rv = 0;
+ int op_errno = EINVAL;
+ char *newkey = NULL;
priv = this->private;
+ GET_STATE (this, finh, state);
#ifdef GF_DARWIN_HOST_OS
if (fgxi->position) {
@@ -3182,45 +3422,43 @@ fuse_getxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
"%"PRIu64": GETXATTR %s/%"PRIu64" (%s):"
"refusing positioned getxattr",
finh->unique, state->loc.path, finh->nodeid, name);
- send_fuse_err (this, finh, EINVAL);
- FREE (finh);
- return;
+ op_errno = EINVAL;
+ goto err;
}
#endif
if (!priv->acl) {
- if ((strcmp (name, "system.posix_acl_access") == 0) ||
- (strcmp (name, "system.posix_acl_default") == 0)) {
- send_fuse_err (this, finh, ENOTSUP);
- GF_FREE (finh);
- return;
+ if ((strcmp (name, POSIX_ACL_ACCESS_XATTR) == 0) ||
+ (strcmp (name, POSIX_ACL_DEFAULT_XATTR) == 0)) {
+ op_errno = ENOTSUP;
+ goto err;
}
}
if (!priv->selinux) {
if (strncmp (name, "security.", 9) == 0) {
- send_fuse_err (this, finh, ENODATA);
- GF_FREE (finh);
- return;
+ op_errno = ENODATA;
+ goto err;
}
}
- GET_STATE (this, finh, state);
-
fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
rv = fuse_flip_xattr_ns (priv, name, &newkey);
if (rv) {
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- goto out;
+ op_errno = ENOMEM;
+ goto err;
}
state->size = fgxi->size;
state->name = newkey;
fuse_resolve_and_resume (state, fuse_getxattr_resume);
- out:
+
+ return;
+ err:
+ send_fuse_err (this, finh, op_errno);
+ free_fuse_state (state);
return;
}
@@ -3326,7 +3564,7 @@ fuse_removexattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
int32_t ret = -1;
char *newkey = NULL;
- if (!strcmp (GFID_XATTR_KEY, name)) {
+ if (!strcmp (GFID_XATTR_KEY, name) || !strcmp (GF_XATTR_VOL_ID_KEY, name)) {
send_fuse_err (this, finh, EPERM);
GF_FREE (finh);
return;
@@ -3532,7 +3770,7 @@ fuse_setlk (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
+#if FUSE_KERNEL_MINOR_VERSION >= 11
static void *
notify_kernel_loop (void *data)
{
@@ -3568,7 +3806,7 @@ notify_kernel_loop (void *data)
return NULL;
}
-
+#endif
static void
fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
@@ -3577,8 +3815,10 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
struct fuse_init_out fino = {0,};
fuse_private_t *priv = NULL;
int ret = 0;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
int pfd[2] = {0,};
pthread_t messenger;
+#endif
priv = this->private;
@@ -3607,6 +3847,10 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
fino.max_readahead = 1 << 17;
fino.max_write = 1 << 17;
fino.flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
+#if FUSE_KERNEL_MINOR_VERSION >= 17
+ if (fini->minor >= 17)
+ fino.flags |= FUSE_FLOCK_LOCKS;
+#endif
#if FUSE_KERNEL_MINOR_VERSION >= 12
if (fini->minor >= 12) {
/* let fuse leave the umask processing to us, so that it does not
@@ -3635,8 +3879,8 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
}
priv->revchan_in = pfd[0];
priv->revchan_out = pfd[1];
- ret = pthread_create (&messenger, NULL, notify_kernel_loop,
- this);
+ ret = gf_thread_create (&messenger, NULL, notify_kernel_loop,
+ this);
if (ret != 0) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
"failed to start messenger daemon (%s)",
@@ -3667,13 +3911,57 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
}
if (fini->minor < 9)
*priv->msg0_len_p = sizeof(*finh) + FUSE_COMPAT_WRITE_IN_SIZE;
+
+ if (priv->use_readdirp) {
+ if (fini->flags & FUSE_DO_READDIRPLUS)
+ fino.flags |= FUSE_DO_READDIRPLUS;
+ }
+#endif
+ if (priv->fopen_keep_cache == 2) {
+ /* If user did not explicitly set --fopen-keep-cache[=off],
+ then check if kernel support FUSE_AUTO_INVAL_DATA and ...
+ */
+#if FUSE_KERNEL_MINOR_VERSION >= 20
+ if (fini->flags & FUSE_AUTO_INVAL_DATA) {
+ /* ... enable fopen_keep_cache mode if supported.
+ */
+ gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "Detected "
+ "support for FUSE_AUTO_INVAL_DATA. Enabling "
+ "fopen_keep_cache automatically.");
+ fino.flags |= FUSE_AUTO_INVAL_DATA;
+ priv->fopen_keep_cache = 1;
+ } else
#endif
- if (fini->flags & FUSE_DO_READDIRPLUS)
- fino.flags |= FUSE_DO_READDIRPLUS;
+ {
+
+ gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "No support "
+ "for FUSE_AUTO_INVAL_DATA. Disabling "
+ "fopen_keep_cache.");
+ /* ... else disable. */
+ priv->fopen_keep_cache = 0;
+ }
+ } else if (priv->fopen_keep_cache == 1) {
+ /* If user explicitly set --fopen-keep-cache[=on],
+ then enable FUSE_AUTO_INVAL_DATA if possible.
+ */
+#if FUSE_KERNEL_MINOR_VERSION >= 20
+ if (fini->flags & FUSE_AUTO_INVAL_DATA) {
+ gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "fopen_keep_cache "
+ "is explicitly set. Enabling FUSE_AUTO_INVAL_DATA");
+ fino.flags |= FUSE_AUTO_INVAL_DATA;
+ } else
+#endif
+ {
+ gf_log ("glusterfs-fuse", GF_LOG_WARNING, "fopen_keep_cache "
+ "is explicitly set. Support for "
+ "FUSE_AUTO_INVAL_DATA is missing");
+ }
+ }
+#if FUSE_KERNEL_MINOR_VERSION >= 22
if (fini->flags & FUSE_ASYNC_DIO)
fino.flags |= FUSE_ASYNC_DIO;
-
+#endif
ret = send_fuse_obj (this, finh, &fino);
if (ret == 0)
gf_log ("glusterfs-fuse", GF_LOG_INFO,
@@ -3816,12 +4104,14 @@ fuse_nameless_lookup (xlator_t *xl, uuid_t gfid, loc_t *loc)
inode_t *linked_inode = NULL;
if ((loc == NULL) || (xl == NULL)) {
+ ret = -EINVAL;
goto out;
}
if (loc->inode == NULL) {
loc->inode = inode_new (xl->itable);
if (loc->inode == NULL) {
+ ret = -ENOMEM;
goto out;
}
}
@@ -3830,13 +4120,13 @@ fuse_nameless_lookup (xlator_t *xl, uuid_t gfid, loc_t *loc)
xattr_req = dict_new ();
if (xattr_req == NULL) {
+ ret = -ENOMEM;
goto out;
}
ret = syncop_lookup (xl, loc, xattr_req, &iatt, NULL, NULL);
- if (ret < 0) {
+ if (ret < 0)
goto out;
- }
linked_inode = inode_link (loc->inode, NULL, NULL, &iatt);
inode_unref (loc->inode);
@@ -3885,9 +4175,10 @@ fuse_migrate_fd_open (xlator_t *this, fd_t *basefd, fd_t *oldfd,
"name-less lookup of gfid (%s) failed (%s)"
"(old-subvolume:%s-%d new-subvolume:%s-%d)",
uuid_utoa (basefd->inode->gfid),
- strerror (errno),
+ strerror (-ret),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
goto out;
}
@@ -3905,9 +4196,11 @@ fuse_migrate_fd_open (xlator_t *this, fd_t *basefd, fd_t *oldfd,
uuid_utoa (loc.inode->gfid),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
goto out;
}
+ newfd->flags = basefd->flags;
if (newfd->lk_ctx)
fd_lk_ctx_unref (newfd->lk_ctx);
@@ -3927,9 +4220,10 @@ fuse_migrate_fd_open (xlator_t *this, fd_t *basefd, fd_t *oldfd,
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"open on basefd (ptr:%p inode-gfid:%s) failed (%s)"
"(old-subvolume:%s-%d new-subvolume:%s-%d)", basefd,
- uuid_utoa (basefd->inode->gfid), strerror (errno),
+ uuid_utoa (basefd->inode->gfid), strerror (-ret),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
goto out;
}
@@ -3997,6 +4291,7 @@ fuse_migrate_locks (xlator_t *this, fd_t *basefd, fd_t *oldfd,
oldfd, newfd, uuid_utoa (newfd->inode->gfid),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
goto out;
}
@@ -4022,6 +4317,7 @@ fuse_migrate_locks (xlator_t *this, fd_t *basefd, fd_t *oldfd,
oldfd, newfd, uuid_utoa (newfd->inode->gfid),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
goto out;
}
@@ -4089,10 +4385,11 @@ fuse_migrate_fd (xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
"syncop_fsync failed (%s) on fd (%p)"
"(basefd:%p basefd-inode.gfid:%s) "
"(old-subvolume:%s-%d new-subvolume:%s-%d)",
- strerror (errno), oldfd, basefd,
+ strerror (-ret), oldfd, basefd,
uuid_utoa (basefd->inode->gfid),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
}
} else {
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
@@ -4644,6 +4941,7 @@ fuse_priv_dump (xlator_t *this)
(int)private->strict_volfile_check);
gf_proc_dump_write("reverse_thread_started", "%d",
(int)private->reverse_fuse_thread_started);
+ gf_proc_dump_write("use_readdirp", "%d", private->use_readdirp);
return 0;
}
@@ -4692,9 +4990,10 @@ dump_history_fuse (circular_buffer_t *cb, void *data)
int
fuse_graph_setup (xlator_t *this, glusterfs_graph_t *graph)
{
- inode_table_t *itable = NULL;
- int ret = 0;
- fuse_private_t *priv = NULL;
+ inode_table_t *itable = NULL;
+ int ret = 0, winds = 0;
+ fuse_private_t *priv = NULL;
+ glusterfs_graph_t *prev_graph = NULL;
priv = this->private;
@@ -4715,13 +5014,30 @@ fuse_graph_setup (xlator_t *this, glusterfs_graph_t *graph)
pthread_mutex_lock (&priv->sync_mutex);
{
- priv->next_graph = graph;
- priv->event_recvd = 0;
+ prev_graph = priv->next_graph;
+
+ if ((prev_graph != NULL) && (prev_graph->id > graph->id)) {
+ /* there was a race and an old graph was initialised
+ * before new one.
+ */
+ prev_graph = graph;
+ } else {
+ priv->next_graph = graph;
+ priv->event_recvd = 0;
+
+ pthread_cond_signal (&priv->sync_cond);
+ }
- pthread_cond_signal (&priv->sync_cond);
+ if (prev_graph != NULL)
+ winds = ((xlator_t *)prev_graph->top)->winds;
}
pthread_mutex_unlock (&priv->sync_mutex);
+ if ((prev_graph != NULL) && (winds == 0)) {
+ xlator_notify (prev_graph->top, GF_EVENT_PARENT_DOWN,
+ prev_graph->top, NULL);
+ }
+
gf_log ("fuse", GF_LOG_INFO, "switched to graph %d",
((graph) ? graph->id : 0));
@@ -4772,8 +5088,8 @@ notify (xlator_t *this, int32_t event, void *data, ...)
if (!private->fuse_thread_started) {
private->fuse_thread_started = 1;
- ret = pthread_create (&private->fuse_thread, NULL,
- fuse_thread_proc, this);
+ ret = gf_thread_create (&private->fuse_thread, NULL,
+ fuse_thread_proc, this);
if (ret != 0) {
gf_log (this->name, GF_LOG_DEBUG,
"pthread_create() failed (%s)",
@@ -4860,9 +5176,20 @@ static fuse_handler_t *fuse_std_ops[FUSE_OP_HIGH] = {
/* [FUSE_IOCTL] */
/* [FUSE_POLL] */
/* [FUSE_NOTIFY_REPLY] */
- /* [FUSE_BATCH_FORGET] */
- /* [FUSE_FALLOCATE] */
+
+#if FUSE_KERNEL_MINOR_VERSION >= 16
+ [FUSE_BATCH_FORGET]= fuse_batch_forget,
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 19
+#ifdef FALLOC_FL_KEEP_SIZE
+ [FUSE_FALLOCATE] = fuse_fallocate,
+#endif /* FALLOC_FL_KEEP_SIZE */
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 21
[FUSE_READDIRPLUS] = fuse_readdirp,
+#endif
};
@@ -4913,6 +5240,7 @@ init (xlator_t *this_xl)
int fsname_allocated = 0;
glusterfs_ctx_t *ctx = NULL;
gf_boolean_t sync_to_mount = _gf_false;
+ gf_boolean_t fopen_keep_cache = _gf_false;
unsigned long mntflags = 0;
char *mnt_args = NULL;
eh_t *event = NULL;
@@ -4994,13 +5322,13 @@ init (xlator_t *this_xl)
goto cleanup_exit;
}
- GF_OPTION_INIT ("attribute-timeout", priv->attribute_timeout, double,
+ GF_OPTION_INIT (ZR_ATTR_TIMEOUT_OPT, priv->attribute_timeout, double,
cleanup_exit);
- GF_OPTION_INIT ("entry-timeout", priv->entry_timeout, double,
+ GF_OPTION_INIT (ZR_ENTRY_TIMEOUT_OPT, priv->entry_timeout, double,
cleanup_exit);
- GF_OPTION_INIT ("negative-timeout", priv->negative_timeout, double,
+ GF_OPTION_INIT (ZR_NEGATIVE_TIMEOUT_OPT, priv->negative_timeout, double,
cleanup_exit);
GF_OPTION_INIT ("client-pid", priv->client_pid, int32, cleanup_exit);
@@ -5031,6 +5359,8 @@ init (xlator_t *this_xl)
GF_OPTION_INIT ("enable-ino32", priv->enable_ino32, bool, cleanup_exit);
+ GF_OPTION_INIT ("use-readdirp", priv->use_readdirp, bool, cleanup_exit);
+
priv->fuse_dump_fd = -1;
ret = dict_get_str (options, "dump-fuse", &value_string);
if (ret == 0) {
@@ -5056,8 +5386,12 @@ init (xlator_t *this_xl)
GF_ASSERT (ret == 0);
}
- GF_OPTION_INIT("fopen-keep-cache", priv->fopen_keep_cache, bool,
- cleanup_exit);
+ priv->fopen_keep_cache = 2;
+ if (dict_get (options, "fopen-keep-cache")) {
+ GF_OPTION_INIT("fopen-keep-cache", fopen_keep_cache, bool,
+ cleanup_exit);
+ priv->fopen_keep_cache = fopen_keep_cache;
+ }
GF_OPTION_INIT("gid-timeout", priv->gid_cache_timeout, int32,
cleanup_exit);
@@ -5076,6 +5410,18 @@ init (xlator_t *this_xl)
GF_OPTION_INIT ("congestion-threshold", priv->congestion_threshold,
int32, cleanup_exit);
+ GF_OPTION_INIT("no-root-squash", priv->no_root_squash, bool,
+ cleanup_exit);
+ /* change the client_pid to no-root-squash pid only if the
+ client is none of defrag process, hadoop access and gsyncd process.
+ */
+ if (!priv->client_pid_set) {
+ if (priv->no_root_squash == _gf_true) {
+ priv->client_pid_set = _gf_true;
+ priv->client_pid = GF_CLIENT_PID_NO_ROOT_SQUASH;
+ }
+ }
+
/* user has set only background-qlen, not congestion-threshold,
use the fuse kernel driver formula to set congestion. ie, 75% */
if (dict_get (this_xl->options, "background-qlen") &&
@@ -5146,7 +5492,7 @@ init (xlator_t *this_xl)
if (priv->fd == -1)
goto cleanup_exit;
- event = eh_new (FUSE_EVENT_HISTORY_SIZE, _gf_false);
+ event = eh_new (FUSE_EVENT_HISTORY_SIZE, _gf_false, NULL);
if (!event) {
gf_log (this_xl->name, GF_LOG_ERROR,
"could not create a new event history");
@@ -5228,6 +5574,8 @@ struct xlator_fops fops;
struct xlator_cbks cbks = {
.invalidate = fuse_invalidate,
+ .forget = fuse_forget_cbk,
+ .release = fuse_internal_release
};
@@ -5310,5 +5658,19 @@ struct volume_options options[] = {
{ .key = {"fuse-mountopts"},
.type = GF_OPTION_TYPE_STR
},
+ { .key = {"use-readdirp"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes"
+ },
+ { .key = {"no-root-squash"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "This is the mount option for disabling the "
+ "root squash for the client irrespective of whether the root-squash "
+ "option for the volume is set or not. But this option is honoured "
+ "only for the trusted clients. For non trusted clients this value "
+ "does not have any affect and the volume option for root-squash is "
+ "honoured.",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
index d90b85e72..f1c4cb3f0 100644
--- a/xlators/mount/fuse/src/fuse-bridge.h
+++ b/xlators/mount/fuse/src/fuse-bridge.h
@@ -101,9 +101,17 @@ struct fuse_private {
gf_boolean_t acl;
gf_boolean_t selinux;
gf_boolean_t read_only;
- gf_boolean_t fopen_keep_cache;
+ int32_t fopen_keep_cache;
int32_t gid_cache_timeout;
gf_boolean_t enable_ino32;
+ /* This is the mount option for disabling the root-squash for the
+ mount irrespective of whether the root-squash option for the
+ volume is set or not. But this option is honoured only for
+ thr trusted clients. For non trusted clients this value does
+ not have any affect and the volume option for root-squash is
+ honoured.
+ */
+ gf_boolean_t no_root_squash;
fdtable_t *fdtable;
gid_cache_t gid_cache;
char *fuse_mountopts;
@@ -119,6 +127,9 @@ struct fuse_private {
/* for fuse queue length and congestion threshold */
int background_qlen;
int congestion_threshold;
+
+ /* for using fuse-kernel readdirp*/
+ gf_boolean_t use_readdirp;
};
typedef struct fuse_private fuse_private_t;
@@ -142,9 +153,10 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
#define FUSE_FOP(state, ret, op_num, fop, args ...) \
do { \
- call_frame_t *frame = NULL; \
- xlator_t *xl = NULL; \
- int32_t op_ret = 0, op_errno = 0; \
+ call_frame_t *frame = NULL; \
+ xlator_t *xl = NULL; \
+ int32_t op_ret = 0, op_errno = 0; \
+ fuse_resolve_t *resolve = NULL; \
\
frame = get_call_frame_for_req (state); \
if (!frame) { \
@@ -171,14 +183,20 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
frame->root->op = op_num; \
frame->op = op_num; \
\
+ if ( state->resolve_now ) { \
+ resolve = state->resolve_now; \
+ } else { \
+ resolve = &(state->resolve); \
+ } \
+ \
xl = state->active_subvol; \
if (!xl) { \
gf_log_callingfn ("glusterfs-fuse", GF_LOG_ERROR, \
"xl is NULL"); \
op_errno = ENOENT; \
op_ret = -1; \
- } else if (state->resolve.op_ret < 0) { \
- op_errno = state->resolve.op_errno; \
+ } else if (resolve->op_ret < 0) { \
+ op_errno = resolve->op_errno; \
op_ret = -1; \
if (op_num == GF_FOP_LOOKUP) { \
gf_log ("glusterfs-fuse", \
@@ -187,7 +205,7 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
"%"PRIu64": %s() %s => -1 (%s)", \
frame->root->unique, \
gf_fop_list[frame->root->op], \
- state->resolve.resolve_loc.path, \
+ resolve->resolve_loc.path, \
strerror (op_errno)); \
} else { \
gf_log ("glusterfs-fuse", \
@@ -196,7 +214,7 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
"migration of %s failed (%s)", \
frame->root->unique, \
gf_fop_list[frame->root->op], \
- state->resolve.resolve_loc.path, \
+ resolve->resolve_loc.path, \
strerror (op_errno)); \
} \
} else if (state->resolve2.op_ret < 0) { \
diff --git a/xlators/mount/fuse/src/fuse-helpers.c b/xlators/mount/fuse/src/fuse-helpers.c
index d4dcc2e61..0936d6311 100644
--- a/xlators/mount/fuse/src/fuse-helpers.c
+++ b/xlators/mount/fuse/src/fuse-helpers.c
@@ -7,6 +7,10 @@
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
+#ifdef __NetBSD__
+#define _KMEMUSER
+#endif
+
#include "fuse-bridge.h"
#if defined(GF_SOLARIS_HOST_OS)
#include <sys/procfs.h>
@@ -14,9 +18,6 @@
#include <sys/sysctl.h>
#endif
-#ifndef GF_REQUEST_MAXGROUPS
-#define GF_REQUEST_MAXGROUPS 16
-#endif /* GF_REQUEST_MAXGROUPS */
static void
fuse_resolve_wipe (fuse_resolve_t *resolve)
@@ -138,6 +139,7 @@ get_fuse_state (xlator_t *this, fuse_in_header_t *finh)
}
+#define FUSE_MAX_AUX_GROUPS 32 /* We can get only up to 32 aux groups from /proc */
void
frame_fill_groups (call_frame_t *frame)
{
@@ -160,6 +162,9 @@ frame_fill_groups (call_frame_t *frame)
if (!fp)
goto out;
+ if (call_stack_alloc_groups (frame->root, FUSE_MAX_AUX_GROUPS) != 0)
+ goto out;
+
while ((ptr = fgets (line, sizeof line, fp))) {
if (strncmp (ptr, "Groups:", 7) != 0)
continue;
@@ -176,7 +181,7 @@ frame_fill_groups (call_frame_t *frame)
if (!endptr || *endptr)
break;
frame->root->groups[idx++] = id;
- if (idx == GF_MAX_AUX_GROUPS)
+ if (idx == FUSE_MAX_AUX_GROUPS)
break;
}
@@ -192,6 +197,7 @@ out:
prcred_t *prcred = (prcred_t *) scratch;
FILE *fp = NULL;
int ret = 0;
+ int ngrps;
ret = snprintf (filename, sizeof filename,
"/proc/%d/cred", frame->root->pid);
@@ -200,8 +206,11 @@ out:
fp = fopen (filename, "r");
if (fp != NULL) {
if (fgets (scratch, sizeof scratch, fp) != NULL) {
- frame->root->ngrps = MIN(prcred->pr_ngroups,
- GF_REQUEST_MAXGROUPS);
+ ngrps = MIN(prcred->pr_ngroups,
+ GF_MAX_AUX_GROUPS);
+ if (call_stack_alloc_groups (frame->root,
+ ngrps) != 0)
+ return;
}
fclose (fp);
}
@@ -226,7 +235,9 @@ out:
if (sysctl(name, namelen, &kp, &kplen, NULL, 0) != 0)
return;
- ngroups = MIN(kp.kp_eproc.e_ucred.cr_ngroups, GF_REQUEST_MAXGROUPS);
+ ngroups = MIN(kp.kp_eproc.e_ucred.cr_ngroups, NGROUPS_MAX);
+ if (call_stack_alloc_groups (frame->root, ngroups) != 0)
+ return;
for (i = 0; i < ngroups; i++)
frame->root->groups[i] = kp.kp_eproc.e_ucred.cr_groups[i];
frame->root->ngrps = ngroups;
@@ -255,8 +266,11 @@ static void get_groups(fuse_private_t *priv, call_frame_t *frame)
return;
}
- gl = gid_cache_lookup(&priv->gid_cache, frame->root->pid);
+ gl = gid_cache_lookup(&priv->gid_cache, frame->root->pid,
+ frame->root->uid, frame->root->gid);
if (gl) {
+ if (call_stack_alloc_groups (frame->root, gl->gl_count) != 0)
+ return;
frame->root->ngrps = gl->gl_count;
for (i = 0; i < gl->gl_count; i++)
frame->root->groups[i] = gl->gl_list[i];
@@ -267,6 +281,8 @@ static void get_groups(fuse_private_t *priv, call_frame_t *frame)
frame_fill_groups (frame);
agl.gl_id = frame->root->pid;
+ agl.gl_uid = frame->root->uid;
+ agl.gl_gid = frame->root->gid;
agl.gl_count = frame->root->ngrps;
agl.gl_list = GF_CALLOC(frame->root->ngrps, sizeof(gid_t),
gf_fuse_mt_gids_t);
@@ -580,6 +596,8 @@ fuse_ignore_xattr_set (fuse_private_t *priv, char *key)
|| (fnmatch ("*.glusterfs.volume-mark",
key, FNM_PERIOD) == 0)
|| (fnmatch ("*.glusterfs.volume-mark.*",
+ key, FNM_PERIOD) == 0)
+ || (fnmatch ("glusterfs.gfid.newfile",
key, FNM_PERIOD) == 0)))
ret = -1;
diff --git a/xlators/mount/fuse/src/fuse-resolve.c b/xlators/mount/fuse/src/fuse-resolve.c
index 88ce32ab9..76b1d9a72 100644
--- a/xlators/mount/fuse/src/fuse-resolve.c
+++ b/xlators/mount/fuse/src/fuse-resolve.c
@@ -26,6 +26,8 @@ int fuse_migrate_fd (xlator_t *this, fd_t *fd, xlator_t *old_subvol,
fuse_fd_ctx_t *
fuse_fd_ctx_get (xlator_t *this, fd_t *fd);
+gf_boolean_t fuse_inode_needs_lookup (inode_t *inode, xlator_t *this);
+
static int
fuse_resolve_loc_touchup (fuse_state_t *state)
{
@@ -161,10 +163,10 @@ fuse_resolve_gfid_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- loc_wipe (&resolve->resolve_loc);
-
link_inode = inode_link (inode, NULL, NULL, buf);
+ loc_wipe (&resolve->resolve_loc);
+
if (!link_inode)
goto out;
@@ -201,7 +203,11 @@ fuse_resolve_gfid (fuse_state_t *state)
uuid_copy (resolve_loc->gfid, resolve->gfid);
}
- resolve_loc->inode = inode_new (state->itable);
+ /* inode may already exist in case we are looking up an inode which was
+ linked through readdirplus */
+ resolve_loc->inode = inode_find (state->itable, resolve_loc->gfid);
+ if (!resolve_loc->inode)
+ resolve_loc->inode = inode_new (state->itable);
ret = loc_path (resolve_loc, NULL);
if (ret <= 0) {
@@ -239,6 +245,9 @@ fuse_resolve_parent_simple (fuse_state_t *state)
parent = resolve->parhint;
if (parent->table == state->itable) {
+ if (fuse_inode_needs_lookup (parent, THIS))
+ return 1;
+
/* no graph switches since */
loc->parent = inode_ref (parent);
uuid_copy (loc->pargfid, parent->gfid);
@@ -265,6 +274,10 @@ fuse_resolve_parent_simple (fuse_state_t *state)
/* non decisive result - parent missing */
return 1;
}
+ if (fuse_inode_needs_lookup (parent, THIS)) {
+ inode_unref (parent);
+ return 1;
+ }
loc->parent = parent;
uuid_copy (loc->pargfid, resolve->pargfid);
@@ -314,15 +327,18 @@ fuse_resolve_inode_simple (fuse_state_t *state)
loc = state->loc_now;
inode = resolve->hint;
- if (inode->table == state->itable) {
+ if (inode->table == state->itable)
inode_ref (inode);
- goto found;
+ else
+ inode = inode_find (state->itable, resolve->gfid);
+
+ if (inode) {
+ if (!fuse_inode_needs_lookup (inode, THIS))
+ goto found;
+ /* inode was linked through readdirplus */
+ inode_unref (inode);
}
- inode = inode_find (state->itable, resolve->gfid);
- if (inode)
- goto found;
-
return 1;
found:
loc->inode = inode;
@@ -365,6 +381,8 @@ fuse_migrate_fd_task (void *data)
basefd = state->fd;
basefd_ctx = fuse_fd_ctx_get (state->this, basefd);
+ if (!basefd_ctx)
+ goto out;
LOCK (&basefd->lock);
{
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
index 3fedb8ce3..71ea66c3c 100755
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
@@ -1,20 +1,17 @@
-#!/bin/sh
-# (C) 2006, 2007, 2008 Gluster Inc. <http://www.gluster.com>
+#!/bin/bash
#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License as
-# published by the Free Software Foundation; either version 2 of
-# the License, or (at your option) any later version.
+# Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+# This file is part of GlusterFS.
#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public
-# License along with this program; if not, write to the Free
-# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-# Boston, MA 02110-1301 USA
+# This file is licensed to you under your choice of the GNU Lesser
+# General Public License, version 3 or any later version (LGPLv3 or
+# later), or the GNU General Public License, version 2 (GPLv2), in all
+# cases as published by the Free Software Foundation.
+
+warn ()
+{
+ echo "$@" >/dev/stderr
+}
_init ()
{
@@ -27,79 +24,129 @@ _init ()
LOG_DEBUG=DEBUG;
LOG_TRACE=TRACE;
+ HOST_NAME_MAX=64;
+
prefix="@prefix@";
exec_prefix=@exec_prefix@;
cmd_line=$(echo "@sbindir@/glusterfs");
- case `uname -s` in
- NetBSD)
- getinode="stat -f %i"
- getdev="stat -f %d"
- lgetinode="${getinode} -L"
- lgetdev="${getdev} -L"
-
- mounttab=/proc/mounts
- ;;
- Linux)
- getinode="stat -c %i $i"
- getdev="stat -c %d $d"
- lgetinode="${getinode} -L"
- lgetdev="${getdev} -L"
-
- mounttab=/etc/mtab
- ;;
+ # check whether getfattr exists
+ export PATH
+ getfattr=$(which getfattr 2>/dev/null);
+ if [ $? -ne 0 ]; then
+ warn "WARNING: getfattr not found, certain checks will be skipped.."
+ fi
+
+ alias lsL='ls -L'
+ mounttab=/proc/mounts
+ uname_s=`uname -s`
+ case ${uname_s} in
+ NetBSD)
+ getinode="stat -f %i"
+ getdev="stat -f %d"
+ lgetinode="${getinode} -L"
+ lgetdev="${getdev} -L"
+ ;;
+ Linux)
+ getinode="stat -c %i"
+ getdev="stat -c %d"
+ lgetinode="${getinode} -L"
+ lgetdev="${getdev} -L"
+ ;;
esac
UPDATEDBCONF=/etc/updatedb.conf
- LD_LIBRARY_PATH=@libdir@:${LD_LIBRARY_PATH}
- export LD_LIBRARY_PATH
}
-start_glusterfs ()
+is_valid_hostname ()
{
- # lets the comparsion be case insensitive for all strings
+ local server=$1
+ length=$(echo $server | wc -c)
+ if [ ${length} -gt ${HOST_NAME_MAX} ]; then
+ return 1
+ fi
+}
+
+parse_backup_volfile_servers ()
+{
+ local server_list=$1
+ local servers=""
+ local new_servers=""
+
+ servers=$(echo ${server_list} | sed 's/\:/ /g')
+ for server in ${servers}; do
+ is_valid_hostname ${server}
+ if [ $? -eq 1 ]; then
+ continue
+ fi
+ new_servers=$(echo "${new_servers} ${server}")
+ done
+
+ echo ${new_servers}
+}
+
+parse_volfile_servers ()
+{
+ local server_list=$1
+ local servers=""
+ local new_servers=""
+
+ servers=$(echo ${server_list} | sed 's/,/ /g')
+ for server in ${servers}; do
+ is_valid_hostname ${server}
+ if [ $? -eq 1 ]; then
+ continue
+ fi
+ new_servers=$(echo "${new_servers} ${server}")
+ done
+
+ echo ${new_servers}
+}
+
+start_glusterfs ()
+{
if [ -n "$log_level_str" ]; then
- case "$( echo $log_level_str | tr '[a-z]' '[A-Z]')" in
- "ERROR")
- log_level=$LOG_ERROR;
- ;;
+ case "$( echo $log_level_str | awk '{print toupper($0)}')" in
+ "ERROR")
+ log_level=$LOG_ERROR;
+ ;;
"INFO")
- log_level=$LOG_INFO
+ log_level=$LOG_INFO;
+ ;;
+ "DEBUG")
+ log_level=$LOG_DEBUG;
+ ;;
+ "CRITICAL")
+ log_level=$LOG_CRITICAL;
+ ;;
+ "WARNING")
+ log_level=$LOG_WARNING;
+ ;;
+ "TRACE")
+ log_level=$LOG_TRACE;
+ ;;
+ "NONE")
+ log_level=$LOG_NONE;
;;
- "DEBUG")
- log_level=$LOG_DEBUG;
- ;;
- "CRITICAL")
- log_level=$LOG_CRITICAL;
- ;;
- "WARNING")
- log_level=$LOG_WARNING;
- ;;
- "TRACE")
- log_level=$LOG_TRACE;
- ;;
- "NONE")
- log_level=$LOG_NONE;
- ;;
- *)
- echo "invalid log level $log_level_str, using INFO";
- log_level=$LOG_INFO;
- ;;
- esac
- fi
-
-#options without values start here
+ *)
+ warn "invalid log level $log_level_str, using INFO";
+ log_level=$LOG_INFO;
+ ;;
+ esac
+ fi
+
+ # options without values start here
if [ -n "$read_only" ]; then
- cmd_line=$(echo "$cmd_line --read-only");
+ cmd_line=$(echo "$cmd_line --read-only");
fi
if [ -n "$acl" ]; then
- cmd_line=$(echo "$cmd_line --acl");
+ cmd_line=$(echo "$cmd_line --acl");
fi
if [ -n "$selinux" ]; then
- cmd_line=$(echo "$cmd_line --selinux");
+ cmd_line=$(echo "$cmd_line --selinux");
fi
if [ -n "$enable_ino32" ]; then
@@ -109,30 +156,45 @@ start_glusterfs ()
if [ -n "$worm" ]; then
cmd_line=$(echo "$cmd_line --worm");
fi
+ if [ -n "$volfile_max_fetch_attempts" ]; then
+ cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts")
+ fi
if [ -n "$fopen_keep_cache" ]; then
- cmd_line=$(echo "$cmd_line --fopen-keep-cache");
+ cmd_line=$(echo "$cmd_line --fopen-keep-cache");
fi
if [ -n "$volfile_check" ]; then
- cmd_line=$(echo "$cmd_line --volfile-check");
+ cmd_line=$(echo "$cmd_line --volfile-check");
fi
if [ -n "$mem_accounting" ]; then
cmd_line=$(echo "$cmd_line --mem-accounting");
fi
+ if [ -n "$aux_gfid_mount" ]; then
+ cmd_line=$(echo "$cmd_line --aux-gfid-mount");
+ fi
+
+ if [ -n "$no_root_squash" ]; then
+ cmd_line=$(echo "$cmd_line --no-root-squash");
+ fi
+
#options with values start here
if [ -n "$log_level" ]; then
cmd_line=$(echo "$cmd_line --log-level=$log_level");
fi
if [ -n "$log_file" ]; then
- cmd_line=$(echo "$cmd_line --log-file=$log_file");
+ cmd_line=$(echo "$cmd_line --log-file=$log_file");
fi
if [ -n "$direct_io_mode" ]; then
- cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
+ cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
+ fi
+
+ if [ -n "$use_readdirp" ]; then
+ cmd_line=$(echo "$cmd_line --use-readdirp=$use_readdirp");
fi
if [ -n "$volume_name" ]; then
@@ -152,24 +214,23 @@ start_glusterfs ()
fi
if [ -n "$gid_timeout" ]; then
- cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
+ cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
fi
if [ -n "$bg_qlen" ]; then
- cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
+ cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
fi
if [ -n "$cong_threshold" ]; then
- cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold");
+ cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold");
fi
if [ -n "$fuse_mountopts" ]; then
- cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+ cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
fi
if [ -n "$xlator_option" ]; then
- xlator_option=$(echo $xlator_option | sed s/"xlator-option="/"--xlator-option "/g)
- cmd_line=$(echo "$cmd_line $xlator_option");
+ cmd_line=$(echo "$cmd_line --xlator-option=$xlator_option");
fi
# for rdma volume, we have to fetch volfile with '.rdma' added
@@ -178,272 +239,394 @@ start_glusterfs ()
if [ -z "$volfile_loc" ]; then
if [ -n "$server_ip" ]; then
+
+ servers=$(parse_volfile_servers ${server_ip});
+ if [ -n "$servers" ]; then
+ for i in $(echo ${servers}); do
+ cmd_line=$(echo "$cmd_line --volfile-server=$i");
+ done
+ else
+ warn "ERROR: No valid servers found on command line.. exiting"
+ print_usage
+ exit 1
+ fi
+
+ if [ -n "$backupvolfile_server" ]; then
+ if [ -z "$backup_volfile_servers" ]; then
+ is_valid_hostname ${backupvolfile_server};
+ if [ $? -eq 1 ]; then
+ warn "ERROR: Invalid backup server specified.. exiting"
+ exit 1
+ fi
+ cmd_line=$(echo "$cmd_line --volfile-server=$backupvolfile_server");
+ fi
+ fi
+
+ if [ -n "$backup_volfile_servers" ]; then
+ backup_servers=$(parse_backup_volfile_servers ${backup_volfile_servers})
+ for i in $(echo ${backup_servers}); do
+ cmd_line=$(echo "$cmd_line --volfile-server=$i");
+ done
+ fi
+
if [ -n "$server_port" ]; then
cmd_line=$(echo "$cmd_line --volfile-server-port=$server_port");
fi
- if [ -n "$transport" ]; then
+
+ if [ -n "$transport" ]; then
cmd_line=$(echo "$cmd_line --volfile-server-transport=$transport");
if [ "$transport" = "rdma" ]; then
volume_id_rdma=".rdma";
fi
fi
+
if [ -n "$volume_id" ]; then
if [ -n "$volume_id_rdma" ]; then
volume_id="$volume_id$volume_id_rdma";
fi
cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
fi
-
- if [ -n "$backupvolfile_server" ]; then
- cmd_line1=$(echo "$cmd_line --volfile-server=$backupvolfile_server");
- fi
- if [ -n "$volfile_max_fetch_attempts" ]; then
- cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts");
- fi
- cmd_line=$(echo "$cmd_line --volfile-server=$server_ip");
fi
else
cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
fi
if [ -n "$fuse_mountopts" ]; then
- cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+ cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
fi
cmd_line=$(echo "$cmd_line $mount_point");
- err=0;
- $cmd_line;
+ $cmd_line;
inode=$( ${getinode} $mount_point 2>/dev/null);
-
# this is required if the stat returns error
- if [ -z "$inode" ]; then
- inode="0";
- fi
-
- # retry the failover
- # if [ $? != "0" ]; then # <--- TODO: Once glusterfs returns proper error code, change it.
- if [ $inode -ne 1 ]; then
- err=1;
- if [ -n "$cmd_line1" ]; then
- cmd_line1=$(echo "$cmd_line1 $mount_point");
- $cmd_line1;
- err=0;
-
- inode=$( ${getinode} $mount_point 2>/dev/null);
- # this is required if the stat returns error
- if [ -z "$inode" ]; then
- inode="0";
- fi
- if [ $inode -ne 1 ]; then
- err=1;
- fi
- fi
- fi
-
- if [ $err -eq "1" ]; then
- echo "Mount failed. Please check the log file for more details."
- umount $mount_point > /dev/null 2>&1;
- exit 1;
+ if [ $? -ne 0 ]; then
+ warn "Mount failed. Please check the log file for more details."
+ umount $mount_point > /dev/null 2>&1;
+ exit 1;
fi
}
-usage ()
+print_usage ()
{
-
-echo "Usage: mount.glusterfs <volumeserver>:<volumeid/volumeport> -o <options> <mountpoint>
+cat << EOF
+Usage: $0 <volumeserver>:<volumeid/volumeport> -o<options> <mountpoint>
Options:
-man 8 mount.glusterfs
-
-To display the version number of the mount helper:
-mount.glusterfs --version"
-
+man 8 $0
+To display the version number of the mount helper: $0 -V
+EOF
}
# check for recursive mounts. i.e, mounting over an existing brick
check_recursive_mount ()
{
- if [ $2 = "/" ]; then
- echo Cannot mount over root;
+ if [ $1 = "/" ]; then
+ warn "Cannot mount over root";
exit 2;
fi
+
# GFID check first
# remove trailing / from mount point
- mnt_dir=${2%/};
-
- export PATH;
- # check whether getfattr exists
- which getfattr > /dev/null 2>&1;
- if [ $? -ne 0 ]; then
- return;
- fi
+ mnt_dir=${1%/};
- getfattr -n trusted.gfid $mnt_dir 2>/dev/null | grep -iq "trusted.gfid=";
- if [ $? -eq 0 ]; then
- echo "ERROR: $mnt_dir is in use as a brick of a gluster volume";
- exit 2;
+ if [ -n ${getfattr} ]; then
+ ${getfattr} -n trusted.gfid $mnt_dir 2>/dev/null | grep -iq "trusted.gfid=";
+ if [ $? -eq 0 ]; then
+ warn "ERROR: $mnt_dir is in use as a brick of a gluster volume";
+ exit 2;
+ fi
fi
# check if the mount point is a brick's parent directory
GLUSTERD_WORKDIR="/var/lib/glusterd";
- ls -L "$GLUSTERD_WORKDIR"/vols/*/bricks/* > /dev/null 2>&1;
+ lsL "$GLUSTERD_WORKDIR"/vols/*/bricks/* > /dev/null 2>&1;
if [ $? -ne 0 ]; then
return;
fi
- brick_path=`grep ^path "$GLUSTERD_WORKDIR"/vols/*/bricks/* | cut -d "=" -f 2`;
+ brick_path=`grep ^path "$GLUSTERD_WORKDIR"/vols/*/bricks/* 2>/dev/null | cut -d "=" -f 2`;
root_inode=`${lgetinode} /`;
root_dev=`${lgetdev} /`;
mnt_inode=`${lgetinode} $mnt_dir`;
mnt_dev=`${lgetdev} $mnt_dir`;
- for brick in "$brick_path";
- do
+ for brick in "$brick_path"; do
# evaluate brick path to see if this is local, if non-local, skip iteration
ls $brick > /dev/null 2>&1;
if [ $? -ne 0 ]; then
continue;
fi
- getfattr -n trusted.gfid "$brick" 2>/dev/null | grep -iq "trusted.gfid=";
- if [ $? -ne 0 ]; then
- continue;
+
+ if [ -n ${getfattr} ]; then
+ ${getfattr} -n trusted.gfid "$brick" 2>/dev/null | grep -iq "trusted.gfid=";
+ if [ $? -eq 0 ]; then
+ # brick is local
+ while [ 1 ]; do
+ tmp_brick="$brick";
+ brick="$brick"/..;
+ brick_dev=`${lgetdev} $brick`;
+ brick_inode=`${lgetinode} $brick`;
+ if [ "$mnt_inode" -eq "$brick_inode" \
+ -a "$mnt_dev" -eq "$brick_dev" ]; then
+ warn "ERROR: ${mnt_dir} is a parent of the brick ${tmp_brick}";
+ exit 2;
+ fi
+ [ "$root_inode" -ne "$brick_inode" \
+ -o "$root_dev" -ne "$brick_dev" ] || break;
+ done;
+ else
+ continue;
+ fi
else
- # brick is local
- while [ 1 ];
- do
- tmp_brick="$brick";
- brick="$brick"/..;
- brick_dev=`${lgetdev} $brick`;
- brick_inode=`${lgetinode} $brick`;
- if [ "$mnt_inode" -eq "$brick_inode" -a "$mnt_dev" -eq "$brick_dev" ]; then
- echo ERROR: $mnt_dir is a parent of the brick $tmp_brick;
- exit 2;
- fi
- [ "$root_inode" -ne "$brick_inode" -o "$root_dev" -ne "$brick_dev" ] || break;
- done;
+ continue;
fi
done;
}
-main ()
+with_options()
{
- helper=$(echo "$@" | sed -n 's/.*\--[ ]*\([^ ]*\).*/\1/p');
-
- in_opt="no"
- pos_args=0
- for opt in "$@"; do
- if [ "$in_opt" = "yes" ]; then
- for pair in $(echo "$opt" | tr "," " "); do
- # Handle options without values.
- case "$pair" in
- "ro") read_only=1 ;;
- "acl") acl=1 ;;
- "selinux") selinux=1 ;;
- "worm") worm=1 ;;
- "fopen-keep-cache") fopen_keep_cache=1 ;;
- "enable-ino32") enable_ino32=1 ;;
- "mem-accounting") mem_accounting=1;;
- # "mount -t glusterfs" sends this, but it's useless.
- "rw") ;;
- # these ones are interpreted during system initialization
- "noauto") ;;
- "_netdev") ;;
- *)
- key=$(echo "$pair" | cut -f1 -d'=');
- value=$(echo "$pair" | cut -f2- -d'=');
-
- # Handle options with values.
- case "$key" in
- "log-level") log_level_str=$value ;;
- "log-file") log_file=$value ;;
- "transport") transport=$value ;;
- "direct-io-mode") direct_io_mode=$value ;;
- "volume-name") volume_name=$value ;;
- "volume-id") volume_id=$value ;;
- "volfile-check") volfile_check=$value ;;
- "server-port") server_port=$value ;;
- "fetch-attempts")
- volfile_max_fetch_attempts=$value ;;
- "backupvolfile-server")
- backupvolfile_server=$value ;;
- "attribute-timeout")
- attribute_timeout=$value ;;
- "entry-timeout") entry_timeout=$value ;;
- "negative-timeout") negative_timeout=$value ;;
- "gid-timeout") gid_timeout=$value ;;
- "background-qlen") bg_qlen=$value ;;
- "congestion-threshold") cong_threshold=$value ;;
- "xlator-option") xlator_option=$xlator_option" "$pair ;;
- "fuse-mountopts") fuse_mountopts=$value ;;
- *)
- # Passthru
- [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts,"
- fuse_mountopts="$fuse_mountopts$pair"
- ;;
- esac
- esac
- done
- in_opt="no"
- elif [ "$opt" = "-o" ]; then
- in_opt="yes"
+ local key=$1
+ local value=$2
+
+ # Handle options with values.
+ case "$key" in
+ "log-level")
+ log_level_str=$value
+ ;;
+ "log-file")
+ log_file=$value
+ ;;
+ "transport")
+ transport=$value
+ ;;
+ "direct-io-mode")
+ direct_io_mode=$value
+ ;;
+ "volume-name")
+ volume_name=$value
+ ;;
+ "volume-id")
+ volume_id=$value
+ ;;
+ "volfile-check")
+ volfile_check=$value
+ ;;
+ "server-port")
+ server_port=$value
+ ;;
+ "attribute-timeout")
+ attribute_timeout=$value
+ ;;
+ "entry-timeout")
+ entry_timeout=$value
+ ;;
+ "negative-timeout")
+ negative_timeout=$value
+ ;;
+ "gid-timeout")
+ gid_timeout=$value
+ ;;
+ "background-qlen")
+ bg_qlen=$value
+ ;;
+ "backup-volfile-servers")
+ backup_volfile_servers=$value
+ ;;
+ "backupvolfile-server")
+ backupvolfile_server=$value
+ ;;
+ "fetch-attempts")
+ volfile_max_fetch_attempts=$value
+ ;;
+ "congestion-threshold")
+ cong_threshold=$value
+ ;;
+ "xlator-option")
+ xlator_option=$value
+ ;;
+ "fuse-mountopts")
+ fuse_mountopts=$value
+ ;;
+ "use-readdirp")
+ use_readdirp=$value
+ ;;
+ "no-root-squash")
+ if [ $value == "yes" ] ||
+ [ $value == "on" ] ||
+ [ $value == "enable" ] ||
+ [ $value == "true" ] ; then
+ no_root_squash=1;
+ fi ;;
+ "root-squash")
+ if [ $value == "no" ] ||
+ [ $value == "off" ] ||
+ [ $value == "disable" ] ||
+ [ $value == "false" ] ; then
+ no_root_squash=1;
+ fi ;;
+ *)
+ warn "Invalid option: $key"
+ exit 1
+ ;;
+ esac
+}
+
+without_options()
+{
+ local option=$1
+ # Handle options without values.
+ case "$option" in
+ "ro")
+ read_only=1
+ ;;
+ "acl")
+ acl=1
+ ;;
+ "selinux")
+ selinux=1
+ ;;
+ "worm")
+ worm=1
+ ;;
+ "fopen-keep-cache")
+ fopen_keep_cache=1
+ ;;
+ "enable-ino32")
+ enable_ino32=1
+ ;;
+ "mem-accounting")
+ mem_accounting=1
+ ;;
+ "aux-gfid-mount")
+ if [ ${uname_s} = "Linux" ]; then
+ aux_gfid_mount=1
+ fi
+ ;;
+ # "mount -t glusterfs" sends this, but it's useless.
+ "rw")
+ ;;
+ # these ones are interpreted during system initialization
+ "noauto")
+ ;;
+ "_netdev")
+ ;;
+ *)
+ warn "Invalid option $option";
+ exit 1
+ ;;
+ esac
+}
+
+parse_options()
+{
+ local optarg=${1}
+ for pair in $(echo ${optarg//,/ }); do
+ key=$(echo "$pair" | cut -f1 -d'=');
+ value=$(echo "$pair" | cut -f2- -d'=');
+ if [ "$key" = "$value" ]; then
+ without_options $pair;
else
- case $pos_args in
- 0) volfile_loc=$opt ;;
- 1) mount_point=$opt ;;
- *) echo "extra arguments at end (ignored)" ;;
- esac
- pos_args=$((pos_args+1))
+ with_options $key $value;
fi
done
- if [ $in_opt = "yes" -o $pos_args -lt 2 ]; then
- usage
- exit 1
- fi
+}
+
+update_updatedb()
+{
+ # Append fuse.glusterfs to PRUNEFS variable in updatedb.conf(5).
+ # updatedb(8) should not index files under GlusterFS, indexing
+ # GlusterFS is not necessary and should be avoided.
+ # Following code disables updatedb crawl on 'glusterfs'
+ test -f $UPDATEDBCONF && {
+ if ! grep -q 'glusterfs' $UPDATEDBCONF; then
+ sed 's/\(PRUNEFS.*\)"/\1 fuse.glusterfs"/' $UPDATEDBCONF \
+ > ${UPDATEDBCONF}.bak
+ mv -f ${UPDATEDBCONF}.bak $UPDATEDBCONF
+ fi
+ }
+}
+
+main ()
+{
+
+ volfile_loc=$1
+ mount_point=$2
+
+ ## `mount` specifies options as a last argument
+ shift 2;
+ while getopts "Vo:h" opt; do
+ case "${opt}" in
+ o)
+ parse_options ${OPTARG};
+ ;;
+ V)
+ ${cmd_line} -V;
+ exit 0;
+ ;;
+ h)
+ print_usage;
+ exit 0;
+ ;;
+ ?)
+ print_usage;
+ exit 0;
+ ;;
+ esac
+ done
[ -r "$volfile_loc" ] || {
server_ip=$(echo "$volfile_loc" | sed -n 's/\([a-zA-Z0-9:.\-]*\):.*/\1/p');
- test_str=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
- [ -n "$test_str" ] && {
- volume_id="$test_str";
+ volume_str=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
+ [ -n "$volume_str" ] && {
+ volume_id="$volume_str";
}
- volfile_loc="";
+ volfile_loc="";
+ }
+
+ [ -z "$volume_id" -o -z "$server_ip" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: Server name/volume name unspecified cannot proceed further..
+Please specify correct format
+Usage:
+man 8 $0
+EOF
+ exit 1;
}
- #
- [ -n "$helper" ] && {
- cmd_line=$(echo "$cmd_line --$helper");
- exec $cmd_line;
- exit 0;
+ grep_ret=$(echo ${mount_point} | grep '^\-o');
+ [ "x" != "x${grep_ret}" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: -o options cannot be specified in either first two arguments..
+Please specify correct style
+Usage:
+man 8 $0
+EOF
+ exit 1;
}
# No need to do a ! -d test, it is taken care while initializing the
# variable mount_point
[ -z "$mount_point" -o ! -d "$mount_point" ] && {
- echo "ERROR: Mount point does not exist."
- usage;
- exit 0;
+ cat <<EOF >/dev/stderr
+ERROR: Mount point does not exist
+Please specify a mount point
+Usage:
+man 8 $0
+EOF
+ exit 1;
}
# Simple check to avoid multiple identical mounts
if grep -q "[[:space:]+]${mount_point}[[:space:]+]fuse" $mounttab; then
- echo -n "$0: according to mtab, GlusterFS is already mounted on "
- echo "$mount_point"
- exit 0;
+ warn "$0: according to mtab, GlusterFS is already mounted on" \
+ "$mount_point"
+ exit 32;
fi
- check_recursive_mount "$@";
+ check_recursive_mount "$mount_point";
- # Append fuse.glusterfs to PRUNEFS variable in updatedb.conf(5). updatedb(8)
- # should not index files under GlusterFS, indexing will slow down GlusteFS
- # if the filesystem is several TB in size.
- test -f $UPDATEDBCONF && {
- if ! grep -q 'glusterfs' $UPDATEDBCONF; then
- sed 's/\(PRUNEFS.*\)"/\1 fuse.glusterfs"/' $UPDATEDBCONF \
- > ${UPDATEDBCONF}.bak
- mv -f ${UPDATEDBCONF}.bak $UPDATEDBCONF
- fi
- }
+ update_updatedb;
start_glusterfs;
}
diff --git a/xlators/mount/fuse/utils/mount_glusterfs.in b/xlators/mount/fuse/utils/mount_glusterfs.in
index b12b4e04e..539b0f558 100755
--- a/xlators/mount/fuse/utils/mount_glusterfs.in
+++ b/xlators/mount/fuse/utils/mount_glusterfs.in
@@ -1,188 +1,538 @@
#!/bin/sh
-# (C) 2008 Gluster Inc. <http://www.gluster.com>
-#
+# (C) 2014 Red Hat Inc. <http://www.redhat.com>
+#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of
# the License, or (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301 USA
+warn ()
+{
+ echo "$@" >/dev/stderr
+}
_init ()
{
+
# log level definitions
LOG_NONE=NONE;
LOG_CRITICAL=CRITICAL;
LOG_ERROR=ERROR;
LOG_WARNING=WARNING;
- LOG_INFO=INFO;
+ LOG_INFO=INFO
LOG_DEBUG=DEBUG;
+ LOG_TRACE=TRACE;
- # set default log level to ERROR
- log_level=$LOG_INFO;
-}
+ HOST_NAME_MAX=64;
-start_glusterfs ()
-{
prefix="@prefix@";
exec_prefix=@exec_prefix@;
cmd_line=$(echo "@sbindir@/glusterfs");
-
+
+ alias lsL='ls -L'
+ uname_s=`uname -s`
+ case ${uname_s} in
+ Darwin)
+ getinode="stat -f %i"
+ getdev="stat -f %d"
+ ;;
+ esac
+}
+
+is_valid_hostname ()
+{
+ local server=$1
+
+ length=$(echo $server | wc -c)
+ if [ ${length} -gt ${HOST_NAME_MAX} ]; then
+ return 1
+ fi
+}
+
+parse_backup_volfile_servers ()
+{
+ local server_list=$1
+ local servers=""
+ local new_servers=""
+
+ servers=$(echo ${server_list} | sed 's/\:/ /g')
+ for server in ${servers}; do
+ is_valid_hostname ${server}
+ if [ $? -eq 1 ]; then
+ continue
+ fi
+ new_servers=$(echo "${new_servers} ${server}")
+ done
+
+ echo ${new_servers}
+}
+
+parse_volfile_servers ()
+{
+ local server_list=$1
+ local servers=""
+ local new_servers=""
+
+ servers=$(echo ${server_list} | sed 's/,/ /g')
+ for server in ${servers}; do
+ is_valid_hostname ${server}
+ if [ $? -eq 1 ]; then
+ continue
+ fi
+ new_servers=$(echo "${new_servers} ${server}")
+ done
+
+ echo ${new_servers}
+}
+
+start_glusterfs ()
+{
if [ -n "$log_level_str" ]; then
- case "$log_level_str" in
- "ERROR")
- log_level=$LOG_ERROR;
- ;;
+ case "$( echo $log_level_str | awk '{print toupper($0)}')" in
+ "ERROR")
+ log_level=$LOG_ERROR;
+ ;;
"INFO")
log_level=$LOG_INFO;
;;
- "DEBUG")
- log_level=$LOG_DEBUG;
- ;;
- "CRITICAL")
- log_level=$LOG_CRITICAL;
- ;;
- "WARNING")
- log_level=$LOG_WARNING;
- ;;
- "NONE")
- log_level=$LOG_NONE;
- ;;
- *)
- echo "invalid log level $log_level_str, using INFO";
- log_level=$LOG_INFO;
- ;;
- esac
- fi
- cmd_line=$(echo "$cmd_line --log-level=$log_level");
-
- if [ -n "$log_file" ]; then
- cmd_line=$(echo "$cmd_line --log-file=$log_file");
+ "DEBUG")
+ log_level=$LOG_DEBUG;
+ ;;
+ "CRITICAL")
+ log_level=$LOG_CRITICAL;
+ ;;
+ "WARNING")
+ log_level=$LOG_WARNING;
+ ;;
+ "TRACE")
+ log_level=$LOG_TRACE;
+ ;;
+ "NONE")
+ log_level=$LOG_NONE;
+ ;;
+ *)
+ warn "invalid log level $log_level_str, using INFO";
+ log_level=$LOG_INFO;
+ ;;
+ esac
+ fi
+
+ # options without values start here
+ if [ -n "$read_only" ]; then
+ cmd_line=$(echo "$cmd_line --read-only");
+ fi
+
+ if [ -n "$acl" ]; then
+ cmd_line=$(echo "$cmd_line --acl");
+ fi
+
+ if [ -n "$selinux" ]; then
+ cmd_line=$(echo "$cmd_line --selinux");
+ fi
+
+ if [ -n "$enable_ino32" ]; then
+ cmd_line=$(echo "$cmd_line --enable-ino32");
+ fi
+
+ if [ -n "$worm" ]; then
+ cmd_line=$(echo "$cmd_line --worm");
+ fi
+ if [ -n "$volfile_max_fetch_attempts" ]; then
+ cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts")
+ fi
+
+ if [ -n "$fopen_keep_cache" ]; then
+ cmd_line=$(echo "$cmd_line --fopen-keep-cache");
fi
if [ -n "$volfile_check" ]; then
- cmd_line=$(echo "$cmd_line --volfile-check");
+ cmd_line=$(echo "$cmd_line --volfile-check");
+ fi
+
+ if [ -n "$mem_accounting" ]; then
+ cmd_line=$(echo "$cmd_line --mem-accounting");
+ fi
+
+ if [ -n "$aux_gfid_mount" ]; then
+ cmd_line=$(echo "$cmd_line --aux-gfid-mount");
+ fi
+
+ if [ -n "$no_root_squash" ]; then
+ cmd_line=$(echo "$cmd_line --no-root-squash");
+ fi
+
+#options with values start here
+ if [ -n "$log_level" ]; then
+ cmd_line=$(echo "$cmd_line --log-level=$log_level");
+ fi
+
+ if [ -n "$log_file" ]; then
+ cmd_line=$(echo "$cmd_line --log-file=$log_file");
fi
if [ -n "$direct_io_mode" ]; then
- cmd_line=$(echo "$cmd_line --disable-direct-io-mode");
+ cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
fi
-
- if [ -z "$volfile_loc" ]; then
- if [ -n "$transport" ]; then
- cmd_line=$(echo "$cmd_line \
---volfile-server=$server_ip \
---volfile-server-transport=$transport");
- else
- cmd_line=$(echo "$cmd_line \
---volfile-server=$server_ip");
- fi
- else
- cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
+
+ if [ -n "$mac_compat" ]; then
+ cmd_line=$(echo "$cmd_line --mac-compat=$mac_compat");
+ fi
+
+ if [ -n "$use_readdirp" ]; then
+ cmd_line=$(echo "$cmd_line --use-readdirp=$use_readdirp");
fi
if [ -n "$volume_name" ]; then
cmd_line=$(echo "$cmd_line --volume-name=$volume_name");
fi
-
- if [ -n "$volume_id" ]; then
- cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
+
+ if [ -n "$attribute_timeout" ]; then
+ cmd_line=$(echo "$cmd_line --attribute-timeout=$attribute_timeout");
+ fi
+
+ if [ -n "$entry_timeout" ]; then
+ cmd_line=$(echo "$cmd_line --entry-timeout=$entry_timeout");
+ fi
+
+ if [ -n "$negative_timeout" ]; then
+ cmd_line=$(echo "$cmd_line --negative-timeout=$negative_timeout");
+ fi
+
+ if [ -n "$gid_timeout" ]; then
+ cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
+ fi
+
+ if [ -n "$bg_qlen" ]; then
+ cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
+ fi
+
+ if [ -n "$cong_threshold" ]; then
+ cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold");
+ fi
+
+ if [ -n "$fuse_mountopts" ]; then
+ cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+ fi
+
+ if [ -n "$xlator_option" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option=$xlator_option");
+ fi
+
+ # for rdma volume, we have to fetch volfile with '.rdma' added
+ # to volume name, so that it fetches the right client vol file
+ volume_id_rdma="";
+
+ if [ -z "$volfile_loc" ]; then
+ if [ -n "$server_ip" ]; then
+
+ servers=$(parse_volfile_servers ${server_ip});
+ if [ -n "$servers" ]; then
+ for i in $(echo ${servers}); do
+ cmd_line=$(echo "$cmd_line --volfile-server=$i");
+ done
+ else
+ warn "ERROR: No valid servers found on command line.. exiting"
+ print_usage
+ exit 1
+ fi
+
+ if [ -n "$backupvolfile_server" ]; then
+ if [ -z "$backup_volfile_servers" ]; then
+ is_valid_hostname ${backupvolfile_server};
+ if [ $? -eq 1 ]; then
+ warn "ERROR: Invalid backup server specified.. exiting"
+ exit 1
+ fi
+ cmd_line=$(echo "$cmd_line --volfile-server=$backupvolfile_server");
+ fi
+ fi
+
+ if [ -n "$backup_volfile_servers" ]; then
+ backup_servers=$(parse_backup_volfile_servers ${backup_volfile_servers})
+ for i in $(echo ${backup_servers}); do
+ cmd_line=$(echo "$cmd_line --volfile-server=$i");
+ done
+ fi
+
+ if [ -n "$server_port" ]; then
+ cmd_line=$(echo "$cmd_line --volfile-server-port=$server_port");
+ fi
+
+ if [ -n "$transport" ]; then
+ cmd_line=$(echo "$cmd_line --volfile-server-transport=$transport");
+ if [ "$transport" = "rdma" ]; then
+ volume_id_rdma=".rdma";
+ fi
+ fi
+
+ if [ -n "$volume_id" ]; then
+ if [ -n "$volume_id_rdma" ]; then
+ volume_id="$volume_id$volume_id_rdma";
+ fi
+ cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
+ fi
+ fi
+ else
+ cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
+ fi
+
+ if [ -n "$fuse_mountopts" ]; then
+ cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
fi
cmd_line=$(echo "$cmd_line $mount_point");
- exec $cmd_line;
+ $cmd_line;
+
+ if [ $? -ne 0 ]; then
+ exit 1;
+ fi
}
+print_usage ()
+{
+cat << EOF >/dev/stderr
+Usage: $0 <volumeserver>:<volumeid/volumeport> -o<options> <mountpoint>
+Options:
+man 8 $0
+To display the version number of the mount helper: $0 -V
+EOF
+}
-main ()
+with_options()
{
-
- new_log_level=""
- log_file=""
- transport=""
- direct_io_mode=""
- volume_name=""
- new_fs_options=""
- volfile_check=""
-
- while getopts o: opt; do
- case "$opt" in
- o)
- options=$(echo $OPTARG | sed -n 's/.*\-o[ ]*\([^ ]*\).*/\1/p');
- [ -z $new_log_level ] && {
- new_log_level=$(echo "$options" | sed -n 's/.*log-level=\([^,]*\).*/\1/p');
- }
-
- [ -z $log_file ] && {
- log_file=$(echo "$options" | sed -n 's/.*log-file=\([^,]*\).*/\1/p');
- }
-
- [ -z $transport ] && {
- transport=$(echo "$options" | sed -n 's/.*transport=\([^,]*\).*/\1/p');
- }
-
- [ -z $direct_io_mode ] && {
- direct_io_mode=$(echo "$options" | sed -n 's/.*direct-io-mode=\([^,]*\).*/\1/p');
- }
-
- [ -z $volfile_check ] && {
- volfile_check=$(echo "$options" | sed -n 's/.*volfile-check=\([^,]*\).*/\1/p');
- }
-
- [ -z $volume_name ] && {
- volume_name=$(echo "$options" | sed -n 's/.*volume-name=\([^,]*\).*/\1/p');
- }
-
- [ -z $volume_id ] && {
- volume_id=$(echo "$options" | sed -n 's/.*volume-id=\([^,]*\).*/\1/p');
- }
-
- this_option=$(echo "$options" | sed -e 's/[,]*log-file=[^,]*//' \
- -e 's/[,]*log-level=[^,]*//' \
- -e 's/[,]*volume-name=[^,]*//' \
- -e 's/[,]*volfile-check=[^,]*//' \
- -e 's/[,]*direct-io-mode=[^,]*//' \
- -e 's/[,]*transport=[^,]*//' \
- -e 's/[,]*volume-id=[^,]*//');
- new_fs_options="$new_fs_options $this_option";
- ;;
- esac
+ local key=$1
+ local value=$2
+
+ # Handle options with values.
+ case "$key" in
+ "log-level")
+ log_level_str=$value
+ ;;
+ "log-file")
+ log_file=$value
+ ;;
+ "transport")
+ transport=$value
+ ;;
+ "direct-io-mode")
+ direct_io_mode=$value
+ ;;
+ "mac-compat")
+ mac_compat=$value
+ ;;
+ "volume-name")
+ volume_name=$value
+ ;;
+ "volume-id")
+ volume_id=$value
+ ;;
+ "volfile-check")
+ volfile_check=$value
+ ;;
+ "server-port")
+ server_port=$value
+ ;;
+ "attribute-timeout")
+ attribute_timeout=$value
+ ;;
+ "entry-timeout")
+ entry_timeout=$value
+ ;;
+ "negative-timeout")
+ negative_timeout=$value
+ ;;
+ "gid-timeout")
+ gid_timeout=$value
+ ;;
+ "background-qlen")
+ bg_qlen=$value
+ ;;
+ "backup-volfile-servers")
+ backup_volfile_servers=$value
+ ;;
+ "backupvolfile-server")
+ backupvolfile_server=$value
+ ;;
+ "fetch-attempts")
+ volfile_max_fetch_attempts=$value
+ ;;
+ "congestion-threshold")
+ cong_threshold=$value
+ ;;
+ "xlator-option")
+ xlator_option=$value
+ ;;
+ "fuse-mountopts")
+ fuse_mountopts=$value
+ ;;
+ "use-readdirp")
+ use_readdirp=$value
+ ;;
+ "no-root-squash")
+ if [ $value == "yes" ] ||
+ [ $value == "on" ] ||
+ [ $value == "enable" ] ||
+ [ $value == "true" ] ; then
+ no_root_squash=1;
+ fi ;;
+ "root-squash")
+ if [ $value == "no" ] ||
+ [ $value == "off" ] ||
+ [ $value == "disable" ] ||
+ [ $value == "false" ] ; then
+ no_root_squash=1;
+ fi ;;
+ *)
+ warn "Invalid option: $key"
+ exit 1
+ ;;
+ esac
+}
+
+without_options()
+{
+ local option=$1
+ # Handle options without values.
+ case "$option" in
+ "ro")
+ read_only=1
+ ;;
+ "acl")
+ acl=1
+ ;;
+ "selinux")
+ selinux=1
+ ;;
+ "worm")
+ worm=1
+ ;;
+ "fopen-keep-cache")
+ fopen_keep_cache=1
+ ;;
+ "enable-ino32")
+ enable_ino32=1
+ ;;
+ "mem-accounting")
+ mem_accounting=1
+ ;;
+ "aux-gfid-mount")
+ if [ ${uname_s} = "Linux" ]; then
+ aux_gfid_mount=1
+ fi
+ ;;
+ # "mount -t glusterfs" sends this, but it's useless.
+ "rw")
+ ;;
+ # these ones are interpreted during system initialization
+ "noauto")
+ ;;
+ "_netdev")
+ ;;
+ *)
+ warn "Invalid option $option";
+ exit 1
+ ;;
+ esac
+}
+
+parse_options()
+{
+ local optarg=${1}
+ for pair in $(echo ${optarg//,/ }); do
+ key=$(echo "$pair" | cut -f1 -d'=');
+ value=$(echo "$pair" | cut -f2- -d'=');
+ if [ "$key" = "$value" ]; then
+ without_options $pair;
+ else
+ with_options $key $value;
+ fi
done
+}
- [ -n "$new_log_level" ] && {
- log_level_str="$new_log_level";
- }
+main ()
+{
+ ## `mount` on OSX specifies options as first argument
+ if [[ $1 =~ "-o" ]]; then
+ volfile_loc=$3
+ mount_point=$4
+ else
+ volfile_loc=$1
+ mount_point=$2
+ fi
- # TODO: use getopt. This is very much darwin specific
- volfile_loc="$1";
- while [ "$volfile_loc" = "-o" ] ; do
- shift ;
- shift ;
- volfile_loc="$1";
+ while getopts "Vo:h" opt; do
+ case "${opt}" in
+ o)
+ parse_options ${OPTARG};
+ ;;
+ V)
+ ${cmd_line} -V;
+ exit 0;
+ ;;
+ h)
+ print_usage;
+ exit 0;
+ ;;
+ ?)
+ print_usage;
+ exit 0;
+ ;;
+ esac
done
-
+
[ -r "$volfile_loc" ] || {
server_ip=$(echo "$volfile_loc" | sed -n 's/\([a-zA-Z0-9:.\-]*\):.*/\1/p');
- volume_id=$(echo "$volfile_loc" | sed -n 's/[a-zA-Z0-9:.\-]*:\(.*\)/\1/p');
- volfile_loc="";
+ volume_str=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
+ [ -n "$volume_str" ] && {
+ volume_id="$volume_str";
+ }
+ volfile_loc="";
+ }
+
+ [ -z "$volume_id" -o -z "$server_ip" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: Server name/volume name unspecified cannot proceed further..
+Please specify correct format
+Usage:
+man 8 $0
+EOF
+ exit 1;
+ }
+
+ grep_ret=$(echo ${mount_point} | grep '^\-o');
+ [ "x" != "x${grep_ret}" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: -o options cannot be specified in either first two arguments..
+Please specify correct style
+Usage:
+man 8 $0
+EOF
+ exit 1;
+ }
+
+ # No need to do a ! -d test, it is taken care while initializing the
+ # variable mount_point
+ [ -z "$mount_point" -o ! -d "$mount_point" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: Mount point does not exist
+Please specify a mount point
+Usage:
+man 8 $0
+EOF
+ exit 1;
}
- # following line is product of love towards sed
- # $2=$(echo "$@" | sed -n 's/[^ ]* \([^ ]*\).*/\1/p');
-
- mount_point="$2";
- fs_options=$(echo "$fs_options,$new_fs_options");
-
start_glusterfs;
}
diff --git a/xlators/nfs/server/src/Makefile.am b/xlators/nfs/server/src/Makefile.am
index 2795a935d..62fbf6535 100644
--- a/xlators/nfs/server/src/Makefile.am
+++ b/xlators/nfs/server/src/Makefile.am
@@ -14,7 +14,7 @@ noinst_HEADERS = nfs.h nfs-common.h nfs-fops.h nfs-inodes.h nfs-generics.h \
AM_CPPFLAGS = $(GF_CPPFLAGS) \
-DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" \
-I$(top_srcdir)/libglusterfs/src \
- -I$(nfsrpclibdir) -I$(CONTRIBDIR)/rbtree\
+ -I$(nfsrpclibdir) -I$(CONTRIBDIR)/rbtree \
-I$(top_srcdir)/rpc/xdr/src/
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/nfs/server/src/acl3.c b/xlators/nfs/server/src/acl3.c
index ed60775ab..43156eb44 100644
--- a/xlators/nfs/server/src/acl3.c
+++ b/xlators/nfs/server/src/acl3.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+ * Copyright (c) 2012-2013 Red Hat, Inc. <http://www.redhat.com>
* This file is part of GlusterFS.
*
* This file is licensed to you under your choice of the GNU Lesser
@@ -28,7 +28,15 @@
#include "nfs3-fh.h"
#include "nfs-generics.h"
#include "acl3.h"
+#include "byte-order.h"
+static int
+acl3_nfs_acl_to_xattr (aclentry *ace, void *xattrbuf,
+ int aclcount, int defacl);
+
+static int
+acl3_nfs_acl_from_xattr (aclentry *ace, void *xattrbuf,
+ int bufsize, int defacl);
typedef ssize_t (*acl3_serializer) (struct iovec outmsg, void *args);
@@ -58,7 +66,8 @@ nfs3_stat_to_fattr3 (struct iatt *buf);
#define acl3_validate_gluster_fh(handle, status, errlabel) \
do { \
if (!nfs3_fh_validate (handle)) { \
- status = NFS3ERR_SERVERFAULT; \
+ gf_log (GF_ACL, GF_LOG_ERROR, "Bad Handle"); \
+ status = NFS3ERR_BADHANDLE; \
goto errlabel; \
} \
} while (0) \
@@ -113,8 +122,10 @@ nfs3_fh_to_xlator (struct nfs3_state *nfs3, struct nfs3_fh *fh);
xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \
&cst->resolvefh); \
uuid_unparse (cst->resolvefh.gfid, gfid); \
- sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\
- xlatorp ? xlatorp->name : "ERR", gfid); \
+ snprintf (buf, sizeof (buf), "(%s) %s : %s", \
+ trans->peerinfo.identifier, \
+ xlatorp ? xlatorp->name : "ERR", \
+ gfid); \
gf_log (GF_ACL, GF_LOG_ERROR, "Unable to resolve FH"\
": %s", buf); \
nfstat = nfs3_errno_to_nfsstat3 (cst->resolve_errno);\
@@ -138,10 +149,11 @@ nfs3_fh_to_xlator (struct nfs3_state *nfs3, struct nfs3_fh *fh);
int
acl3svc_submit_reply (rpcsvc_request_t *req, void *arg, acl3_serializer sfunc)
{
- struct iovec outmsg = {0, };
- struct iobuf *iob = NULL;
- struct nfs3_state *nfs3 = NULL;
- int ret = -1;
+ struct iovec outmsg = {0, };
+ struct iobuf *iob = NULL;
+ struct nfs3_state *nfs3 = NULL;
+ int ret = -1;
+ ssize_t msglen = 0;
struct iobref *iobref = NULL;
if (!req)
@@ -166,7 +178,12 @@ acl3svc_submit_reply (rpcsvc_request_t *req, void *arg, acl3_serializer sfunc)
/* Use the given serializer to translate the give C structure in arg
* to XDR format which will be written into the buffer in outmsg.
*/
- outmsg.iov_len = sfunc (outmsg, arg);
+ msglen = sfunc (outmsg, arg);
+ if (msglen < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to encode message");
+ goto ret;
+ }
+ outmsg.iov_len = msglen;
iobref = iobref_new ();
if (iobref == NULL) {
@@ -174,7 +191,11 @@ acl3svc_submit_reply (rpcsvc_request_t *req, void *arg, acl3_serializer sfunc)
goto ret;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref);
@@ -208,74 +229,93 @@ acl3svc_null (rpcsvc_request_t *req)
}
int
-acl3_getacl_reply (nfs3_call_state_t *cs, getaclreply *reply)
+acl3_getacl_reply (rpcsvc_request_t *req, getaclreply *reply)
{
- acl3svc_submit_reply (cs->req, (void *)reply,
+ acl3svc_submit_reply (req, (void *)reply,
(acl3_serializer)xdr_serialize_getaclreply);
return 0;
}
+int
+acl3_setacl_reply (rpcsvc_request_t *req, setaclreply *reply)
+{
+ acl3svc_submit_reply (req, (void *)reply,
+ (acl3_serializer)xdr_serialize_setaclreply);
+ return 0;
+}
+
int
acl3_getacl_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- nfsstat3 stat = NFS3ERR_SERVERFAULT;
- nfs3_call_state_t *cs = NULL;
- data_t *data = NULL;
- int *p = NULL;
- int i = 0;
- getaclreply *getaclreply = NULL;
-
+ nfsstat3 stat = NFS3ERR_SERVERFAULT;
+ nfs3_call_state_t *cs = NULL;
+ data_t *data = NULL;
+ getaclreply *getaclreply = NULL;
+ int aclcount = 0;
+ int defacl = 1; /* DEFAULT ACL */
+
+ if (!frame->local) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Invalid argument,"
+ " frame->local NULL");
+ return EINVAL;
+ }
cs = frame->local;
- if (cs)
- getaclreply = &cs->args.getaclreply;
-
- if (op_ret == -1) {
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ getaclreply = &cs->args.getaclreply;
+ if (op_ret < 0) {
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
}
getaclreply->aclentry.aclentry_val = cs->aclentry;
getaclreply->daclentry.daclentry_val = cs->daclentry;
- /* FIXME: use posix_acl_from_xattr() */
- data = dict_get (dict, "system.posix_acl_access");
- if (data && (p = data_to_bin (data))) {
- /* POSIX_ACL_XATTR_VERSION */
- p++;
- while ((char *)p < (data->data + data->len)) {
- getaclreply->aclentry.aclentry_val[i].type = *(*(short **)&p)++;
- getaclreply->aclentry.aclentry_val[i].perm = *(*(short **)&p)++;
- getaclreply->aclentry.aclentry_val[i].uid = *(*(int **)&p)++;
- i++;
+ /* getfacl: NFS USER ACL */
+ data = dict_get (dict, POSIX_ACL_ACCESS_XATTR);
+ if (data && data->data) {
+ aclcount = acl3_nfs_acl_from_xattr (cs->aclentry,
+ data->data,
+ data->len,
+ !defacl);
+ if (aclcount < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR,
+ "Failed to get USER ACL");
+ stat = nfs3_errno_to_nfsstat3 (-aclcount);
+ goto err;
}
- getaclreply->aclcount = getaclreply->aclentry.aclentry_len = i;
- }
- i = 0;
-
- data = dict_get (dict, "system.posix_acl_default");
- if (data && (p = data_to_bin (data))) {
- /* POSIX_ACL_XATTR_VERSION */
- p++;
- while ((char *)p < (data->data + data->len)) {
- getaclreply->daclentry.daclentry_val[i].type = *(*(short **)&p)++;
- getaclreply->daclentry.daclentry_val[i].perm = *(*(short **)&p)++;
- getaclreply->daclentry.daclentry_val[i].uid = *(*(int **)&p)++;
- i++;
+
+ getaclreply->aclcount = aclcount;
+ getaclreply->aclentry.aclentry_len = aclcount;
+ }
+
+ /* getfacl: NFS DEFAULT ACL */
+ data = dict_get (dict, POSIX_ACL_DEFAULT_XATTR);
+ if (data && data->data) {
+ aclcount = acl3_nfs_acl_from_xattr (cs->daclentry,
+ data->data,
+ data->len,
+ defacl);
+ if (aclcount < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR,
+ "Failed to get DEFAULT ACL");
+ stat = nfs3_errno_to_nfsstat3 (-aclcount);
+ goto err;
}
- getaclreply->daclcount = getaclreply->daclentry.daclentry_len = i;
+
+ getaclreply->daclcount = aclcount;
+ getaclreply->daclentry.daclentry_len = aclcount;
}
- acl3_getacl_reply (cs, getaclreply);
+ acl3_getacl_reply (cs->req, getaclreply);
nfs3_call_state_wipe (cs);
return 0;
err:
if (getaclreply)
getaclreply->status = stat;
- acl3_getacl_reply (cs, getaclreply);
+ acl3_getacl_reply (cs->req, getaclreply);
nfs3_call_state_wipe (cs);
return 0;
}
@@ -290,30 +330,39 @@ acl3_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
getaclreply *getaclreply = NULL;
int ret = -1;
nfs_user_t nfu = {0, };
+ uint64_t deviceid = 0;
+
+ if (!frame->local) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Invalid argument,"
+ " frame->local NULL");
+ return EINVAL;
+ }
cs = frame->local;
- if (cs)
- getaclreply = &cs->args.getaclreply;
+ getaclreply = &cs->args.getaclreply;
if (op_ret == -1) {
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
}
- getaclreply->attr_follows = 1;
+ /* Fill the attrs before xattrs */
+ getaclreply->attr_follows = TRUE;
+ deviceid = nfs3_request_xlator_deviceid (cs->req);
+ nfs3_map_deviceid_to_statdev (buf, deviceid);
getaclreply->attr = nfs3_stat_to_fattr3 (buf);
- getaclreply->mask = 0xf;
+
nfs_request_user_init (&nfu, cs->req);
- ret = nfs_getxattr (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, NULL, NULL,
- acl3_getacl_cbk, cs);
- if (ret == -1) {
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ ret = nfs_getxattr (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+ NULL, NULL, acl3_getacl_cbk, cs);
+ if (ret < 0) {
+ stat = nfs3_errno_to_nfsstat3 (-ret);
goto err;
}
return 0;
err:
getaclreply->status = stat;
- acl3_getacl_reply (cs, getaclreply);
+ acl3_getacl_reply (cs->req, getaclreply);
nfs3_call_state_wipe (cs);
return 0;
}
@@ -341,7 +390,7 @@ acl3err:
if (ret < 0) {
gf_log (GF_ACL, GF_LOG_ERROR, "unable to open_and_resume");
cs->args.getaclreply.status = nfs3_errno_to_nfsstat3 (stat);
- acl3_getacl_reply (cs, &cs->args.getaclreply);
+ acl3_getacl_reply (cs->req, &cs->args.getaclreply);
nfs3_call_state_wipe (cs);
}
@@ -360,6 +409,7 @@ acl3svc_getacl (rpcsvc_request_t *req)
nfsstat3 stat = NFS3ERR_SERVERFAULT;
struct nfs3_fh fh, *fhp = NULL;
getaclargs getaclargs;
+ getaclreply getaclreply;
if (!req)
return ret;
@@ -367,33 +417,39 @@ acl3svc_getacl (rpcsvc_request_t *req)
acl3_validate_nfs3_state (req, nfs3, stat, rpcerr, ret);
nfs = nfs_state (nfs3->nfsx);
memset (&getaclargs, 0, sizeof (getaclargs));
+ memset (&getaclreply, 0, sizeof (getaclreply));
getaclargs.fh.n_bytes = (char *)&fh;
if (xdr_to_getaclargs(req->msg[0], &getaclargs) <= 0) {
gf_log (GF_ACL, GF_LOG_ERROR, "Error decoding args");
rpcsvc_request_seterr (req, GARBAGE_ARGS);
goto rpcerr;
}
+
+ /* Validate ACL mask */
+ if (getaclargs.mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) {
+ stat = NFS3ERR_INVAL;
+ goto acl3err;
+ }
+
fhp = &fh;
acl3_validate_gluster_fh (&fh, stat, acl3err);
- acl3_map_fh_to_volume (nfs->nfs3state, fhp, req,
- vol, stat, acl3err);
+ acl3_map_fh_to_volume (nfs->nfs3state, fhp, req, vol, stat, acl3err);
+ acl3_volume_started_check (nfs3, vol, ret, rpcerr);
acl3_handle_call_state_init (nfs->nfs3state, cs, req,
- vol, stat, rpcerr);
+ vol, stat, acl3err);
cs->vol = vol;
- acl3_volume_started_check (nfs3, vol, ret, acl3err);
+ cs->args.getaclreply.mask = getaclargs.mask;
- ret = nfs3_fh_resolve_and_resume (cs, fhp,
- NULL, acl3_getacl_resume);
+ ret = nfs3_fh_resolve_and_resume (cs, fhp, NULL, acl3_getacl_resume);
+ stat = nfs3_errno_to_nfsstat3 (-ret);
acl3err:
if (ret < 0) {
gf_log (GF_ACL, GF_LOG_ERROR, "unable to resolve and resume");
- if (cs) {
- cs->args.getaclreply.status = stat;
- acl3_getacl_reply (cs, &cs->args.getaclreply);
- nfs3_call_state_wipe (cs);
- }
+ getaclreply.status = stat;
+ acl3_getacl_reply (req, &getaclreply);
+ nfs3_call_state_wipe (cs);
return 0;
}
@@ -409,11 +465,12 @@ acl3_setacl_cbk (call_frame_t *frame, void *cookie,
nfs3_call_state_t *cs = NULL;
cs = frame->local;
if (op_ret < 0) {
- cs->args.setaclreply.status = nfs3_errno_to_nfsstat3 (op_errno);
+ nfsstat3 status = nfs3_cbk_errno_status (op_ret, op_errno);
+ cs->args.setaclreply.status = status;
}
- acl3svc_submit_reply (cs->req, (void *)&cs->args.setaclreply,
- (acl3_serializer)xdr_serialize_setaclreply);
+ acl3_setacl_reply (cs->req, &cs->args.setaclreply);
+
return 0;
}
@@ -428,17 +485,18 @@ acl3_setacl_resume (void *carg)
if (!carg)
return ret;
-
cs = (nfs3_call_state_t *)carg;
acl3_check_fh_resolve_status (cs, stat, acl3err);
nfs_request_user_init (&nfu, cs->req);
xattr = dict_new();
if (cs->aclcount)
- ret = dict_set_static_bin (xattr, "system.posix_acl_access", cs->aclxattr,
- cs->aclcount * 8 + 4);
+ ret = dict_set_static_bin (xattr, POSIX_ACL_ACCESS_XATTR,
+ cs->aclxattr,
+ posix_acl_xattr_size (cs->aclcount));
if (cs->daclcount)
- ret = dict_set_static_bin (xattr, "system.posix_acl_default", cs->daclxattr,
- cs->daclcount * 8 + 4);
+ ret = dict_set_static_bin (xattr, POSIX_ACL_DEFAULT_XATTR,
+ cs->daclxattr,
+ posix_acl_xattr_size (cs->daclcount));
ret = nfs_setxattr (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, xattr,
0, NULL, acl3_setacl_cbk, cs);
@@ -449,8 +507,7 @@ acl3err:
stat = -ret;
gf_log (GF_ACL, GF_LOG_ERROR, "unable to open_and_resume");
cs->args.setaclreply.status = nfs3_errno_to_nfsstat3 (stat);
- acl3svc_submit_reply (cs->req, (void *)&cs->args.setaclreply,
- (acl3_serializer)xdr_serialize_setaclreply);
+ acl3_setacl_reply (cs->req, &cs->args.setaclreply);
nfs3_call_state_wipe (cs);
}
@@ -461,25 +518,38 @@ acl3err:
int
acl3svc_setacl (rpcsvc_request_t *req)
{
- xlator_t *vol = NULL;
+ xlator_t *vol = NULL;
struct nfs_state *nfs = NULL;
nfs3_state_t *nfs3 = NULL;
- nfs3_call_state_t *cs = NULL;
- int ret = RPCSVC_ACTOR_ERROR;
- nfsstat3 stat = NFS3ERR_SERVERFAULT;
- struct nfs3_fh fh;
+ nfs3_call_state_t *cs = NULL;
+ int ret = RPCSVC_ACTOR_ERROR;
+ nfsstat3 stat = NFS3ERR_SERVERFAULT;
+ struct nfs3_fh fh;
struct nfs3_fh *fhp = NULL;
- setaclargs setaclargs;
- aclentry aclentry[NFS_ACL_MAX_ENTRIES];
- struct aclentry daclentry[NFS_ACL_MAX_ENTRIES];
- int *p = NULL, i = 0;
+ setaclargs setaclargs;
+ setaclreply setaclreply;
+ aclentry *daclentry = NULL;
+ aclentry *aclentry = NULL;
+ int aclerrno = 0;
+ int defacl = 1;
if (!req)
return ret;
+ aclentry = GF_CALLOC (NFS_ACL_MAX_ENTRIES, sizeof(*aclentry),
+ gf_nfs_mt_arr);
+ if (!aclentry) {
+ goto rpcerr;
+ }
+ daclentry = GF_CALLOC (NFS_ACL_MAX_ENTRIES, sizeof(*daclentry),
+ gf_nfs_mt_arr);
+ if (!daclentry) {
+ goto rpcerr;
+ }
acl3_validate_nfs3_state (req, nfs3, stat, rpcerr, ret);
nfs = nfs_state (nfs3->nfsx);
memset (&setaclargs, 0, sizeof (setaclargs));
+ memset (&setaclreply, 0, sizeof (setaclreply));
memset (&fh, 0, sizeof (fh));
setaclargs.fh.n_bytes = (char *)&fh;
setaclargs.aclentry.aclentry_val = aclentry;
@@ -489,62 +559,72 @@ acl3svc_setacl (rpcsvc_request_t *req)
rpcsvc_request_seterr (req, GARBAGE_ARGS);
goto rpcerr;
}
+
+ /* Validate ACL mask */
+ if (setaclargs.mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) {
+ stat = NFS3ERR_INVAL;
+ goto acl3err;
+ }
+
fhp = &fh;
acl3_validate_gluster_fh (fhp, stat, acl3err);
- acl3_map_fh_to_volume (nfs->nfs3state, fhp, req,
- vol, stat, acl3err);
+ acl3_map_fh_to_volume (nfs->nfs3state, fhp, req, vol, stat, acl3err);
+ acl3_volume_started_check (nfs3, vol, ret, rpcerr);
acl3_handle_call_state_init (nfs->nfs3state, cs, req,
- vol, stat, rpcerr);
+ vol, stat, acl3err);
cs->vol = vol;
- acl3_volume_started_check (nfs3, vol, ret, rpcerr);
-
cs->aclcount = setaclargs.aclcount;
cs->daclcount = setaclargs.daclcount;
- if ((cs->aclcount > NFS_ACL_MAX_ENTRIES) ||
- (cs->daclcount > NFS_ACL_MAX_ENTRIES))
+ /* setfacl: NFS USER ACL */
+ aclerrno = acl3_nfs_acl_to_xattr (aclentry,
+ cs->aclxattr,
+ cs->aclcount,
+ !defacl);
+ if (aclerrno < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to set USER ACL");
+ stat = nfs3_errno_to_nfsstat3 (-aclerrno);
goto acl3err;
- /* FIXME: use posix_acl_to_xattr() */
- p = (int *)cs->aclxattr;
- *(*(int **)&p)++ = POSIX_ACL_XATTR_VERSION;
- for (i = 0; i < cs->aclcount; i++) {
- *(*(short **)&p)++ = aclentry[i].type;
- *(*(short **)&p)++ = aclentry[i].perm;
- *(*(int **)&p)++ = aclentry[i].uid;
- }
- p = (int *)cs->daclxattr;
- *(*(int **)&p)++ = POSIX_ACL_XATTR_VERSION;
- for (i = 0; i < cs->daclcount; i++) {
- *(*(short **)&p)++ = daclentry[i].type;
- *(*(short **)&p)++ = daclentry[i].perm;
- *(*(int **)&p)++ = daclentry[i].uid;
}
+ /* setfacl: NFS DEFAULT ACL */
+ aclerrno = acl3_nfs_acl_to_xattr (daclentry,
+ cs->daclxattr,
+ cs->daclcount,
+ defacl);
+ if (aclerrno < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to set DEFAULT ACL");
+ stat = nfs3_errno_to_nfsstat3 (-aclerrno);
+ goto acl3err;
+ }
- ret = nfs3_fh_resolve_and_resume (cs, fhp,
- NULL, acl3_setacl_resume);
+ ret = nfs3_fh_resolve_and_resume (cs, fhp, NULL, acl3_setacl_resume);
+ stat = nfs3_errno_to_nfsstat3 (-ret);
acl3err:
if (ret < 0) {
gf_log (GF_ACL, GF_LOG_ERROR, "unable to resolve and resume");
- cs->args.setaclreply.status = stat;
- acl3svc_submit_reply (cs->req, (void *)&cs->args.setaclreply,
- (acl3_serializer)xdr_serialize_setaclreply);
+ setaclreply.status = stat;
+ acl3_setacl_reply (req, &setaclreply);
nfs3_call_state_wipe (cs);
+ GF_FREE(aclentry);
+ GF_FREE(daclentry);
return 0;
}
rpcerr:
if (ret < 0)
nfs3_call_state_wipe (cs);
-
+ if (aclentry)
+ GF_FREE (aclentry);
+ if (daclentry)
+ GF_FREE (daclentry);
return ret;
}
-
rpcsvc_actor_t acl3svc_actors[ACL3_PROC_COUNT] = {
{"NULL", ACL3_NULL, acl3svc_null, NULL, 0},
{"GETACL", ACL3_GETACL, acl3svc_getacl, NULL, 0},
@@ -554,7 +634,7 @@ rpcsvc_actor_t acl3svc_actors[ACL3_PROC_COUNT] = {
rpcsvc_program_t acl3prog = {
.progname = "ACL3",
.prognum = ACL_PROGRAM,
- .progver = ACL_V3,
+ .progver = ACLV3_VERSION,
.progport = GF_NFS3_PORT,
.actors = acl3svc_actors,
.numactors = ACL3_PROC_COUNT,
@@ -569,6 +649,11 @@ acl3svc_init(xlator_t *nfsx)
dict_t *options = NULL;
int ret = -1;
char *portstr = NULL;
+ static gf_boolean_t acl3_inited = _gf_false;
+
+ /* Already inited */
+ if (acl3_inited)
+ return &acl3prog;
nfs = (struct nfs_state*)nfsx->private;
@@ -614,14 +699,145 @@ acl3svc_init(xlator_t *nfsx)
goto err;
}
- rpcsvc_create_listeners (nfs->rpcsvc, options, "ACL");
+ ret = rpcsvc_create_listeners (nfs->rpcsvc, options, "ACL");
if (ret == -1) {
gf_log (GF_ACL, GF_LOG_ERROR, "Unable to create listeners");
dict_unref (options);
goto err;
}
+ acl3_inited = _gf_true;
return &acl3prog;
err:
return NULL;
}
+
+static int
+acl3_nfs_acl_to_xattr (aclentry *ace, /* ACL entries to be read */
+ void *xattrbuf, /* XATTR buf to be populated */
+ int aclcount, /* No of ACLs to be read */
+ int defacl) /* 1 if DEFAULT ACL */
+{
+ int idx = 0;
+ posix_acl_xattr_header *xheader = NULL;
+ posix_acl_xattr_entry *xentry = NULL;
+
+ if ((!ace) || (!xattrbuf))
+ return (-EINVAL);
+
+ /* ACL count is ZERO, nothing to do */
+ if (!aclcount)
+ return (0);
+
+ if ((aclcount < 0) || (aclcount > NFS_ACL_MAX_ENTRIES))
+ return (-EINVAL);
+
+ xheader = (posix_acl_xattr_header *) (xattrbuf);
+ xentry = (posix_acl_xattr_entry *) (xheader + 1);
+
+ /*
+ * For "default ACL", NFSv3 handles the 'type' differently
+ * i.e. by logical OR'ing 'type' with NFS_ACL_DEFAULT.
+ * Which the backend File system does not understand and
+ * that needs to be masked OFF.
+ */
+ xheader->version = POSIX_ACL_XATTR_VERSION;
+
+ for (idx = 0; idx < aclcount; idx++) {
+ xentry->tag = ace->type;
+ if (defacl)
+ xentry->tag &= ~NFS_ACL_DEFAULT;
+ xentry->perm = ace->perm;
+
+ switch (xentry->tag) {
+ case POSIX_ACL_USER:
+ case POSIX_ACL_GROUP:
+ if (xentry->perm & ~S_IRWXO)
+ return (-EINVAL);
+ xentry->id = ace->uid;
+ break;
+ case POSIX_ACL_USER_OBJ:
+ case POSIX_ACL_GROUP_OBJ:
+ case POSIX_ACL_OTHER:
+ if (xentry->perm & ~S_IRWXO)
+ return (-EINVAL);
+ xentry->id = POSIX_ACL_UNDEFINED_ID;
+ break;
+ case POSIX_ACL_MASK:
+ /* Solaris sometimes sets additional bits in
+ * the mask.
+ */
+ xentry->perm &= S_IRWXO;
+ xentry->id = POSIX_ACL_UNDEFINED_ID;
+ break;
+ default:
+ return (-EINVAL);
+ }
+
+ xentry++;
+ ace++;
+ }
+
+ /* SUCCESS */
+ return (0);
+}
+
+static int
+acl3_nfs_acl_from_xattr (aclentry *ace, /* ACL entries to be filled */
+ void *xattrbuf, /* XATTR buf to be read */
+ int bufsize, /* Size of XATTR buffer */
+ int defacl) /* 1 if DEFAULT ACL */
+{
+ int idx = 0;
+ ssize_t aclcount = 0;
+ posix_acl_xattr_header *xheader = NULL;
+ posix_acl_xattr_entry *xentry = NULL;
+
+ if ((!xattrbuf) || (!ace))
+ return (-EINVAL);
+
+ aclcount = posix_acl_xattr_count (bufsize);
+ if ((aclcount < 0) || (aclcount > NFS_ACL_MAX_ENTRIES))
+ return (-EINVAL);
+
+ xheader = (posix_acl_xattr_header *) (xattrbuf);
+ xentry = (posix_acl_xattr_entry *) (xheader + 1);
+
+ /* Check for supported POSIX ACL xattr version */
+ if (xheader->version != POSIX_ACL_XATTR_VERSION)
+ return (-ENOSYS);
+
+ for (idx = 0; idx < (int)aclcount; idx++) {
+ ace->type = xentry->tag;
+ if (defacl) {
+ /*
+ * SET the NFS_ACL_DEFAULT flag for default
+ * ACL which was masked OFF during setfacl().
+ */
+ ace->type |= NFS_ACL_DEFAULT;
+ }
+ ace->perm = (xentry->perm & S_IRWXO);
+
+ switch (xentry->tag) {
+ case POSIX_ACL_USER:
+ case POSIX_ACL_GROUP:
+ ace->uid = xentry->id;
+ break;
+ case POSIX_ACL_USER_OBJ:
+ case POSIX_ACL_GROUP_OBJ:
+ case POSIX_ACL_MASK:
+ case POSIX_ACL_OTHER:
+ ace->uid = POSIX_ACL_UNDEFINED_ID;
+ break;
+ default:
+ return (-EINVAL);
+ }
+
+
+ xentry++;
+ ace++;
+ }
+
+ /* SUCCESS: ACL count */
+ return aclcount;
+}
diff --git a/xlators/nfs/server/src/acl3.h b/xlators/nfs/server/src/acl3.h
index b668723c8..220bc9e78 100644
--- a/xlators/nfs/server/src/acl3.h
+++ b/xlators/nfs/server/src/acl3.h
@@ -11,18 +11,30 @@
#ifndef _ACL3_H
#define _ACL3_H
+#include "glusterfs-acl.h"
+
+#define ACL3_NULL 0
+#define ACL3_GETACL 1
+#define ACL3_SETACL 2
+#define ACL3_PROC_COUNT 3
+
#define GF_ACL3_PORT 38469
#define GF_ACL GF_NFS"-ACL"
-#define ACL_PROGRAM 100227
-#define ACL_V3 3
+/* Flags for the getacl/setacl mode */
+#define NFS_ACL 0x0001
+#define NFS_ACLCNT 0x0002
+#define NFS_DFACL 0x0004
+#define NFS_DFACLCNT 0x0008
-#define ACL_USER_OBJ 0x1
-#define ACL_GROUP_OBJ 0x4
-#define ACL_OTHER_OBJ 0x20
+/*
+ * NFSv3, identifies the default ACL by NFS_ACL_DEFAULT. Gluster
+ * NFS needs to mask it OFF before sending it upto POSIX layer
+ * or File system layer.
+ */
+#define NFS_ACL_DEFAULT 0x1000
-#define POSIX_ACL_XATTR_VERSION 0x0002
-#define NFS_ACL_MAX_ENTRIES 1024
+#define NFS_ACL_MAX_ENTRIES 1024
rpcsvc_program_t *
acl3svc_init(xlator_t *nfsx);
diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c
index 39d736e7d..47ff3845e 100644
--- a/xlators/nfs/server/src/mount3.c
+++ b/xlators/nfs/server/src/mount3.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -39,26 +30,71 @@
#include "nfs-mem-types.h"
#include "nfs.h"
#include "common-utils.h"
-
+#include "store.h"
#include <errno.h>
#include <sys/socket.h>
#include <sys/uio.h>
+
+#define IPv4_ADDR_SIZE 32
+
+/* Macro to typecast the parameter to struct sockaddr_in
+ */
+#define SA(addr) ((struct sockaddr_in*)(addr))
+
+/* Macro will mask the ip address with netmask.
+ */
+#define MASKED_IP(ipv4addr, netmask) \
+ (ntohl(SA(ipv4addr)->sin_addr.s_addr) & (netmask))
+
+/* Macro will compare two IP address after applying the mask
+ */
+#define COMPARE_IPv4_ADDRS(ip1, ip2, netmask) \
+ ((MASKED_IP(ip1, netmask)) == (MASKED_IP(ip2, netmask)))
+
+/* This macro will assist in freeing up entire link list
+ * of host_auth_spec structure.
+ */
+#define FREE_HOSTSPEC(exp) do { \
+ struct host_auth_spec *host= exp->hostspec; \
+ while (NULL != host){ \
+ struct host_auth_spec* temp = host; \
+ host = host->next; \
+ if (NULL != temp->host_addr) { \
+ GF_FREE (temp->host_addr); \
+ } \
+ GF_FREE (temp); \
+ } \
+ exp->hostspec = NULL; \
+ } while (0)
+
typedef ssize_t (*mnt3_serializer) (struct iovec outmsg, void *args);
extern void *
mount3udp_thread (void *argv);
+static inline void
+mnt3_export_free (struct mnt3_export *exp)
+{
+ if (!exp)
+ return;
+
+ if (exp->exptype == MNT3_EXPTYPE_DIR)
+ FREE_HOSTSPEC (exp);
+ GF_FREE (exp->expname);
+ GF_FREE (exp);
+}
/* Generic reply function for MOUNTv3 specific replies. */
int
mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
{
- struct iovec outmsg = {0, };
- struct iobuf *iob = NULL;
- struct mount3_state *ms = NULL;
- int ret = -1;
+ struct iovec outmsg = {0, };
+ struct iobuf *iob = NULL;
+ struct mount3_state *ms = NULL;
+ int ret = -1;
+ ssize_t msglen = 0;
struct iobref *iobref = NULL;
if (!req)
@@ -84,7 +120,12 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
/* Use the given serializer to translate the give C structure in arg
* to XDR format which will be written into the buffer in outmsg.
*/
- outmsg.iov_len = sfunc (outmsg, arg);
+ msglen = sfunc (outmsg, arg);
+ if (msglen < 0) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to encode message");
+ goto ret;
+ }
+ outmsg.iov_len = msglen;
iobref = iobref_new ();
if (iobref == NULL) {
@@ -92,12 +133,14 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
goto ret;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref);
- iobuf_unref (iob);
- iobref_unref (iobref);
if (ret == -1) {
gf_log (GF_MNT, GF_LOG_ERROR, "Reply submission failed");
goto ret;
@@ -105,6 +148,11 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
ret = 0;
ret:
+ if (NULL != iob)
+ iobuf_unref (iob);
+ if (NULL != iobref)
+ iobref_unref (iobref);
+
return ret;
}
@@ -188,14 +236,278 @@ mnt3svc_set_mountres3 (mountstat3 stat, struct nfs3_fh *fh, int *authflavor,
return res;
}
+/* Read the rmtab from the store_handle and append (or not) the entries to the
+ * mountlist.
+ *
+ * Requires the store_handle to be locked.
+ */
+static int
+__mount_read_rmtab (gf_store_handle_t *sh, struct list_head *mountlist,
+ gf_boolean_t append)
+{
+ int ret = 0;
+ unsigned int idx = 0;
+ struct mountentry *me = NULL, *tmp = NULL;
+ /* me->hostname is a char[MNTPATHLEN] */
+ char key[MNTPATHLEN + 11];
+
+ GF_ASSERT (sh && mountlist);
+
+ if (!gf_store_locked_local (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Not reading unlocked %s",
+ sh->path);
+ return -1;
+ }
+
+ if (!append) {
+ list_for_each_entry_safe (me, tmp, mountlist, mlist) {
+ list_del (&me->mlist);
+ GF_FREE (me);
+ }
+ me = NULL;
+ }
+
+ for (;;) {
+ char *value = NULL;
+
+ if (me && append) {
+ /* do not add duplicates */
+ list_for_each_entry (tmp, mountlist, mlist) {
+ if (!strcmp(tmp->hostname, me->hostname) &&
+ !strcmp(tmp->exname, me->exname)) {
+ GF_FREE (me);
+ goto dont_add;
+ }
+ }
+ list_add_tail (&me->mlist, mountlist);
+ } else if (me) {
+ list_add_tail (&me->mlist, mountlist);
+ }
+
+dont_add:
+ me = GF_CALLOC (1, sizeof (*me), gf_nfs_mt_mountentry);
+ if (!me) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory");
+ ret = -1;
+ goto out;
+ }
+
+ INIT_LIST_HEAD (&me->mlist);
+
+ snprintf (key, 9 + MNTPATHLEN, "hostname-%d", idx);
+ ret = gf_store_retrieve_value (sh, key, &value);
+ if (ret)
+ break;
+ strncpy (me->hostname, value, MNTPATHLEN);
+ GF_FREE (value);
+
+ snprintf (key, 11 + MNTPATHLEN, "mountpoint-%d", idx);
+ ret = gf_store_retrieve_value (sh, key, &value);
+ if (ret)
+ break;
+ strncpy (me->exname, value, MNTPATHLEN);
+ GF_FREE (value);
+
+ idx++;
+ gf_log (GF_MNT, GF_LOG_TRACE, "Read entries %s:%s", me->hostname, me->exname);
+ }
+ gf_log (GF_MNT, GF_LOG_DEBUG, "Read %d entries from '%s'", idx, sh->path);
+ GF_FREE (me);
+out:
+ return ret;
+}
+
+/* Overwrite the contents of the rwtab with te in-memory client list.
+ * Fail gracefully if the stora_handle is not locked.
+ */
+static void
+__mount_rewrite_rmtab(struct mount3_state *ms, gf_store_handle_t *sh)
+{
+ struct mountentry *me = NULL;
+ char key[16];
+ int fd, ret;
+ unsigned int idx = 0;
+
+ if (!gf_store_locked_local (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Not modifying unlocked %s",
+ sh->path);
+ return;
+ }
+
+ fd = gf_store_mkstemp (sh);
+ if (fd == -1) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to open %s", sh->path);
+ return;
+ }
+
+ list_for_each_entry (me, &ms->mountlist, mlist) {
+ snprintf (key, 16, "hostname-%d", idx);
+ ret = gf_store_save_value (fd, key, me->hostname);
+ if (ret)
+ goto fail;
+
+ snprintf (key, 16, "mountpoint-%d", idx);
+ ret = gf_store_save_value (fd, key, me->exname);
+ if (ret)
+ goto fail;
+
+ idx++;
+ }
+
+ gf_log (GF_MNT, GF_LOG_DEBUG, "Updated rmtab with %d entries", idx);
+
+ close (fd);
+ if (gf_store_rename_tmppath (sh))
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to overwrite rwtab %s",
+ sh->path);
+
+ return;
+
+fail:
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to update %s", sh->path);
+ close (fd);
+ gf_store_unlink_tmppath (sh);
+}
+
+/* Read the rmtab into a clean ms->mountlist.
+ */
+static void
+mount_read_rmtab (struct mount3_state *ms)
+{
+ gf_store_handle_t *sh = NULL;
+ struct nfs_state *nfs = NULL;
+ int ret;
+
+ nfs = (struct nfs_state *)ms->nfsx->private;
+
+ ret = gf_store_handle_new (nfs->rmtab, &sh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ nfs->rmtab);
+ return;
+ }
+
+ if (gf_store_lock (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to lock '%s'",
+ nfs->rmtab);
+ goto out;
+ }
+
+ __mount_read_rmtab (sh, &ms->mountlist, _gf_false);
+ gf_store_unlock (sh);
+
+out:
+ gf_store_handle_destroy (sh);
+}
+
+/* Write the ms->mountlist to the rmtab.
+ *
+ * The rmtab could be empty, or it can exists and have been updated by a
+ * different storage server without our knowing.
+ *
+ * 1. takes the store_handle lock on the current rmtab
+ * - blocks if an other storage server rewrites the rmtab at the same time
+ * 2. [if new_rmtab] takes the store_handle lock on the new rmtab
+ * 3. reads/merges the entries from the current rmtab
+ * 4. [if new_rmtab] reads/merges the entries from the new rmtab
+ * 5. [if new_rmtab] writes the new rmtab
+ * 6. [if not new_rmtab] writes the current rmtab
+ * 7 [if new_rmtab] replaces nfs->rmtab to point to the new location
+ * 8. [if new_rmtab] releases the store_handle lock of the new rmtab
+ * 9. releases the store_handle lock of the old rmtab
+ */
+void
+mount_rewrite_rmtab (struct mount3_state *ms, char *new_rmtab)
+{
+ gf_store_handle_t *sh = NULL, *nsh = NULL;
+ struct nfs_state *nfs = NULL;
+ int ret;
+ char *rmtab = NULL;
+
+ nfs = (struct nfs_state *)ms->nfsx->private;
+
+ ret = gf_store_handle_new (nfs->rmtab, &sh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ nfs->rmtab);
+ return;
+ }
+
+ if (gf_store_lock (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Not rewriting '%s'",
+ nfs->rmtab);
+ goto free_sh;
+ }
+
+ if (new_rmtab) {
+ ret = gf_store_handle_new (new_rmtab, &nsh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ new_rmtab);
+ goto unlock_sh;
+ }
+
+ if (gf_store_lock (nsh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Not rewriting '%s'",
+ new_rmtab);
+ goto free_nsh;
+ }
+ }
+
+ /* always read the currently used rmtab */
+ __mount_read_rmtab (sh, &ms->mountlist, _gf_true);
+
+ if (new_rmtab) {
+ /* read the new rmtab and write changes to the new location */
+ __mount_read_rmtab (nsh, &ms->mountlist, _gf_true);
+ __mount_rewrite_rmtab (ms, nsh);
+
+ /* replace the nfs->rmtab reference to the new rmtab */
+ rmtab = gf_strdup(new_rmtab);
+ if (rmtab == NULL) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory, keeping "
+ "%s as rmtab", nfs->rmtab);
+ } else {
+ GF_FREE (nfs->rmtab);
+ nfs->rmtab = rmtab;
+ }
+
+ gf_store_unlock (nsh);
+ } else {
+ /* rewrite the current (unchanged location) rmtab */
+ __mount_rewrite_rmtab (ms, sh);
+ }
+
+free_nsh:
+ if (new_rmtab)
+ gf_store_handle_destroy (nsh);
+unlock_sh:
+ gf_store_unlock (sh);
+free_sh:
+ gf_store_handle_destroy (sh);
+}
+/* Add a new NFS-client to the ms->mountlist and update the rmtab if we can.
+ *
+ * A NFS-client will only be removed from the ms->mountlist in case the
+ * NFS-client sends a unmount request. It is possible that a NFS-client
+ * crashed/rebooted had network loss or something else prevented the NFS-client
+ * to unmount cleanly. In this case, a duplicate entry would be added to the
+ * ms->mountlist, which is wrong and we should prevent.
+ *
+ * It is fully acceptible that the ms->mountlist is not 100% correct, this is a
+ * common issue for all(?) NFS-servers.
+ */
int
mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req,
char *expname)
{
struct mountentry *me = NULL;
+ struct mountentry *cur = NULL;
int ret = -1;
char *colon = NULL;
+ struct nfs_state *nfs = NULL;
+ gf_store_handle_t *sh = NULL;
if ((!ms) || (!req) || (!expname))
return -1;
@@ -205,14 +517,24 @@ mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req,
if (!me)
return -1;
- strcpy (me->exname, expname);
+ nfs = (struct nfs_state *)ms->nfsx->private;
+
+ ret = gf_store_handle_new (nfs->rmtab, &sh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ nfs->rmtab);
+ goto free_err;
+ }
+
+ strncpy (me->exname, expname, MNTPATHLEN);
+
INIT_LIST_HEAD (&me->mlist);
/* Must get the IP or hostname of the client so we
* can map it into the mount entry.
*/
ret = rpcsvc_transport_peername (req->trans, me->hostname, MNTPATHLEN);
if (ret == -1)
- goto free_err;
+ goto free_err2;
colon = strrchr (me->hostname, ':');
if (colon) {
@@ -220,10 +542,37 @@ mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req,
}
LOCK (&ms->mountlock);
{
+ /* in case locking fails, we just don't write the rmtab */
+ if (gf_store_lock (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to lock '%s'"
+ ", changes will not be written", nfs->rmtab);
+ } else {
+ __mount_read_rmtab (sh, &ms->mountlist, _gf_false);
+ }
+
+ /* do not add duplicates */
+ list_for_each_entry (cur, &ms->mountlist, mlist) {
+ if (!strcmp(cur->hostname, me->hostname) &&
+ !strcmp(cur->exname, me->exname)) {
+ GF_FREE (me);
+ goto dont_add;
+ }
+ }
list_add_tail (&me->mlist, &ms->mountlist);
+
+ /* only write the rmtab in case it was locked */
+ if (gf_store_locked_local (sh))
+ __mount_rewrite_rmtab (ms, sh);
}
+dont_add:
+ if (gf_store_locked_local (sh))
+ gf_store_unlock (sh);
+
UNLOCK (&ms->mountlock);
+free_err2:
+ gf_store_handle_destroy (sh);
+
free_err:
if (ret == -1)
GF_FREE (me);
@@ -242,6 +591,7 @@ __mnt3_get_volume_id (struct mount3_state *ms, xlator_t *mntxl,
if ((!ms) || (!mntxl))
return ret;
+ LOCK (&ms->mountlock);
list_for_each_entry (exp, &ms->exportlist, explist) {
if (exp->vol == mntxl) {
uuid_copy (volumeid, exp->volumeid);
@@ -251,6 +601,7 @@ __mnt3_get_volume_id (struct mount3_state *ms, xlator_t *mntxl,
}
out:
+ UNLOCK (&ms->mountlock);
return ret;
}
@@ -271,7 +622,7 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie,
rpcsvc_t *svc = NULL;
xlator_t *mntxl = NULL;
uuid_t volumeid = {0, };
- char fhstr[1024];
+ char fhstr[1024], *path = NULL;
req = (rpcsvc_request_t *)frame->local;
@@ -293,7 +644,15 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie,
if (status != MNT3_OK)
goto xmit_res;
- mnt3svc_update_mountlist (ms, req, mntxl->name);
+ path = GF_CALLOC (PATH_MAX, sizeof (char), gf_nfs_mt_char);
+ if (!path) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory");
+ goto xmit_res;
+ }
+
+ snprintf (path, PATH_MAX, "/%s", mntxl->name);
+ mnt3svc_update_mountlist (ms, req, path);
+ GF_FREE (path);
if (gf_nfs_dvm_off (nfs_state (ms->nfsx))) {
fh = nfs3_fh_build_indexed_root_fh (ms->nfsx->children, mntxl);
goto xmit_res;
@@ -303,7 +662,7 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie,
fh = nfs3_fh_build_uuid_root_fh (volumeid);
xmit_res:
- nfs3_fh_to_str (&fh, fhstr);
+ nfs3_fh_to_str (&fh, fhstr, sizeof (fhstr));
gf_log (GF_MNT, GF_LOG_DEBUG, "MNT reply: fh %s, status: %d", fhstr,
status);
if (op_ret == 0) {
@@ -467,8 +826,8 @@ mnt3_resolve_state_wipe (mnt3_resolve_t *mres)
/* Sets up the component argument to contain the next component in the path and
* sets up path as an absolute path starting from the next component.
*/
-char *
-__setup_next_component (char *path, char *component)
+static char *
+setup_next_component (char *path, size_t plen, char *component, size_t clen)
{
char *comp = NULL;
char *nextcomp = NULL;
@@ -476,7 +835,7 @@ __setup_next_component (char *path, char *component)
if ((!path) || (!component))
return NULL;
- strcpy (component, path);
+ strncpy (component, path, clen);
comp = index (component, (int)'/');
if (!comp)
goto err;
@@ -484,7 +843,7 @@ __setup_next_component (char *path, char *component)
comp++;
nextcomp = index (comp, (int)'/');
if (nextcomp) {
- strcpy (path, nextcomp);
+ strncpy (path, nextcomp, plen);
*nextcomp = '\0';
} else
path[0] = '\0';
@@ -514,7 +873,9 @@ __mnt3_resolve_export_subdir_comp (mnt3_resolve_t *mres)
if (!mres)
return ret;
- nextcomp = __setup_next_component (mres->remainingdir, dupsubdir);
+ nextcomp = setup_next_component (mres->remainingdir,
+ sizeof (mres->remainingdir),
+ dupsubdir, sizeof (dupsubdir));
if (!nextcomp)
goto err;
@@ -526,7 +887,7 @@ __mnt3_resolve_export_subdir_comp (mnt3_resolve_t *mres)
if ((ret < 0) && (ret != -2)) {
gf_log (GF_MNT, GF_LOG_ERROR, "Failed to resolve and create "
"inode: parent gfid %s, entry %s",
- uuid_utoa (mres->resolveloc.inode->gfid), nextcomp);
+ uuid_utoa (gfid), nextcomp);
ret = -EFAULT;
goto err;
}
@@ -554,6 +915,7 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
rpcsvc_t *svc = NULL;
mountres3 res = {0, };
xlator_t *mntxl = NULL;
+ char *path = NULL;
mres = frame->local;
mntxl = (xlator_t *)cookie;
@@ -571,15 +933,23 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (strlen (mres->remainingdir) <= 0) {
op_ret = -1;
mntstat = MNT3_OK;
+ path = GF_CALLOC (PATH_MAX, sizeof (char), gf_nfs_mt_char);
+ if (!path) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation "
+ "failed");
+ goto err;
+ }
+ snprintf (path, PATH_MAX, "/%s%s", mres->exp->vol->name,
+ mres->resolveloc.path);
mnt3svc_update_mountlist (mres->mstate, mres->req,
- mres->exp->expname);
- goto err;
+ path);
+ GF_FREE (path);
+ } else {
+ mres->parentfh = fh;
+ op_ret = __mnt3_resolve_export_subdir_comp (mres);
+ if (op_ret < 0)
+ mntstat = mnt3svc_errno_to_mnterr (-op_ret);
}
-
- mres->parentfh = fh;
- op_ret = __mnt3_resolve_export_subdir_comp (mres);
- if (op_ret < 0)
- mntstat = mnt3svc_errno_to_mnterr (-op_ret);
err:
if (op_ret == -1) {
gf_log (GF_MNT, GF_LOG_DEBUG, "Mount reply status: %d",
@@ -622,7 +992,9 @@ __mnt3_resolve_subdir (mnt3_resolve_t *mres)
if (!mres)
return ret;
- firstcomp = __setup_next_component (mres->remainingdir, dupsubdir);
+ firstcomp = setup_next_component (mres->remainingdir,
+ sizeof (mres->remainingdir),
+ dupsubdir, sizeof (dupsubdir));
if (!firstcomp)
goto err;
@@ -645,6 +1017,132 @@ err:
}
+/**
+ * This function will verify if the client is allowed to mount
+ * the directory or not. Client's IP address will be compared with
+ * allowed IP list or range present in mnt3_export structure.
+ *
+ * @param req - RPC request. This structure contains client's IP address.
+ * @param export - mnt3_export structure. Contains allowed IP list/range.
+ *
+ * @return 0 - on Success and -EACCES on failure.
+ */
+int
+mnt3_verify_auth (rpcsvc_request_t *req, struct mnt3_export *export)
+{
+ int retvalue = -EACCES;
+ int ret = 0;
+ int shiftbits = 0;
+ uint32_t ipv4netmask = 0;
+ uint32_t routingprefix = 0;
+ struct host_auth_spec *host = NULL;
+ struct sockaddr_in *client_addr = NULL;
+ struct sockaddr_in *allowed_addr = NULL;
+ struct addrinfo *allowed_addrinfo = NULL;
+
+ /* Sanity check */
+ if ((NULL == req) ||
+ (NULL == req->trans) ||
+ (NULL == export) ||
+ (NULL == export->hostspec)) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Invalid argument");
+ return retvalue;
+ }
+
+ host = export->hostspec;
+
+
+ /* Client's IP address. */
+ client_addr = (struct sockaddr_in *)(&(req->trans->peerinfo.sockaddr));
+
+ /* Try to see if the client IP matches the allowed IP list.*/
+ while (NULL != host){
+ GF_ASSERT (host->host_addr);
+
+ if (NULL != allowed_addrinfo) {
+ freeaddrinfo (allowed_addrinfo);
+ allowed_addrinfo = NULL;
+ }
+
+ /* Get the addrinfo for the allowed host (host_addr). */
+ ret = getaddrinfo (host->host_addr,
+ NULL,
+ NULL,
+ &allowed_addrinfo);
+ if (0 != ret){
+ gf_log (GF_MNT, GF_LOG_ERROR, "getaddrinfo: %s\n",
+ gai_strerror (ret));
+ host = host->next;
+
+ /* Failed to get IP addrinfo. Continue to check other
+ * allowed IPs in the list.
+ */
+ continue;
+ }
+
+ allowed_addr = (struct sockaddr_in *)(allowed_addrinfo->ai_addr);
+
+ if (NULL == allowed_addr) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Invalid structure");
+ break;
+ }
+
+ if (AF_INET == allowed_addr->sin_family){
+ if (IPv4_ADDR_SIZE < host->routeprefix) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "invalid IP "
+ "configured for export-dir AUTH");
+ host = host->next;
+ continue;
+ }
+
+ /* -1 means no route prefix is provided. In this case
+ * the IP should be an exact match. Which is same as
+ * providing a route prefix of IPv4_ADDR_SIZE.
+ */
+ if (-1 == host->routeprefix) {
+ routingprefix = IPv4_ADDR_SIZE;
+ } else {
+ routingprefix = host->routeprefix;
+ }
+
+ /* Create a mask from the routing prefix. User provided
+ * CIDR address is split into IP address (host_addr) and
+ * routing prefix (routeprefix). This CIDR address may
+ * denote a single, distinct interface address or the
+ * beginning address of an entire network.
+ *
+ * e.g. the IPv4 block 192.168.100.0/24 represents the
+ * 256 IPv4 addresses from 192.168.100.0 to
+ * 192.168.100.255.
+ * Therefore to check if an IP matches 192.168.100.0/24
+ * we should mask the IP with FFFFFF00 and compare it
+ * with host address part of CIDR.
+ */
+ shiftbits = IPv4_ADDR_SIZE - routingprefix;
+ ipv4netmask = 0xFFFFFFFFUL << shiftbits;
+
+ /* Mask both the IPs and then check if they match
+ * or not. */
+ if (COMPARE_IPv4_ADDRS (allowed_addr,
+ client_addr,
+ ipv4netmask)){
+ retvalue = 0;
+ break;
+ }
+ }
+
+ /* Client IP didn't match the allowed IP.
+ * Check with the next allowed IP.*/
+ host = host->next;
+ }
+
+ if (NULL != allowed_addrinfo) {
+ freeaddrinfo (allowed_addrinfo);
+ }
+
+ return retvalue;
+}
+
int
mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms,
struct mnt3_export *exp, char *subdir)
@@ -656,6 +1154,16 @@ mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms,
if ((!req) || (!ms) || (!exp) || (!subdir))
return ret;
+ /* Need to check AUTH */
+ if (NULL != exp->hostspec) {
+ ret = mnt3_verify_auth (req, exp);
+ if (0 != ret) {
+ gf_log (GF_MNT,GF_LOG_ERROR,
+ "AUTH verification failed");
+ return ret;
+ }
+ }
+
mres = GF_CALLOC (1, sizeof (mnt3_resolve_t), gf_nfs_mt_mnt3_resolve);
if (!mres) {
gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
@@ -665,7 +1173,7 @@ mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms,
mres->exp = exp;
mres->mstate = ms;
mres->req = req;
- strcpy (mres->remainingdir, subdir);
+ strncpy (mres->remainingdir, subdir, MNTPATHLEN);
if (gf_nfs_dvm_off (nfs_state (ms->nfsx)))
pfh = nfs3_fh_build_indexed_root_fh (mres->mstate->nfsx->children, mres->exp->vol);
else
@@ -740,6 +1248,7 @@ mnt3_mntpath_to_export (struct mount3_state *ms, char *dirpath)
if ((!ms) || (!dirpath))
return NULL;
+ LOCK (&ms->mountlock);
list_for_each_entry (exp, &ms->exportlist, explist) {
/* Search for the an exact match with the volume */
@@ -753,6 +1262,7 @@ mnt3_mntpath_to_export (struct mount3_state *ms, char *dirpath)
gf_log (GF_MNT, GF_LOG_DEBUG, "Export not found");
foundexp:
+ UNLOCK (&ms->mountlock);
return found;
}
@@ -781,7 +1291,7 @@ mnt3_check_client_net (struct mount3_state *ms, rpcsvc_request_t *req,
gai_strerror (ret));
}
- ret = rpcsvc_auth_check (svc->options, targetxl->name, trans);
+ ret = rpcsvc_auth_check (svc, targetxl->name, trans);
if (ret == RPCSVC_AUTH_REJECT) {
gf_log (GF_MNT, GF_LOG_INFO, "Peer %s not allowed", peer);
goto err;
@@ -808,7 +1318,8 @@ mnt3_parse_dir_exports (rpcsvc_request_t *req, struct mount3_state *ms,
char volname[1024];
struct mnt3_export *exp = NULL;
char *volname_ptr = NULL;
- int ret = -1;
+ int ret = -ENOENT;
+ struct nfs_state *nfs = NULL;
if ((!ms) || (!subdir))
return -1;
@@ -822,10 +1333,26 @@ mnt3_parse_dir_exports (rpcsvc_request_t *req, struct mount3_state *ms,
if (!exp)
goto err;
+ nfs = (struct nfs_state *)ms->nfsx->private;
+ if (!nfs)
+ goto err;
+
+ if (!nfs_subvolume_started (nfs, exp->vol)) {
+ gf_log (GF_MNT, GF_LOG_DEBUG,
+ "Volume %s not started", exp->vol->name);
+ goto err;
+ }
+
+ if (mnt3_check_client_net (ms, req, exp->vol) == RPCSVC_AUTH_REJECT) {
+ gf_log (GF_MNT, GF_LOG_DEBUG, "Client mount not allowed");
+ ret = -EACCES;
+ goto err;
+ }
+
ret = mnt3_resolve_subdir (req, ms, exp, subdir);
if (ret < 0) {
- gf_log (GF_MNT, GF_LOG_ERROR, "Failed to resolve export dir: %s"
- , subdir);
+ gf_log (GF_MNT, GF_LOG_ERROR,
+ "Failed to resolve export dir: %s", subdir);
goto err;
}
@@ -865,10 +1392,6 @@ mnt3_find_export (rpcsvc_request_t *req, char *path, struct mnt3_export **e)
}
ret = mnt3_parse_dir_exports (req, ms, path);
- if (ret == 0) {
- ret = -2;
- goto err;
- }
err:
return ret;
@@ -906,17 +1429,26 @@ mnt3svc_mnt (rpcsvc_request_t *req)
goto rpcerr;
}
- ret = 0;
nfs = (struct nfs_state *)ms->nfsx->private;
gf_log (GF_MNT, GF_LOG_DEBUG, "dirpath: %s", path);
ret = mnt3_find_export (req, path, &exp);
- if (ret == -2) {
- ret = 0;
- goto rpcerr;
- } else if (ret < 0) {
- ret = -1;
- mntstat = MNT3ERR_NOENT;
+ if (ret < 0) {
+ mntstat = mnt3svc_errno_to_mnterr (-ret);
goto mnterr;
+ } else if (!exp) {
+ /*
+ * SPECIAL CASE: exp is NULL if "path" is subdir in
+ * call to mnt3_find_export().
+ *
+ * This is subdir mount, we are already DONE!
+ * nfs_subvolume_started() and mnt3_check_client_net()
+ * validation are done in mnt3_parse_dir_exports()
+ * which is invoked through mnt3_find_export().
+ *
+ * TODO: All mount should happen thorugh mnt3svc_mount()
+ * It needs more clean up.
+ */
+ return (0);
}
if (!nfs_subvolume_started (nfs, exp->vol)) {
@@ -976,6 +1508,9 @@ __build_mountlist (struct mount3_state *ms, int *count)
if ((!ms) || (!count))
return NULL;
+ /* read rmtab, other peers might have updated it */
+ mount_read_rmtab(ms);
+
*count = 0;
gf_log (GF_MNT, GF_LOG_DEBUG, "Building mount list:");
list_for_each_entry (me, &ms->mountlist, mlist) {
@@ -986,6 +1521,8 @@ __build_mountlist (struct mount3_state *ms, int *count)
" failed");
goto free_list;
}
+ if (!first)
+ first = mlist;
mlist->ml_directory = GF_CALLOC (namelen + 2, sizeof (char),
gf_nfs_mt_char);
@@ -995,8 +1532,7 @@ __build_mountlist (struct mount3_state *ms, int *count)
goto free_list;
}
- strcpy (mlist->ml_directory, "/");
- strcat (mlist->ml_directory, me->exname);
+ strcpy (mlist->ml_directory, me->exname);
namelen = strlen (me->hostname);
mlist->ml_hostname = GF_CALLOC (namelen + 2, sizeof (char),
@@ -1017,9 +1553,6 @@ __build_mountlist (struct mount3_state *ms, int *count)
} else
prev = mlist;
- if (!first)
- first = mlist;
-
(*count)++;
}
@@ -1096,67 +1629,71 @@ rpcerr:
int
-__mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname)
+mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname)
{
struct mountentry *me = NULL;
- char *exname = NULL;
int ret = -1;
+ gf_store_handle_t *sh = NULL;
+ struct nfs_state *nfs = NULL;
if ((!ms) || (!dirpath) || (!hostname))
return -1;
- if (list_empty (&ms->mountlist))
- return 0;
-
- if (dirpath[0] == '/')
- exname = dirpath+1;
- else
- exname = dirpath;
+ nfs = (struct nfs_state *)ms->nfsx->private;
- list_for_each_entry (me, &ms->mountlist, mlist) {
- if ((strcmp (me->exname, exname) == 0) &&
- (strcmp (me->hostname, hostname) == 0)) {
- ret = 0;
- break;
- }
+ ret = gf_store_handle_new (nfs->rmtab, &sh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ nfs->rmtab);
+ return 0;
}
- /* Need this check here because at the end of the search me might still
- * be pointing to the last entry, which may not be the one we're
- * looking for.
- */
- if (ret == -1) {/* Not found in list. */
- gf_log (GF_MNT, GF_LOG_DEBUG, "Export not found");
- goto ret;
+ ret = gf_store_lock (sh);
+ if (ret) {
+ goto out_free;
}
- if (!me)
- goto ret;
+ LOCK (&ms->mountlock);
+ {
+ __mount_read_rmtab (sh, &ms->mountlist, _gf_false);
+ if (list_empty (&ms->mountlist)) {
+ ret = 0;
+ goto out_unlock;
+ }
- gf_log (GF_MNT, GF_LOG_DEBUG, "Unmounting: dir %s, host: %s",
- me->exname, me->hostname);
- list_del (&me->mlist);
- GF_FREE (me);
- ret = 0;
-ret:
- return ret;
-}
+ ret = -1;
+ list_for_each_entry (me, &ms->mountlist, mlist) {
+ if ((strcmp (me->exname, dirpath) == 0) &&
+ (strcmp (me->hostname, hostname) == 0)) {
+ ret = 0;
+ break;
+ }
+ }
+ /* Need this check here because at the end of the search me
+ * might still be pointing to the last entry, which may not be
+ * the one we're looking for.
+ */
+ if (ret == -1) {/* Not found in list. */
+ gf_log (GF_MNT, GF_LOG_TRACE, "Export not found");
+ goto out_unlock;
+ }
+ if (!me)
+ goto out_unlock;
-int
-mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname)
-{
- int ret = -1;
- if ((!ms) || (!dirpath) || (!hostname))
- return -1;
+ gf_log (GF_MNT, GF_LOG_DEBUG, "Unmounting: dir %s, host: %s",
+ me->exname, me->hostname);
- LOCK (&ms->mountlock);
- {
- ret = __mnt3svc_umount (ms, dirpath, hostname);
+ list_del (&me->mlist);
+ GF_FREE (me);
+ __mount_rewrite_rmtab (ms, sh);
}
+out_unlock:
UNLOCK (&ms->mountlock);
-
+ gf_store_unlock (sh);
+out_free:
+ gf_store_handle_destroy (sh);
return ret;
}
@@ -1304,6 +1841,10 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms)
return NULL;
nfs = (struct nfs_state *)ms->nfsx->private;
+ if (!nfs)
+ return NULL;
+
+ LOCK (&ms->mountlock);
list_for_each_entry(ent, &ms->exportlist, explist) {
/* If volume is not started yet, do not list it for tools like
@@ -1319,7 +1860,8 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms)
" failed");
goto free_list;
}
-
+ if (!first)
+ first = elist;
elist->ex_dir = GF_CALLOC (namelen + 2, sizeof (char),
gf_nfs_mt_char);
if (!elist->ex_dir) {
@@ -1327,16 +1869,10 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms)
" failed");
goto free_list;
}
-
strcpy (elist->ex_dir, ent->expname);
addrstr = rpcsvc_volume_allowed (svc->options,
ent->vol->name);
- if (addrstr)
- addrstr = gf_strdup (addrstr);
- else
- addrstr = gf_strdup ("No Access");
-
elist->ex_groups = GF_CALLOC (1, sizeof (struct groupnode),
gf_nfs_mt_groupnode);
if (!elist->ex_groups) {
@@ -1344,21 +1880,29 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms)
" failed");
goto free_list;
}
+ /*This check has to be done after checking
+ * elist->ex_groups allocation check to avoid resource leak;
+ */
+ if (addrstr)
+ addrstr = gf_strdup (addrstr);
+ else
+ addrstr = gf_strdup ("No Access");
+ if (!addrstr) {
+ goto free_list;
+ }
elist->ex_groups->gr_name = addrstr;
if (prev) {
prev->ex_next = elist;
prev = elist;
} else
prev = elist;
-
- if (!first)
- first = elist;
}
ret = 0;
free_list:
+ UNLOCK (&ms->mountlock);
if (ret == -1) {
xdr_free_exports_list (first);
first = NULL;
@@ -1464,12 +2008,13 @@ mount3udp_add_mountlist (char *host, dirpath *expname)
while (*export == '/')
export++;
- strcpy (me->exname, export);
- strcpy (me->hostname, host);
+ strncpy (me->exname, export, MNTPATHLEN);
+ strncpy (me->hostname, host, MNTPATHLEN);
INIT_LIST_HEAD (&me->mlist);
LOCK (&ms->mountlock);
{
list_add_tail (&me->mlist, &ms->mountlist);
+ mount_rewrite_rmtab(ms, NULL);
}
UNLOCK (&ms->mountlock);
return 0;
@@ -1485,11 +2030,155 @@ mount3udp_delete_mountlist (char *hostname, dirpath *expname)
export = (char *)expname;
while (*export == '/')
export++;
- __mnt3svc_umount (ms, export, hostname);
+ mnt3svc_umount (ms, export, hostname);
return 0;
}
+/**
+ * This function will parse the hostip (IP addres, IP range, or hostname)
+ * and fill the host_auth_spec structure.
+ *
+ * @param hostspec - struct host_auth_spec
+ * @param hostip - IP address, IP range (CIDR format) or hostname
+ *
+ * @return 0 - on success and -1 on failure
+ */
+int
+mnt3_export_fill_hostspec (struct host_auth_spec* hostspec, const char* hostip)
+{
+ char *ipdupstr = NULL;
+ char *savptr = NULL;
+ char *ip = NULL;
+ char *token = NULL;
+ int ret = -1;
+
+ /* Create copy of the string so that the source won't change
+ */
+ ipdupstr = gf_strdup (hostip);
+ if (NULL == ipdupstr) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
+ goto err;
+ }
+
+ ip = strtok_r (ipdupstr, "/", &savptr);
+ hostspec->host_addr = gf_strdup (ip);
+ if (NULL == hostspec->host_addr) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
+ goto err;
+ }
+
+ /* Check if the IP is in <IP address> / <Range> format.
+ * If yes, then strip the range and store it separately.
+ */
+ token = strtok_r (NULL, "/", &savptr);
+
+ if (NULL == token) {
+ hostspec->routeprefix = -1;
+ } else {
+ hostspec->routeprefix = atoi (token);
+ }
+
+ // success
+ ret = 0;
+err:
+ if (NULL != ipdupstr) {
+ GF_FREE (ipdupstr);
+ }
+ return ret;
+}
+
+
+/**
+ * This function will parse the AUTH parameter passed along with
+ * "export-dir" option. If AUTH parameter is present then it will be
+ * stripped from exportpath and stored in mnt3_export (exp) structure.
+ *
+ * @param exp - mnt3_export structure. Holds information needed for mount.
+ * @param exportpath - Value of "export-dir" key. Holds both export path
+ * and AUTH parameter for the path.
+ * exportpath format: <abspath>[(hostdesc[|hostspec|...])]
+ *
+ * @return This function will return 0 on success and -1 on failure.
+ */
+int
+mnt3_export_parse_auth_param (struct mnt3_export* exp, char* exportpath)
+{
+ char *token = NULL;
+ char *savPtr = NULL;
+ char *hostip = NULL;
+ struct host_auth_spec *host = NULL;
+ int ret = 0;
+
+ /* Using exportpath directly in strtok_r because we want
+ * to strip off AUTH parameter from exportpath. */
+ token = strtok_r (exportpath, "(", &savPtr);
+
+ /* Get the next token, which will be the AUTH parameter. */
+ token = strtok_r (NULL, ")", &savPtr);
+
+ if (NULL == token) {
+ /* If AUTH is not present then we should return success. */
+ return 0;
+ }
+
+ /* Free any previously allocated hostspec structure. */
+ if (NULL != exp->hostspec) {
+ GF_FREE (exp->hostspec);
+ exp->hostspec = NULL;
+ }
+
+ exp->hostspec = GF_CALLOC (1,
+ sizeof (*(exp->hostspec)),
+ gf_nfs_mt_auth_spec);
+ if (NULL == exp->hostspec){
+ gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
+ return -1;
+ }
+
+ /* AUTH parameter can have multiple entries. For each entry
+ * a host_auth_spec structure is created. */
+ host = exp->hostspec;
+
+ hostip = strtok_r (token, "|", &savPtr);
+
+ /* Parse all AUTH parameters separated by '|' */
+ while (NULL != hostip){
+ ret = mnt3_export_fill_hostspec (host, hostip);
+ if (0 != ret) {
+ gf_log(GF_MNT, GF_LOG_WARNING,
+ "Failed to parse hostspec: %s", hostip);
+ goto err;
+ }
+
+ hostip = strtok_r (NULL, "|", &savPtr);
+ if (NULL == hostip) {
+ break;
+ }
+
+ host->next = GF_CALLOC (1, sizeof (*(host)),
+ gf_nfs_mt_auth_spec);
+ if (NULL == host->next){
+ gf_log (GF_MNT,GF_LOG_ERROR,
+ "Memory allocation failed");
+ goto err;
+ }
+ host = host->next;
+ }
+
+ /* In case of success return from here */
+ return 0;
+err:
+ /* In case of failure free up hostspec structure. */
+ FREE_HOSTSPEC (exp);
+ return -1;
+}
+
+/**
+ * exportpath will also have AUTH options (ip address, subnet address or
+ * hostname) mentioned.
+ * exportpath format: <abspath>[(hostdesc[|hostspec|...])]
+ */
struct mnt3_export *
mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
uuid_t volumeid)
@@ -1507,6 +2196,20 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
return NULL;
}
+ if (NULL != exportpath) {
+ /* If exportpath is not NULL then we should check if AUTH
+ * parameter is present or not. If AUTH parameter is present
+ * then it will be stripped and stored in mnt3_export (exp)
+ * structure.
+ */
+ if (0 != mnt3_export_parse_auth_param (exp, exportpath)){
+ gf_log (GF_MNT, GF_LOG_ERROR,
+ "Failed to parse auth param");
+ goto err;
+ }
+ }
+
+
INIT_LIST_HEAD (&exp->explist);
if (exportpath)
alloclen = strlen (xl->name) + 2 + strlen (exportpath);
@@ -1516,8 +2219,6 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
exp->expname = GF_CALLOC (alloclen, sizeof (char), gf_nfs_mt_char);
if (!exp->expname) {
gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
- GF_FREE (exp);
- exp = NULL;
goto err;
}
@@ -1534,8 +2235,9 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
ret = snprintf (exp->expname, alloclen, "/%s", xl->name);
}
if (ret < 0) {
- gf_log (xl->name, GF_LOG_WARNING,
- "failed to get the export name");
+ gf_log (xl->name, GF_LOG_ERROR,
+ "Failed to set the export name");
+ goto err;
}
/* Just copy without discrimination, we'll determine whether to
* actually use it when a mount request comes in and a file handle
@@ -1543,7 +2245,16 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
*/
uuid_copy (exp->volumeid, volumeid);
exp->vol = xl;
+
+ /* On success we should return from here*/
+ return exp;
err:
+ /* On failure free exp and it's members.*/
+ if (NULL != exp) {
+ mnt3_export_free (exp);
+ exp = NULL;
+ }
+
return exp;
}
@@ -1704,8 +2415,11 @@ __mnt3_init_volume_export (struct mount3_state *ms, dict_t *opts)
goto err;
}
- gf_string2boolean (optstr, &boolt);
- ret = 0;
+ ret = gf_string2boolean (optstr, &boolt);
+ if (ret < 0) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to convert"
+ " string to boolean");
+ }
err:
if (boolt == _gf_false) {
@@ -1744,8 +2458,11 @@ __mnt3_init_dir_export (struct mount3_state *ms, dict_t *opts)
goto err;
}
- gf_string2boolean (optstr, &boolt);
- ret = 0;
+ ret = gf_string2boolean (optstr, &boolt);
+ if (ret < 0) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to convert"
+ " string to boolean");
+ }
err:
if (boolt == _gf_false) {
@@ -1843,12 +2560,12 @@ out:
}
rpcsvc_actor_t mnt3svc_actors[MOUNT3_PROC_COUNT] = {
- {"NULL", MOUNT3_NULL, mnt3svc_null, NULL, 0},
- {"MNT", MOUNT3_MNT, mnt3svc_mnt, NULL, 0},
- {"DUMP", MOUNT3_DUMP, mnt3svc_dump, NULL, 0},
- {"UMNT", MOUNT3_UMNT, mnt3svc_umnt, NULL, 0},
- {"UMNTALL", MOUNT3_UMNTALL, mnt3svc_umntall, NULL, 0},
- {"EXPORT", MOUNT3_EXPORT, mnt3svc_export, NULL, 0}
+ {"NULL", MOUNT3_NULL, mnt3svc_null, NULL, 0, DRC_NA},
+ {"MNT", MOUNT3_MNT, mnt3svc_mnt, NULL, 0, DRC_NA},
+ {"DUMP", MOUNT3_DUMP, mnt3svc_dump, NULL, 0, DRC_NA},
+ {"UMNT", MOUNT3_UMNT, mnt3svc_umnt, NULL, 0, DRC_NA},
+ {"UMNTALL", MOUNT3_UMNTALL, mnt3svc_umntall, NULL, 0, DRC_NA},
+ {"EXPORT", MOUNT3_EXPORT, mnt3svc_export, NULL, 0, DRC_NA}
};
@@ -1922,7 +2639,7 @@ mnt3svc_init (xlator_t *nfsx)
}
}
- rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
+ ret= rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
if (ret == -1) {
gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners");
dict_unref (options);
@@ -1939,12 +2656,12 @@ err:
rpcsvc_actor_t mnt1svc_actors[MOUNT1_PROC_COUNT] = {
- {"NULL", MOUNT1_NULL, mnt3svc_null, NULL, 0},
- {{0, 0}, },
- {"DUMP", MOUNT1_DUMP, mnt3svc_dump, NULL, 0},
- {"UMNT", MOUNT1_UMNT, mnt3svc_umnt, NULL, 0},
- {{0, 0}, },
- {"EXPORT", MOUNT1_EXPORT, mnt3svc_export, NULL, 0}
+ {"NULL", MOUNT1_NULL, mnt3svc_null, NULL, 0, DRC_NA},
+ {"MNT", MOUNT1_MNT, NULL, NULL, 0, DRC_NA },
+ {"DUMP", MOUNT1_DUMP, mnt3svc_dump, NULL, 0, DRC_NA},
+ {"UMNT", MOUNT1_UMNT, mnt3svc_umnt, NULL, 0, DRC_NA},
+ {"UMNTALL", MOUNT1_UMNTALL, NULL, NULL, 0, DRC_NA},
+ {"EXPORT", MOUNT1_EXPORT, mnt3svc_export, NULL, 0, DRC_NA}
};
rpcsvc_program_t mnt1prog = {
@@ -2010,7 +2727,7 @@ mnt1svc_init (xlator_t *nfsx)
}
}
- rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
+ ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
if (ret == -1) {
gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners");
dict_unref (options);
@@ -2021,3 +2738,44 @@ mnt1svc_init (xlator_t *nfsx)
err:
return NULL;
}
+
+int
+mount_reconfigure_state (xlator_t *nfsx, dict_t *options)
+{
+ int ret = -1;
+ struct nfs_state *nfs = NULL;
+ struct mount3_state *ms = NULL;
+ struct mnt3_export *exp = NULL;
+ struct mnt3_export *texp = NULL;
+
+ if ((!nfsx) || (!options))
+ return (-1);
+
+ nfs = (struct nfs_state *)nfs_state (nfsx);
+ if (!nfs)
+ return (-1);
+
+ ms = nfs->mstate;
+ if (!ms)
+ return (-1);
+
+ /*
+ * Free() up the old export list. mnt3_init_options() will
+ * rebuild the export list from scratch. Do it with locking
+ * to avoid unnecessary race conditions.
+ */
+ LOCK (&ms->mountlock);
+ list_for_each_entry_safe (exp, texp, &ms->exportlist, explist) {
+ list_del (&exp->explist);
+ mnt3_export_free (exp);
+ }
+ ret = mnt3_init_options (ms, options);
+ UNLOCK (&ms->mountlock);
+
+ if (ret < 0) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Options reconfigure failed");
+ return (-1);
+ }
+
+ return (0);
+}
diff --git a/xlators/nfs/server/src/mount3.h b/xlators/nfs/server/src/mount3.h
index c0eae3644..7fc16ed57 100644
--- a/xlators/nfs/server/src/mount3.h
+++ b/xlators/nfs/server/src/mount3.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _MOUNT3_H_
@@ -53,6 +44,12 @@ mnt1svc_init (xlator_t *nfsx);
extern int
mount_init_state (xlator_t *nfsx);
+extern int
+mount_reconfigure_state (xlator_t *nfsx, dict_t *options);
+
+void
+mount_rewrite_rmtab (struct mount3_state *ms, char *new_rmtab);
+
/* Data structure used to store the list of mounts points currently
* in use by NFS clients.
*/
@@ -68,6 +65,13 @@ struct mountentry {
#define MNT3_EXPTYPE_VOLUME 1
#define MNT3_EXPTYPE_DIR 2
+/* Structure to hold export-dir AUTH parameter */
+struct host_auth_spec {
+ char *host_addr; /* Allowed IP or host name */
+ int routeprefix; /* Routing prefix */
+ struct host_auth_spec *next; /* Pointer to next AUTH struct */
+};
+
struct mnt3_export {
struct list_head explist;
@@ -75,6 +79,11 @@ struct mnt3_export {
* is exported or the subdirectory in the volume.
*/
char *expname;
+ /*
+ * IP address, hostname or subnets who are allowed to connect to expname
+ * subvolume or subdirectory
+ */
+ struct host_auth_spec* hostspec;
xlator_t *vol;
int exptype;
@@ -101,8 +110,8 @@ struct mount3_state {
gf_lock_t mountlock;
/* Set to 0 if exporting full volumes is disabled. On by default. */
- int export_volumes;
- int export_dirs;
+ gf_boolean_t export_volumes;
+ gf_boolean_t export_dirs;
};
#define gf_mnt3_export_dirs(mst) ((mst)->export_dirs)
diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c
index aa38b1cc4..fb59e282c 100644
--- a/xlators/nfs/server/src/mount3udp_svc.c
+++ b/xlators/nfs/server/src/mount3udp_svc.c
@@ -2,19 +2,10 @@
Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c
index b3853ded5..f74396ee8 100644
--- a/xlators/nfs/server/src/nfs-common.c
+++ b/xlators/nfs/server/src/nfs-common.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -94,7 +85,7 @@ nfs_mntpath_to_xlator (xlator_list_t *cl, char *path)
if ((!cl) || (!path))
return NULL;
- strcpy (volname, path);
+ strncpy (volname, path, MNTPATHLEN);
pathlen = strlen (volname);
gf_log (GF_NFS, GF_LOG_TRACE, "Subvolume search: %s", path);
if (volname[0] == '/')
diff --git a/xlators/nfs/server/src/nfs-common.h b/xlators/nfs/server/src/nfs-common.h
index f74bb3187..2e97f1563 100644
--- a/xlators/nfs/server/src/nfs-common.h
+++ b/xlators/nfs/server/src/nfs-common.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_COMMON_H_
diff --git a/xlators/nfs/server/src/nfs-fops.c b/xlators/nfs/server/src/nfs-fops.c
index b6edc99c7..56d4cba47 100644
--- a/xlators/nfs/server/src/nfs-fops.c
+++ b/xlators/nfs/server/src/nfs-fops.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -39,27 +30,47 @@
#include <libgen.h>
#include <semaphore.h>
+static int gf_auth_max_groups_nfs_log = 0;
+
void
nfs_fix_groups (xlator_t *this, call_stack_t *root)
{
struct passwd mypw;
char mystrs[1024];
struct passwd *result;
+#ifdef GF_DARWIN_HOST_OS
+ /* BSD/DARWIN does not correctly uses gid_t in getgrouplist */
+ int mygroups[GF_MAX_AUX_GROUPS];
+#else
gid_t mygroups[GF_MAX_AUX_GROUPS];
+#endif
int ngroups;
int i;
+ int max_groups;
struct nfs_state *priv = this->private;
const gid_list_t *agl;
- gid_list_t gl;
+ gid_list_t gl;
if (!priv->server_aux_gids) {
return;
}
- agl = gid_cache_lookup(&priv->gid_cache, root->uid);
+ /* RPC enforces the GF_AUTH_GLUSTERFS_MAX_GROUPS limit */
+ max_groups = GF_AUTH_GLUSTERFS_MAX_GROUPS(root->lk_owner.len);
+
+ agl = gid_cache_lookup(&priv->gid_cache, root->uid, 0, 0);
if (agl) {
- for (ngroups = 0; ngroups < agl->gl_count; ngroups++)
+ if (agl->gl_count > max_groups) {
+ GF_LOG_OCCASIONALLY (gf_auth_max_groups_nfs_log,
+ this->name, GF_LOG_WARNING,
+ "too many groups, reducing %d -> %d",
+ agl->gl_count, max_groups);
+ }
+
+ for (ngroups = 0; ngroups < agl->gl_count
+ && ngroups <= max_groups; ngroups++) {
root->groups[ngroups] = agl->gl_list[ngroups];
+ }
root->ngrps = ngroups;
gid_cache_release(&priv->gid_cache, agl);
return;
@@ -93,12 +104,24 @@ nfs_fix_groups (xlator_t *this, call_stack_t *root)
if (gl.gl_list) {
/* It's not fatal if the alloc failed. */
gl.gl_id = root->uid;
+ gl.gl_uid = 0;
+ gl.gl_gid = 0;
gl.gl_count = ngroups;
memcpy(gl.gl_list, mygroups, sizeof(gid_t) * ngroups);
if (gid_cache_add(&priv->gid_cache, &gl) != 1)
GF_FREE(gl.gl_list);
}
+ /* RPC enforces the GF_AUTH_GLUSTERFS_MAX_GROUPS limit */
+ if (ngroups > max_groups) {
+ GF_LOG_OCCASIONALLY (gf_auth_max_groups_nfs_log,
+ this->name, GF_LOG_WARNING,
+ "too many groups, reducing %d -> %d",
+ ngroups, max_groups);
+
+ ngroups = max_groups;
+ }
+
/* Copy data to the frame. */
for (i = 0; i < ngroups; ++i) {
gf_log (this->name, GF_LOG_TRACE,
@@ -191,6 +214,12 @@ nfs_create_frame (xlator_t *xl, nfs_user_t *nfu)
frame = create_frame (xl, (call_pool_t *)xl->ctx->pool);
if (!frame)
goto err;
+ if (call_stack_alloc_groups (frame->root, nfu->ngrps) != 0) {
+ STACK_DESTROY (frame->root);
+ frame = NULL;
+ goto err;
+ }
+
frame->root->pid = NFS_PID;
frame->root->uid = nfu->uid;
frame->root->gid = nfu->gids[NFS_PRIMGID_IDX];
@@ -318,6 +347,9 @@ nfs_gfid_dict (inode_t *inode)
uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
dyngfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
+ if (dyngfid == NULL)
+ return (NULL);
+
uuid_generate (newgfid);
if (uuid_compare (inode->gfid, rootgfid) == 0)
@@ -328,16 +360,17 @@ nfs_gfid_dict (inode_t *inode)
dictgfid = dict_new ();
if (!dictgfid) {
gf_log (GF_NFS, GF_LOG_ERROR, "Failed to create gfid dict");
- goto out;
+ GF_FREE (dyngfid);
+ return (NULL);
}
ret = dict_set_bin (dictgfid, "gfid-req", dyngfid, sizeof (uuid_t));
if (ret < 0) {
+ GF_FREE (dyngfid);
dict_unref (dictgfid);
- dictgfid = NULL;
+ return (NULL);
}
-out:
return dictgfid;
}
@@ -385,19 +418,6 @@ nfs_fop_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
struct nfs_fop_local *local = NULL;
fop_lookup_cbk_t progcbk;
- int32_t spb = 0;
-
- /*
- * With native protocol, self-heal failures would be detected during
- * open. NFS doesn't issue that open when revalidating cache, so we
- * have to check for failures here instead.
- */
- if (dict_get_int32(xattr, "split-brain", &spb) == 0) {
- if (spb) {
- op_ret = -1;
- op_errno = EIO;
- }
- }
if (op_ret == 0) {
nfs_fix_generation(this,inode);
diff --git a/xlators/nfs/server/src/nfs-fops.h b/xlators/nfs/server/src/nfs-fops.h
index d846e14de..44e99c66b 100644
--- a/xlators/nfs/server/src/nfs-fops.h
+++ b/xlators/nfs/server/src/nfs-fops.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_FOPS_H_
diff --git a/xlators/nfs/server/src/nfs-generics.c b/xlators/nfs/server/src/nfs-generics.c
index 7f79bba9b..cb32b7f1b 100644
--- a/xlators/nfs/server/src/nfs-generics.c
+++ b/xlators/nfs/server/src/nfs-generics.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
diff --git a/xlators/nfs/server/src/nfs-generics.h b/xlators/nfs/server/src/nfs-generics.h
index 34e203a62..01876d68e 100644
--- a/xlators/nfs/server/src/nfs-generics.h
+++ b/xlators/nfs/server/src/nfs-generics.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_GENERICS_H_
diff --git a/xlators/nfs/server/src/nfs-inodes.c b/xlators/nfs/server/src/nfs-inodes.c
index 291152f85..63d5e8a19 100644
--- a/xlators/nfs/server/src/nfs-inodes.c
+++ b/xlators/nfs/server/src/nfs-inodes.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -57,10 +48,10 @@ nfl_inodes_init (struct nfs_fop_local *nfl, inode_t *inode, inode_t *parent,
nfl->newparent = inode_ref (newparent);
if (name)
- strcpy (nfl->path, name);
+ strncpy (nfl->path, name, NFS_NAME_MAX);
if (newname)
- strcpy (nfl->newpath, newname);
+ strncpy (nfl->newpath, newname, NFS_NAME_MAX);
return;
}
diff --git a/xlators/nfs/server/src/nfs-inodes.h b/xlators/nfs/server/src/nfs-inodes.h
index 7c962b339..ba7a57124 100644
--- a/xlators/nfs/server/src/nfs-inodes.h
+++ b/xlators/nfs/server/src/nfs-inodes.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_INODES_H_
diff --git a/xlators/nfs/server/src/nfs-mem-types.h b/xlators/nfs/server/src/nfs-mem-types.h
index 005598c1f..450b6f2fe 100644
--- a/xlators/nfs/server/src/nfs-mem-types.h
+++ b/xlators/nfs/server/src/nfs-mem-types.h
@@ -2,19 +2,10 @@
Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -54,6 +45,8 @@ enum gf_nfs_mem_types_ {
gf_nfs_mt_nlm4_share,
gf_nfs_mt_aux_gids,
gf_nfs_mt_inode_ctx,
+ gf_nfs_mt_auth_spec,
+ gf_nfs_mt_arr,
gf_nfs_mt_end
};
#endif
diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c
index 466393e34..918e86312 100644
--- a/xlators/nfs/server/src/nfs.c
+++ b/xlators/nfs/server/src/nfs.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
/* This is the primary translator source for NFS.
@@ -43,9 +34,209 @@
#include "nlm4.h"
#include "options.h"
#include "acl3.h"
+#include "rpc-drc.h"
+#include "syscall.h"
+
+#define STRINGIFY(val) #val
+#define TOSTRING(val) STRINGIFY(val)
#define OPT_SERVER_AUX_GIDS "nfs.server-aux-gids"
#define OPT_SERVER_GID_CACHE_TIMEOUT "nfs.server.aux-gid-timeout"
+#define OPT_SERVER_RPC_STATD "nfs.rpc-statd"
+#define OPT_SERVER_RPC_STATD_PIDFILE "nfs.rpc-statd-pidfile"
+#define OPT_SERVER_RPC_STATD_NOTIFY_PIDFILE "nfs.rpc-statd-notify-pidfile"
+
+/* TODO: DATADIR should be based on configure's $(localstatedir) */
+#define DATADIR "/var/lib/glusterd"
+#define NFS_DATADIR DATADIR "/nfs"
+
+/* Forward declaration */
+int nfs_add_initer (struct list_head *list, nfs_version_initer_t init);
+
+static int
+nfs_init_version (xlator_t *this, nfs_version_initer_t init)
+{
+ int ret = -1;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+ struct list_head *versions = NULL;
+ struct nfs_state *nfs = NULL;
+ gf_boolean_t found = _gf_false;
+
+ if ((!this) || (!this->private) || (!init))
+ return (-1);
+
+ nfs = (struct nfs_state *)this->private;
+
+ ret = nfs_add_initer (&nfs->versions, init);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Failed to add protocol initializer");
+ goto err;
+ }
+
+ versions = &nfs->versions;
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ prog = version->program;
+ if (version->init == init) {
+ prog = init(this);
+ if (!prog) {
+ ret = -1;
+ goto err;
+ }
+ version->program = prog;
+ found = _gf_true;
+ break;
+ }
+ }
+
+ /* program not added */
+ if (!found) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Program: %s NOT found", prog->progname);
+ goto err;
+ }
+
+ /* Check if nfs.port is configured */
+ if (nfs->override_portnum)
+ prog->progport = nfs->override_portnum;
+
+ gf_log (GF_NFS, GF_LOG_DEBUG, "Starting program: %s", prog->progname);
+
+ ret = rpcsvc_program_register (nfs->rpcsvc, prog);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Program: %s init failed",
+ prog->progname);
+ goto err;
+ }
+
+ /* Registration with portmapper is disabled, Nothing to do */
+ if (!nfs->register_portmap)
+ goto err;
+
+ ret = rpcsvc_program_register_portmap (prog, prog->progport);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Program %s registration failed",
+ prog->progname);
+ goto err;
+ }
+ ret = 0; /* All well */
+err:
+ return ret;
+}
+
+static int
+nfs_deinit_version (struct nfs_state *nfs, nfs_version_initer_t init)
+{
+ int ret = -1;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+ struct list_head *versions = NULL;
+
+ if ((!nfs) || (!init))
+ return (-1);
+
+ versions = &nfs->versions;
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ prog = version->program;
+ if (version->init == init) {
+ prog = version->program;
+ ret = rpcsvc_program_unregister (nfs->rpcsvc, prog);
+ if (ret != 0)
+ return (-1);
+ list_del (&version->list);
+ GF_FREE (version);
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+static int
+nfs_reconfigure_acl3 (xlator_t *this)
+{
+ struct nfs_state *nfs = NULL;
+
+ if ((!this) || (!this->private))
+ return (-1);
+
+ nfs = (struct nfs_state *)this->private;
+
+ /* ACL is enabled */
+ if (nfs->enable_acl)
+ return nfs_init_version (this, acl3svc_init);
+
+ /* ACL is disabled */
+ return nfs_deinit_version (nfs, acl3svc_init);
+}
+
+static int
+nfs_reconfigure_nlm4 (xlator_t *this)
+{
+ struct nfs_state *nfs = NULL;
+
+ if ((!this) || (!this->private))
+ return (-1);
+
+ nfs = (struct nfs_state *)this->private;
+
+ /* NLM is enabled */
+ if (nfs->enable_nlm)
+ return nfs_init_version (this, nlm4svc_init);
+
+ /* NLM is disabled */
+ return nfs_deinit_version (nfs, nlm4svc_init);
+}
+
+static int
+nfs_program_register_portmap_all (struct nfs_state *nfs)
+{
+ struct list_head *versions = NULL;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+
+ if (nfs == NULL)
+ return (-1);
+
+ versions = &nfs->versions;
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ prog = version->program;
+ if (prog == NULL)
+ continue;
+ if (nfs->override_portnum)
+ prog->progport = nfs->override_portnum;
+ (void) rpcsvc_program_register_portmap (prog, prog->progport);
+ }
+
+ return (0);
+}
+
+static int
+nfs_program_unregister_portmap_all (struct nfs_state *nfs)
+{
+ struct list_head *versions = NULL;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+
+ if (nfs == NULL)
+ return (-1);
+
+ versions = &nfs->versions;
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ prog = version->program;
+ if (prog == NULL)
+ continue;
+ (void) rpcsvc_program_unregister_portmap (prog);
+ }
+
+ return (0);
+}
/* Every NFS version must call this function with the init function
* for its particular version.
@@ -123,7 +314,7 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this)
ret = -1;
goto err;
}
-// prog->actorxl = this;
+
version->program = prog;
if (nfs->override_portnum)
prog->progport = nfs->override_portnum;
@@ -132,17 +323,21 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this)
ret = rpcsvc_program_register (nfs->rpcsvc, prog);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Program init failed");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Program: %s init failed",
+ prog->progname);
goto err;
}
- if (rpcsvc_register_portmap_enabled(nfs->rpcsvc)) {
+ if (nfs->register_portmap) {
ret = rpcsvc_program_register_portmap (prog,
prog->progport);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Program registration failed");
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Program %s registration failed",
+ prog->progname);
goto err;
}
}
+
}
ret = 0;
@@ -159,22 +354,22 @@ nfs_add_all_initiators (struct nfs_state *nfs)
/* Add the initializers for all versions. */
ret = nfs_add_initer (&nfs->versions, mnt3svc_init);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol"
- " initializer");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add "
+ "MOUNT3 protocol initializer");
goto ret;
}
ret = nfs_add_initer (&nfs->versions, mnt1svc_init);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol"
- " initializer");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add "
+ "MOUNT1 protocol initializer");
goto ret;
}
ret = nfs_add_initer (&nfs->versions, nfs3svc_init);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol"
- " initializer");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add "
+ "NFS3 protocol initializer");
goto ret;
}
@@ -187,11 +382,13 @@ nfs_add_all_initiators (struct nfs_state *nfs)
}
}
- ret = nfs_add_initer (&nfs->versions, acl3svc_init);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol"
- " initializer");
- goto ret;
+ if (nfs->enable_acl == _gf_true) {
+ ret = nfs_add_initer (&nfs->versions, acl3svc_init);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add "
+ "ACL protocol initializer");
+ goto ret;
+ }
}
ret = 0;
@@ -527,10 +724,11 @@ nfs_init_state (xlator_t *this)
if (!this)
return NULL;
- if ((!this->children) || (!this->children->xlator)) {
- gf_log (GF_NFS, GF_LOG_ERROR, "nfs must have at least one"
- " child subvolume");
- return NULL;
+ if (!this->children) {
+ gf_log (GF_NFS, GF_LOG_INFO,
+ "NFS is manually disabled: Exiting");
+ /* Nothing for nfs process to do, exit cleanly */
+ kill (getpid (), SIGTERM);
}
nfs = GF_CALLOC (1, sizeof (*nfs), gf_nfs_mt_nfs_state);
@@ -586,19 +784,17 @@ nfs_init_state (xlator_t *this)
}
nfs->enable_nlm = _gf_true;
- if (!dict_get_str (this->options, "nfs.nlm", &optstr)) {
-
- ret = gf_string2boolean (optstr, &boolt);
- if (ret < 0) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse"
- " bool string");
- goto free_foppool;
- }
+ ret = dict_get_str_boolean (this->options, "nfs.nlm", _gf_true);
+ if (ret == _gf_false) {
+ gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually disabled");
+ nfs->enable_nlm = _gf_false;
+ }
- if (boolt == _gf_false) {
- gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually disabled");
- nfs->enable_nlm = _gf_false;
- }
+ nfs->enable_acl = _gf_true;
+ ret = dict_get_str_boolean (this->options, "nfs.acl", _gf_true);
+ if (ret == _gf_false) {
+ gf_log (GF_NFS, GF_LOG_INFO, "ACL is manually disabled");
+ nfs->enable_acl = _gf_false;
}
nfs->enable_ino32 = 0;
@@ -682,6 +878,15 @@ nfs_init_state (xlator_t *this)
nfs->mount_udp = 1;
}
+ nfs->rmtab = gf_strdup (NFS_DATADIR "/rmtab");
+ if (dict_get(this->options, "nfs.mount-rmtab")) {
+ ret = dict_get_str (this->options, "nfs.mount-rmtab", &nfs->rmtab);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse dict");
+ goto free_foppool;
+ }
+ }
+
/* support both options rpc-auth.ports.insecure and
* rpc-auth-allow-insecure for backward compatibility
*/
@@ -741,30 +946,54 @@ nfs_init_state (xlator_t *this)
goto free_foppool;
}
}
+ GF_OPTION_INIT (OPT_SERVER_RPC_STATD, nfs->rpc_statd, path, free_foppool);
+
+ GF_OPTION_INIT (OPT_SERVER_RPC_STATD_PIDFILE, nfs->rpc_statd_pid_file, path, free_foppool);
GF_OPTION_INIT (OPT_SERVER_AUX_GIDS, nfs->server_aux_gids,
bool, free_foppool);
- GF_OPTION_INIT (OPT_SERVER_GID_CACHE_TIMEOUT, nfs->server_aux_gids_max_age,
+ GF_OPTION_INIT (OPT_SERVER_GID_CACHE_TIMEOUT,
+ nfs->server_aux_gids_max_age,
uint32, free_foppool);
- if (gid_cache_init(&nfs->gid_cache, nfs->server_aux_gids_max_age) < 0) {
- gf_log(GF_NFS, GF_LOG_ERROR, "Failed to initialize group cache.");
- goto free_foppool;
- }
+ if (gid_cache_init(&nfs->gid_cache, nfs->server_aux_gids_max_age) < 0) {
+ gf_log(GF_NFS, GF_LOG_ERROR, "Failed to initialize group cache.");
+ goto free_foppool;
+ }
- if (stat("/sbin/rpc.statd", &stbuf) == -1) {
- gf_log (GF_NFS, GF_LOG_WARNING, "/sbin/rpc.statd not found. "
- "Disabling NLM");
+ ret = sys_access (nfs->rpc_statd, X_OK);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_WARNING, "%s not enough permissions to"
+ " access. Disabling NLM", nfs->rpc_statd);
nfs->enable_nlm = _gf_false;
}
- nfs->rpcsvc = rpcsvc_init (this, this->ctx, this->options, 0);
+ ret = sys_stat (nfs->rpc_statd, &stbuf);
+ if (ret || !S_ISREG (stbuf.st_mode)) {
+ gf_log (GF_NFS, GF_LOG_WARNING, "%s not a regular file."
+ " Disabling NLM", nfs->rpc_statd);
+ nfs->enable_nlm = _gf_false;
+ }
+
+ nfs->rpcsvc = rpcsvc_init (this, this->ctx,
+ this->options, fopspoolsize);
if (!nfs->rpcsvc) {
ret = -1;
gf_log (GF_NFS, GF_LOG_ERROR, "RPC service init failed");
goto free_foppool;
}
+ ret = rpcsvc_set_outstanding_rpc_limit (nfs->rpcsvc,
+ this->options,
+ RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Failed to configure outstanding-rpc-limit");
+ goto free_foppool;
+ }
+
+ nfs->register_portmap = rpcsvc_register_portmap_enabled (nfs->rpcsvc);
+
this->private = (void *)nfs;
INIT_LIST_HEAD (&nfs->versions);
nfs->generation = 1965;
@@ -786,7 +1015,293 @@ free_rpcsvc:
return nfs;
}
+int
+nfs_drc_init (xlator_t *this)
+{
+ int ret = -1;
+ rpcsvc_t *svc = NULL;
+
+ GF_VALIDATE_OR_GOTO (GF_NFS, this, out);
+ GF_VALIDATE_OR_GOTO (GF_NFS, this->private, out);
+
+ svc = ((struct nfs_state *)(this->private))->rpcsvc;
+ if (!svc)
+ goto out;
+
+ ret = rpcsvc_drc_init (svc, this->options);
+
+ out:
+ return ret;
+}
+
+int
+nfs_reconfigure_state (xlator_t *this, dict_t *options)
+{
+ int ret = 0;
+ int keyindx = 0;
+ char *rmtab = NULL;
+ char *rpc_statd = NULL;
+ gf_boolean_t optbool;
+ uint32_t optuint32;
+ struct nfs_state *nfs = NULL;
+ char *blacklist_keys[] = {
+ "nfs.port",
+ "nfs.transport-type",
+ "nfs.mem-factor",
+ NULL};
+
+ GF_VALIDATE_OR_GOTO (GF_NFS, this, out);
+ GF_VALIDATE_OR_GOTO (GF_NFS, this->private, out);
+ GF_VALIDATE_OR_GOTO (GF_NFS, options, out);
+
+ nfs = (struct nfs_state *)this->private;
+
+ /* Black listed options can't be reconfigured, they need
+ * NFS to be restarted. There are two cases 1. SET 2. UNSET.
+ * 1. SET */
+ while (blacklist_keys[keyindx]) {
+ if (dict_get (options, blacklist_keys[keyindx])) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Reconfiguring %s needs NFS restart",
+ blacklist_keys[keyindx]);
+ goto out;
+ }
+ keyindx ++;
+ }
+
+ /* UNSET for nfs.mem-factor */
+ if ((!dict_get (options, "nfs.mem-factor")) &&
+ (nfs->memfactor != GF_NFS_DEFAULT_MEMFACTOR)) {
+ gf_log (GF_NFS, GF_LOG_INFO,
+ "Reconfiguring nfs.mem-factor needs NFS restart");
+ goto out;
+ }
+
+ /* UNSET for nfs.port */
+ if ((!dict_get (options, "nfs.port")) &&
+ (nfs->override_portnum)) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Reconfiguring nfs.port needs NFS restart");
+ goto out;
+ }
+
+ /* reconfig nfs.rpc-statd... */
+ rpc_statd = GF_RPC_STATD_PROG;
+ if (dict_get (options, OPT_SERVER_RPC_STATD_PIDFILE)) {
+ ret = dict_get_str (options, "nfs.rpc-statd", &rpc_statd);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to read "
+ "reconfigured option: nfs.rpc-statd");
+ goto out;
+ }
+ }
+
+ if (strcmp(nfs->rpc_statd, rpc_statd) != 0) {
+ gf_log (GF_NFS, GF_LOG_INFO,
+ "Reconfiguring nfs.rpc-statd needs NFS restart");
+ goto out;
+ }
+
+ /* reconfig nfs.mount-rmtab */
+ rmtab = NFS_DATADIR "/rmtab";
+ if (dict_get (options, "nfs.mount-rmtab")) {
+ ret = dict_get_str (options, "nfs.mount-rmtab", &rmtab);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to read "
+ "reconfigured option: nfs.mount-rmtab");
+ goto out;
+ }
+ gf_path_strip_trailing_slashes (rmtab);
+ }
+ if (strcmp (nfs->rmtab, rmtab) != 0) {
+ mount_rewrite_rmtab (nfs->mstate, rmtab);
+ gf_log (GF_NFS, GF_LOG_INFO,
+ "Reconfigured nfs.mount-rmtab path: %s",
+ nfs->rmtab);
+ }
+
+ GF_OPTION_RECONF (OPT_SERVER_AUX_GIDS, optbool,
+ options, bool, out);
+ if (nfs->server_aux_gids != optbool) {
+ nfs->server_aux_gids = optbool;
+ gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured %s with value %d",
+ OPT_SERVER_AUX_GIDS, optbool);
+ }
+
+ GF_OPTION_RECONF (OPT_SERVER_GID_CACHE_TIMEOUT, optuint32,
+ options, uint32, out);
+ if (nfs->server_aux_gids_max_age != optuint32) {
+ nfs->server_aux_gids_max_age = optuint32;
+ gid_cache_reconf (&nfs->gid_cache, optuint32);
+ gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured %s with value %d",
+ OPT_SERVER_GID_CACHE_TIMEOUT, optuint32);
+ }
+
+ /* reconfig nfs.dynamic-volumes */
+ ret = dict_get_str_boolean (options, "nfs.dynamic-volumes",
+ GF_NFS_DVM_OFF);
+ switch (ret) {
+ case GF_NFS_DVM_ON:
+ case GF_NFS_DVM_OFF:
+ optbool = ret;
+ break;
+ default:
+ optbool = GF_NFS_DVM_OFF;
+ break;
+ }
+ if (nfs->dynamicvolumes != optbool) {
+ nfs->dynamicvolumes = optbool;
+ gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured nfs.dynamic-volumes"
+ " with value %d", optbool);
+ }
+
+ optbool = _gf_false;
+ if (dict_get (options, "nfs.enable-ino32")) {
+ ret = dict_get_str_boolean (options, "nfs.enable-ino32",
+ _gf_false);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Failed to read reconfigured option: "
+ "nfs.enable-ino32");
+ goto out;
+ }
+ optbool = ret;
+ }
+ if (nfs->enable_ino32 != optbool) {
+ nfs->enable_ino32 = optbool;
+ gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured nfs.enable-ino32"
+ " with value %d", optbool);
+ }
+
+ /* nfs.nlm is enabled by default */
+ ret = dict_get_str_boolean (options, "nfs.nlm", _gf_true);
+ if (ret < 0) {
+ optbool = _gf_true;
+ } else {
+ optbool = ret;
+ }
+ if (nfs->enable_nlm != optbool) {
+ gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually %s",
+ (optbool ? "enabled":"disabled"));
+ nfs->enable_nlm = optbool;
+ nfs_reconfigure_nlm4 (this);
+ }
+
+ /* nfs.acl is enabled by default */
+ ret = dict_get_str_boolean (options, "nfs.acl", _gf_true);
+ if (ret < 0) {
+ optbool = _gf_true;
+ } else {
+ optbool = ret;
+ }
+ if (nfs->enable_acl != optbool) {
+ gf_log (GF_NFS, GF_LOG_INFO, "ACL is manually %s",
+ (optbool ? "enabled":"disabled"));
+ nfs->enable_acl = optbool;
+ nfs_reconfigure_acl3 (this);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+/*
+ * reconfigure() for NFS server xlator.
+ */
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = 0;
+ struct nfs_state *nfs = NULL;
+ gf_boolean_t regpmap = _gf_true;
+
+ if ((!this) || (!this->private) || (!options))
+ return (-1);
+
+ nfs = (struct nfs_state *)this->private;
+
+ /* Reconfigure nfs options */
+ ret = nfs_reconfigure_state(this, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "nfs reconfigure state failed");
+ return (-1);
+ }
+
+ /* Reconfigure nfs3 options */
+ ret = nfs3_reconfigure_state(this, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "nfs3 reconfigure state failed");
+ return (-1);
+ }
+
+ /* Reconfigure mount options */
+ ret = mount_reconfigure_state(this, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "mount reconfigure state failed");
+ return (-1);
+ }
+
+ /* Reconfigure rpc layer */
+ ret = rpcsvc_reconfigure_options (nfs->rpcsvc, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "rpcsvc reconfigure options failed");
+ return (-1);
+ }
+
+ /* Reconfigure rpc.outstanding-rpc-limit */
+ ret = rpcsvc_set_outstanding_rpc_limit (nfs->rpcsvc,
+ options,
+ RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Failed to reconfigure outstanding-rpc-limit");
+ return (-1);
+ }
+
+ regpmap = rpcsvc_register_portmap_enabled(nfs->rpcsvc);
+ if (nfs->register_portmap != regpmap) {
+ nfs->register_portmap = regpmap;
+ if (regpmap) {
+ (void) nfs_program_register_portmap_all (nfs);
+ } else {
+ (void) nfs_program_unregister_portmap_all (nfs);
+ }
+ }
+
+ /* Reconfigure drc */
+ ret = rpcsvc_drc_reconfigure (nfs->rpcsvc, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "rpcsvc DRC reconfigure failed");
+ return (-1);
+ }
+
+ return (0);
+}
+/* Main init() routine for NFS server xlator. It inits NFS v3 protocol
+ * and its dependent protocols e.g. ACL, MOUNT v3 (mount3), NLM and
+ * DRC.
+ *
+ * Usage: glusterfsd:
+ * glusterfs_process_volfp() =>
+ * glusterfs_graph_activate() =>
+ * glusterfs_graph_init() =>
+ * xlator_init () => NFS init() routine
+ *
+ * If init() routine fails, the glusterfsd cleans up the NFS process
+ * by invoking cleanup_and_exit().
+ *
+ * RETURN:
+ * 0 (SUCCESS) if all protocol specific inits PASS.
+ * -1 (FAILURE) if any of them FAILS.
+ */
int
init (xlator_t *this) {
@@ -794,57 +1309,52 @@ init (xlator_t *this) {
int ret = -1;
if (!this)
- return -1;
+ return (-1);
nfs = nfs_init_state (this);
if (!nfs) {
gf_log (GF_NFS, GF_LOG_ERROR, "Failed to init nfs option");
- return -1;
+ return (-1);
}
ret = nfs_add_all_initiators (nfs);
- if (ret == -1) {
+ if (ret) {
gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add initiators");
- goto err;
+ return (-1);
}
ret = nfs_init_subvolumes (nfs, this->children);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init NFS "
- "exports");
- goto err;
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init NFS exports");
+ return (-1);
}
ret = mount_init_state (this);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init Mount"
- "state");
- goto err;
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init Mount state");
+ return (-1);
}
ret = nlm4_init_state (this);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init NLM"
- "state");
- goto err;
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init NLM state");
+ return (-1);
}
ret = nfs_init_versions (nfs, this);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to initialize "
- "protocols");
- /* Do not return an error on this. If we dont return
- * an error, the process keeps running and it helps
- * to point out where the log is by doing ps ax|grep gluster.
- */
- ret = 0;
- goto err;
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to initialize protocols");
+ return (-1);
}
- gf_log (GF_NFS, GF_LOG_INFO, "NFS service started");
-err:
+ ret = nfs_drc_init (this);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to initialize DRC");
+ return (-1);
+ }
- return ret;
+ gf_log (GF_NFS, GF_LOG_INFO, "NFS service started");
+ return (0); /* SUCCESS */
}
@@ -996,7 +1506,22 @@ nlm_priv (xlator_t *this);
int32_t
nfs_priv (xlator_t *this)
{
- return nlm_priv (this);
+ int32_t ret = -1;
+
+ /* DRC needs the global drc structure, xl is of no use to it. */
+ ret = rpcsvc_drc_priv (((struct nfs_state *)(this->private))->rpcsvc->drc);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Statedump of DRC failed");
+ goto out;
+ }
+
+ ret = nlm_priv (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Statedump of NLM failed");
+ goto out;
+ }
+ out:
+ return ret;
}
@@ -1019,29 +1544,53 @@ struct xlator_dumpops dumpops = {
struct volume_options options[] = {
{ .key = {"nfs3.read-size"},
.type = GF_OPTION_TYPE_SIZET,
- .description = "Size in which the client should issue read requests"
- " to the Gluster NFSv3 server. Must be a multiple of"
- " 4KB."
+ .min = GF_NFS3_RTMIN,
+ .max = GF_NFS3_RTMAX,
+ .default_value = TOSTRING(GF_NFS3_RTPREF),
+ .description = "Size in which the client should issue read requests "
+ "to the Gluster NFSv3 server. Must be a multiple of "
+ "4KB (4096). Min and Max supported values are 4KB "
+ "(4096) and 1MB (1048576) respectively. If the "
+ "specified value is within the supported range but "
+ "not a multiple of 4096, it is rounded up to the "
+ "nearest multiple of 4096."
},
{ .key = {"nfs3.write-size"},
.type = GF_OPTION_TYPE_SIZET,
- .description = "Size in which the client should issue write requests"
- " to the Gluster NFSv3 server. Must be a multiple of"
- " 4KB."
+ .min = GF_NFS3_WTMIN,
+ .max = GF_NFS3_WTMAX,
+ .default_value = TOSTRING(GF_NFS3_WTPREF),
+ .description = "Size in which the client should issue write requests "
+ "to the Gluster NFSv3 server. Must be a multiple of "
+ "1KB (1024). Min and Max supported values are "
+ "4KB (4096) and 1MB(1048576) respectively. If the "
+ "specified value is within the supported range but "
+ "not a multiple of 4096, it is rounded up to the "
+ "nearest multiple of 4096."
},
{ .key = {"nfs3.readdir-size"},
.type = GF_OPTION_TYPE_SIZET,
+ .min = GF_NFS3_DTMIN,
+ .max = GF_NFS3_DTMAX,
+ .default_value = TOSTRING(GF_NFS3_DTPREF),
.description = "Size in which the client should issue directory "
- " reading requests."
+ "reading requests to the Gluster NFSv3 server. Must "
+ "be a multiple of 1KB (1024). Min and Max supported "
+ "values are 4KB (4096) and 1MB (1048576) respectively."
+ "If the specified value is within the supported range "
+ "but not a multiple of 4096, it is rounded up to the "
+ "nearest multiple of 4096."
},
{ .key = {"nfs3.*.volume-access"},
.type = GF_OPTION_TYPE_STR,
.value = {"read-only", "read-write"},
+ .default_value = "read-write",
.description = "Type of access desired for this subvolume: "
" read-only, read-write(default)"
},
{ .key = {"nfs3.*.trusted-write"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "On an UNSTABLE write from client, return STABLE flag"
" to force client to not send a COMMIT request. In "
"some environments, combined with a replicated "
@@ -1056,6 +1605,7 @@ struct volume_options options[] = {
},
{ .key = {"nfs3.*.trusted-sync"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "All writes and COMMIT requests are treated as async."
" This implies that no write requests are guaranteed"
" to be on server disks when the write reply is "
@@ -1065,25 +1615,40 @@ struct volume_options options[] = {
},
{ .key = {"nfs3.*.export-dir"},
.type = GF_OPTION_TYPE_PATH,
+ .default_value = "",
.description = "By default, all subvolumes of nfs are exported as "
"individual exports. There are cases where a "
"subdirectory or subdirectories in the volume need to "
"be exported separately. This option can also be used "
"in conjunction with nfs3.export-volumes option to "
"restrict exports only to the subdirectories specified"
- " through this option. Must be an absolute path."
+ " through this option. Must be an absolute path. Along"
+ " with path allowed list of IPs/hostname can be "
+ "associated with each subdirectory. If provided "
+ "connection will allowed only from these IPs. By "
+ "default connections from all IPs are allowed. "
+ "Format: <dir>[(hostspec[|hostspec|...])][,...]. Where"
+ " hostspec can be an IP address, hostname or an IP "
+ "range in CIDR notation. "
+ "e.g. /foo(192.168.1.0/24|host1|10.1.1.8),/host2."
+ " NOTE: Care must be taken while configuring this "
+ "option as invalid entries and/or unreachable DNS "
+ "servers can introduce unwanted delay in all the mount"
+ " calls."
},
{ .key = {"nfs3.export-dirs"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "By default, all subvolumes of nfs are exported as "
"individual exports. There are cases where a "
"subdirectory or subdirectories in the volume need to "
"be exported separately. Enabling this option allows "
"any directory on a volumes to be exported separately."
- " Directory exports are enabled by default."
+ "Directory exports are enabled by default."
},
{ .key = {"nfs3.export-volumes"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Enable or disable exporting whole volumes, instead "
"if used in conjunction with nfs3.export-dir, can "
"allow setting up only subdirectories as exports. On "
@@ -1091,37 +1656,42 @@ struct volume_options options[] = {
},
{ .key = {"rpc-auth.auth-unix"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Disable or enable the AUTH_UNIX authentication type."
- "Must always be enabled for better interoperability."
- "However, can be disabled if needed. Enabled by"
+ "Must always be enabled for better interoperability. "
+ "However, can be disabled if needed. Enabled by "
"default"
},
{ .key = {"rpc-auth.auth-null"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Disable or enable the AUTH_NULL authentication type."
"Must always be enabled. This option is here only to"
" avoid unrecognized option warnings"
},
{ .key = {"rpc-auth.auth-unix.*"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Disable or enable the AUTH_UNIX authentication type "
"for a particular exported volume overriding defaults"
" and general setting for AUTH_UNIX scheme. Must "
- "always be enabled for better interoperability."
- "However, can be disabled if needed. Enabled by"
+ "always be enabled for better interoperability. "
+ "However, can be disabled if needed. Enabled by "
"default."
},
{ .key = {"rpc-auth.auth-unix.*.allow"},
.type = GF_OPTION_TYPE_STR,
+ .default_value = "on",
.description = "Disable or enable the AUTH_UNIX authentication type "
"for a particular exported volume overriding defaults"
" and general setting for AUTH_UNIX scheme. Must "
- "always be enabled for better interoperability."
- "However, can be disabled if needed. Enabled by"
+ "always be enabled for better interoperability. "
+ "However, can be disabled if needed. Enabled by "
"default."
},
{ .key = {"rpc-auth.auth-null.*"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Disable or enable the AUTH_NULL authentication type "
"for a particular exported volume overriding defaults"
" and general setting for AUTH_NULL. Must always be "
@@ -1129,35 +1699,40 @@ struct volume_options options[] = {
"unrecognized option warnings."
},
{ .key = {"rpc-auth.addr.allow"},
- .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+ .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+ .default_value = "all",
.description = "Allow a comma separated list of addresses and/or"
" hostnames to connect to the server. By default, all"
" connections are allowed. This allows users to "
"define a general rule for all exported volumes."
},
{ .key = {"rpc-auth.addr.reject"},
- .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+ .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+ .default_value = "none",
.description = "Reject a comma separated list of addresses and/or"
" hostnames from connecting to the server. By default,"
" all connections are allowed. This allows users to"
"define a general rule for all exported volumes."
},
{ .key = {"rpc-auth.addr.*.allow"},
- .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+ .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+ .default_value = "all",
.description = "Allow a comma separated list of addresses and/or"
" hostnames to connect to the server. By default, all"
" connections are allowed. This allows users to "
"define a rule for a specific exported volume."
},
{ .key = {"rpc-auth.addr.*.reject"},
- .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+ .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+ .default_value = "none",
.description = "Reject a comma separated list of addresses and/or"
" hostnames from connecting to the server. By default,"
- " all connections are allowed. This allows users to"
+ " all connections are allowed. This allows users to "
"define a rule for a specific exported volume."
},
{ .key = {"rpc-auth.ports.insecure"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "Allow client connections from unprivileged ports. By "
"default only privileged ports are allowed. This is a"
"global setting in case insecure ports are to be "
@@ -1165,31 +1740,35 @@ struct volume_options options[] = {
},
{ .key = {"rpc-auth.ports.*.insecure"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "Allow client connections from unprivileged ports. By "
"default only privileged ports are allowed. Use this"
" option to enable or disable insecure ports for "
- "a specific subvolume and to override the global setting "
- " set by the previous option."
+ "a specific subvolume and to override the global "
+ "setting set by the previous option."
},
{ .key = {"rpc-auth.addr.namelookup"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "Users have the option of turning on name lookup for"
" incoming client connections using this option. Use this "
"option to turn on name lookups during address-based "
"authentication. Turning this on will enable you to"
- " use hostnames in rpc-auth.addr.* filters. In some "
+ " use hostnames in nfs.rpc-auth-* filters. In some "
"setups, the name server can take too long to reply to DNS "
- "queries resulting in timeouts of mount requests. By default, "
- " name lookup is off"
+ "queries resulting in timeouts of mount requests. By "
+ "default, name lookup is off"
},
{ .key = {"nfs.dynamic-volumes"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "Internal option set to tell gnfs to use a different"
" scheme for encoding file handles when DVM is being"
" used."
},
{ .key = {"nfs3.*.volume-id"},
.type = GF_OPTION_TYPE_STR,
+ .default_value = "",
.description = "When nfs.dynamic-volumes is set, gnfs expects every "
"subvolume to have this option set for it, so that "
"gnfs can use this option to identify the volume. "
@@ -1198,22 +1777,34 @@ struct volume_options options[] = {
},
{ .key = {"nfs.enable-ino32"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
.description = "For nfs clients or apps that do not support 64-bit "
"inode numbers, use this option to make NFS return "
- "32-bit inode numbers instead. Disabled by default, so "
- "NFS returns 64-bit inode numbers."
+ "32-bit inode numbers instead. Disabled by default, so"
+ " NFS returns 64-bit inode numbers."
},
{ .key = {"rpc.register-with-portmap"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "For systems that need to run multiple nfs servers, "
"only one registration is possible with "
"portmap service. Use this option to turn off portmap "
"registration for Gluster NFS. On by default"
},
+ { .key = {"rpc.outstanding-rpc-limit"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = RPCSVC_MIN_OUTSTANDING_RPC_LIMIT,
+ .max = RPCSVC_MAX_OUTSTANDING_RPC_LIMIT,
+ .default_value = TOSTRING(RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT),
+ .description = "Parameter to throttle the number of incoming RPC "
+ "requests from a client. 0 means no limit (can "
+ "potentially run out of memory)"
+ },
{ .key = {"nfs.port"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 0xffff,
+ .default_value = TOSTRING(GF_NFS3_PORT),
.description = "Use this option on systems that need Gluster NFS to "
"be associated with a non-default port number."
},
@@ -1221,6 +1812,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 1024,
+ .default_value = TOSTRING(GF_NFS_DEFAULT_MEMFACTOR),
.description = "Use this option to make NFS be faster on systems by "
"using more memory. This option specifies a multiple "
"that determines the total amount of memory used. "
@@ -1231,12 +1823,14 @@ struct volume_options options[] = {
},
{ .key = {"nfs.*.disable"},
.type = GF_OPTION_TYPE_BOOL,
- .description = "This option is used to start or stop NFS server"
- "for individual volume."
+ .default_value = "false",
+ .description = "This option is used to start or stop the NFS server "
+ "for individual volumes."
},
{ .key = {"nfs.nlm"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "This option, if set to 'off', disables NLM server "
"by not registering the service with the portmapper."
" Set it to 'on' to re-enable it. Default value: 'on'"
@@ -1244,11 +1838,33 @@ struct volume_options options[] = {
{ .key = {"nfs.mount-udp"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "set the option to 'on' to enable mountd on UDP. "
"Required for some Solaris and AIX NFS clients. "
"The need for enabling this option often depends "
"on the usage of NLM."
},
+ { .key = {"nfs.mount-rmtab"},
+ .type = GF_OPTION_TYPE_PATH,
+ .default_value = DATADIR "/rmtab",
+ .description = "Set the location of the cache file that is used to "
+ "list all the NFS-clients that have connected "
+ "through the MOUNT protocol. If this is on shared "
+ "storage, all GlusterFS servers will update and "
+ "output (with 'showmount') the same list."
+ },
+ { .key = {OPT_SERVER_RPC_STATD},
+ .type = GF_OPTION_TYPE_PATH,
+ .default_value = GF_RPC_STATD_PROG,
+ .description = "The executable of RPC statd utility. "
+ "Defaults to " GF_RPC_STATD_PROG
+ },
+ { .key = {OPT_SERVER_RPC_STATD_PIDFILE},
+ .type = GF_OPTION_TYPE_PATH,
+ .default_value = GF_RPC_STATD_PIDFILE,
+ .description = "The pid file of RPC statd utility. "
+ "Defaults to " GF_RPC_STATD_PIDFILE
+ },
{ .key = {OPT_SERVER_AUX_GIDS},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
@@ -1267,7 +1883,23 @@ struct volume_options options[] = {
.description = "Number of seconds to cache auxiliary-GID data, when "
OPT_SERVER_AUX_GIDS " is set."
},
-
+ { .key = {"nfs.acl"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option is used to control ACL support for NFS."
+ },
+ { .key = {"nfs.drc"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "on",
+ .description = "Enable Duplicate Request Cache in gNFS server to "
+ "improve correctness of non-idempotent operations like "
+ "write, delete, link, et al"
+ },
+ { .key = {"nfs.drc-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "0x20000",
+ .description = "Sets the number of non-idempotent "
+ "requests to cache in drc"
+ },
{ .key = {NULL} },
};
-
diff --git a/xlators/nfs/server/src/nfs.h b/xlators/nfs/server/src/nfs.h
index 7d5163dfe..fc745fbbd 100644
--- a/xlators/nfs/server/src/nfs.h
+++ b/xlators/nfs/server/src/nfs.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __NFS_H__
@@ -45,7 +36,7 @@
#define GF_NFS_MAX_MEMFACTOR 30
#define GF_NFS_DVM_ON 1
-#define GF_NFS_DVM_OFF 2
+#define GF_NFS_DVM_OFF 0
/* This corresponds to the max 16 number of group IDs that are sent through an
* RPC request. Since NFS is the only one going to set this, we can be safe
@@ -86,12 +77,17 @@ struct nfs_state {
unsigned int override_portnum;
int allow_insecure;
int enable_nlm;
+ int enable_acl;
int mount_udp;
+ char *rmtab;
struct rpc_clnt *rpc_clnt;
gf_boolean_t server_aux_gids;
uint32_t server_aux_gids_max_age;
gid_cache_t gid_cache;
uint32_t generation;
+ gf_boolean_t register_portmap;
+ char *rpc_statd;
+ char *rpc_statd_pid_file;
};
struct nfs_inode_ctx {
diff --git a/xlators/nfs/server/src/nfs3-fh.c b/xlators/nfs/server/src/nfs3-fh.c
index c4f59a622..e199c56dc 100644
--- a/xlators/nfs/server/src/nfs3-fh.c
+++ b/xlators/nfs/server/src/nfs3-fh.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -120,17 +111,17 @@ nfs3_fh_is_root_fh (struct nfs3_fh *fh)
void
-nfs3_fh_to_str (struct nfs3_fh *fh, char *str)
+nfs3_fh_to_str (struct nfs3_fh *fh, char *str, size_t len)
{
- char gfid[512];
- char exportid[512];
+ char gfid[GF_UUID_BUF_SIZE];
+ char exportid[GF_UUID_BUF_SIZE];
if ((!fh) || (!str))
return;
- sprintf (str, "FH: exportid %s, gfid %s",
- uuid_utoa_r (fh->exportid, exportid),
- uuid_utoa_r (fh->gfid, gfid));
+ snprintf (str, len, "FH: exportid %s, gfid %s",
+ uuid_utoa_r (fh->exportid, exportid),
+ uuid_utoa_r (fh->gfid, gfid));
}
void
diff --git a/xlators/nfs/server/src/nfs3-fh.h b/xlators/nfs/server/src/nfs3-fh.h
index 23957d977..1049cdc96 100644
--- a/xlators/nfs/server/src/nfs3-fh.h
+++ b/xlators/nfs/server/src/nfs3-fh.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_FH_H_
@@ -65,6 +56,11 @@ struct nfs3_fh {
/* File/dir gfid. */
uuid_t gfid;
+ /* This structure must be exactly NFS3_FHSIZE (64) bytes long.
+ Having the structure shorter results in buffer overflows
+ during XDR decoding.
+ */
+ unsigned char padding[NFS3_FHSIZE - GF_NFSFH_STATIC_SIZE];
} __attribute__((__packed__));
#define GF_NFS3FH_STATIC_INITIALIZER {{0},}
@@ -92,7 +88,7 @@ extern void
nfs3_log_fh (struct nfs3_fh *fh);
extern void
-nfs3_fh_to_str (struct nfs3_fh *fh, char *str);
+nfs3_fh_to_str (struct nfs3_fh *fh, char *str, size_t len);
extern int
nfs3_fh_build_parent_fh (struct nfs3_fh *child, struct iatt *newstat,
diff --git a/xlators/nfs/server/src/nfs3-helpers.c b/xlators/nfs/server/src/nfs3-helpers.c
index f7b1bb0cd..f67cccf1a 100644
--- a/xlators/nfs/server/src/nfs3-helpers.c
+++ b/xlators/nfs/server/src/nfs3-helpers.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -94,7 +85,7 @@ struct nfs3stat_strerror nfs3stat_strerror_table[] = {
{ NFS3ERR_SERVERFAULT, "Error occurred on the server or IO Error" },
{ NFS3ERR_BADTYPE, "Type not supported by the server" },
{ NFS3ERR_JUKEBOX, "Cannot complete server initiated request" },
- { -1, "IO Error" },
+ { NFS3ERR_END_OF_LIST, "IO Error" },
};
@@ -265,6 +256,20 @@ nfs3_errno_to_nfsstat3 (int errnum)
return stat;
}
+/*
+ * Special case: If op_ret is -1, it's very unusual op_errno being
+ * 0 which means something came wrong from upper layer(s). If it
+ * happens by any means, then set NFS3 status to NFS3ERR_SERVERFAULT.
+ */
+inline nfsstat3
+nfs3_cbk_errno_status (int32_t op_ret, int32_t op_errno)
+{
+ if ((op_ret == -1) && (op_errno == 0)) {
+ return NFS3ERR_SERVERFAULT;
+ }
+
+ return nfs3_errno_to_nfsstat3 (op_errno);
+}
void
nfs3_fill_lookup3res_error (lookup3res *res, nfsstat3 stat,
@@ -284,6 +289,9 @@ nfs3_stat_to_fattr3 (struct iatt *buf)
{
fattr3 fa = {0, };
+ if (buf == NULL)
+ goto out;
+
if (IA_ISDIR (buf->ia_type))
fa.type = NF3DIR;
else if (IA_ISREG (buf->ia_type))
@@ -353,6 +361,7 @@ nfs3_stat_to_fattr3 (struct iatt *buf)
fa.mtime.seconds = buf->ia_mtime;
fa.mtime.nseconds = buf->ia_mtime_nsec;
+out:
return fa;
}
@@ -496,7 +505,7 @@ nfs3_fill_fsinfo3res (struct nfs3_state *nfs3, fsinfo3res *res,
resok.wtpref = nfs3->writesize;
resok.wtmult = GF_NFS3_WTMULT;
resok.dtpref = nfs3->readdirsize;
- resok.maxfilesize = GF_NFS3_MAXFILE;
+ resok.maxfilesize = GF_NFS3_MAXFILESIZE;
resok.time_delta = tdelta;
resok.properties = GF_NFS3_FS_PROP;
@@ -534,7 +543,7 @@ char *
nfsstat3_strerror(int stat)
{
int i;
- for(i = 0; nfs3stat_strerror_table[i].stat != -1; i++) {
+ for(i = 0; nfs3stat_strerror_table[i].stat != NFS3ERR_END_OF_LIST ; i++) {
if (nfs3stat_strerror_table[i].stat == stat)
return nfs3stat_strerror_table[i].strerror;
}
@@ -1607,13 +1616,14 @@ err:
void
nfs3_stat_to_errstr (uint32_t xid, char *op, nfsstat3 stat, int pstat,
- char *errstr)
+ char *errstr, size_t len)
{
if ((!op) || (!errstr))
return;
- sprintf (errstr, "XID: %x, %s: NFS: %d(%s), POSIX: %d(%s)", xid, op,
- stat, nfsstat3_strerror (stat), pstat, strerror (pstat));
+ snprintf (errstr, len, "XID: %x, %s: NFS: %d(%s), POSIX: %d(%s)",
+ xid, op,stat, nfsstat3_strerror (stat), pstat,
+ strerror (pstat));
}
void
@@ -1621,10 +1631,10 @@ nfs3_log_common_call (uint32_t xid, char *op, struct nfs3_fh *fh)
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
- nfs3_fh_to_str (fh, fhstr);
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s", xid, op,
fhstr);
}
@@ -1636,9 +1646,9 @@ nfs3_log_fh_entry_call (uint32_t xid, char *op, struct nfs3_fh *fh,
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s, name: %s", xid,
op, fhstr, name);
}
@@ -1651,10 +1661,10 @@ nfs3_log_rename_call (uint32_t xid, struct nfs3_fh *src, char *sname,
char sfhstr[1024];
char dfhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (src, sfhstr);
- nfs3_fh_to_str (dst, dfhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (src, sfhstr, sizeof (sfhstr));
+ nfs3_fh_to_str (dst, dfhstr, sizeof (dfhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, RENAME: args: Src: %s, "
"name: %s, Dst: %s, name: %s", xid, sfhstr, sname, dfhstr,
dname);
@@ -1672,9 +1682,9 @@ nfs3_log_create_call (uint32_t xid, struct nfs3_fh *fh, char *name,
char unchkd[] = "UNCHECKED";
char guarded[] = "GUARDED";
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
if (mode == EXCLUSIVE)
modestr = exclmode;
else if (mode == GUARDED)
@@ -1697,9 +1707,9 @@ nfs3_log_mknod_call (uint32_t xid, struct nfs3_fh *fh, char *name, int type)
char sock[] = "SOCK";
char fifo[] = "FIFO";
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
if (type == NF3CHR)
modestr = chr;
else if (type == NF3BLK)
@@ -1720,9 +1730,9 @@ nfs3_log_symlink_call (uint32_t xid, struct nfs3_fh *fh, char *name, char *tgt)
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, SYMLINK: args: %s, name: %s,"
" target: %s", xid, fhstr, name, tgt);
}
@@ -1735,10 +1745,10 @@ nfs3_log_link_call (uint32_t xid, struct nfs3_fh *fh, char *name,
char dfhstr[1024];
char tfhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, dfhstr);
- nfs3_fh_to_str (tgt, tfhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, dfhstr, sizeof (dfhstr));
+ nfs3_fh_to_str (tgt, tfhstr, sizeof (tfhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, LINK: args: %s, name: %s,"
" target: %s", xid, dfhstr, name, tfhstr);
}
@@ -1750,9 +1760,9 @@ nfs3_log_rw_call (uint32_t xid, char *op, struct nfs3_fh *fh, offset3 offt,
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
if (stablewrite == -1)
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s, offset:"
" %"PRIu64", count: %"PRIu32, xid, op, fhstr, offt,
@@ -3383,11 +3393,11 @@ void
nfs3_log_common_res (uint32_t xid, int op, nfsstat3 stat, int pstat)
{
char errstr[1024];
- int ll = nfs3_loglevel (op, stat);
+ int ll = nfs3_loglevel (op, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s", errstr);
}
@@ -3395,14 +3405,14 @@ void
nfs3_log_readlink_res (uint32_t xid, nfsstat3 stat, int pstat, char *linkpath)
{
char errstr[1024];
- int ll = nfs3_loglevel (NFS3_READLINK, stat);
+ int ll = nfs3_loglevel (NFS3_READLINK, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
+ if (THIS->ctx->log.loglevel < ll)
+ return;
- nfs3_stat_to_errstr (xid, "READLINK", stat, pstat, errstr);
+ nfs3_stat_to_errstr (xid, "READLINK", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, target: %s",
- errstr, linkpath);
+ errstr, linkpath);
}
@@ -3411,12 +3421,12 @@ nfs3_log_read_res (uint32_t xid, nfsstat3 stat, int pstat, count3 count,
int is_eof, struct iovec *vec, int32_t veccount)
{
char errstr[1024];
- int ll = GF_LOG_DEBUG;
+ int ll = GF_LOG_DEBUG;
ll = nfs3_loglevel (NFS3_READ, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, "READ", stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, "READ", stat, pstat, errstr, sizeof (errstr));
if (vec)
gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", is_eof:"
" %d, vector: count: %d, len: %zd", errstr, count,
@@ -3432,12 +3442,12 @@ nfs3_log_write_res (uint32_t xid, nfsstat3 stat, int pstat, count3 count,
int stable, uint64_t wverf)
{
char errstr[1024];
- int ll = nfs3_loglevel (NFS3_WRITE, stat);
+ int ll = nfs3_loglevel (NFS3_WRITE, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
+ if (THIS->ctx->log.loglevel < ll)
+ return;
- nfs3_stat_to_errstr (xid, "WRITE", stat, pstat, errstr);
+ nfs3_stat_to_errstr (xid, "WRITE", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", %s,wverf: %"PRIu64
, errstr, count, (stable == UNSTABLE)?"UNSTABLE":"STABLE",
wverf);
@@ -3450,12 +3460,12 @@ nfs3_log_newfh_res (uint32_t xid, int op, nfsstat3 stat, int pstat,
{
char errstr[1024];
char fhstr[1024];
- int ll = nfs3_loglevel (op, stat);
+ int ll = nfs3_loglevel (op, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr);
- nfs3_fh_to_str (newfh, fhstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr, sizeof (errstr));
+ nfs3_fh_to_str (newfh, fhstr, sizeof (fhstr));
gf_log (GF_NFS3, nfs3_loglevel (op, stat), "%s, %s", errstr, fhstr);
}
@@ -3466,11 +3476,11 @@ nfs3_log_readdir_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf,
count3 count, int is_eof)
{
char errstr[1024];
- int ll = nfs3_loglevel (NFS3_READDIR, stat);
+ int ll = nfs3_loglevel (NFS3_READDIR, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, "READDIR", stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, "READDIR", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", cverf: %"PRIu64
", is_eof: %d", errstr, count, cverf, is_eof);
}
@@ -3483,9 +3493,9 @@ nfs3_log_readdirp_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf,
char errstr[1024];
int ll = nfs3_loglevel (NFS3_READDIRP, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, "READDIRPLUS", stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, "READDIRPLUS", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, dircount: %"PRIu32", maxcount: %"
PRIu32", cverf: %"PRIu64", is_eof: %d", errstr, dircount,
maxcount, cverf, is_eof);
@@ -3498,9 +3508,9 @@ nfs3_log_commit_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t wverf)
char errstr[1024];
int ll = nfs3_loglevel (NFS3_COMMIT, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, "COMMIT", stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, "COMMIT", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, wverf: %"PRIu64, errstr, wverf);
}
@@ -3511,10 +3521,10 @@ nfs3_log_readdir_call (uint32_t xid, struct nfs3_fh *fh, count3 dircount,
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
- nfs3_fh_to_str (fh, fhstr);
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
if (maxcount == 0)
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, READDIR: args: %s,"
diff --git a/xlators/nfs/server/src/nfs3-helpers.h b/xlators/nfs/server/src/nfs3-helpers.h
index cc96051e1..eada24221 100644
--- a/xlators/nfs/server/src/nfs3-helpers.h
+++ b/xlators/nfs/server/src/nfs3-helpers.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS3_HELPER_H_
@@ -44,6 +35,9 @@ nfs3_extract_lookup_name (lookup3args *args);
extern nfsstat3
nfs3_errno_to_nfsstat3 (int errnum);
+extern nfsstat3
+nfs3_cbk_errno_status (int32_t, int32_t);
+
extern void
nfs3_fill_lookup3res (lookup3res *res, nfsstat3 stat, struct nfs3_fh *newfh,
struct iatt *stbuf, struct iatt *postparent,
@@ -340,4 +334,7 @@ nfs3_is_parentdir_entry (char *entry);
uint32_t
nfs3_request_to_accessbits (int32_t accbits);
+void
+nfs3_map_deviceid_to_statdev (struct iatt *ia, uint64_t deviceid);
+
#endif
diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c
index 9b756712b..6361f9e20 100644
--- a/xlators/nfs/server/src/nfs3.c
+++ b/xlators/nfs/server/src/nfs3.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -226,32 +217,32 @@ out:
uuid_unparse (handle->exportid, exportid); \
uuid_unparse (handle->gfid, gfid); \
trans = rpcsvc_request_transport (req); \
- gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to map " \
- "FH to vol: client=%s, exportid=%s, gfid=%s",\
- trans->peerinfo.identifier, exportid, \
- gfid); \
- gf_log (GF_NFS3, GF_LOG_ERROR, \
- "Stale nfs client %s must be trying to "\
- "connect to a deleted volume, please " \
- "unmount it.", trans->peerinfo.identifier);\
+ GF_LOG_OCCASIONALLY (nfs3state->occ_logger, \
+ GF_NFS3, GF_LOG_ERROR, "Failed to map " \
+ "FH to vol: client=%s, exportid=%s, " \
+ "gfid=%s", trans->peerinfo.identifier, \
+ exportid, gfid); \
+ GF_LOG_OCCASIONALLY (nfs3state->occ_logger, \
+ GF_NFS3, GF_LOG_ERROR, "Stale nfs " \
+ "client %s must be trying to connect to"\
+ " a deleted volume, please unmount it.",\
+ trans->peerinfo.identifier); \
status = NFS3ERR_STALE; \
goto label; \
} else { \
- gf_log (GF_NFS3, GF_LOG_TRACE, "FH to Volume: %s"\
- ,volume->name); \
- rpcsvc_request_set_private (req, volume); \
+ gf_log (GF_NFS3, GF_LOG_TRACE, "FH to Volume:" \
+ "%s", volume->name); \
+ rpcsvc_request_set_private (req, volume); \
} \
} while (0); \
#define nfs3_validate_gluster_fh(handle, status, errlabel) \
do { \
- if ((handle)) { \
- if (!nfs3_fh_validate (handle)) { \
- gf_log (GF_NFS3, GF_LOG_ERROR, "Bad Handle");\
- status = NFS3ERR_BADHANDLE; \
- goto errlabel; \
- } \
+ if (!nfs3_fh_validate (handle)) { \
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Bad Handle"); \
+ status = NFS3ERR_BADHANDLE; \
+ goto errlabel; \
} \
} while (0) \
@@ -265,10 +256,12 @@ out:
xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \
&cst->resolvefh); \
uuid_unparse (cst->resolvefh.gfid, gfid); \
- sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\
- xlatorp ? xlatorp->name : "ERR", gfid); \
- gf_log (GF_NFS3, GF_LOG_ERROR, "Unable to resolve FH"\
- ": %s", buf); \
+ snprintf (buf, sizeof (buf), "(%s) %s : %s", \
+ trans->peerinfo.identifier, \
+ xlatorp ? xlatorp->name : "ERR", \
+ gfid ); \
+ gf_log (GF_NFS3, GF_LOG_ERROR, "%s: %s", \
+ strerror(cst->resolve_errno), buf); \
nfstat = nfs3_errno_to_nfsstat3 (cst->resolve_errno);\
goto erlabl; \
} \
@@ -285,10 +278,12 @@ out:
xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \
&cst->resolvefh); \
uuid_unparse (cst->resolvefh.gfid, gfid); \
- sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\
- xlatorp ? xlatorp->name : "ERR", gfid); \
- gf_log (GF_NFS3, GF_LOG_ERROR, "Unable to resolve FH"\
- ": %s", buf); \
+ snprintf (buf, sizeof (buf), "(%s) %s : %s", \
+ trans->peerinfo.identifier, \
+ xlatorp ? xlatorp->name : "ERR", \
+ gfid); \
+ gf_log (GF_NFS3, GF_LOG_ERROR, "%s: %s", \
+ strerror(cst->resolve_errno), buf); \
nfstat = nfs3_errno_to_nfsstat3 (cs->resolve_errno);\
goto erlabl; \
} \
@@ -554,23 +549,18 @@ nfs3svc_submit_reply (rpcsvc_request_t *req, void *arg, nfs3_serializer sfunc)
iobref = iobref_new ();
if (!iobref) {
- iobuf_unref (iob);
gf_log (GF_NFS3, GF_LOG_ERROR, "failed on iobref_new()");
goto ret;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref);
-
- /* Now that we've done our job of handing the message to the RPC layer
- * we can safely unref the iob in the hope that RPC layer must have
- * ref'ed the iob on receiving into the txlist.
- */
- iobuf_unref (iob);
- iobref_unref (iobref);
-
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Reply submission failed");
goto ret;
@@ -578,6 +568,14 @@ nfs3svc_submit_reply (rpcsvc_request_t *req, void *arg, nfs3_serializer sfunc)
ret = 0;
ret:
+ /* Now that we've done our job of handing the message to the RPC layer
+ * we can safely unref the iob in the hope that RPC layer must have
+ * ref'ed the iob on receiving into the txlist.
+ */
+ if (NULL != iob)
+ iobuf_unref (iob);
+ if (NULL != iobref)
+ iobref_unref (iobref);
return ret;
}
@@ -609,19 +607,14 @@ nfs3svc_submit_vector_reply (rpcsvc_request_t *req, void *arg,
new_iobref = 1;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, payload, vcount, iobref);
-
- /* Now that we've done our job of handing the message to the RPC layer
- * we can safely unref the iob in the hope that RPC layer must have
- * ref'ed the iob on receiving into the txlist.
- */
- iobuf_unref (iob);
- if (new_iobref)
- iobref_unref (iobref);
-
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Reply submission failed");
goto ret;
@@ -629,6 +622,14 @@ nfs3svc_submit_vector_reply (rpcsvc_request_t *req, void *arg,
ret = 0;
ret:
+ /* Now that we've done our job of handing the message to the RPC layer
+ * we can safely unref the iob in the hope that RPC layer must have
+ * ref'ed the iob on receiving into the txlist.
+ */
+ if (NULL != iob)
+ iobuf_unref (iob);
+ if (new_iobref)
+ iobref_unref (iobref);
return ret;
}
@@ -703,7 +704,7 @@ nfs3svc_getattr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
}
else {
nfs_fix_generation(this,inode);
@@ -733,7 +734,7 @@ nfs3svc_getattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
}
nfs3_log_common_res (rpcsvc_request_xid (cs->req), NFS3_GETATTR,
@@ -917,7 +918,7 @@ nfs3svc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -957,7 +958,7 @@ nfs3svc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -1013,7 +1014,7 @@ nfs3svc_setattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -1227,7 +1228,7 @@ nfs3svc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
(op_errno == ENOENT ? GF_LOG_TRACE : GF_LOG_WARNING),
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
goto xmit_res;
}
@@ -1271,7 +1272,7 @@ nfs3svc_lookup_parentdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
goto xmit_res;
}
@@ -1313,7 +1314,11 @@ nfs3_lookup_parentdir_resume (void *carg)
nfs3_call_state_t *cs = NULL;
inode_t *parent = NULL;
- GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err);
+ if (!carg) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument,"
+ " carg value NULL");
+ return EINVAL;
+ }
cs = (nfs3_call_state_t *)carg;
nfs3_check_fh_resolve_status (cs, stat, nfs3err);
@@ -1384,7 +1389,11 @@ nfs3_lookup_resume (void *carg)
nfs3_call_state_t *cs = NULL;
struct nfs3_fh newfh = {{0},};
- GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err);
+ if (!carg) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument,"
+ " carg value NULL");
+ return EINVAL;
+ }
cs = (nfs3_call_state_t *)carg;
nfs3_check_fh_resolve_status (cs, stat, nfs3err);
@@ -1521,7 +1530,7 @@ nfs3svc_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
}
nfs3_log_common_res (rpcsvc_request_xid (cs->req), NFS3_ACCESS, status,
op_errno);
@@ -1539,7 +1548,11 @@ nfs3_access_resume (void *carg)
nfs_user_t nfu = {0, };
nfs3_call_state_t *cs = NULL;
- GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err);
+ if (!carg) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument,"
+ " carg value NULL");
+ return EINVAL;
+ }
cs = (nfs3_call_state_t *)carg;
nfs3_check_fh_resolve_status (cs, stat, nfs3err);
@@ -1657,7 +1670,7 @@ nfs3svc_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -1824,7 +1837,7 @@ nfs3svc_read_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
} else
stat = NFS3_OK;
@@ -2010,7 +2023,7 @@ nfs3svc_write_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else
stat = NFS3_OK;
@@ -2070,7 +2083,7 @@ nfs3svc_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
}
@@ -2322,7 +2335,7 @@ nfs3svc_create_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2355,7 +2368,7 @@ nfs3svc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2460,7 +2473,7 @@ nfs3svc_create_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
ret = -op_errno;
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2631,13 +2644,7 @@ nfs3svc_create (rpcsvc_request_t *req)
}
cval = (uint64_t *)args.how.createhow3_u.verf;
- if (cval)
- cverf = *cval;
- else {
- gf_log(GF_NFS3, GF_LOG_ERROR,
- "Error getting createverf3 from args");
- goto rpcerr;
- }
+ cverf = *cval;
ret = nfs3_create (req, &dirfh, name, args.how.mode,
&args.how.createhow3_u.obj_attributes, cverf);
@@ -2681,7 +2688,7 @@ nfs3svc_mkdir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2713,7 +2720,7 @@ nfs3svc_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2891,7 +2898,7 @@ nfs3svc_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -3053,7 +3060,7 @@ nfs3svc_mknod_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -3085,7 +3092,7 @@ nfs3svc_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -3343,7 +3350,7 @@ nfs3svc_remove_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
}
if (op_ret == 0)
@@ -3509,7 +3516,7 @@ nfs3svc_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else {
stat = NFS3_OK;
}
@@ -3664,7 +3671,7 @@ nfs3svc_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%x: rename %s -> %s => -1 (%s)",
rpcsvc_request_xid (cs->req), cs->oploc.path,
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -3869,7 +3876,7 @@ nfs3svc_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%x: link %s <- %s => -1 (%s)",
rpcsvc_request_xid (cs->req), cs->oploc.path,
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else
stat = NFS3_OK;
@@ -4079,7 +4086,7 @@ nfs3svc_readdir_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -4134,7 +4141,7 @@ nfs3svc_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
}
@@ -4382,16 +4389,8 @@ nfs3svc_readdir (rpcsvc_request_t *req)
rpcsvc_request_seterr (req, GARBAGE_ARGS);
goto rpcerr;
}
-
cval = (uint64_t *) ra.cookieverf;
-
- if (cval)
- verf = *cval;
- else {
- gf_log(GF_NFS3, GF_LOG_ERROR,
- "Error getting cookieverf from readdir args");
- goto rpcerr;
- }
+ verf = *cval;
ret = nfs3_readdir (req, &fh, ra.cookie, verf, ra.count, 0);
if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
@@ -4422,16 +4421,8 @@ nfs3svc_readdirp (rpcsvc_request_t *req)
rpcsvc_request_seterr (req, GARBAGE_ARGS);
goto rpcerr;
}
-
cval = (uint64_t *) ra.cookieverf;
-
- if (cval)
- cverf = *cval;
- else {
- gf_log (GF_NFS3, GF_LOG_ERROR,
- "Error getting cookieverf from readdirp args");
- goto rpcerr;
- }
+ cverf = *cval;
ret = nfs3_readdir (req, &fh, ra.cookie, cverf, ra.dircount,
ra.maxcount);
@@ -4474,7 +4465,7 @@ nfs3_fsstat_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else
stat = NFS3_OK;
@@ -4502,7 +4493,7 @@ nfs3_fsstat_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
ret = -op_errno;
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
}
@@ -4661,7 +4652,7 @@ nfs3svc_fsinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
}else
status = NFS3_OK;
@@ -4803,7 +4794,7 @@ nfs3svc_pathconf_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else {
/* If stat fop failed, we can still send the other components
* in a pathconf reply.
@@ -4947,7 +4938,7 @@ nfs3svc_commit_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else
stat = NFS3_OK;
@@ -5108,28 +5099,28 @@ rpcerr:
rpcsvc_actor_t nfs3svc_actors[NFS3_PROC_COUNT] = {
- {"NULL", NFS3_NULL, nfs3svc_null, NULL, 0},
- {"GETATTR", NFS3_GETATTR, nfs3svc_getattr,NULL, 0},
- {"SETATTR", NFS3_SETATTR, nfs3svc_setattr,NULL, 0},
- {"LOOKUP", NFS3_LOOKUP, nfs3svc_lookup, NULL, 0},
- {"ACCESS", NFS3_ACCESS, nfs3svc_access, NULL, 0},
- {"READLINK", NFS3_READLINK, nfs3svc_readlink,NULL, 0},
- {"READ", NFS3_READ, nfs3svc_read, NULL, 0},
- {"WRITE", NFS3_WRITE, nfs3svc_write, nfs3svc_write_vecsizer, 0},
- {"CREATE", NFS3_CREATE, nfs3svc_create, NULL, 0},
- {"MKDIR", NFS3_MKDIR, nfs3svc_mkdir, NULL, 0},
- {"SYMLINK", NFS3_SYMLINK, nfs3svc_symlink,NULL, 0},
- {"MKNOD", NFS3_MKNOD, nfs3svc_mknod, NULL, 0},
- {"REMOVE", NFS3_REMOVE, nfs3svc_remove, NULL, 0},
- {"RMDIR", NFS3_RMDIR, nfs3svc_rmdir, NULL, 0},
- {"RENAME", NFS3_RENAME, nfs3svc_rename, NULL, 0},
- {"LINK", NFS3_LINK, nfs3svc_link, NULL, 0},
- {"READDIR", NFS3_READDIR, nfs3svc_readdir,NULL, 0},
- {"READDIRPLUS", NFS3_READDIRP, nfs3svc_readdirp,NULL, 0},
- {"FSSTAT", NFS3_FSSTAT, nfs3svc_fsstat, NULL, 0},
- {"FSINFO", NFS3_FSINFO, nfs3svc_fsinfo, NULL, 0},
- {"PATHCONF", NFS3_PATHCONF, nfs3svc_pathconf,NULL, 0},
- {"COMMIT", NFS3_COMMIT, nfs3svc_commit, NULL, 0}
+ {"NULL", NFS3_NULL, nfs3svc_null, NULL, 0, DRC_IDEMPOTENT},
+ {"GETATTR", NFS3_GETATTR, nfs3svc_getattr, NULL, 0, DRC_IDEMPOTENT},
+ {"SETATTR", NFS3_SETATTR, nfs3svc_setattr, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"LOOKUP", NFS3_LOOKUP, nfs3svc_lookup, NULL, 0, DRC_IDEMPOTENT},
+ {"ACCESS", NFS3_ACCESS, nfs3svc_access, NULL, 0, DRC_IDEMPOTENT},
+ {"READLINK", NFS3_READLINK, nfs3svc_readlink, NULL, 0, DRC_IDEMPOTENT},
+ {"READ", NFS3_READ, nfs3svc_read, NULL, 0, DRC_IDEMPOTENT},
+ {"WRITE", NFS3_WRITE, nfs3svc_write, nfs3svc_write_vecsizer, 0, DRC_NON_IDEMPOTENT},
+ {"CREATE", NFS3_CREATE, nfs3svc_create, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"MKDIR", NFS3_MKDIR, nfs3svc_mkdir, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"SYMLINK", NFS3_SYMLINK, nfs3svc_symlink, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"MKNOD", NFS3_MKNOD, nfs3svc_mknod, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"REMOVE", NFS3_REMOVE, nfs3svc_remove, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"RMDIR", NFS3_RMDIR, nfs3svc_rmdir, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"RENAME", NFS3_RENAME, nfs3svc_rename, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"LINK", NFS3_LINK, nfs3svc_link, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"READDIR", NFS3_READDIR, nfs3svc_readdir, NULL, 0, DRC_IDEMPOTENT},
+ {"READDIRPLUS", NFS3_READDIRP, nfs3svc_readdirp, NULL, 0, DRC_IDEMPOTENT},
+ {"FSSTAT", NFS3_FSSTAT, nfs3svc_fsstat, NULL, 0, DRC_IDEMPOTENT},
+ {"FSINFO", NFS3_FSINFO, nfs3svc_fsinfo, NULL, 0, DRC_IDEMPOTENT},
+ {"PATHCONF", NFS3_PATHCONF, nfs3svc_pathconf, NULL, 0, DRC_IDEMPOTENT},
+ {"COMMIT", NFS3_COMMIT, nfs3svc_commit, NULL, 0, DRC_IDEMPOTENT}
};
@@ -5146,21 +5137,48 @@ rpcsvc_program_t nfs3prog = {
.min_auth = AUTH_NULL,
};
+/*
+ * This function rounds up the input value to multiple of 4096. Min and Max
+ * supported I/O size limits are 4KB (GF_NFS3_FILE_IO_SIZE_MIN) and
+ * 1MB (GF_NFS3_FILE_IO_SIZE_MAX).
+ */
+void
+nfs3_iosize_roundup_4KB (uint64_t *ioszptr)
+{
+ uint64_t iosize;
+ uint64_t iopages;
+
+ if (!ioszptr)
+ return;
+
+ iosize = *ioszptr;
+ iopages = (iosize + GF_NFS3_IO_SIZE -1) >> GF_NFS3_IO_SHIFT;
+ iosize = (iopages * GF_NFS3_IO_SIZE);
+
+ /* Double check - boundary conditions */
+ if (iosize < GF_NFS3_FILE_IO_SIZE_MIN) {
+ iosize = GF_NFS3_FILE_IO_SIZE_MIN;
+ } else if (iosize > GF_NFS3_FILE_IO_SIZE_MAX) {
+ iosize = GF_NFS3_FILE_IO_SIZE_MAX;
+ }
+
+ *ioszptr = iosize;
+}
int
-nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
+nfs3_init_options (struct nfs3_state *nfs3, dict_t *options)
{
int ret = -1;
char *optstr = NULL;
uint64_t size64 = 0;
- if ((!nfs3) || (!nfsx))
+ if ((!nfs3) || (!options))
return -1;
/* nfs3.read-size */
nfs3->readsize = GF_NFS3_RTPREF;
- if (dict_get (nfsx->options, "nfs3.read-size")) {
- ret = dict_get_str (nfsx->options, "nfs3.read-size", &optstr);
+ if (dict_get (options, "nfs3.read-size")) {
+ ret = dict_get_str (options, "nfs3.read-size", &optstr);
if (ret < 0) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read "
" option: nfs3.read-size");
@@ -5168,20 +5186,22 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
goto err;
}
- ret = gf_string2bytesize (optstr, &size64);
- nfs3->readsize = size64;
+ ret = gf_string2uint64 (optstr, &size64);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format"
" option: nfs3.read-size");
ret = -1;
goto err;
}
+
+ nfs3_iosize_roundup_4KB (&size64);
+ nfs3->readsize = size64;
}
/* nfs3.write-size */
nfs3->writesize = GF_NFS3_WTPREF;
- if (dict_get (nfsx->options, "nfs3.write-size")) {
- ret = dict_get_str (nfsx->options, "nfs3.write-size", &optstr);
+ if (dict_get (options, "nfs3.write-size")) {
+ ret = dict_get_str (options, "nfs3.write-size", &optstr);
if (ret < 0) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read "
" option: nfs3.write-size");
@@ -5189,20 +5209,22 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
goto err;
}
- ret = gf_string2bytesize (optstr, &size64);
- nfs3->writesize = size64;
+ ret = gf_string2uint64 (optstr, &size64);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format"
" option: nfs3.write-size");
ret = -1;
goto err;
}
+
+ nfs3_iosize_roundup_4KB (&size64);
+ nfs3->writesize = size64;
}
/* nfs3.readdir.size */
nfs3->readdirsize = GF_NFS3_DTPREF;
- if (dict_get (nfsx->options, "nfs3.readdir-size")) {
- ret = dict_get_str (nfsx->options,"nfs3.readdir-size", &optstr);
+ if (dict_get (options, "nfs3.readdir-size")) {
+ ret = dict_get_str (options,"nfs3.readdir-size", &optstr);
if (ret < 0) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read"
" option: nfs3.readdir-size");
@@ -5210,16 +5232,17 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
goto err;
}
- ret = gf_string2bytesize (optstr, &size64);
- nfs3->readdirsize = size64;
+ ret = gf_string2uint64 (optstr, &size64);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format"
" option: nfs3.readdir-size");
ret = -1;
goto err;
}
- }
+ nfs3_iosize_roundup_4KB (&size64);
+ nfs3->readdirsize = size64;
+ }
/* We want to use the size of the biggest param for the io buffer size.
*/
@@ -5233,16 +5256,15 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
* accommodate the NFS headers also in the same buffer. */
nfs3->iobsize = nfs3->iobsize * 2;
- /* mem-factor */
- nfs3->memfactor = GF_NFS3_DEFAULT_MEMFACTOR;
ret = 0;
err:
return ret;
}
-
int
-nfs3_init_subvolume_options (struct nfs3_state *nfs3, struct nfs3_export *exp)
+nfs3_init_subvolume_options (xlator_t *nfsx,
+ struct nfs3_export *exp,
+ dict_t *options)
{
int ret = -1;
char *optstr = NULL;
@@ -5250,14 +5272,20 @@ nfs3_init_subvolume_options (struct nfs3_state *nfs3, struct nfs3_export *exp)
char *name = NULL;
gf_boolean_t boolt = _gf_false;
uuid_t volumeid = {0, };
- dict_t *options = NULL;
- if ((!exp) || (!nfs3))
+ if ((!nfsx) || (!exp))
return -1;
- options = nfs3->nfsx->options;
+ /* For init, fetch options from xlator but for
+ * reconfigure, take the parameter */
+ if (!options)
+ options = nfsx->options;
+
+ if (!options)
+ return (-1);
+
uuid_clear (volumeid);
- if (gf_nfs_dvm_off (nfs_state (nfs3->nfsx)))
+ if (gf_nfs_dvm_off (nfs_state (nfsx)))
goto no_dvm;
ret = snprintf (searchkey, 1024, "nfs3.%s.volume-id",exp->subvol->name);
@@ -5423,7 +5451,7 @@ nfs3_init_subvolume (struct nfs3_state *nfs3, xlator_t *subvol)
INIT_LIST_HEAD (&exp->explist);
gf_log (GF_NFS3, GF_LOG_TRACE, "Initing state: %s", exp->subvol->name);
- ret = nfs3_init_subvolume_options (nfs3, exp);
+ ret = nfs3_init_subvolume_options (nfs3->nfsx, exp, NULL);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to init subvol");
goto exp_free;
@@ -5477,7 +5505,7 @@ nfs3_init_state (xlator_t *nfsx)
unsigned int localpool = 0;
struct nfs_state *nfs = NULL;
- if (!nfsx)
+ if ((!nfsx) || (!nfsx->private))
return NULL;
nfs3 = (struct nfs3_state *)GF_CALLOC (1, sizeof (*nfs3),
@@ -5488,7 +5516,7 @@ nfs3_init_state (xlator_t *nfsx)
}
nfs = nfsx->private;
- ret = nfs3_init_options (nfs3, nfsx);
+ ret = nfs3_init_options (nfs3, nfsx->options);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to init options");
goto ret;
@@ -5496,7 +5524,7 @@ nfs3_init_state (xlator_t *nfsx)
nfs3->iobpool = nfsx->ctx->iobuf_pool;
- localpool = nfs3->memfactor * GF_NFS_CONCURRENT_OPS_MULT;
+ localpool = nfs->memfactor * GF_NFS_CONCURRENT_OPS_MULT;
gf_log (GF_NFS3, GF_LOG_TRACE, "local pool: %d", localpool);
nfs3->localpool = mem_pool_new (nfs3_call_state_t, localpool);
if (!nfs3->localpool) {
@@ -5520,7 +5548,7 @@ nfs3_init_state (xlator_t *nfsx)
LOCK_INIT (&nfs3->fdlrulock);
nfs3->fdcount = 0;
- rpcsvc_create_listeners (nfs->rpcsvc, nfsx->options, nfsx->name);
+ ret = rpcsvc_create_listeners (nfs->rpcsvc, nfsx->options, nfsx->name);
if (ret == -1) {
gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners");
goto free_localpool;
@@ -5562,4 +5590,39 @@ nfs3svc_init (xlator_t *nfsx)
return &nfs3prog;
}
+int
+nfs3_reconfigure_state (xlator_t *nfsx, dict_t *options)
+{
+ int ret = -1;
+ struct nfs3_export *exp = NULL;
+ struct nfs_state *nfs = NULL;
+ struct nfs3_state *nfs3 = NULL;
+
+ if ((!nfsx) || (!nfsx->private) || (!options))
+ goto out;
+
+ nfs = (struct nfs_state *)nfsx->private;
+ nfs3 = nfs->nfs3state;
+ if (!nfs3)
+ goto out;
+
+ ret = nfs3_init_options (nfs3, options);
+ if (ret) {
+ gf_log (GF_NFS3, GF_LOG_ERROR,
+ "Failed to reconfigure options");
+ goto out;
+ }
+
+ list_for_each_entry (exp, &nfs3->exports, explist) {
+ ret = nfs3_init_subvolume_options (nfsx, exp, options);
+ if (ret) {
+ gf_log (GF_NFS3, GF_LOG_ERROR,
+ "Failed to reconfigure subvol options");
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
diff --git a/xlators/nfs/server/src/nfs3.h b/xlators/nfs/server/src/nfs3.h
index 111542bc6..e64ef9d15 100644
--- a/xlators/nfs/server/src/nfs3.h
+++ b/xlators/nfs/server/src/nfs3.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS3_H_
@@ -48,16 +39,39 @@
/* Static values used for FSINFO
-FIXME: This should be configurable */
-#define GF_NFS3_RTMAX (64 * GF_UNIT_KB)
-#define GF_NFS3_RTPREF (64 * GF_UNIT_KB)
-#define GF_NFS3_RTMULT (4 * GF_UNIT_KB)
-#define GF_NFS3_WTMAX (64 * GF_UNIT_KB)
-#define GF_NFS3_WTPREF (64 * GF_UNIT_KB)
-#define GF_NFS3_WTMULT (4 * GF_UNIT_KB)
-#define GF_NFS3_DTMIN (4 * GF_UNIT_KB)
-#define GF_NFS3_DTPREF (64 * GF_UNIT_KB)
-#define GF_NFS3_MAXFILE (1 * GF_UNIT_PB)
+ * To change the maximum rsize and wsize supported by the NFS client, adjust
+ * GF_NFS3_FILE_IO_SIZE_MAX. The Gluster NFS server defaults to 1MB(1048576)
+ * (same as kernel NFS server). For slower network, rsize/wsize can be trimmed
+ * to 16/32/64-KB. rsize and wsize can be tuned through nfs.read-size and
+ * nfs.write-size respectively.
+ *
+ * NB: For Kernel-NFS, NFS_MAX_FILE_IO_SIZE is 1048576U (1MB).
+ */
+#define GF_NFS3_FILE_IO_SIZE_MAX (1 * GF_UNIT_MB) /* 1048576 */
+#define GF_NFS3_FILE_IO_SIZE_MIN (4 * GF_UNIT_KB) /* 4096 */
+
+#define GF_NFS3_FILE_IO_SIZE_DEF GF_NFS3_FILE_IO_SIZE_MAX
+
+#define GF_NFS3_RTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_RTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_RTPREF GF_NFS3_FILE_IO_SIZE_DEF
+#define GF_NFS3_RTMULT GF_NFS3_FILE_IO_SIZE_MIN
+
+#define GF_NFS3_WTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_WTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_WTPREF GF_NFS3_FILE_IO_SIZE_DEF
+#define GF_NFS3_WTMULT GF_NFS3_FILE_IO_SIZE_MIN
+
+/* This can be tuned through nfs.readdir-size */
+#define GF_NFS3_DTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_DTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_DTPREF GF_NFS3_FILE_IO_SIZE_DEF
+
+#define GF_NFS3_MAXFILESIZE (1 * GF_UNIT_PB)
+
+#define GF_NFS3_IO_SIZE 4096 /* 4-KB */
+#define GF_NFS3_IO_SHIFT 12 /* 2^12 = 4KB */
+
/* FIXME: Handle time resolutions */
#define GF_NFS3_TIMEDELTA_SECS {1,0}
#define GF_NFS3_TIMEDELTA_NSECS {0,1}
@@ -120,20 +134,19 @@ typedef struct nfs3_state {
uint64_t serverstart;
/* NFSv3 Protocol configurables */
- size_t readsize;
- size_t writesize;
- size_t readdirsize;
+ uint64_t readsize;
+ uint64_t writesize;
+ uint64_t readdirsize;
/* Size of the iobufs used, depends on the sizes of the three params
* above.
*/
- size_t iobsize;
-
- unsigned int memfactor;
+ uint64_t iobsize;
struct list_head fdlru;
gf_lock_t fdlrulock;
int fdcount;
+ uint32_t occ_logger;
} nfs3_state_t;
typedef enum nfs3_lookup_type {
@@ -262,9 +275,13 @@ struct inode_op_queue {
pthread_mutex_t qlock;
};
+extern rpcsvc_program_t *
+nfs3svc_init (xlator_t *nfsx);
+extern int
+nfs3_reconfigure_state (xlator_t *nfsx, dict_t *options);
+extern uint64_t
+nfs3_request_xlator_deviceid (rpcsvc_request_t *req);
-extern rpcsvc_program_t *
-nfs3svc_init (xlator_t *nfsx);
#endif
diff --git a/xlators/nfs/server/src/nlm4.c b/xlators/nfs/server/src/nlm4.c
index 595738b2c..4d0083fe2 100644
--- a/xlators/nfs/server/src/nlm4.c
+++ b/xlators/nfs/server/src/nlm4.c
@@ -2,19 +2,10 @@
Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -42,7 +33,6 @@
#include "nfs-generics.h"
#include "rpc-clnt.h"
#include "nsm-xdr.h"
-#include "nlmcbk-xdr.h"
#include "run.h"
#include <unistd.h>
#include <rpc/pmap_clnt.h>
@@ -149,8 +139,10 @@ nfs3_fh_to_xlator (struct nfs3_state *nfs3, struct nfs3_fh *fh);
xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \
&cst->resolvefh); \
uuid_unparse (cst->resolvefh.gfid, gfid); \
- sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\
- xlatorp ? xlatorp->name : "ERR", gfid); \
+ snprintf (buf, sizeof (buf), "(%s) %s : %s", \
+ trans->peerinfo.identifier, \
+ xlatorp ? xlatorp->name : "ERR", \
+ gfid); \
gf_log (GF_NLM, GF_LOG_ERROR, "Unable to resolve FH"\
": %s", buf); \
nfstat = nlm4_errno_to_nlm4stat (cst->resolve_errno);\
@@ -164,9 +156,9 @@ nlm4_prep_nlm4_testargs (nlm4_testargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->alock.fh.n_bytes = (void *)fh;
- args->alock.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->alock.fh.nlm4_netobj_val = (void *)fh;
+ args->alock.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -174,9 +166,9 @@ nlm4_prep_nlm4_lockargs (nlm4_lockargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->alock.fh.n_bytes = (void *)fh;
- args->alock.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->alock.fh.nlm4_netobj_val = (void *)fh;
+ args->alock.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -184,9 +176,9 @@ nlm4_prep_nlm4_cancargs (nlm4_cancargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->alock.fh.n_bytes = (void *)fh;
- args->alock.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->alock.fh.nlm4_netobj_val = (void *)fh;
+ args->alock.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -194,9 +186,9 @@ nlm4_prep_nlm4_unlockargs (nlm4_unlockargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->alock.fh.n_bytes = (void *)fh;
- args->alock.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->alock.fh.nlm4_netobj_val = (void *)fh;
+ args->alock.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -204,9 +196,9 @@ nlm4_prep_shareargs (nlm4_shareargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->share.fh.n_bytes = (void *)fh;
- args->share.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->share.fh.nlm4_netobj_val = (void *)fh;
+ args->share.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -217,25 +209,25 @@ nlm4_prep_freeallargs (nlm4_freeallargs *args, nlm4_lkowner_t *oh)
}
void
-nlm_copy_lkowner (gf_lkowner_t *dst, netobj *src)
+nlm_copy_lkowner (gf_lkowner_t *dst, nlm4_netobj *src)
{
- dst->len = src->n_len;
- memcpy (dst->data, src->n_bytes, dst->len);
+ dst->len = src->nlm4_netobj_len;
+ memcpy (dst->data, src->nlm4_netobj_val, dst->len);
}
int
-nlm_is_oh_same_lkowner (gf_lkowner_t *a, netobj *b)
+nlm_is_oh_same_lkowner (gf_lkowner_t *a, nlm4_netobj *b)
{
if (!a || !b) {
gf_log (GF_NLM, GF_LOG_ERROR, "invalid args");
return -1;
}
- return (a->len == b->n_len &&
- !memcmp (a->data, b->n_bytes, a->len));
+ return (a->len == b->nlm4_netobj_len &&
+ !memcmp (a->data, b->nlm4_netobj_val, a->len));
}
-nfsstat3
+nlm4_stats
nlm4_errno_to_nlm4stat (int errnum)
{
nlm4_stats stat = nlm4_denied;
@@ -434,10 +426,11 @@ ret:
int
nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc)
{
- struct iovec outmsg = {0, };
- struct iobuf *iob = NULL;
- struct nfs3_state *nfs3 = NULL;
- int ret = -1;
+ struct iovec outmsg = {0, };
+ struct iobuf *iob = NULL;
+ struct nfs3_state *nfs3 = NULL;
+ int ret = -1;
+ ssize_t msglen = 0;
struct iobref *iobref = NULL;
if (!req)
@@ -462,7 +455,12 @@ nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc)
/* Use the given serializer to translate the give C structure in arg
* to XDR format which will be written into the buffer in outmsg.
*/
- outmsg.iov_len = sfunc (outmsg, arg);
+ msglen = sfunc (outmsg, arg);
+ if (msglen < 0) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "Failed to encode message");
+ goto ret;
+ }
+ outmsg.iov_len = msglen;
iobref = iobref_new ();
if (iobref == NULL) {
@@ -470,7 +468,11 @@ nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc)
goto ret;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref);
@@ -650,7 +652,7 @@ err:
}
int
-nlm4_generic_reply (rpcsvc_request_t *req, netobj cookie, nlm4_stats stat)
+nlm4_generic_reply (rpcsvc_request_t *req, nlm4_netobj cookie, nlm4_stats stat)
{
nlm4_res res;
@@ -915,6 +917,8 @@ nlm_rpcclnt_notify (struct rpc_clnt *rpc_clnt, void *mydata,
case RPC_CLNT_DISCONNECT:
nlm_unset_rpc_clnt (rpc_clnt);
break;
+ default:
+ break;
}
err:
@@ -956,7 +960,7 @@ nlm4_establish_callback (void *csarg)
case AF_INET:
inet_ntop (AF_INET, &sock_union.sin.sin_addr, peerip,
INET6_ADDRSTRLEN+1);
- inet_ntop (AF_INET, &(((struct sockaddr_in *)&cs->req->trans->myinfo.sockaddr)->sin_addr),
+ inet_ntop (AF_INET, &(((struct sockaddr_in *)&cs->trans->myinfo.sockaddr)->sin_addr),
myip, INET6_ADDRSTRLEN + 1);
break;
@@ -970,8 +974,10 @@ nlm4_establish_callback (void *csarg)
NLM_V4, IPPROTO_TCP);
if (port == 0) {
- gf_log (GF_NLM, GF_LOG_ERROR, "Unable to get NLM port of the "
- "client. Is the firewall running on client?");
+ gf_log (GF_NLM, GF_LOG_ERROR,
+ "Unable to get NLM port of the client."
+ " Is the firewall running on client?"
+ " OR Are RPC services running (rpcinfo -p)?");
goto err;
}
@@ -1094,7 +1100,11 @@ nlm4svc_send_granted (nfs3_call_state_t *cs)
goto ret;
}
- iobref_add (iobref, iobuf);
+ ret = iobref_add (iobref, iobuf);
+ if (ret) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
ret = rpc_clnt_submit (rpc_clnt, &nlm4clntprog, NLM4_GRANTED,
nlm4svc_send_granted_cbk, &outmsg, 1,
@@ -1814,7 +1824,7 @@ nlm4_add_share_to_inode (nlm_share_t *share)
inode = share->inode;
ret = inode_ctx_get (inode, this, &ctx);
- if (ret || !head) {
+ if (ret == -1) {
ictx = GF_CALLOC (1, sizeof (struct nfs_inode_ctx),
gf_nfs_mt_inode_ctx);
if (!ictx ) {
@@ -2286,34 +2296,34 @@ nlm4svc_sm_notify (struct nlm_sm_status *status)
rpcsvc_actor_t nlm4svc_actors[NLM4_PROC_COUNT] = {
/* 0 */
- {"NULL", NLM4_NULL, nlm4svc_null, NULL},
- {"TEST", NLM4_TEST, nlm4svc_test, NULL},
- {"LOCK", NLM4_LOCK, nlm4svc_lock, NULL},
- {"CANCEL", NLM4_CANCEL, nlm4svc_cancel, NULL},
- {"UNLOCK", NLM4_UNLOCK, nlm4svc_unlock, NULL},
+ {"NULL", NLM4_NULL, nlm4svc_null, NULL, 0, DRC_IDEMPOTENT},
+ {"TEST", NLM4_TEST, nlm4svc_test, NULL, 0, DRC_IDEMPOTENT},
+ {"LOCK", NLM4_LOCK, nlm4svc_lock, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"CANCEL", NLM4_CANCEL, nlm4svc_cancel, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"UNLOCK", NLM4_UNLOCK, nlm4svc_unlock, NULL, 0, DRC_NON_IDEMPOTENT},
/* 5 */
- {"GRANTED", NLM4_GRANTED, NULL, NULL},
- {"TEST", NLM4_TEST_MSG, NULL, NULL},
- {"LOCK", NLM4_LOCK_MSG, NULL, NULL},
- {"CANCEL", NLM4_CANCEL_MSG, NULL, NULL},
- {"UNLOCK", NLM4_UNLOCK_MSG, NULL, NULL},
+ {"GRANTED", NLM4_GRANTED, NULL, NULL, 0, DRC_NA},
+ {"TEST", NLM4_TEST_MSG, NULL, NULL, 0, DRC_NA},
+ {"LOCK", NLM4_LOCK_MSG, NULL, NULL, 0, DRC_NA},
+ {"CANCEL", NLM4_CANCEL_MSG, NULL, NULL, 0, DRC_NA},
+ {"UNLOCK", NLM4_UNLOCK_MSG, NULL, NULL, 0, DRC_NA},
/* 10 */
- {"GRANTED", NLM4_GRANTED_MSG, NULL, NULL},
- {"TEST", NLM4_TEST_RES, NULL, NULL},
- {"LOCK", NLM4_LOCK_RES, NULL, NULL},
- {"CANCEL", NLM4_CANCEL_RES, NULL, NULL},
- {"UNLOCK", NLM4_UNLOCK_RES, NULL, NULL},
+ {"GRANTED", NLM4_GRANTED_MSG, NULL, NULL, 0, DRC_NA},
+ {"TEST", NLM4_TEST_RES, NULL, NULL, 0, DRC_NA},
+ {"LOCK", NLM4_LOCK_RES, NULL, NULL, 0, DRC_NA},
+ {"CANCEL", NLM4_CANCEL_RES, NULL, NULL, 0, DRC_NA},
+ {"UNLOCK", NLM4_UNLOCK_RES, NULL, NULL, 0, DRC_NA},
/* 15 ; procedures 17,18,19 are not defined by nlm */
- {"GRANTED", NLM4_GRANTED_RES, NULL, NULL},
- {"SM_NOTIFY", NLM4_SM_NOTIFY, NULL, NULL},
- {"SEVENTEEN", NLM4_SEVENTEEN, NULL, NULL},
- {"EIGHTEEN", NLM4_EIGHTEEN, NULL, NULL},
- {"NINETEEN", NLM4_NINETEEN, NULL, NULL},
+ {"GRANTED", NLM4_GRANTED_RES, NULL, NULL, 0, DRC_NA},
+ {"SM_NOTIFY", NLM4_SM_NOTIFY, NULL, NULL, 0, DRC_NA},
+ {"SEVENTEEN", NLM4_SEVENTEEN, NULL, NULL, 0, DRC_NA},
+ {"EIGHTEEN", NLM4_EIGHTEEN, NULL, NULL, 0, DRC_NA},
+ {"NINETEEN", NLM4_NINETEEN, NULL, NULL, 0, DRC_NA},
/* 20 */
- {"SHARE", NLM4_SHARE, nlm4svc_share, NULL},
- {"UNSHARE", NLM4_UNSHARE, nlm4svc_unshare, NULL},
- {"NM_LOCK", NLM4_NM_LOCK, nlm4svc_nm_lock, NULL},
- {"FREE_ALL", NLM4_FREE_ALL, nlm4svc_free_all, NULL},
+ {"SHARE", NLM4_SHARE, nlm4svc_share, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"UNSHARE", NLM4_UNSHARE, nlm4svc_unshare, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"NM_LOCK", NLM4_NM_LOCK, nlm4svc_nm_lock, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"FREE_ALL", NLM4_FREE_ALL, nlm4svc_free_all, NULL, 0, DRC_IDEMPOTENT},
};
rpcsvc_program_t nlm4prog = {
@@ -2349,9 +2359,14 @@ nlm4svc_init(xlator_t *nfsx)
int ret = -1;
char *portstr = NULL;
pthread_t thr;
- struct timeval timeout = {0,};
+ struct timespec timeout = {0,};
FILE *pidfile = NULL;
pid_t pid = -1;
+ static gf_boolean_t nlm4_inited = _gf_false;
+
+ /* Already inited */
+ if (nlm4_inited)
+ return &nlm4prog;
nfs = (struct nfs_state*)nfsx->private;
@@ -2397,7 +2412,7 @@ nlm4svc_init(xlator_t *nfsx)
goto err;
}
- rpcsvc_create_listeners (nfs->rpcsvc, options, "NLM");
+ ret = rpcsvc_create_listeners (nfs->rpcsvc, options, "NLM");
if (ret == -1) {
gf_log (GF_NLM, GF_LOG_ERROR, "Unable to create listeners");
dict_unref (options);
@@ -2409,9 +2424,21 @@ nlm4svc_init(xlator_t *nfsx)
/* unlink sm-notify.pid so that when we restart rpc.statd/sm-notify
* it thinks that the machine has restarted and sends NOTIFY to clients.
*/
- ret = unlink ("/var/run/sm-notify.pid");
+
+ /* TODO:
+ notify/rpc.statd is done differently on OSX
+
+ On OSX rpc.statd is controlled by rpc.lockd and are part for launchd
+ (unified service management framework)
+
+ A runcmd() should be invoking "launchctl start com.apple.lockd"
+ instead. This is still a theory but we need to thoroughly test it
+ out. Until then NLM support is non-existent on OSX.
+ */
+ ret = unlink (GF_SM_NOTIFY_PIDFILE);
if (ret == -1 && errno != ENOENT) {
- gf_log (GF_NLM, GF_LOG_ERROR, "unable to unlink sm-notify");
+ gf_log (GF_NLM, GF_LOG_ERROR, "unable to unlink %s: %d",
+ GF_SM_NOTIFY_PIDFILE, errno);
goto err;
}
/* temporary work around to restart statd, not distro/OS independant.
@@ -2419,41 +2446,50 @@ nlm4svc_init(xlator_t *nfsx)
* killall will cause problems on solaris.
*/
- pidfile = fopen ("/var/run/rpc.statd.pid", "r");
+ char *pid_file = GF_RPC_STATD_PIDFILE;
+ if (nfs->rpc_statd_pid_file)
+ pid_file = nfs->rpc_statd_pid_file;
+ pidfile = fopen (pid_file, "r");
if (pidfile) {
ret = fscanf (pidfile, "%d", &pid);
if (ret <= 0) {
gf_log (GF_NLM, GF_LOG_WARNING, "unable to get pid of "
- "rpc.statd");
+ "rpc.statd from %s ", GF_RPC_STATD_PIDFILE);
ret = runcmd ("killall", "-9", "rpc.statd", NULL);
} else
kill (pid, SIGKILL);
fclose (pidfile);
} else {
- gf_log (GF_NLM, GF_LOG_WARNING, "opening the pid file of "
- "rpc.statd failed (%s)", strerror (errno));
+ gf_log (GF_NLM, GF_LOG_WARNING, "opening %s of "
+ "rpc.statd failed (%s)", pid_file, strerror (errno));
/* if ret == -1, do nothing - case either statd was not
* running or was running in valgrind mode
*/
ret = runcmd ("killall", "-9", "rpc.statd", NULL);
}
- ret = unlink ("/var/run/rpc.statd.pid");
+ ret = unlink (GF_RPC_STATD_PIDFILE);
if (ret == -1 && errno != ENOENT) {
- gf_log (GF_NLM, GF_LOG_ERROR, "unable to unlink rpc.statd");
+ gf_log (GF_NLM, GF_LOG_ERROR, "unable to unlink %s", pid_file);
goto err;
}
- ret = runcmd ("/sbin/rpc.statd", NULL);
+ ret = runcmd (nfs->rpc_statd, NULL);
if (ret == -1) {
- gf_log (GF_NLM, GF_LOG_ERROR, "unable to start rpc.statd");
+ gf_log (GF_NLM, GF_LOG_ERROR, "unable to start %s",
+ nfs->rpc_statd);
goto err;
}
+
+
pthread_create (&thr, NULL, nsm_thread, (void*)NULL);
timeout.tv_sec = nlm_grace_period;
+ timeout.tv_nsec = 0;
+
gf_timer_call_after (nfsx->ctx, timeout, nlm_grace_period_over, NULL);
+ nlm4_inited = _gf_true;
return &nlm4prog;
err:
return NULL;
diff --git a/xlators/nfs/server/src/nlm4.h b/xlators/nfs/server/src/nlm4.h
index 4659915aa..e234b6944 100644
--- a/xlators/nfs/server/src/nlm4.h
+++ b/xlators/nfs/server/src/nlm4.h
@@ -2,19 +2,10 @@
Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NLM4_H_
@@ -40,9 +31,44 @@
#include "nlm4-xdr.h"
#include "lkowner.h"
+#define NLM4_NULL 0
+#define NLM4_TEST 1
+#define NLM4_LOCK 2
+#define NLM4_CANCEL 3
+#define NLM4_UNLOCK 4
+#define NLM4_GRANTED 5
+#define NLM4_TEST_MSG 6
+#define NLM4_LOCK_MSG 7
+#define NLM4_CANCEL_MSG 8
+#define NLM4_UNLOCK_MSG 9
+#define NLM4_GRANTED_MSG 10
+#define NLM4_TEST_RES 11
+#define NLM4_LOCK_RES 12
+#define NLM4_CANCEL_RES 13
+#define NLM4_UNLOCK_RES 14
+#define NLM4_GRANTED_RES 15
+#define NLM4_SM_NOTIFY 16
+#define NLM4_SEVENTEEN 17
+#define NLM4_EIGHTEEN 18
+#define NLM4_NINETEEN 19
+#define NLM4_SHARE 20
+#define NLM4_UNSHARE 21
+#define NLM4_NM_LOCK 22
+#define NLM4_FREE_ALL 23
+#define NLM4_PROC_COUNT 24
+
/* Registered with portmap */
#define GF_NLM4_PORT 38468
#define GF_NLM GF_NFS"-NLM"
+#ifdef GF_DARWIN_HOST_OS
+#define GF_RPC_STATD_PROG "/usr/sbin/rpc.statd"
+#define GF_RPC_STATD_PIDFILE "/var/run/statd.pid"
+#define GF_SM_NOTIFY_PIDFILE "/var/run/statd.notify.pid"
+#else
+#define GF_RPC_STATD_PROG "/sbin/rpc.stat"
+#define GF_RPC_STATD_PIDFILE "/var/run/rpc.statd.pid"
+#define GF_SM_NOTIFY_PIDFILE "/var/run/sm-notify.pid"
+#endif
extern rpcsvc_program_t *
nlm4svc_init (xlator_t *nfsx);
diff --git a/xlators/nfs/server/src/nlmcbk_svc.c b/xlators/nfs/server/src/nlmcbk_svc.c
index 5401dc39b..20d3728d0 100644
--- a/xlators/nfs/server/src/nlmcbk_svc.c
+++ b/xlators/nfs/server/src/nlmcbk_svc.c
@@ -2,19 +2,10 @@
Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
/*
@@ -22,7 +13,6 @@
* It was generated using rpcgen.
*/
-#include "nlmcbk-xdr.h"
#include "nlm4.h"
#include "logging.h"
#include <stdio.h>
@@ -92,10 +82,14 @@ void *
nsm_thread (void *argv)
{
register SVCXPRT *transp;
+ int ret = 0;
- pmap_unset (NLMCBK_PROGRAM, NLMCBK_V1);
-
- transp = svcudp_create(RPC_ANYSOCK);
+ ret = pmap_unset (NLMCBK_PROGRAM, NLMCBK_V1);
+ if (ret == 0) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "pmap_unset failed");
+ return NULL;
+ }
+ transp = svcudp_create(RPC_ANYSOCK);
if (transp == NULL) {
gf_log (GF_NLM, GF_LOG_ERROR, "cannot create udp service.");
return NULL;
diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am
index f99e11829..a494190ba 100644
--- a/xlators/performance/Makefile.am
+++ b/xlators/performance/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read md-cache open-behind
+SUBDIRS = write-behind read-ahead readdir-ahead io-threads io-cache symlink-cache quick-read md-cache open-behind
CLEANFILES =
diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c
index 30dc14a9c..facff5038 100644
--- a/xlators/performance/io-cache/src/io-cache.c
+++ b/xlators/performance/io-cache/src/io-cache.c
@@ -41,6 +41,9 @@ ioc_hashfn (void *data, int len)
return (offset >> ioc_log2_page_size);
}
+/* TODO: This function is not used, uncomment when we find a
+ usage for this function.
+
static inline ioc_inode_t *
ioc_inode_reupdate (ioc_inode_t *ioc_inode)
{
@@ -54,6 +57,7 @@ ioc_inode_reupdate (ioc_inode_t *ioc_inode)
return ioc_inode;
}
+
static inline ioc_inode_t *
ioc_get_inode (dict_t *dict, char *name)
{
@@ -77,6 +81,7 @@ ioc_get_inode (dict_t *dict, char *name)
return ioc_inode;
}
+*/
int32_t
ioc_inode_need_revalidate (ioc_inode_t *ioc_inode)
@@ -316,9 +321,11 @@ ioc_forget (xlator_t *this, inode_t *inode)
static int32_t
ioc_invalidate(xlator_t *this, inode_t *inode)
{
+ uint64_t ioc_addr = 0;
ioc_inode_t *ioc_inode = NULL;
- inode_ctx_get(inode, this, (uint64_t *) &ioc_inode);
+ inode_ctx_get(inode, this, (uint64_t *) &ioc_addr);
+ ioc_inode = (void *) ioc_addr;
if (ioc_inode)
ioc_inode_flush(ioc_inode);
@@ -989,6 +996,7 @@ ioc_dispatch_requests (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd,
"out of memory");
local->op_ret = -1;
local->op_errno = ENOMEM;
+ ioc_inode_unlock (ioc_inode);
goto out;
}
}
@@ -1422,6 +1430,58 @@ ioc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
return 0;
}
+static int32_t
+ioc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ uint64_t ioc_inode = 0;
+
+ inode_ctx_get (fd->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT(zerofill, frame, op_ret,
+ op_errno, pre, post, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ uint64_t ioc_inode = 0;
+
+ inode_ctx_get (fd->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND(frame, ioc_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+ return 0;
+}
+
+
int32_t
ioc_get_priority_list (const char *opt_str, struct list_head *first)
{
@@ -1608,12 +1668,12 @@ reconfigure (xlator_t *this, dict_t *options)
}
GF_OPTION_RECONF ("max-file-size", table->max_file_size,
- options, size, unlock);
+ options, size_uint64, unlock);
GF_OPTION_RECONF ("min-file-size", table->min_file_size,
- options, size, unlock);
+ options, size_uint64, unlock);
- if ((table->max_file_size >= 0) &&
+ if ((table->max_file_size <= UINT64_MAX) &&
(table->min_file_size > table->max_file_size)) {
gf_log (this->name, GF_LOG_ERROR, "minimum size (%"
PRIu64") of a file that can be cached is "
@@ -1624,7 +1684,7 @@ reconfigure (xlator_t *this, dict_t *options)
}
GF_OPTION_RECONF ("cache-size", cache_size_new,
- options, size, unlock);
+ options, size_uint64, unlock);
if (!check_cache_size_ok (this, cache_size_new)) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR,
@@ -1681,13 +1741,13 @@ init (xlator_t *this)
table->xl = this;
table->page_size = this->ctx->page_size;
- GF_OPTION_INIT ("cache-size", table->cache_size, size, out);
+ GF_OPTION_INIT ("cache-size", table->cache_size, size_uint64, out);
GF_OPTION_INIT ("cache-timeout", table->cache_timeout, int32, out);
- GF_OPTION_INIT ("min-file-size", table->min_file_size, size, out);
+ GF_OPTION_INIT ("min-file-size", table->min_file_size, size_uint64, out);
- GF_OPTION_INIT ("max-file-size", table->max_file_size, size, out);
+ GF_OPTION_INIT ("max-file-size", table->max_file_size, size_uint64, out);
if (!check_cache_size_ok (this, table->cache_size)) {
ret = -1;
@@ -1713,7 +1773,7 @@ init (xlator_t *this)
INIT_LIST_HEAD (&table->inodes);
- if ((table->max_file_size >= 0)
+ if ((table->max_file_size <= UINT64_MAX)
&& (table->min_file_size > table->max_file_size)) {
gf_log ("io-cache", GF_LOG_ERROR, "minimum size (%"
PRIu64") of a file that can be cached is "
@@ -1885,11 +1945,11 @@ int
ioc_inode_dump (xlator_t *this, inode_t *inode)
{
- char *path = NULL;
+ char *path = NULL;
int ret = -1;
char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, };
uint64_t tmp_ioc_inode = 0;
- ioc_inode_t *ioc_inode = NULL;
+ ioc_inode_t *ioc_inode = NULL;
gf_boolean_t section_added = _gf_false;
char uuid_str[64] = {0,};
@@ -1903,9 +1963,6 @@ ioc_inode_dump (xlator_t *this, inode_t *inode)
if (ioc_inode == NULL)
goto out;
- gf_proc_dump_add_section (key_prefix);
- section_added = _gf_true;
-
/* Similar to ioc_page_dump function its better to use
* pthread_mutex_trylock and not to use gf_log in statedump
* to avoid deadlocks.
@@ -1913,24 +1970,30 @@ ioc_inode_dump (xlator_t *this, inode_t *inode)
ret = pthread_mutex_trylock (&ioc_inode->inode_lock);
if (ret)
goto out;
- else
+
{
- gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight);
+ if (uuid_is_null (ioc_inode->inode->gfid))
+ goto unlock;
+
+ gf_proc_dump_add_section (key_prefix);
+ section_added = _gf_true;
- //inode_path takes blocking lock on the itable.
__inode_path (ioc_inode->inode, NULL, &path);
+ gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight);
+
if (path) {
gf_proc_dump_write ("path", "%s", path);
GF_FREE (path);
}
+
gf_proc_dump_write ("uuid", "%s", uuid_utoa_r
(ioc_inode->inode->gfid, uuid_str));
__ioc_cache_dump (ioc_inode, key_prefix);
__ioc_inode_waitq_dump (ioc_inode, key_prefix);
-
- pthread_mutex_unlock (&ioc_inode->inode_lock);
}
+unlock:
+ pthread_mutex_unlock (&ioc_inode->inode_lock);
out:
if (ret && ioc_inode) {
@@ -2044,6 +2107,8 @@ struct xlator_fops fops = {
.mknod = ioc_mknod,
.readdirp = ioc_readdirp,
+ .discard = ioc_discard,
+ .zerofill = ioc_zerofill,
};
diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c
index b2e20ba65..416cd5fe4 100644
--- a/xlators/performance/io-cache/src/page.c
+++ b/xlators/performance/io-cache/src/page.c
@@ -136,6 +136,7 @@ int64_t
ioc_page_destroy (ioc_page_t *page)
{
int64_t ret = 0;
+ struct ioc_inode *inode = NULL;
if (page == NULL) {
goto out;
@@ -143,9 +144,10 @@ ioc_page_destroy (ioc_page_t *page)
ioc_inode_lock (page->inode);
{
+ inode = page->inode;
ret = __ioc_page_destroy (page);
}
- ioc_inode_unlock (page->inode);
+ ioc_inode_unlock (inode);
out:
return ret;
@@ -315,6 +317,7 @@ __ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset,
local->op_errno = ENOMEM;
gf_log (frame->this->name, GF_LOG_WARNING,
"asked to wait on a NULL page");
+ goto out;
}
waitq = GF_CALLOC (1, sizeof (*waitq), gf_ioc_mt_ioc_waitq_t);
@@ -476,6 +479,7 @@ ioc_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
iobref_unref (page->iobref);
GF_FREE (page->vector);
page->vector = NULL;
+ page->iobref = NULL;
}
/* keep a copy of the page for our cache */
@@ -855,7 +859,10 @@ ioc_frame_unwind (call_frame_t *frame)
copied += (fill->count * sizeof (*vector));
- iobref_merge (iobref, fill->iobref);
+ if (iobref_merge (iobref, fill->iobref)) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ }
}
list_del (&fill->list);
@@ -987,7 +994,7 @@ __ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno)
waitq = page->waitq;
page->waitq = NULL;
- gf_log (page->inode->table->xl->name, GF_LOG_WARNING,
+ gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
"page error for page = %p & waitq = %p", page, waitq);
for (trav = waitq; trav; trav = trav->next) {
@@ -1027,6 +1034,7 @@ ioc_waitq_t *
ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno)
{
ioc_waitq_t *waitq = NULL;
+ struct ioc_inode *inode = NULL;
if (page == NULL) {
goto out;
@@ -1034,9 +1042,10 @@ ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno)
ioc_inode_lock (page->inode);
{
+ inode = page->inode;
waitq = __ioc_page_error (page, op_ret, op_errno);
}
- ioc_inode_unlock (page->inode);
+ ioc_inode_unlock (inode);
out:
return waitq;
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index 226c091f1..55bcfd16e 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -14,6 +14,7 @@
#endif
#include "call-stub.h"
+#include "defaults.h"
#include "glusterfs.h"
#include "logging.h"
#include "dict.h"
@@ -29,6 +30,28 @@ int iot_workers_scale (iot_conf_t *conf);
int __iot_workers_scale (iot_conf_t *conf);
struct volume_options options[];
+#define IOT_FOP(name, frame, this, args ...) \
+ do { \
+ call_stub_t *__stub = NULL; \
+ int __ret = -1; \
+ \
+ __stub = fop_##name##_stub(frame, default_##name##_resume, args); \
+ if (!__stub) { \
+ __ret = -ENOMEM; \
+ goto out; \
+ } \
+ \
+ __ret = iot_schedule (frame, this, __stub); \
+ \
+ out: \
+ if (__ret < 0) { \
+ default_##name##_failure_cbk (frame, -__ret); \
+ if (__stub != NULL) { \
+ call_stub_destroy (__stub); \
+ } \
+ } \
+ } while (0)
+
call_stub_t *
__iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep)
{
@@ -307,6 +330,9 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub)
case GF_FOP_XATTROP:
case GF_FOP_FXATTROP:
case GF_FOP_RCHECKSUM:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
+ case GF_FOP_ZEROFILL:
pri = IOT_PRI_LO;
break;
@@ -315,90 +341,23 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub)
case GF_FOP_RELEASE:
case GF_FOP_RELEASEDIR:
case GF_FOP_GETSPEC:
+ case GF_FOP_IPC:
case GF_FOP_MAXVALUE:
//fail compilation on missing fop
//new fop must choose priority.
break;
}
out:
- ret = do_iot_schedule (this->private, stub, pri);
gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop",
gf_fop_list[stub->fop], iot_get_pri_meaning (pri));
+ ret = do_iot_schedule (this->private, stub, pri);
return ret;
}
int
-iot_lookup_cbk (call_frame_t *frame, void * cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xdata,
- struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata,
- postparent);
- return 0;
-}
-
-
-int
-iot_lookup_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_lookup_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->lookup,
- loc, xdata);
- return 0;
-}
-
-
-int
iot_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_lookup_stub (frame, iot_lookup_wrapper, loc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create lookup stub (out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- STACK_UNWIND_STRICT (lookup, frame, -1, -ret, NULL, NULL, NULL,
- NULL);
- }
-
- return 0;
-}
-
-
-int
-iot_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop,
- xdata);
- return 0;
-}
-
-
-int
-iot_setattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- STACK_WIND (frame, iot_setattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setattr,
- loc, stbuf, valid, xdata);
+ IOT_FOP (lookup, frame, this, loc, xdata);
return 0;
}
@@ -407,51 +366,7 @@ int
iot_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_setattr_stub (frame, iot_setattr_wrapper, loc, stbuf, valid,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "Cannot create setattr stub"
- "(Out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
-
- STACK_UNWIND_STRICT (setattr, frame, -1, -ret, NULL, NULL, NULL);
- }
-
- return 0;
-}
-
-
-int
-iot_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, preop, postop,
- xdata);
- return 0;
-}
-
-
-int
-iot_fsetattr_wrapper (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fsetattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, valid,
- xdata);
+ IOT_FOP (setattr, frame, this, loc, stbuf, valid, xdata);
return 0;
}
@@ -460,47 +375,7 @@ int
iot_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fsetattr_stub (frame, iot_fsetattr_wrapper, fd, stbuf,
- valid, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fsetattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fsetattr, frame, -1, -ret, NULL, NULL,
- NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_access_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t mask, dict_t *xdata)
-{
- STACK_WIND (frame, iot_access_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->access, loc, mask, xdata);
+ IOT_FOP (fsetattr, frame, this, fd, stbuf, valid, xdata);
return 0;
}
@@ -509,49 +384,7 @@ int
iot_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_access_stub (frame, iot_access_wrapper, loc, mask, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create access stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (access, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, const char *path,
- struct iatt *stbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, stbuf,
- xdata);
- return 0;
-}
-
-
-int
-iot_readlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- size_t size, dict_t *xdata)
-{
- STACK_WIND (frame, iot_readlink_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readlink,
- loc, size, xdata);
+ IOT_FOP (access, frame, this, loc, mask, xdata);
return 0;
}
@@ -559,51 +392,7 @@ iot_readlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
int
iot_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_readlink_stub (frame, iot_readlink_wrapper, loc, size, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create readlink stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (readlink, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_mknod_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t rdev, mode_t umask, dict_t *xdata)
-{
- STACK_WIND (frame, iot_mknod_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->mknod, loc, mode, rdev, umask,
- xdata);
+ IOT_FOP (readlink, frame, this, loc, size, xdata);
return 0;
}
@@ -612,51 +401,7 @@ int
iot_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
dev_t rdev, mode_t umask, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_mknod_stub (frame, iot_mknod_wrapper, loc, mode, rdev,
- umask, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create mknod stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (mknod, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_mkdir_cbk (call_frame_t *frame, void * cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_mkdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- mode_t umask, dict_t *xdata)
-{
- STACK_WIND (frame, iot_mkdir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->mkdir, loc, mode, umask, xdata);
+ IOT_FOP (mknod, frame, this, loc, mode, rdev, umask, xdata);
return 0;
}
@@ -665,49 +410,7 @@ int
iot_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
mode_t umask, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_mkdir_stub (frame, iot_mkdir_wrapper, loc, mode, umask,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create mkdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (mkdir, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent,
- postparent, xdata);
- return 0;
-}
-
-
-int
-iot_rmdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata)
-{
- STACK_WIND (frame, iot_rmdir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->rmdir, loc, flags, xdata);
+ IOT_FOP (mkdir, frame, this, loc, mode, umask, xdata);
return 0;
}
@@ -715,49 +418,7 @@ iot_rmdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, d
int
iot_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_rmdir_stub (frame, iot_rmdir_wrapper, loc, flags, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create rmdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (rmdir, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_symlink_cbk (call_frame_t *frame, void * cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_symlink_wrapper (call_frame_t *frame, xlator_t *this, const char *linkname,
- loc_t *loc, mode_t umask, dict_t *xdata)
-{
- STACK_WIND (frame, iot_symlink_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->symlink, linkname, loc, umask,
- xdata);
+ IOT_FOP (rmdir, frame, this, loc, flags, xdata);
return 0;
}
@@ -766,52 +427,7 @@ int
iot_symlink (call_frame_t *frame, xlator_t *this, const char *linkname,
loc_t *loc, mode_t umask, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_symlink_stub (frame, iot_symlink_wrapper, linkname, loc,
- umask, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create symlink stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (symlink, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_rename_cbk (call_frame_t *frame, void * cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf, preoldparent,
- postoldparent, prenewparent, postnewparent, xdata);
- return 0;
-}
-
-
-int
-iot_rename_wrapper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
- loc_t *newloc, dict_t *xdata)
-{
- STACK_WIND (frame, iot_rename_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->rename, oldloc, newloc, xdata);
+ IOT_FOP (symlink, frame, this, linkname, loc, umask, xdata);
return 0;
}
@@ -820,106 +436,17 @@ int
iot_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_rename_stub (frame, iot_rename_wrapper, oldloc, newloc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_DEBUG, "cannot create rename stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (rename, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
+ IOT_FOP (rename, frame, this, oldloc, newloc, xdata);
return 0;
}
int
-iot_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);
- return 0;
-}
-
-
-int
-iot_open_wrapper (call_frame_t * frame, xlator_t * this, loc_t *loc,
- int32_t flags, fd_t * fd, dict_t *xdata)
-{
- STACK_WIND (frame, iot_open_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->open, loc, flags, fd,
- xdata);
- return 0;
-}
-
-
-int
iot_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
fd_t *fd, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_open_stub (frame, iot_open_wrapper, loc, flags, fd,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create open call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (open, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
- struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, stbuf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_create_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, umask, fd, xdata);
- return 0;
+ IOT_FOP (open, frame, this, loc, flags, fd, xdata);
+ return 0;
}
@@ -927,159 +454,25 @@ int
iot_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_create_stub (frame, iot_create_wrapper, loc, flags, mode,
- umask, fd, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create \"create\" call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (create, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
+ IOT_FOP (create, frame, this, loc, flags, mode, umask, fd, xdata);
return 0;
}
int
-iot_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iovec *vector,
- int32_t count, struct iatt *stbuf, struct iobref *iobref,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count,
- stbuf, iobref, xdata);
-
- return 0;
-}
-
-
-int
-iot_readv_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata)
-{
- STACK_WIND (frame, iot_readv_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv,
- fd, size, offset, flags, xdata);
- return 0;
-}
-
-
-int
iot_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, uint32_t flags, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_readv_stub (frame, iot_readv_wrapper, fd, size, offset,
- flags, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create readv call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (readv, frame, -1, -ret, NULL, -1, NULL,
- NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- STACK_WIND (frame, iot_flush_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd, xdata);
- return 0;
+ IOT_FOP (readv, frame, this, fd, size, offset, flags, xdata);
+ return 0;
}
int
iot_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_flush_stub (frame, iot_flush_wrapper, fd, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create flush_cbk call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (flush, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
- return 0;
-}
-
-
-int
-iot_fsync_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fsync_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsync,
- fd, datasync, xdata);
- return 0;
+ IOT_FOP (flush, frame, this, fd, xdata);
+ return 0;
}
@@ -1087,54 +480,8 @@ int
iot_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fsync_stub (frame, iot_fsync_wrapper, fd, datasync, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fsync_cbk call stub"
- "(out of memory)");
- ret = -1;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fsync, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
- return 0;
-}
-
-
-int
-iot_writev_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count,
- off_t offset, uint32_t flags, struct iobref *iobref,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_writev_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev,
- fd, vector, count, offset, flags, iobref, xdata);
- return 0;
+ IOT_FOP (fsync, frame, this, fd, datasync, xdata);
+ return 0;
}
@@ -1143,53 +490,9 @@ iot_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_writev_stub (frame, iot_writev_wrapper, fd, vector,
- count, offset, flags, iobref, xdata);
-
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create writev call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (writev, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int32_t
-iot_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, flock, xdata);
- return 0;
-}
-
-
-int
-iot_lk_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t cmd, struct gf_flock *flock, dict_t *xdata)
-{
- STACK_WIND (frame, iot_lk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lk,
- fd, cmd, flock, xdata);
- return 0;
+ IOT_FOP (writev, frame, this, fd, vector, count, offset,
+ flags, iobref, xdata);
+ return 0;
}
@@ -1197,149 +500,24 @@ int
iot_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_lk_stub (frame, iot_lk_wrapper, fd, cmd, flock, xdata);
-
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_lk call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (lk, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata);
- return 0;
-}
-
-
-int
-iot_stat_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- STACK_WIND (frame, iot_stat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat,
- loc, xdata);
- return 0;
+ IOT_FOP (lk, frame, this, fd, cmd, flock, xdata);
+ return 0;
}
int
iot_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_stat_stub (frame, iot_stat_wrapper, loc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_stat call stub"
- "(out of memory)");
- ret = -1;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (stat, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata);
- return 0;
-}
-
-
-int
-iot_fstat_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fstat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat,
- fd, xdata);
- return 0;
+ IOT_FOP (stat, frame, this, loc, xdata);
+ return 0;
}
int
iot_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fstat_stub (frame, iot_fstat_wrapper, fd, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_fstat call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fstat, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf,
- postbuf, xdata);
- return 0;
-}
-
-
-int
-iot_truncate_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- off_t offset, dict_t *xdata)
-{
- STACK_WIND (frame, iot_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc, offset, xdata);
- return 0;
+ IOT_FOP (fstat, frame, this, fd, xdata);
+ return 0;
}
@@ -1347,55 +525,8 @@ int
iot_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
dict_t *xdata)
{
- call_stub_t *stub;
- int ret = -1;
-
- stub = fop_truncate_stub (frame, iot_truncate_wrapper, loc, offset,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_stat call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (truncate, frame, -1, -ret, NULL, NULL,
- NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf,
- postbuf, xdata);
- return 0;
-}
-
-
-int
-iot_ftruncate_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset, dict_t *xdata)
-{
- STACK_WIND (frame, iot_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd, offset, xdata);
- return 0;
+ IOT_FOP (truncate, frame, this, loc, offset, xdata);
+ return 0;
}
@@ -1403,106 +534,17 @@ int
iot_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_ftruncate_stub (frame, iot_ftruncate_wrapper, fd, offset,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_ftruncate call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (ftruncate, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-
-int
-iot_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent,
- postparent, xdata);
- return 0;
+ IOT_FOP (ftruncate, frame, this, fd, offset, xdata);
+ return 0;
}
-int
-iot_unlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t xflag, dict_t *xdata)
-{
- STACK_WIND (frame, iot_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- loc, xflag, xdata);
- return 0;
-}
-
int
iot_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_unlink_stub (frame, iot_unlink_wrapper, loc, xflag, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_unlink call stub"
- "(out of memory)");
- ret = -1;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (unlink, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_link_wrapper (call_frame_t *frame, xlator_t *this, loc_t *old, loc_t *new,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_link_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->link, old, new, xdata);
-
+ IOT_FOP (unlink, frame, this, loc, xflag, xdata);
return 0;
}
@@ -1511,46 +553,7 @@ int
iot_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_link_stub (frame, iot_link_wrapper, oldloc, newloc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create link stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (link, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata);
- return 0;
-}
-
-
-int
-iot_opendir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_opendir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->opendir, loc, fd, xdata);
+ IOT_FOP (link, frame, this, oldloc, newloc, xdata);
return 0;
}
@@ -1559,45 +562,7 @@ int
iot_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_opendir_stub (frame, iot_opendir_wrapper, loc, fd, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create opendir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (opendir, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_fsyncdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int datasync, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fsyncdir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsyncdir, fd, datasync, xdata);
+ IOT_FOP (opendir, frame, this, loc, fd, xdata);
return 0;
}
@@ -1606,47 +571,7 @@ int
iot_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fsyncdir_stub (frame, iot_fsyncdir_wrapper, fd, datasync,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fsyncdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fsyncdir, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct statvfs *buf,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
- return 0;
-}
-
-
-int
-iot_statfs_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_statfs_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->statfs, loc, xdata);
+ IOT_FOP (fsyncdir, frame, this, fd, datasync, xdata);
return 0;
}
@@ -1654,45 +579,7 @@ iot_statfs_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
int
iot_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_statfs_stub (frame, iot_statfs_wrapper, loc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create statfs stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (statfs, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_setxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *dict, int32_t flags, dict_t *xdata)
-{
- STACK_WIND (frame, iot_setxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setxattr, loc, dict, flags, xdata);
+ IOT_FOP (statfs, frame, this, loc, xdata);
return 0;
}
@@ -1701,47 +588,7 @@ int
iot_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
int32_t flags, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_setxattr_stub (frame, iot_setxattr_wrapper, loc, dict,
- flags, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create setxattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (setxattr, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
-}
-
-
-int
-iot_getxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
-{
- STACK_WIND (frame, iot_getxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->getxattr, loc, name, xdata);
+ IOT_FOP (setxattr, frame, this, loc, dict, flags, xdata);
return 0;
}
@@ -1750,47 +597,7 @@ int
iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_getxattr_stub (frame, iot_getxattr_wrapper, loc, name, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create getxattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
-}
-
-
-int
-iot_fgetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fgetxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata);
+ IOT_FOP (getxattr, frame, this, loc, name, xdata);
return 0;
}
@@ -1799,46 +606,7 @@ int
iot_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
const char *name, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fgetxattr_stub (frame, iot_fgetxattr_wrapper, fd, name, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fgetxattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fgetxattr, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_fsetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- dict_t *dict, int32_t flags, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fsetxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags,
- xdata);
+ IOT_FOP (fgetxattr, frame, this, fd, name, xdata);
return 0;
}
@@ -1847,46 +615,7 @@ int
iot_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
int32_t flags, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fsetxattr_stub (frame, iot_fsetxattr_wrapper, fd, dict,
- flags, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fsetxattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fsetxattr, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_removexattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
-{
- STACK_WIND (frame, iot_removexattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->removexattr, loc, name, xdata);
+ IOT_FOP (fsetxattr, frame, this, fd, dict, flags, xdata);
return 0;
}
@@ -1895,94 +624,15 @@ int
iot_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_removexattr_stub (frame, iot_removexattr_wrapper, loc,
- name, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,"cannot get removexattr fop"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (removexattr, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-int
-iot_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata);
+ IOT_FOP (removexattr, frame, this, loc, name, xdata);
return 0;
}
-
-int
-iot_fremovexattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fremovexattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fremovexattr, fd, name, xdata);
- return 0;
-}
-
-
int
iot_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
const char *name, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fremovexattr_stub (frame, iot_fremovexattr_wrapper, fd,
- name, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,"cannot get fremovexattr fop"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fremovexattr, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
- return 0;
-}
-
-
-int
-iot_readdirp_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t offset, dict_t *xdata)
-{
- STACK_WIND (frame, iot_readdirp_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readdirp, fd, size, offset, xdata);
+ IOT_FOP (fremovexattr, frame, this, fd, name, xdata);
return 0;
}
@@ -1991,47 +641,7 @@ int
iot_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_readdirp_stub (frame, iot_readdirp_wrapper, fd, size,
- offset, xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (readdirp, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, xdata);
- return 0;
-}
-
-
-int
-iot_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t offset, dict_t *xdata)
-{
- STACK_WIND (frame, iot_readdir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readdir, fd, size, offset, xdata);
+ IOT_FOP (readdirp, frame, this, fd, size, offset, xdata);
return 0;
}
@@ -2040,369 +650,94 @@ int
iot_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_readdir_stub (frame, iot_readdir_wrapper, fd, size, offset,
- xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (readdir, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
+ IOT_FOP (readdir, frame, this, fd, size, offset, xdata);
return 0;
}
int
-iot_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_inodelk_wrapper (call_frame_t *frame, xlator_t *this, const char *volume,
- loc_t *loc, int32_t cmd, struct gf_flock *lock,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_inodelk_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->inodelk, volume, loc, cmd, lock,
- xdata);
- return 0;
-}
-
-
-int
iot_inodelk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_inodelk_stub (frame, iot_inodelk_wrapper,
- volume, loc, cmd, lock, xdata);
- if (!stub) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (inodelk, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-int
-iot_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata);
+ IOT_FOP (inodelk, frame, this, volume, loc, cmd, lock, xdata);
return 0;
}
-
-int
-iot_finodelk_wrapper (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- STACK_WIND (frame, iot_finodelk_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->finodelk, volume, fd, cmd, lock,
- xdata);
- return 0;
-}
-
-
int
iot_finodelk (call_frame_t *frame, xlator_t *this,
const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_finodelk_stub (frame, iot_finodelk_wrapper,
- volume, fd, cmd, lock, xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get finodelk stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (finodelk, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-int
-iot_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata);
+ IOT_FOP (finodelk, frame, this, volume, fd, cmd, lock, xdata);
return 0;
}
-
-int
-iot_entrylk_wrapper (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- STACK_WIND (frame, iot_entrylk_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->entrylk,
- volume, loc, basename, cmd, type, xdata);
- return 0;
-}
-
-
int
iot_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_entrylk_stub (frame, iot_entrylk_wrapper,
- volume, loc, basename, cmd, type, xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get entrylk stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (entrylk, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
+ IOT_FOP (entrylk, frame, this, volume, loc, basename, cmd, type, xdata);
return 0;
}
int
-iot_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fentrylk, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_fentrylk_wrapper (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fentrylk_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fentrylk,
- volume, fd, basename, cmd, type, xdata);
- return 0;
-}
-
-
-int
iot_fentrylk (call_frame_t *frame, xlator_t *this,
const char *volume, fd_t *fd, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fentrylk_stub (frame, iot_fentrylk_wrapper,
- volume, fd, basename, cmd, type, xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get fentrylk stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fentrylk, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
+ IOT_FOP (fentrylk, frame, this, volume, fd, basename, cmd, type, xdata);
return 0;
}
int
-iot_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata)
+iot_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
- STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, xdata);
+ IOT_FOP (xattrop, frame, this, loc, optype, xattr, xdata);
return 0;
}
int
-iot_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+iot_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
- STACK_WIND (frame, iot_xattrop_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr, xdata);
+ IOT_FOP (fxattrop, frame, this, fd, optype, xattr, xdata);
return 0;
}
-int
-iot_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+int32_t
+iot_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_xattrop_stub (frame, iot_xattrop_wrapper, loc, optype,
- xattr, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create xattrop stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (xattrop, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
+ IOT_FOP (rchecksum, frame, this, fd, offset, len, xdata);
return 0;
}
-
int
-iot_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata)
+iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
{
- STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, xattr, xdata);
+ IOT_FOP (fallocate, frame, this, fd, mode, offset, len, xdata);
return 0;
}
int
-iot_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
{
- STACK_WIND (frame, iot_fxattrop_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr, xdata);
+ IOT_FOP (discard, frame, this, fd, offset, len, xdata);
return 0;
}
-
int
-iot_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+iot_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fxattrop_stub (frame, iot_fxattrop_wrapper, fd, optype,
- xattr, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fxattrop stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fxattrop, frame, -1, -ret, NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int32_t
-iot_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, uint32_t weak_checksum,
- uint8_t *strong_checksum, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum,
- strong_checksum, xdata);
- return 0;
-}
-
-
-int32_t
-iot_rchecksum_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset, int32_t len, dict_t *xdata)
-{
- STACK_WIND (frame, iot_rchecksum_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
- return 0;
-}
-
-
-int32_t
-iot_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- int32_t len, dict_t *xdata)
-{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_rchecksum_stub (frame, iot_rchecksum_wrapper, fd, offset,
- len, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create rchecksum stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (rchecksum, frame, -1, -ret, -1, NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
+ IOT_FOP (zerofill, frame, this, fd, offset, len, xdata);
return 0;
}
@@ -2432,7 +767,7 @@ __iot_workers_scale (iot_conf_t *conf)
while (diff) {
diff --;
- ret = pthread_create (&thread, &conf->w_attr, iot_worker, conf);
+ ret = gf_thread_create (&thread, &conf->w_attr, iot_worker, conf);
if (ret == 0) {
conf->curr_count++;
gf_log (conf->this->name, GF_LOG_DEBUG,
@@ -2736,6 +1071,9 @@ struct xlator_fops fops = {
.xattrop = iot_xattrop,
.fxattrop = iot_fxattrop,
.rchecksum = iot_rchecksum,
+ .fallocate = iot_fallocate,
+ .discard = iot_discard,
+ .zerofill = iot_zerofill,
};
struct xlator_cbks cbks;
diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c
index ffb933118..ef156e309 100644
--- a/xlators/performance/md-cache/src/md-cache.c
+++ b/xlators/performance/md-cache/src/md-cache.c
@@ -18,6 +18,7 @@
#include "dict.h"
#include "xlator.h"
#include "md-cache-mem-types.h"
+#include "glusterfs-acl.h"
#include <assert.h>
#include <sys/time.h>
@@ -42,12 +43,12 @@ static struct mdc_key {
int check;
} mdc_keys[] = {
{
- .name = "system.posix_acl_access",
+ .name = POSIX_ACL_ACCESS_XATTR,
.load = 0,
.check = 1,
},
{
- .name = "system.posix_acl_default",
+ .name = POSIX_ACL_DEFAULT_XATTR,
.load = 0,
.check = 1,
},
@@ -132,6 +133,7 @@ struct mdc_local {
loc_t loc2;
fd_t *fd;
char *linkname;
+ char *key;
dict_t *xattr;
};
@@ -174,7 +176,7 @@ __mdc_inode_ctx_set (xlator_t *this, inode_t *inode, struct md_cache *mdc)
uint64_t mdc_int = 0;
mdc_int = (long) mdc;
- ret = __inode_ctx_set2 (inode, this, &mdc_int, 0);
+ ret = __inode_ctx_set (inode, this, &mdc_int);
return ret;
}
@@ -229,6 +231,8 @@ mdc_local_wipe (xlator_t *this, mdc_local_t *local)
GF_FREE (local->linkname);
+ GF_FREE (local->key);
+
if (local->xattr)
dict_unref (local->xattr);
@@ -585,6 +589,31 @@ out:
int
+mdc_inode_xatt_unset (xlator_t *this, inode_t *inode, char *name)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ mdc = mdc_inode_prep (this, inode);
+ if (!mdc)
+ goto out;
+
+ if (!name)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ dict_del (mdc->xattr, name);
+ }
+ UNLOCK (&mdc->lock);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict)
{
int ret = -1;
@@ -598,13 +627,15 @@ mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict)
LOCK (&mdc->lock);
{
+ ret = 0;
+ /* Missing xattr only means no keys were there, i.e
+ a negative cache for the "loaded" keys
+ */
if (!mdc->xattr)
goto unlock;
if (dict)
*dict = dict_ref (mdc->xattr);
-
- ret = 0;
}
unlock:
UNLOCK (&mdc->lock);
@@ -614,6 +645,46 @@ out:
}
+int
+mdc_inode_iatt_invalidate (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ if (mdc_inode_ctx_get (this, inode, &mdc) != 0)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ mdc->ia_time = 0;
+ }
+ UNLOCK (&mdc->lock);
+
+out:
+ return ret;
+}
+
+
+int
+mdc_inode_xatt_invalidate (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ if (mdc_inode_ctx_get (this, inode, &mdc) != 0)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ mdc->xa_time = 0;
+ }
+ UNLOCK (&mdc->lock);
+
+out:
+ return ret;
+}
+
+
void
mdc_load_reqs (xlator_t *this, dict_t *dict)
{
@@ -647,7 +718,7 @@ is_mdc_key_satisfied (const char *key)
return 0;
for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) {
- if (!mdc_keys[i].check)
+ if (!mdc_keys[i].load)
continue;
if (strcmp (mdc_key, key) == 0)
return 1;
@@ -721,6 +792,7 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt stbuf = {0, };
struct iatt postparent = {0, };
dict_t *xattr_rsp = NULL;
+ dict_t *xattr_alloc = NULL;
mdc_local_t *local = NULL;
@@ -728,6 +800,13 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
if (!local)
goto uncached;
+ if (!loc->name)
+ /* A nameless discovery is dangerous to cache. We
+ perform nameless lookup with the intention of
+ re-establishing an inode "properly"
+ */
+ goto uncached;
+
loc_copy (&local->loc, loc);
ret = mdc_inode_iatt_get (this, loc->inode, &stbuf);
@@ -752,6 +831,8 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
uncached:
+ if (!xdata)
+ xdata = xattr_alloc = dict_new ();
if (xdata)
mdc_load_reqs (this, xdata);
@@ -760,7 +841,8 @@ uncached:
if (xattr_rsp)
dict_unref (xattr_rsp);
-
+ if (xattr_alloc)
+ dict_unref (xattr_alloc);
return 0;
}
@@ -1573,6 +1655,8 @@ mdc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
mdc_inode_xatt_update (this, local->loc.inode, local->xattr);
+ mdc_inode_iatt_invalidate (this, local->loc.inode);
+
out:
MDC_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
@@ -1614,6 +1698,7 @@ mdc_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
mdc_inode_xatt_update (this, local->fd->inode, local->xattr);
+ mdc_inode_iatt_invalidate (this, local->fd->inode);
out:
MDC_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
@@ -1666,6 +1751,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
dict_t *xdata)
{
int ret;
+ int op_errno = ENODATA;
mdc_local_t *local = NULL;
dict_t *xattr = NULL;
@@ -1682,10 +1768,12 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
if (ret != 0)
goto uncached;
- if (!dict_get (xattr, (char *)key))
- goto uncached;
+ if (!xattr || !dict_get (xattr, (char *)key)) {
+ ret = -1;
+ op_errno = ENODATA;
+ }
- MDC_STACK_UNWIND (getxattr, frame, 0, 0, xattr, xdata);
+ MDC_STACK_UNWIND (getxattr, frame, ret, op_errno, xattr, xdata);
return 0;
@@ -1727,6 +1815,7 @@ mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
int ret;
mdc_local_t *local = NULL;
dict_t *xattr = NULL;
+ int op_errno = ENODATA;
local = mdc_local_get (frame);
if (!local)
@@ -1741,10 +1830,12 @@ mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
if (ret != 0)
goto uncached;
- if (!dict_get (xattr, (char *)key))
- goto uncached;
+ if (!xattr || !dict_get (xattr, (char *)key)) {
+ ret = -1;
+ op_errno = ENODATA;
+ }
- MDC_STACK_UNWIND (fgetxattr, frame, 0, 0, xattr, xdata);
+ MDC_STACK_UNWIND (fgetxattr, frame, ret, op_errno, xattr, xdata);
return 0;
@@ -1755,6 +1846,97 @@ uncached:
return 0;
}
+int
+mdc_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ if (local->key)
+ mdc_inode_xatt_unset (this, local->loc.inode, local->key);
+ else
+ mdc_inode_xatt_invalidate (this, local->loc.inode);
+
+ mdc_inode_iatt_invalidate (this, local->loc.inode);
+out:
+ MDC_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+
+int
+mdc_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = mdc_local_get (frame);
+
+ loc_copy (&local->loc, loc);
+
+ local->key = gf_strdup (name);
+
+ STACK_WIND (frame, mdc_removexattr_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+
+int
+mdc_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ if (local->key)
+ mdc_inode_xatt_unset (this, local->fd->inode, local->key);
+ else
+ mdc_inode_xatt_invalidate (this, local->fd->inode);
+
+ mdc_inode_iatt_invalidate (this, local->fd->inode);
+out:
+ MDC_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+
+int
+mdc_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = mdc_local_get (frame);
+
+ local->fd = fd_ref (fd);
+
+ local->key = gf_strdup (name);
+
+ STACK_WIND (frame, mdc_fremovexattr_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
int
mdc_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1782,9 +1964,18 @@ int
mdc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd,
size_t size, off_t offset, dict_t *xdata)
{
+ dict_t *xattr_alloc = NULL;
+
+ if (!xdata)
+ xdata = xattr_alloc = dict_new ();
+ if (xdata)
+ mdc_load_reqs (this, xdata);
+
STACK_WIND (frame, mdc_readdirp_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp,
fd, size, offset, xdata);
+ if (xattr_alloc)
+ dict_unref (xattr_alloc);
return 0;
}
@@ -1828,6 +2019,123 @@ mdc_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
+int
+mdc_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+
+ return 0;
+}
+
+int
+mdc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len,
+ xdata);
+
+ return 0;
+}
+
+int
+mdc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len,
+ xdata);
+
+ return 0;
+}
+
int
mdc_forget (xlator_t *this, inode_t *inode)
@@ -1955,8 +2263,13 @@ struct xlator_fops fops = {
.fsetxattr = mdc_fsetxattr,
.getxattr = mdc_getxattr,
.fgetxattr = mdc_fgetxattr,
+ .removexattr = mdc_removexattr,
+ .fremovexattr= mdc_fremovexattr,
.readdirp = mdc_readdirp,
- .readdir = mdc_readdir
+ .readdir = mdc_readdir,
+ .fallocate = mdc_fallocate,
+ .discard = mdc_discard,
+ .zerofill = mdc_zerofill,
};
@@ -1986,4 +2299,5 @@ struct volume_options options[] = {
.description = "Convert all readdir requests to readdirplus to "
"collect stat info on each entry.",
},
+ { .key = {NULL} },
};
diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c
index e23a23325..742e4df3f 100644
--- a/xlators/performance/open-behind/src/open-behind.c
+++ b/xlators/performance/open-behind/src/open-behind.c
@@ -23,6 +23,11 @@ typedef struct ob_conf {
like mandatory locks
*/
gf_boolean_t lazy_open; /* delay backend open as much as possible */
+ gf_boolean_t read_after_open; /* instead of sending readvs on
+ anonymous fds, open the file
+ first and then send readv i.e
+ similar to what writev does
+ */
} ob_conf_t;
@@ -367,8 +372,14 @@ ob_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
{
call_stub_t *stub = NULL;
fd_t *wind_fd = NULL;
+ ob_conf_t *conf = NULL;
- wind_fd = ob_get_wind_fd (this, fd);
+ conf = this->private;
+
+ if (!conf->read_after_open)
+ wind_fd = ob_get_wind_fd (this, fd);
+ else
+ wind_fd = fd_ref (fd);
stub = fop_readv_stub (frame, default_readv_resume, wind_fd,
size, offset, flags, xdata);
@@ -681,6 +692,63 @@ err:
return 0;
}
+int
+ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode,
+ offset, len, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len,
+ xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_zerofill_stub(frame, default_zerofill_resume, fd,
+ offset, len, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
int
ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
@@ -697,6 +765,8 @@ ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
fd = fd_lookup (loc->inode, 0);
open_and_resume (this, fd, stub);
+ if (fd)
+ fd_unref (fd);
return 0;
err:
@@ -721,6 +791,8 @@ ob_rename (call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst,
fd = fd_lookup (dst->inode, 0);
open_and_resume (this, fd, stub);
+ if (fd)
+ fd_unref (fd);
return 0;
err:
@@ -833,6 +905,8 @@ reconfigure (xlator_t *this, dict_t *options)
bool, out);
GF_OPTION_RECONF ("lazy-open", conf->lazy_open, options, bool, out);
+ GF_OPTION_RECONF ("read-after-open", conf->read_after_open, options,
+ bool, out);
ret = 0;
out:
@@ -863,7 +937,7 @@ init (xlator_t *this)
GF_OPTION_INIT ("use-anonymous-fd", conf->use_anonymous_fd, bool, err);
GF_OPTION_INIT ("lazy-open", conf->lazy_open, bool, err);
-
+ GF_OPTION_INIT ("read-after-open", conf->read_after_open, bool, err);
this->private = conf;
return 0;
@@ -903,6 +977,9 @@ struct xlator_fops fops = {
.fentrylk = ob_fentrylk,
.fxattrop = ob_fxattrop,
.fsetattr = ob_fsetattr,
+ .fallocate = ob_fallocate,
+ .discard = ob_discard,
+ .zerofill = ob_zerofill,
.unlink = ob_unlink,
.rename = ob_rename,
.lk = ob_lk,
@@ -922,15 +999,22 @@ struct volume_options options[] = {
{ .key = {"use-anonymous-fd"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "yes",
- .description = "For read operations, use anonymous FD when "
- "original FD is open-behind and not yet opened in the backend.",
+ .description = "For read operations, use anonymous FD when "
+ "original FD is open-behind and not yet opened in the backend.",
},
{ .key = {"lazy-open"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "yes",
- .description = "Perform open in the backend only when a necessary "
- "FOP arrives (e.g writev on the FD, unlink of the file). When option "
- "is disabled, perform backend open right after unwinding open().",
+ .description = "Perform open in the backend only when a necessary "
+ "FOP arrives (e.g writev on the FD, unlink of the file). When option "
+ "is disabled, perform backend open right after unwinding open().",
+ },
+ { .key = {"read-after-open"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .description = "read is sent only after actual open happens and real "
+ "fd is obtained, instead of doing on anonymous fd (similar to write)",
},
+ { .key = {NULL} }
};
diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c
index 1a0f8675e..b8b4c5326 100644
--- a/xlators/performance/quick-read/src/quick-read.c
+++ b/xlators/performance/quick-read/src/quick-read.c
@@ -101,6 +101,7 @@ qr_inode_ctx_get_or_new (xlator_t *this, inode_t *inode)
if (ret) {
__qr_inode_prune (&priv->table, qr_inode);
GF_FREE (qr_inode);
+ qr_inode = NULL;
}
}
unlock:
@@ -417,10 +418,11 @@ qr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (content) {
/* new content came along, always replace old content */
qr_inode = qr_inode_ctx_get_or_new (this, inode);
- if (!qr_inode)
+ if (!qr_inode) {
/* no harm done */
+ GF_FREE (content);
goto out;
-
+ }
qr_content_update (this, qr_inode, content, buf);
} else {
/* purge old content if necessary */
@@ -565,7 +567,6 @@ qr_readv_cached (call_frame_t *frame, qr_inode_t *qr_inode, size_t size,
iobref = iobref_new ();
if (!iobref) {
op_ret = -1;
- iobuf_unref (iobuf);
goto unlock;
}
@@ -622,6 +623,46 @@ wind:
int
+qr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
+ int count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ qr_inode_prune (this, fd->inode);
+
+ STACK_WIND (frame, default_writev_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
+ fd, iov, count, offset, flags, iobref, xdata);
+ return 0;
+}
+
+
+int
+qr_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ qr_inode_prune (this, loc->inode);
+
+ STACK_WIND (frame, default_truncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+
+int
+qr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ qr_inode_prune (this, fd->inode);
+
+ STACK_WIND (frame, default_ftruncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+
+int
qr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
fd_t *fd, dict_t *xdata)
{
@@ -813,7 +854,7 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("cache-timeout", conf->cache_timeout, options, int32,
out);
- GF_OPTION_RECONF ("cache-size", cache_size_new, options, size, out);
+ GF_OPTION_RECONF ("cache-size", cache_size_new, options, size_uint64, out);
if (!check_cache_size_ok (this, cache_size_new)) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR,
@@ -954,11 +995,11 @@ init (xlator_t *this)
LOCK_INIT (&priv->table.lock);
conf = &priv->conf;
- GF_OPTION_INIT ("max-file-size", conf->max_file_size, size, out);
+ GF_OPTION_INIT ("max-file-size", conf->max_file_size, size_uint64, out);
GF_OPTION_INIT ("cache-timeout", conf->cache_timeout, int32, out);
- GF_OPTION_INIT ("cache-size", conf->cache_size, size, out);
+ GF_OPTION_INIT ("cache-size", conf->cache_size, size_uint64, out);
if (!check_cache_size_ok (this, conf->cache_size)) {
ret = -1;
goto out;
@@ -1066,6 +1107,9 @@ struct xlator_fops fops = {
.readdirp = qr_readdirp,
.open = qr_open,
.readv = qr_readv,
+ .writev = qr_writev,
+ .truncate = qr_truncate,
+ .ftruncate = qr_ftruncate
};
struct xlator_cbks cbks = {
@@ -1100,4 +1144,5 @@ struct volume_options options[] = {
.max = 1 * GF_UNIT_KB * 1000,
.default_value = "64KB",
},
+ { .key = {NULL} }
};
diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c
index e79e7ae78..6e5b52c5e 100644
--- a/xlators/performance/read-ahead/src/page.c
+++ b/xlators/performance/read-ahead/src/page.c
@@ -421,7 +421,12 @@ ra_frame_unwind (call_frame_t *frame)
fill->count * sizeof (*vector));
copied += (fill->count * sizeof (*vector));
- iobref_merge (iobref, fill->iobref);
+ if (iobref_merge (iobref, fill->iobref)) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ iobref_unref (iobref);
+ iobref = NULL;
+ }
}
fill->next->prev = fill->prev;
diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c
index 522fa52b8..01c861d52 100644
--- a/xlators/performance/read-ahead/src/read-ahead.c
+++ b/xlators/performance/read-ahead/src/read-ahead.c
@@ -942,6 +942,106 @@ unwind:
return 0;
}
+int
+ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame);
+
+ STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+static int
+ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ uint64_t tmp_file = 0;
+ int32_t op_errno = EINVAL;
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
+ GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+ if (!file)
+ continue;
+
+ flush_region(frame, file, offset, len, 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_discard_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame);
+
+ STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+static int
+ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ uint64_t tmp_file = 0;
+ int32_t op_errno = EINVAL;
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
+ GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+ if (!file)
+ continue;
+
+ flush_region(frame, file, offset, len, 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_zerofill_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->zerofill, fd,
+ offset, len, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
int
ra_priv_dump (xlator_t *this)
@@ -1024,7 +1124,8 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("page-count", conf->page_count, options, uint32, out);
- GF_OPTION_RECONF ("page-size", conf->page_size, options, size, out);
+ GF_OPTION_RECONF ("page-size", conf->page_size, options, size_uint64,
+ out);
ret = 0;
out:
@@ -1058,7 +1159,7 @@ init (xlator_t *this)
conf->page_size = this->ctx->page_size;
- GF_OPTION_INIT ("page-size", conf->page_size, size, out);
+ GF_OPTION_INIT ("page-size", conf->page_size, size_uint64, out);
GF_OPTION_INIT ("page-count", conf->page_count, uint32, out);
@@ -1123,6 +1224,8 @@ struct xlator_fops fops = {
.truncate = ra_truncate,
.ftruncate = ra_ftruncate,
.fstat = ra_fstat,
+ .discard = ra_discard,
+ .zerofill = ra_zerofill,
};
struct xlator_cbks cbks = {
diff --git a/xlators/performance/readdir-ahead/Makefile.am b/xlators/performance/readdir-ahead/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/performance/readdir-ahead/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am
new file mode 100644
index 000000000..539d6ede4
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = readdir-ahead.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+readdir_ahead_la_LDFLAGS = -module -avoid-version
+
+readdir_ahead_la_SOURCES = readdir-ahead.c
+readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = readdir-ahead.h readdir-ahead-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h
new file mode 100644
index 000000000..39e2c5369
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h
@@ -0,0 +1,24 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __RDA_MEM_TYPES_H__
+#define __RDA_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_rda_mem_types_ {
+ gf_rda_mt_rda_local = gf_common_mt_end + 1,
+ gf_rda_mt_rda_fd_ctx,
+ gf_rda_mt_rda_priv,
+ gf_rda_mt_end
+};
+
+#endif
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.c b/xlators/performance/readdir-ahead/src/readdir-ahead.c
new file mode 100644
index 000000000..ba96bfcd3
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead.c
@@ -0,0 +1,560 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/*
+ * performance/readdir-ahead preloads a local buffer with directory entries
+ * on opendir. The optimization involves using maximum sized gluster rpc
+ * requests (128k) to minimize overhead of smaller client requests.
+ *
+ * For example, fuse currently supports a maximum readdir buffer of 4k
+ * (regardless of the filesystem client's buffer size). readdir-ahead should
+ * effectively convert these smaller requests into fewer, larger sized requests
+ * for simple, sequential workloads (i.e., ls).
+ *
+ * The translator is currently designed to handle the simple, sequential case
+ * only. If a non-sequential directory read occurs, readdir-ahead disables
+ * preloads on the directory.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "call-stub.h"
+#include "readdir-ahead.h"
+#include "readdir-ahead-mem-types.h"
+#include "defaults.h"
+
+static int rda_fill_fd(call_frame_t *, xlator_t *, fd_t *);
+
+/*
+ * Get (or create) the fd context for storing prepopulated directory
+ * entries.
+ */
+static struct
+rda_fd_ctx *get_rda_fd_ctx(fd_t *fd, xlator_t *this)
+{
+ uint64_t val;
+ struct rda_fd_ctx *ctx;
+
+ LOCK(&fd->lock);
+
+ if (__fd_ctx_get(fd, this, &val) < 0) {
+ ctx = GF_CALLOC(1, sizeof(struct rda_fd_ctx),
+ gf_rda_mt_rda_fd_ctx);
+ if (!ctx)
+ goto out;
+
+ LOCK_INIT(&ctx->lock);
+ INIT_LIST_HEAD(&ctx->entries.list);
+ ctx->state = RDA_FD_NEW;
+ /* ctx offset values initialized to 0 */
+
+ if (__fd_ctx_set(fd, this, (uint64_t) ctx) < 0) {
+ GF_FREE(ctx);
+ ctx = NULL;
+ goto out;
+ }
+ } else {
+ ctx = (struct rda_fd_ctx *) val;
+ }
+out:
+ UNLOCK(&fd->lock);
+ return ctx;
+}
+
+/*
+ * Reset the tracking state of the context.
+ */
+static void
+rda_reset_ctx(struct rda_fd_ctx *ctx)
+{
+ ctx->state = RDA_FD_NEW;
+ ctx->cur_offset = 0;
+ ctx->cur_size = 0;
+ ctx->next_offset = 0;
+ gf_dirent_free(&ctx->entries);
+}
+
+/*
+ * Check whether we can handle a request. Offset verification is done by the
+ * caller, so we only check whether the preload buffer has completion status
+ * (including an error) or has some data to return.
+ */
+static gf_boolean_t
+rda_can_serve_readdirp(struct rda_fd_ctx *ctx, size_t request_size)
+{
+ if ((ctx->state & RDA_FD_EOD) ||
+ (ctx->state & RDA_FD_ERROR) ||
+ (!(ctx->state & RDA_FD_PLUGGED) && (ctx->cur_size > 0)))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+/*
+ * Serve a request from the fd dentry list based on the size of the request
+ * buffer. ctx must be locked.
+ */
+static int32_t
+__rda_serve_readdirp(xlator_t *this, gf_dirent_t *entries, size_t request_size,
+ struct rda_fd_ctx *ctx)
+{
+ gf_dirent_t *dirent, *tmp;
+ size_t dirent_size, size = 0;
+ int32_t count = 0;
+ struct rda_priv *priv = this->private;
+
+ list_for_each_entry_safe(dirent, tmp, &ctx->entries.list, list) {
+ dirent_size = gf_dirent_size(dirent->d_name);
+ if (size + dirent_size > request_size)
+ break;
+
+ size += dirent_size;
+ list_del_init(&dirent->list);
+ ctx->cur_size -= dirent_size;
+
+ list_add_tail(&dirent->list, &entries->list);
+ ctx->cur_offset = dirent->d_off;
+ count++;
+ }
+
+ if (ctx->cur_size <= priv->rda_low_wmark)
+ ctx->state |= RDA_FD_PLUGGED;
+
+ return count;
+}
+
+static int32_t
+rda_readdirp_stub(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ gf_dirent_t entries;
+ int32_t ret;
+ struct rda_fd_ctx *ctx;
+ int op_errno = 0;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ INIT_LIST_HEAD(&entries.list);
+ ret = __rda_serve_readdirp(this, &entries, size, ctx);
+
+ if (!ret && (ctx->state & RDA_FD_ERROR)) {
+ ret = -1;
+ op_errno = ctx->op_errno;
+ ctx->state &= ~RDA_FD_ERROR;
+
+ /*
+ * the preload has stopped running in the event of an error, so
+ * pass all future requests along
+ */
+ ctx->state |= RDA_FD_BYPASS;
+ }
+
+ STACK_UNWIND_STRICT(readdirp, frame, ret, op_errno, &entries, xdata);
+ gf_dirent_free(&entries);
+
+ return 0;
+}
+
+static int32_t
+rda_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
+{
+ struct rda_fd_ctx *ctx;
+ call_stub_t *stub;
+ int fill = 0;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ if (!ctx)
+ goto err;
+
+ if (ctx->state & RDA_FD_BYPASS)
+ goto bypass;
+
+ LOCK(&ctx->lock);
+
+ /* recheck now that we have the lock */
+ if (ctx->state & RDA_FD_BYPASS) {
+ UNLOCK(&ctx->lock);
+ goto bypass;
+ }
+
+ /*
+ * If a new read comes in at offset 0 and the buffer has been
+ * completed, reset the context and kickstart the filler again.
+ */
+ if (!off && (ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)) {
+ rda_reset_ctx(ctx);
+ fill = 1;
+ }
+
+ /*
+ * If a readdir occurs at an unexpected offset or we already have a
+ * request pending, admit defeat and just get out of the way.
+ */
+ if (off != ctx->cur_offset || ctx->stub) {
+ ctx->state |= RDA_FD_BYPASS;
+ UNLOCK(&ctx->lock);
+ goto bypass;
+ }
+
+ stub = fop_readdirp_stub(frame, rda_readdirp_stub, fd, size, off, xdata);
+ if (!stub) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ /*
+ * If we haven't bypassed the preload, this means we can either serve
+ * the request out of the preload or the request that enables us to do
+ * so is in flight...
+ */
+ if (rda_can_serve_readdirp(ctx, size))
+ call_resume(stub);
+ else
+ ctx->stub = stub;
+
+ UNLOCK(&ctx->lock);
+
+ if (fill)
+ rda_fill_fd(frame, this, fd);
+
+ return 0;
+
+bypass:
+ STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+static int32_t
+rda_fill_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ gf_dirent_t *dirent, *tmp;
+ struct rda_local *local = frame->local;
+ struct rda_fd_ctx *ctx = local->ctx;
+ struct rda_priv *priv = this->private;
+ int fill = 1;
+
+ LOCK(&ctx->lock);
+
+ /* Verify that the preload buffer is still pending on this data. */
+ if (ctx->next_offset != local->offset) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Out of sequence directory preload.");
+ ctx->state |= (RDA_FD_BYPASS|RDA_FD_ERROR);
+ ctx->op_errno = EUCLEAN;
+
+ goto out;
+ }
+
+ if (entries) {
+ list_for_each_entry_safe(dirent, tmp, &entries->list, list) {
+ list_del_init(&dirent->list);
+ /* must preserve entry order */
+ list_add_tail(&dirent->list, &ctx->entries.list);
+
+ ctx->cur_size += gf_dirent_size(dirent->d_name);
+ ctx->next_offset = dirent->d_off;
+ }
+ }
+
+ if (ctx->cur_size >= priv->rda_high_wmark)
+ ctx->state &= ~RDA_FD_PLUGGED;
+
+ if (!op_ret) {
+ /* we've hit eod */
+ ctx->state &= ~RDA_FD_RUNNING;
+ ctx->state |= RDA_FD_EOD;
+ } else if (op_ret == -1) {
+ /* kill the preload and pend the error */
+ ctx->state &= ~RDA_FD_RUNNING;
+ ctx->state |= RDA_FD_ERROR;
+ ctx->op_errno = op_errno;
+ }
+
+ /*
+ * NOTE: The strict bypass logic in readdirp() means a pending request
+ * is always based on ctx->cur_offset.
+ */
+ if (ctx->stub &&
+ rda_can_serve_readdirp(ctx, ctx->stub->args.size)) {
+ call_resume(ctx->stub);
+ ctx->stub = NULL;
+ }
+
+out:
+ /*
+ * If we have been marked for bypass and have no pending stub, clear the
+ * run state so we stop preloading the context with entries.
+ */
+ if ((ctx->state & RDA_FD_BYPASS) && !ctx->stub)
+ ctx->state &= ~RDA_FD_RUNNING;
+
+ if (!(ctx->state & RDA_FD_RUNNING)) {
+ fill = 0;
+ STACK_DESTROY(ctx->fill_frame->root);
+ ctx->fill_frame = NULL;
+ }
+
+ UNLOCK(&ctx->lock);
+
+ if (fill)
+ rda_fill_fd(frame, this, local->fd);
+
+ return 0;
+}
+
+/*
+ * Start prepopulating the fd context with directory entries.
+ */
+static int
+rda_fill_fd(call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ call_frame_t *nframe = NULL;
+ struct rda_local *local = NULL;
+ struct rda_fd_ctx *ctx;
+ off_t offset;
+ struct rda_priv *priv = this->private;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ if (!ctx)
+ goto err;
+
+ LOCK(&ctx->lock);
+
+ if (ctx->state & RDA_FD_NEW) {
+ ctx->state &= ~RDA_FD_NEW;
+ ctx->state |= RDA_FD_RUNNING;
+ if (priv->rda_low_wmark)
+ ctx->state |= RDA_FD_PLUGGED;
+ }
+
+ offset = ctx->next_offset;
+
+ if (!ctx->fill_frame) {
+ nframe = copy_frame(frame);
+ if (!nframe) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ local->ctx = ctx;
+ local->fd = fd;
+ nframe->local = local;
+
+ ctx->fill_frame = nframe;
+ } else {
+ nframe = ctx->fill_frame;
+ local = nframe->local;
+ }
+
+ local->offset = offset;
+
+ UNLOCK(&ctx->lock);
+
+ STACK_WIND(nframe, rda_fill_fd_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, priv->rda_req_size,
+ offset, NULL);
+
+ return 0;
+
+err:
+ if (nframe)
+ FRAME_DESTROY(nframe);
+
+ return -1;
+}
+
+static int32_t
+rda_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ if (!op_ret)
+ rda_fill_fd(frame, this, fd);
+
+ STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+static int32_t
+rda_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
+{
+ STACK_WIND(frame, rda_opendir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+ return 0;
+}
+
+static int32_t
+rda_releasedir(xlator_t *this, fd_t *fd)
+{
+ uint64_t val;
+ struct rda_fd_ctx *ctx;
+
+ if (fd_ctx_del(fd, this, &val) < 0)
+ return -1;
+
+ ctx = (struct rda_fd_ctx *) val;
+ if (!ctx)
+ return 0;
+
+ rda_reset_ctx(ctx);
+
+ if (ctx->fill_frame)
+ STACK_DESTROY(ctx->fill_frame->root);
+
+ if (ctx->stub)
+ gf_log(this->name, GF_LOG_ERROR,
+ "released a directory with a pending stub");
+
+ GF_FREE(ctx);
+ return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ goto out;
+
+ ret = xlator_mem_acct_init(this, gf_rda_mt_end + 1);
+
+ if (ret != 0)
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+
+out:
+ return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ struct rda_priv *priv = this->private;
+
+ GF_OPTION_RECONF("rda-request-size", priv->rda_req_size, options,
+ uint32, err);
+ GF_OPTION_RECONF("rda-low-wmark", priv->rda_low_wmark, options, size_uint64,
+ err);
+ GF_OPTION_RECONF("rda-high-wmark", priv->rda_high_wmark, options, size_uint64,
+ err);
+
+ return 0;
+err:
+ return -1;
+}
+
+int
+init(xlator_t *this)
+{
+ struct rda_priv *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO("readdir-ahead", this, err);
+
+ if (!this->children || this->children->next) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "FATAL: readdir-ahead not configured with exactly one"
+ " child");
+ goto err;
+ }
+
+ if (!this->parents) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ priv = GF_CALLOC(1, sizeof(struct rda_priv), gf_rda_mt_rda_priv);
+ if (!priv)
+ goto err;
+ this->private = priv;
+
+ this->local_pool = mem_pool_new(struct rda_local, 32);
+ if (!this->local_pool)
+ goto err;
+
+ GF_OPTION_INIT("rda-request-size", priv->rda_req_size, uint32, err);
+ GF_OPTION_INIT("rda-low-wmark", priv->rda_low_wmark, size_uint64, err);
+ GF_OPTION_INIT("rda-high-wmark", priv->rda_high_wmark, size_uint64, err);
+
+ return 0;
+
+err:
+ if (this->local_pool)
+ mem_pool_destroy(this->local_pool);
+ if (priv)
+ GF_FREE(priv);
+
+ return -1;
+}
+
+
+void
+fini(xlator_t *this)
+{
+ GF_VALIDATE_OR_GOTO ("readdir-ahead", this, out);
+
+ GF_FREE(this->private);
+
+out:
+ return;
+}
+
+struct xlator_fops fops = {
+ .opendir = rda_opendir,
+ .readdirp = rda_readdirp,
+};
+
+struct xlator_cbks cbks = {
+ .releasedir = rda_releasedir,
+};
+
+struct volume_options options[] = {
+ { .key = {"rda-request-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 4096,
+ .max = 131072,
+ .default_value = "131072",
+ .description = "readdir-ahead request size",
+ },
+ { .key = {"rda-low-wmark"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 0,
+ .max = 10 * GF_UNIT_MB,
+ .default_value = "4096",
+ .description = "the value under which we plug",
+ },
+ { .key = {"rda-high-wmark"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 0,
+ .max = 100 * GF_UNIT_MB,
+ .default_value = "131072",
+ .description = "the value over which we unplug",
+ },
+ { .key = {NULL} },
+};
+
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.h b/xlators/performance/readdir-ahead/src/readdir-ahead.h
new file mode 100644
index 000000000..e48786dae
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead.h
@@ -0,0 +1,46 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __READDIR_AHEAD_H
+#define __READDIR_AHEAD_H
+
+/* state flags */
+#define RDA_FD_NEW (1 << 0)
+#define RDA_FD_RUNNING (1 << 1)
+#define RDA_FD_EOD (1 << 2)
+#define RDA_FD_ERROR (1 << 3)
+#define RDA_FD_BYPASS (1 << 4)
+#define RDA_FD_PLUGGED (1 << 5)
+
+struct rda_fd_ctx {
+ off_t cur_offset; /* current head of the ctx */
+ size_t cur_size; /* current size of the preload */
+ off_t next_offset; /* tail of the ctx */
+ uint32_t state;
+ gf_lock_t lock;
+ gf_dirent_t entries;
+ call_frame_t *fill_frame;
+ call_stub_t *stub;
+ int op_errno;
+};
+
+struct rda_local {
+ struct rda_fd_ctx *ctx;
+ fd_t *fd;
+ off_t offset;
+};
+
+struct rda_priv {
+ uint32_t rda_req_size;
+ uint64_t rda_low_wmark;
+ uint64_t rda_high_wmark;
+};
+
+#endif /* __READDIR_AHEAD_H */
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index ebb8410a0..3cb0d449b 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -108,6 +108,7 @@ typedef struct wb_inode {
after it arrived (i.e, those that have a
liability generation higher than itself)
*/
+ size_t size; /* Size of the file to catch write after EOF. */
gf_lock_t lock;
xlator_t *this;
} wb_inode_t;
@@ -213,11 +214,7 @@ wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno)
int32_t tmp = 0;
if (fd_ctx_get (fd, this, &value) == 0) {
- if (value != EBADF) {
- fd_ctx_set (fd, this, EBADF);
- }
-
- if (op_errno != NULL) {
+ if (op_errno) {
tmp = value;
*op_errno = tmp;
}
@@ -507,8 +504,23 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
switch (stub->fop) {
case GF_FOP_WRITE:
- req->ordering.off = stub->args.offset;
- req->ordering.size = req->write_size;
+ LOCK (&wb_inode->lock);
+ {
+ if (wb_inode->size < stub->args.offset) {
+ req->ordering.off = wb_inode->size;
+ req->ordering.size = stub->args.offset
+ + req->write_size
+ - wb_inode->size;
+ } else {
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = req->write_size;
+ }
+
+ if (wb_inode->size < stub->args.offset + req->write_size)
+ wb_inode->size = stub->args.offset
+ + req->write_size;
+ }
+ UNLOCK (&wb_inode->lock);
req->fd = fd_ref (stub->args.fd);
@@ -523,10 +535,20 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
case GF_FOP_TRUNCATE:
req->ordering.off = stub->args.offset;
req->ordering.size = 0; /* till infinity */
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = req->ordering.off;
+ }
+ UNLOCK (&wb_inode->lock);
break;
case GF_FOP_FTRUNCATE:
req->ordering.off = stub->args.offset;
req->ordering.size = 0; /* till infinity */
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = req->ordering.off;
+ }
+ UNLOCK (&wb_inode->lock);
req->fd = fd_ref (stub->args.fd);
@@ -749,7 +771,7 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
} while (0)
-void
+int
wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
{
struct iovec vector[MAX_VECTOR_COUNT];
@@ -770,8 +792,9 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
list_for_each_entry (req, &head->winds, winds) {
WB_IOV_LOAD (vector, count, req, head);
- iobref_merge (head->stub->args.iobref,
- req->stub->args.iobref);
+ if (iobref_merge (head->stub->args.iobref,
+ req->stub->args.iobref))
+ goto err;
}
if (wb_fd_err (head->fd, this, NULL)) {
@@ -799,22 +822,23 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
head->stub->args.flags,
head->stub->args.iobref, NULL);
- return;
+ return 0;
err:
if (!fderr) {
/* frame creation failure */
- wb_fulfill_err (head, ENOMEM);
+ fderr = ENOMEM;
+ wb_fulfill_err (head, fderr);
}
wb_head_done (head);
- return;
+ return fderr;
}
#define NEXT_HEAD(head, req) do { \
if (head) \
- wb_fulfill_head (wb_inode, head); \
+ ret |= wb_fulfill_head (wb_inode, head); \
head = req; \
expected_offset = req->stub->args.offset + \
req->write_size; \
@@ -823,7 +847,7 @@ err:
} while (0)
-void
+int
wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
{
wb_request_t *req = NULL;
@@ -833,6 +857,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
off_t expected_offset = 0;
size_t curr_aggregate = 0;
size_t vector_count = 0;
+ int ret = 0;
conf = wb_inode->this->private;
@@ -876,8 +901,9 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
}
if (head)
- wb_fulfill_head (wb_inode, head);
- return;
+ ret |= wb_fulfill_head (wb_inode, head);
+
+ return ret;
}
@@ -967,11 +993,11 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
ret = iobref_add (iobref, iobuf);
if (ret != 0) {
- iobuf_unref (iobuf);
- iobref_unref (iobref);
gf_log (req->wb_inode->this->name, GF_LOG_WARNING,
"cannot add iobuf (%p) into iobref (%p)",
iobuf, iobref);
+ iobuf_unref (iobuf);
+ iobref_unref (iobref);
goto out;
}
@@ -1162,32 +1188,54 @@ wb_process_queue (wb_inode_t *wb_inode)
list_head_t tasks = {0, };
list_head_t lies = {0, };
list_head_t liabilities = {0, };
+ int retry = 0;
INIT_LIST_HEAD (&tasks);
INIT_LIST_HEAD (&lies);
INIT_LIST_HEAD (&liabilities);
- LOCK (&wb_inode->lock);
- {
- __wb_preprocess_winds (wb_inode);
+ do {
+ LOCK (&wb_inode->lock);
+ {
+ __wb_preprocess_winds (wb_inode);
- __wb_pick_winds (wb_inode, &tasks, &liabilities);
+ __wb_pick_winds (wb_inode, &tasks, &liabilities);
- __wb_pick_unwinds (wb_inode, &lies);
+ __wb_pick_unwinds (wb_inode, &lies);
- }
- UNLOCK (&wb_inode->lock);
+ }
+ UNLOCK (&wb_inode->lock);
- wb_do_unwinds (wb_inode, &lies);
+ wb_do_unwinds (wb_inode, &lies);
- wb_do_winds (wb_inode, &tasks);
+ wb_do_winds (wb_inode, &tasks);
- wb_fulfill (wb_inode, &liabilities);
+ /* fd might've been marked bad due to previous errors.
+ * Since, caller of wb_process_queue might be the last fop on
+ * inode, make sure we keep processing request queue, till there
+ * are no requests left.
+ */
+ retry = wb_fulfill (wb_inode, &liabilities);
+ } while (retry);
return;
}
+void
+wb_set_inode_size(wb_inode_t *wb_inode, struct iatt *postbuf)
+{
+ GF_ASSERT (wb_inode);
+ GF_ASSERT (postbuf);
+
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = postbuf->ia_size;
+ }
+ UNLOCK (&wb_inode->lock);
+}
+
+
int
wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -1254,8 +1302,7 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
if (fd->flags & (O_SYNC|O_DSYNC|o_direct))
wb_disabled = 1;
- if (flags & (O_SYNC|O_DSYNC|O_DIRECT))
- /* O_DIRECT flag in params of writev must _always_ be honored */
+ if (flags & (O_SYNC|O_DSYNC|o_direct))
wb_disabled = 1;
if (wb_disabled)
@@ -1414,7 +1461,7 @@ wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
if (!wb_enqueue (wb_inode, stub))
goto unwind;
- wb_process_queue (wb_inode);
+ wb_process_queue (wb_inode);
return 0;
@@ -1569,11 +1616,29 @@ noqueue:
}
+int32_t
+wb_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame->local);
+
+ if (op_ret == 0)
+ wb_set_inode_size (frame->local, postbuf);
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+
int
wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
off_t offset, dict_t *xdata)
{
- STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this),
+ STACK_WIND (frame, wb_truncate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
return 0;
}
@@ -1590,6 +1655,8 @@ wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
if (!wb_inode)
goto unwind;
+ frame->local = wb_inode;
+
stub = fop_truncate_stub (frame, wb_truncate_helper, loc,
offset, xdata);
if (!stub)
@@ -1612,11 +1679,29 @@ unwind:
}
+int32_t
+wb_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame->local);
+
+ if (op_ret == 0)
+ wb_set_inode_size (frame->local, postbuf);
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+
int
wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
off_t offset, dict_t *xdata)
{
- STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this),
+ STACK_WIND (frame, wb_ftruncate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
return 0;
}
@@ -1639,6 +1724,8 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
if (wb_fd_err (fd, this, &op_errno))
goto unwind;
+ frame->local = wb_inode;
+
stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd,
offset, xdata);
if (!stub) {
@@ -1656,6 +1743,8 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
return 0;
unwind:
+ frame->local = NULL;
+
STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
if (stub)
@@ -1756,6 +1845,81 @@ noqueue:
}
+int32_t
+wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+
+ wb_inode = wb_inode_create (this, fd->inode);
+ if (!wb_inode)
+ goto unwind;
+
+ if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+ wb_inode->size = 0;
+
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create, loc, flags, mode,
+ umask, fd, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+wb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+
+ wb_inode = wb_inode_create (this, fd->inode);
+ if (!wb_inode)
+ goto unwind;
+
+ if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+ wb_inode->size = 0;
+
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+wb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ if (op_ret == 0) {
+ wb_inode_t *wb_inode = wb_inode_ctx_get (this, inode);
+ if (wb_inode)
+ wb_set_inode_size (wb_inode, buf);
+ }
+
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+
+int32_t
+wb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ STACK_WIND (frame, wb_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
+
+
int
wb_forget (xlator_t *this, inode_t *inode)
{
@@ -1952,7 +2116,7 @@ reconfigure (xlator_t *this, dict_t *options)
conf = this->private;
- GF_OPTION_RECONF ("cache-size", conf->window_size, options, size, out);
+ GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, out);
GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool,
out);
@@ -1999,7 +2163,7 @@ init (xlator_t *this)
conf->aggregate_size = WB_AGGREGATE_SIZE;
/* configure 'option window-size <size>' */
- GF_OPTION_INIT ("cache-size", conf->window_size, size, out);
+ GF_OPTION_INIT ("cache-size", conf->window_size, size_uint64, out);
if (!conf->window_size && conf->aggregate_size) {
gf_log (this->name, GF_LOG_WARNING,
diff --git a/xlators/playground/Makefile.am b/xlators/playground/Makefile.am
new file mode 100644
index 000000000..e7de6b31a
--- /dev/null
+++ b/xlators/playground/Makefile.am
@@ -0,0 +1,2 @@
+SUBDIRS = template
+CLEANFILES =
diff --git a/xlators/playground/template/Makefile.am b/xlators/playground/template/Makefile.am
new file mode 100644
index 000000000..f26892443
--- /dev/null
+++ b/xlators/playground/template/Makefile.am
@@ -0,0 +1,2 @@
+SUBDIRS = src
+
diff --git a/xlators/playground/template/src/Makefile.am b/xlators/playground/template/src/Makefile.am
new file mode 100644
index 000000000..21f1c5f6b
--- /dev/null
+++ b/xlators/playground/template/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = template.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features
+
+template_la_LDFLAGS = -module -avoid-version
+
+template_la_SOURCES = template.c
+template_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = template.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/playground/template/src/template.c b/xlators/playground/template/src/template.c
new file mode 100644
index 000000000..37a7794a0
--- /dev/null
+++ b/xlators/playground/template/src/template.c
@@ -0,0 +1,49 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "template.h"
+
+int32_t
+init (xlator_t *this)
+{
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "not configured with exactly one child. exiting");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ return 0;
+}
+
+void
+fini (xlator_t *this)
+{
+ return;
+}
+
+struct xlator_fops fops = {
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/playground/template/src/template.h b/xlators/playground/template/src/template.h
new file mode 100644
index 000000000..d6aced501
--- /dev/null
+++ b/xlators/playground/template/src/template.h
@@ -0,0 +1,24 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef __TEMPLATE_H__
+#define __TEMPLATE_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "defaults.h"
+
+#endif /* __TEMPLATE_H__ */
diff --git a/xlators/protocol/auth/addr/src/addr.c b/xlators/protocol/auth/addr/src/addr.c
index 199fc6db0..181d091bd 100644
--- a/xlators/protocol/auth/addr/src/addr.c
+++ b/xlators/protocol/auth/addr/src/addr.c
@@ -2,19 +2,10 @@
Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -190,6 +181,7 @@ gf_auth (dict_t *input_params, dict_t *config_params)
addr_str = strtok_r (NULL, ADDR_DELIMITER, &tmp);
}
GF_FREE (addr_cpy);
+ addr_cpy = NULL;
}
if (allow_addr) {
diff --git a/xlators/protocol/auth/login/src/login.c b/xlators/protocol/auth/login/src/login.c
index 702a876ac..c2f0bf0d0 100644
--- a/xlators/protocol/auth/login/src/login.c
+++ b/xlators/protocol/auth/login/src/login.c
@@ -2,19 +2,10 @@
Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c
index ff0e162c6..85b0f757b 100644
--- a/xlators/protocol/client/src/client-handshake.c
+++ b/xlators/protocol/client/src/client-handshake.c
@@ -53,7 +53,7 @@ rpc_client_ping_timer_expired (void *data)
rpc_clnt_connection_t *conn = NULL;
int disconnect = 0;
int transport_activity = 0;
- struct timeval timeout = {0, };
+ struct timespec timeout = {0, };
struct timeval current = {0, };
struct rpc_clnt *clnt = NULL;
xlator_t *this = NULL;
@@ -101,7 +101,7 @@ rpc_client_ping_timer_expired (void *data)
"ping timer expired but transport activity "
"detected - not bailing transport");
timeout.tv_sec = conf->opt.ping_timeout;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
conn->ping_timer =
gf_timer_call_after (this->ctx, timeout,
@@ -140,7 +140,7 @@ client_start_ping (void *data)
clnt_conf_t *conf = NULL;
rpc_clnt_connection_t *conn = NULL;
int32_t ret = -1;
- struct timeval timeout = {0, };
+ struct timespec timeout = {0, };
call_frame_t *frame = NULL;
int frame_count = 0;
@@ -196,7 +196,7 @@ client_start_ping (void *data)
}
timeout.tv_sec = conf->opt.ping_timeout;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
conn->ping_timer =
gf_timer_call_after (this->ctx, timeout,
@@ -241,7 +241,7 @@ client_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
{
xlator_t *this = NULL;
rpc_clnt_connection_t *conn = NULL;
- struct timeval timeout = {0, };
+ struct timespec timeout = {0, };
call_frame_t *frame = NULL;
clnt_conf_t *conf = NULL;
@@ -281,7 +281,7 @@ client_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
timeout.tv_sec = conf->opt.ping_timeout;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
gf_timer_call_cancel (this->ctx,
conn->ping_timer);
@@ -465,17 +465,23 @@ client_set_lk_version (xlator_t *this)
clnt_conf_t *conf = NULL;
call_frame_t *frame = NULL;
gf_set_lk_ver_req req = {0, };
+ char *process_uuid = NULL;
GF_VALIDATE_OR_GOTO ("client", this, err);
conf = (clnt_conf_t *) this->private;
req.lk_ver = client_get_lk_ver (conf);
- ret = gf_asprintf (&req.uid, "%s-%s-%d",
- this->ctx->process_uuid, this->name,
- this->graph->id);
- if (ret == -1)
+ ret = dict_get_str (this->options, "process-uuid", &process_uuid);
+ if (!process_uuid) {
+ ret = -1;
goto err;
+ }
+ req.uid = gf_strdup (process_uuid);
+ if (!req.uid) {
+ ret = -1;
+ goto err;
+ }
frame = create_frame (this, this->ctx->pool);
if (!frame) {
@@ -1452,7 +1458,7 @@ client_setvolume_cbk (struct rpc_req *req, struct iovec *iov, int count, void *m
gf_log (this->name, GF_LOG_INFO,
"Connected to %s, attached to remote volume '%s'.",
- conf->rpc->conn.trans->peerinfo.identifier,
+ conf->rpc->conn.name,
remote_subvol);
rpc_clnt_set_connected (&conf->rpc->conn);
@@ -1475,6 +1481,7 @@ client_setvolume_cbk (struct rpc_req *req, struct iovec *iov, int count, void *m
gf_log (this->name, GF_LOG_INFO, "Server and Client "
"lk-version numbers are same, no need to "
"reopen the fds");
+ client_notify_parents_child_up (frame->this);
}
out:
@@ -1523,6 +1530,7 @@ client_setvolume (xlator_t *this, struct rpc_clnt *rpc)
char *process_uuid_xl = NULL;
clnt_conf_t *conf = NULL;
dict_t *options = NULL;
+ char counter_str[32] = {0};
options = this->options;
conf = this->private;
@@ -1548,13 +1556,24 @@ client_setvolume (xlator_t *this, struct rpc_clnt *rpc)
}
}
- /* With multiple graphs possible in the same process, we need a
+ /* When lock-heal is enabled:
+ * With multiple graphs possible in the same process, we need a
field to bring the uniqueness. Graph-ID should be enough to get the
- job done
+ job done.
+ * When lock-heal is disabled, connection-id should always be unique so
+ * that server never gets to reuse the previous connection resources
+ * so it cleans up the resources on every disconnect. Otherwise
+ * it may lead to stale resources, i.e. leaked file desciptors,
+ * inode/entry locks
*/
- ret = gf_asprintf (&process_uuid_xl, "%s-%s-%d",
+ if (!conf->lk_heal) {
+ snprintf (counter_str, sizeof (counter_str),
+ "-%"PRIu64, conf->setvol_count);
+ conf->setvol_count++;
+ }
+ ret = gf_asprintf (&process_uuid_xl, "%s-%s-%d%s",
this->ctx->process_uuid, this->name,
- this->graph->id);
+ this->graph->id, counter_str);
if (-1 == ret) {
gf_log (this->name, GF_LOG_ERROR,
"asprintf failed while setting process_uuid");
@@ -1744,6 +1763,7 @@ client_query_portmap_cbk (struct rpc_req *req, struct iovec *iov, int count, voi
}
conf->portmap_err_logged = 0;
+ conf->disconnect_err_logged = 0;
config.remote_port = rsp.port;
rpc_clnt_reconfig (conf->rpc, &config);
diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c
index 1fd8f0d50..b3c36a420 100644
--- a/xlators/protocol/client/src/client-lk.c
+++ b/xlators/protocol/client/src/client-lk.c
@@ -227,7 +227,7 @@ subtract_locks (client_posix_lock_t *big, client_posix_lock_t *small)
/* LOG-TODO : decide what more info is required here*/
gf_log ("client-protocol", GF_LOG_CRITICAL,
"Unexpected case in subtract_locks. Please send "
- "a bug report to gluster-devel@nongnu.org");
+ "a bug report to gluster-devel@gluster.org");
}
return v;
diff --git a/xlators/protocol/client/src/client-rpc-fops.c b/xlators/protocol/client/src/client-rpc-fops.c
index a6d0a591a..55d62dd39 100644
--- a/xlators/protocol/client/src/client-rpc-fops.c
+++ b/xlators/protocol/client/src/client-rpc-fops.c
@@ -1936,6 +1936,221 @@ out:
return 0;
}
+int
+client3_3_fallocate_cbk (struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ call_frame_t *frame = NULL;
+ gfs3_fallocate_rsp rsp = {0,};
+ struct iatt prestat = {0,};
+ struct iatt poststat = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+
+
+ this = THIS;
+
+ frame = myframe;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = ENOTCONN;
+ goto out;
+ }
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gfs3_fallocate_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ if (-1 != rsp.op_ret) {
+ gf_stat_to_iatt (&rsp.statpre, &prestat);
+ gf_stat_to_iatt (&rsp.statpost, &poststat);
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), ret,
+ rsp.op_errno, out);
+
+out:
+ if (rsp.op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s",
+ strerror (gf_error_to_errno (rsp.op_errno)));
+ }
+ CLIENT_STACK_UNWIND (fallocate, frame, rsp.op_ret,
+ gf_error_to_errno (rsp.op_errno), &prestat,
+ &poststat, xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+int
+client3_3_discard_cbk(struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ call_frame_t *frame = NULL;
+ gfs3_discard_rsp rsp = {0,};
+ struct iatt prestat = {0,};
+ struct iatt poststat = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+
+ this = THIS;
+
+ frame = myframe;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = ENOTCONN;
+ goto out;
+ }
+ ret = xdr_to_generic(*iov, &rsp, (xdrproc_t) xdr_gfs3_discard_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ if (-1 != rsp.op_ret) {
+ gf_stat_to_iatt (&rsp.statpre, &prestat);
+ gf_stat_to_iatt (&rsp.statpost, &poststat);
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), ret,
+ rsp.op_errno, out);
+
+out:
+ if (rsp.op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s",
+ strerror (gf_error_to_errno (rsp.op_errno)));
+ }
+ CLIENT_STACK_UNWIND (discard, frame, rsp.op_ret,
+ gf_error_to_errno (rsp.op_errno), &prestat,
+ &poststat, xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+int
+client3_3_zerofill_cbk(struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ call_frame_t *frame = NULL;
+ gfs3_zerofill_rsp rsp = {0,};
+ struct iatt prestat = {0,};
+ struct iatt poststat = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+
+ this = THIS;
+
+ frame = myframe;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = ENOTCONN;
+ goto out;
+ }
+ ret = xdr_to_generic(*iov, &rsp, (xdrproc_t) xdr_gfs3_zerofill_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ if (-1 != rsp.op_ret) {
+ gf_stat_to_iatt (&rsp.statpre, &prestat);
+ gf_stat_to_iatt (&rsp.statpost, &poststat);
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), ret,
+ rsp.op_errno, out);
+
+out:
+ if (rsp.op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "remote operation failed: %s",
+ strerror (gf_error_to_errno (rsp.op_errno)));
+ }
+ CLIENT_STACK_UNWIND (zerofill, frame, rsp.op_ret,
+ gf_error_to_errno (rsp.op_errno), &prestat,
+ &poststat, xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+int
+client3_3_ipc_cbk (struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ call_frame_t *frame = NULL;
+ gfs3_zerofill_rsp rsp = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+
+ this = THIS;
+
+ frame = myframe;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = ENOTCONN;
+ goto out;
+ }
+ ret = xdr_to_generic(*iov, &rsp, (xdrproc_t) xdr_gf_common_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), ret,
+ rsp.op_errno, out);
+
+out:
+ if (rsp.op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "remote operation failed: %s",
+ strerror (gf_error_to_errno (rsp.op_errno)));
+ }
+ CLIENT_STACK_UNWIND (ipc, frame,
+ rsp.op_ret, gf_error_to_errno (rsp.op_errno),
+ xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
int
client3_3_setattr_cbk (struct rpc_req *req, struct iovec *iov, int count,
@@ -2597,7 +2812,8 @@ out:
rsp.op_errno = op_errno;
if (rsp.op_ret == -1) {
/* any error other than ENOENT */
- if (rsp.op_errno != ENOENT)
+ if (!(local->loc.name && rsp.op_errno == ENOENT) &&
+ !(rsp.op_errno == ESTALE))
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s. Path: %s (%s)",
strerror (rsp.op_errno), local->loc.path,
@@ -2757,7 +2973,7 @@ client_fdctx_destroy (xlator_t *this, clnt_fd_ctx_t *fdctx)
if (fdctx->is_dir) {
gfs3_releasedir_req req = {{0,},};
req.fd = fdctx->remote_fd;
- gf_log (this->name, GF_LOG_DEBUG, "sending releasedir on fd");
+ gf_log (this->name, GF_LOG_TRACE, "sending releasedir on fd");
client_submit_request (this, &req, fr, &clnt3_3_fop_prog,
GFS3_OP_RELEASEDIR,
client3_3_releasedir_cbk,
@@ -2766,7 +2982,7 @@ client_fdctx_destroy (xlator_t *this, clnt_fd_ctx_t *fdctx)
} else {
gfs3_release_req req = {{0,},};
req.fd = fdctx->remote_fd;
- gf_log (this->name, GF_LOG_DEBUG, "sending release on fd");
+ gf_log (this->name, GF_LOG_TRACE, "sending release on fd");
client_submit_request (this, &req, fr, &clnt3_3_fop_prog,
GFS3_OP_RELEASE,
client3_3_release_cbk, NULL,
@@ -2901,13 +3117,13 @@ client3_3_lookup (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
if (!(args->loc && args->loc->inode))
goto unwind;
loc_copy (&local->loc, args->loc);
loc_path (&local->loc, NULL);
- frame->local = local;
if (args->loc->parent) {
if (!uuid_is_null (args->loc->parent->gfid))
@@ -3625,13 +3841,13 @@ client3_3_mknod (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
if (!(args->loc && args->loc->parent))
goto unwind;
loc_copy (&local->loc, args->loc);
loc_path (&local->loc, NULL);
- frame->local = local;
if (!uuid_is_null (args->loc->parent->gfid))
memcpy (req.pargfid, args->loc->parent->gfid, 16);
@@ -3693,13 +3909,13 @@ client3_3_mkdir (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
if (!(args->loc && args->loc->parent))
goto unwind;
loc_copy (&local->loc, args->loc);
loc_path (&local->loc, NULL);
- frame->local = local;
if (!uuid_is_null (args->loc->parent->gfid))
memcpy (req.pargfid, args->loc->parent->gfid, 16);
@@ -3760,6 +3976,8 @@ client3_3_create (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
+
if (!(args->loc && args->loc->parent))
goto unwind;
@@ -3768,7 +3986,6 @@ client3_3_create (call_frame_t *frame, xlator_t *this,
loc_copy (&local->loc, args->loc);
loc_path (&local->loc, NULL);
- frame->local = local;
if (!uuid_is_null (args->loc->parent->gfid))
memcpy (req.pargfid, args->loc->parent->gfid, 16);
@@ -3831,6 +4048,8 @@ client3_3_open (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
+
if (!(args->loc && args->loc->inode))
goto unwind;
@@ -3838,7 +4057,6 @@ client3_3_open (call_frame_t *frame, xlator_t *this,
local->flags = args->flags;
loc_copy (&local->loc, args->loc);
loc_path (&local->loc, NULL);
- frame->local = local;
if (!uuid_is_null (args->loc->inode->gfid))
memcpy (req.gfid, args->loc->inode->gfid, 16);
@@ -4221,13 +4439,14 @@ client3_3_opendir (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
+
if (!(args->loc && args->loc->inode))
goto unwind;
local->fd = fd_ref (args->fd);
loc_copy (&local->loc, args->loc);
loc_path (&local->loc, NULL);
- frame->local = local;
if (!uuid_is_null (args->loc->inode->gfid))
memcpy (req.gfid, args->loc->inode->gfid, 16);
@@ -5084,6 +5303,7 @@ client3_3_lk (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
remote_fd, op_errno, unwind);
@@ -5111,7 +5331,6 @@ client3_3_lk (call_frame_t *frame, xlator_t *this,
local->owner = frame->root->lk_owner;
local->cmd = args->cmd;
local->fd = fd_ref (args->fd);
- frame->local = local;
req.fd = remote_fd;
req.cmd = gf_cmd;
@@ -5786,7 +6005,181 @@ unwind:
return 0;
}
+int32_t
+client3_3_fallocate(call_frame_t *frame, xlator_t *this, void *data)
+{
+ clnt_args_t *args = NULL;
+ int64_t remote_fd = -1;
+ clnt_conf_t *conf = NULL;
+ gfs3_fallocate_req req = {{0},};
+ int op_errno = ESTALE;
+ int ret = 0;
+
+ if (!frame || !this || !data)
+ goto unwind;
+
+ args = data;
+ conf = this->private;
+
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
+
+ req.fd = remote_fd;
+ req.flags = args->flags;
+ req.offset = args->offset;
+ req.size = args->size;
+ memcpy(req.gfid, args->fd->inode->gfid, 16);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
+ req.xdata.xdata_len, op_errno, unwind);
+
+ ret = client_submit_request (this, &req, frame, conf->fops,
+ GFS3_OP_FALLOCATE,
+ client3_3_fallocate_cbk, NULL,
+ NULL, 0, NULL, 0,
+ NULL, (xdrproc_t)xdr_gfs3_fallocate_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+ }
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+unwind:
+ CLIENT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
+
+int32_t
+client3_3_discard(call_frame_t *frame, xlator_t *this, void *data)
+{
+ clnt_args_t *args = NULL;
+ int64_t remote_fd = -1;
+ clnt_conf_t *conf = NULL;
+ gfs3_discard_req req = {{0},};
+ int op_errno = ESTALE;
+ int ret = 0;
+
+ if (!frame || !this || !data)
+ goto unwind;
+
+ args = data;
+ conf = this->private;
+
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
+
+ req.fd = remote_fd;
+ req.offset = args->offset;
+ req.size = args->size;
+ memcpy(req.gfid, args->fd->inode->gfid, 16);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
+ req.xdata.xdata_len, op_errno, unwind);
+
+ ret = client_submit_request(this, &req, frame, conf->fops,
+ GFS3_OP_DISCARD, client3_3_discard_cbk,
+ NULL, NULL, 0, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_discard_req);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+unwind:
+ CLIENT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
+
+int32_t
+client3_3_zerofill(call_frame_t *frame, xlator_t *this, void *data)
+{
+ clnt_args_t *args = NULL;
+ int64_t remote_fd = -1;
+ clnt_conf_t *conf = NULL;
+ gfs3_zerofill_req req = {{0},};
+ int op_errno = ESTALE;
+ int ret = 0;
+
+ GF_ASSERT (frame);
+
+ if (!this || !data)
+ goto unwind;
+
+ args = data;
+ conf = this->private;
+
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
+
+ req.fd = remote_fd;
+ req.offset = args->offset;
+ req.size = args->size;
+ memcpy(req.gfid, args->fd->inode->gfid, 16);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
+ req.xdata.xdata_len, op_errno, unwind);
+
+ ret = client_submit_request(this, &req, frame, conf->fops,
+ GFS3_OP_ZEROFILL, client3_3_zerofill_cbk,
+ NULL, NULL, 0, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_zerofill_req);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+unwind:
+ CLIENT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
+
+int32_t
+client3_3_ipc (call_frame_t *frame, xlator_t *this, void *data)
+{
+ clnt_args_t *args = NULL;
+ clnt_conf_t *conf = NULL;
+ gfs3_ipc_req req = {0,};
+ int op_errno = ESTALE;
+ int ret = 0;
+ GF_ASSERT (frame);
+
+ if (!this || !data)
+ goto unwind;
+
+ args = data;
+ conf = this->private;
+
+ req.op = args->cmd;
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
+ req.xdata.xdata_len, op_errno, unwind);
+
+ ret = client_submit_request(this, &req, frame, conf->fops,
+ GFS3_OP_IPC, client3_3_ipc_cbk,
+ NULL, NULL, 0, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_ipc_req);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+unwind:
+ CLIENT_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
/* Table Specific to FOPS */
@@ -5833,10 +6226,14 @@ rpc_clnt_procedure_t clnt3_3_fop_actors[GF_FOP_MAXVALUE] = {
[GF_FOP_SETATTR] = { "SETATTR", client3_3_setattr },
[GF_FOP_FSETATTR] = { "FSETATTR", client3_3_fsetattr },
[GF_FOP_READDIRP] = { "READDIRP", client3_3_readdirp },
+ [GF_FOP_FALLOCATE] = { "FALLOCATE", client3_3_fallocate },
+ [GF_FOP_DISCARD] = { "DISCARD", client3_3_discard },
+ [GF_FOP_ZEROFILL] = { "ZEROFILL", client3_3_zerofill},
[GF_FOP_RELEASE] = { "RELEASE", client3_3_release },
[GF_FOP_RELEASEDIR] = { "RELEASEDIR", client3_3_releasedir },
[GF_FOP_GETSPEC] = { "GETSPEC", client3_getspec },
- [GF_FOP_FREMOVEXATTR] = { "FREMOVEXATTR", client3_3_fremovexattr },
+ [GF_FOP_FREMOVEXATTR]= { "FREMOVEXATTR",client3_3_fremovexattr },
+ [GF_FOP_IPC] = { "IPC", client3_3_ipc },
};
/* Used From RPC-CLNT library to log proper name of procedure based on number */
@@ -5885,6 +6282,11 @@ char *clnt3_3_fop_names[GFS3_OP_MAXVALUE] = {
[GFS3_OP_RELEASE] = "RELEASE",
[GFS3_OP_RELEASEDIR] = "RELEASEDIR",
[GFS3_OP_FREMOVEXATTR] = "FREMOVEXATTR",
+ [GFS3_OP_FALLOCATE] = "FALLOCATE",
+ [GFS3_OP_DISCARD] = "DISCARD",
+ [GFS3_OP_ZEROFILL] = "ZEROFILL",
+ [GFS3_OP_IPC] = "IPC",
+
};
rpc_clnt_prog_t clnt3_3_fop_prog = {
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 3b5ecb0aa..f1415899e 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -21,6 +21,7 @@
#include "statedump.h"
#include "compat-errno.h"
+#include "xdr-rpc.h"
#include "glusterfs3.h"
extern rpc_clnt_prog_t clnt_handshake_prog;
@@ -130,7 +131,7 @@ client_register_grace_timer (xlator_t *this, clnt_conf_t *conf)
conf->grace_timer =
gf_timer_call_after (this->ctx,
- conf->grace_tv,
+ conf->grace_ts,
client_grace_timeout,
conf->rpc);
}
@@ -943,6 +944,7 @@ client_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
args.vector = vector;
args.count = count;
args.offset = off;
+ args.size = iov_length (vector, count);
args.flags = flags;
args.iobref = iobref;
args.xdata = xdata;
@@ -1961,6 +1963,142 @@ out:
return 0;
}
+int32_t
+client_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int ret = -1;
+ clnt_conf_t *conf = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ clnt_args_t args = {0,};
+
+ conf = this->private;
+ if (!conf || !conf->fops)
+ goto out;
+
+ args.fd = fd;
+ args.flags = mode;
+ args.offset = offset;
+ args.size = len;
+ args.xdata = xdata;
+
+ proc = &conf->fops->proctable[GF_FOP_FALLOCATE];
+ if (!proc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rpc procedure not found for %s",
+ gf_fop_list[GF_FOP_FALLOCATE]);
+ goto out;
+ }
+ if (proc->fn)
+ ret = proc->fn (frame, this, &args);
+out:
+ if (ret)
+ STACK_UNWIND_STRICT (fallocate, frame, -1, ENOTCONN, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+client_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int ret = -1;
+ clnt_conf_t *conf = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ clnt_args_t args = {0,};
+
+ conf = this->private;
+ if (!conf || !conf->fops)
+ goto out;
+
+ args.fd = fd;
+ args.offset = offset;
+ args.size = len;
+ args.xdata = xdata;
+
+ proc = &conf->fops->proctable[GF_FOP_DISCARD];
+ if (!proc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rpc procedure not found for %s",
+ gf_fop_list[GF_FOP_DISCARD]);
+ goto out;
+ }
+ if (proc->fn)
+ ret = proc->fn (frame, this, &args);
+out:
+ if (ret)
+ STACK_UNWIND_STRICT(discard, frame, -1, ENOTCONN, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+client_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ int ret = -1;
+ clnt_conf_t *conf = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ clnt_args_t args = {0,};
+
+ conf = this->private;
+ if (!conf || !conf->fops)
+ goto out;
+
+ args.fd = fd;
+ args.offset = offset;
+ args.size = len;
+ args.xdata = xdata;
+
+ proc = &conf->fops->proctable[GF_FOP_ZEROFILL];
+ if (!proc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rpc procedure not found for %s",
+ gf_fop_list[GF_FOP_ZEROFILL]);
+ goto out;
+ }
+ if (proc->fn)
+ ret = proc->fn (frame, this, &args);
+out:
+ if (ret)
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ENOTCONN,
+ NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+client_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ int ret = -1;
+ clnt_conf_t *conf = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ clnt_args_t args = {0,};
+
+ conf = this->private;
+ if (!conf || !conf->fops)
+ goto out;
+
+ args.cmd = op;
+ args.xdata = xdata;
+
+ proc = &conf->fops->proctable[GF_FOP_IPC];
+ if (!proc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rpc procedure not found for %s",
+ gf_fop_list[GF_FOP_IPC]);
+ goto out;
+ }
+ if (proc->fn)
+ ret = proc->fn (frame, this, &args);
+out:
+ if (ret)
+ STACK_UNWIND_STRICT(ipc, frame, -1, ENOTCONN, NULL);
+
+ return 0;
+}
+
int32_t
client_getspec (call_frame_t *frame, xlator_t *this, const char *key,
@@ -2092,9 +2230,19 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
client_register_grace_timer (this, conf);
if (!conf->skip_notify) {
- if (conf->connected)
- gf_log (this->name, GF_LOG_INFO,
- "disconnected");
+ if (conf->connected) {
+ gf_log (this->name,
+ ((!conf->disconnect_err_logged)
+ ? GF_LOG_INFO : GF_LOG_DEBUG),
+ "disconnected from %s. Client process "
+ "will keep trying to connect to "
+ "glusterd until brick's port is "
+ "available",
+ conf->rpc->conn.name);
+
+ if (conf->portmap_err_logged)
+ conf->disconnect_err_logged = 1;
+ }
/* If the CHILD_DOWN event goes to parent xlator
multiple times, the logic of parent xlator notify
@@ -2336,14 +2484,14 @@ client_init_grace_timer (xlator_t *this, dict_t *options,
ret = dict_get_int32 (options, "grace-timeout", &grace_timeout);
if (!ret)
- conf->grace_tv.tv_sec = grace_timeout;
+ conf->grace_ts.tv_sec = grace_timeout;
else
- conf->grace_tv.tv_sec = 10;
+ conf->grace_ts.tv_sec = 10;
- conf->grace_tv.tv_usec = 0;
+ conf->grace_ts.tv_nsec = 0;
gf_log (this->name, GF_LOG_DEBUG, "Client grace timeout "
- "value = %"PRIu64, conf->grace_tv.tv_sec);
+ "value = %"GF_PRI_SECOND, conf->grace_ts.tv_sec);
ret = 0;
out:
@@ -2669,7 +2817,11 @@ struct xlator_fops fops = {
.fxattrop = client_fxattrop,
.setattr = client_setattr,
.fsetattr = client_fsetattr,
+ .fallocate = client_fallocate,
+ .discard = client_discard,
+ .zerofill = client_zerofill,
.getspec = client_getspec,
+ .ipc = client_ipc,
};
@@ -2724,13 +2876,19 @@ struct volume_options options[] = {
{ .key = {"lk-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
- .description = "Enables or disables the lock heal."
+ .description = "When the connection to client is lost, server "
+ "cleans up all the locks held by the client. After "
+ "the connection is restored, the client reacquires "
+ "(heals) the fcntl locks released by the server."
},
{ .key = {"grace-timeout"},
.type = GF_OPTION_TYPE_INT,
.min = 10,
.max = 1800,
- .description = "Sets the grace-timeout value. Valid range 10-1800."
+ .default_value = "10",
+ .description = "Specifies the duration for the lock state to be "
+ "maintained on the client after a network "
+ "disconnection. Range 10-1800 seconds."
},
{.key = {"tcp-window-size"},
.type = GF_OPTION_TYPE_SIZET,
diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h
index 0a27c095c..bc0f5d0e9 100644
--- a/xlators/protocol/client/src/client.h
+++ b/xlators/protocol/client/src/client.h
@@ -93,6 +93,10 @@ typedef struct clnt_conf {
which was sent earlier */
char portmap_err_logged; /* flag used to prevent
excessive logging */
+ char disconnect_err_logged; /* flag used to prevent
+ excessive disconnect
+ logging */
+
char need_different_port; /* flag used to change the
portmap path in case of
'tcp,rdma' on server */
@@ -100,7 +104,7 @@ typedef struct clnt_conf {
uint16_t lk_version; /* this variable is used to distinguish
client-server transaction while
performing lock healing */
- struct timeval grace_tv;
+ struct timespec grace_ts;
gf_timer_t *grace_timer;
gf_boolean_t grace_timer_needed; /* The state of this flag will
be used to decide whether
@@ -116,6 +120,11 @@ typedef struct clnt_conf {
*/
gf_boolean_t filter_o_direct; /* if set, filter O_DIRECT from
the flags list of open() */
+ /* set volume is the op which results in creating/re-using
+ * the conn-id and is called once per connection, this remembers
+ * how manytimes set_volume is called
+ */
+ uint64_t setvol_count;
} clnt_conf_t;
typedef struct _client_fd_ctx {
diff --git a/xlators/protocol/server/src/Makefile.am b/xlators/protocol/server/src/Makefile.am
index 25d6706cc..6a18bf025 100644
--- a/xlators/protocol/server/src/Makefile.am
+++ b/xlators/protocol/server/src/Makefile.am
@@ -16,9 +16,9 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) \
-I$(top_srcdir)/libglusterfs/src \
-DCONFDIR=\"$(sysconfdir)/glusterfs\" \
-DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" \
- -I$(top_srcdir)/xlators/protocol/lib/src \
- -I$(top_srcdir)/rpc/rpc-lib/src/ \
- -I$(top_srcdir)/rpc/xdr/src/
+ -I$(top_srcdir)/xlators/protocol/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(top_srcdir)/rpc/xdr/src
AM_CFLAGS = -Wall $(GF_CFLAGS) \
-DDATADIR=\"$(localstatedir)\"
diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c
index ed3d75860..a459f89e1 100644
--- a/xlators/protocol/server/src/server-handshake.c
+++ b/xlators/protocol/server/src/server-handshake.c
@@ -94,9 +94,9 @@ _volfile_update_checksum (xlator_t *this, char *key, uint32_t checksum)
if (temp_volfile->checksum != checksum) {
gf_log (this->name, GF_LOG_INFO,
- "the volume file got modified between earlier access "
- "and now, this may lead to inconsistency between "
- "clients, advised to remount client");
+ "the volume file was modified between a prior access "
+ "and now. This may lead to inconsistency between "
+ "clients, you are advised to remount client");
temp_volfile->checksum = checksum;
}
@@ -109,10 +109,10 @@ static size_t
getspec_build_volfile_path (xlator_t *this, const char *key, char *path,
size_t path_len)
{
- int ret = -1;
+ char *filename = NULL;
+ server_conf_t *conf = NULL;
+ int ret = -1;
int free_filename = 0;
- char *filename = NULL;
- server_conf_t *conf = NULL;
char data_key[256] = {0,};
conf = this->private;
@@ -329,14 +329,15 @@ server_setvolume (rpcsvc_request_t *req)
{
gf_setvolume_req args = {{0,},};
gf_setvolume_rsp rsp = {0,};
- server_connection_t *conn = NULL;
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
server_conf_t *conf = NULL;
peer_info_t *peerinfo = NULL;
dict_t *reply = NULL;
dict_t *config_params = NULL;
dict_t *params = NULL;
char *name = NULL;
- char *process_uuid = NULL;
+ char *client_uid = NULL;
char *clnt_version = NULL;
xlator_t *xl = NULL;
char *msg = NULL;
@@ -393,7 +394,7 @@ server_setvolume (rpcsvc_request_t *req)
params->extra_free = buf;
buf = NULL;
- ret = dict_get_str (params, "process-uuid", &process_uuid);
+ ret = dict_get_str (params, "process-uuid", &client_uid);
if (ret < 0) {
ret = dict_set_str (reply, "ERROR",
"UUID not specified");
@@ -420,25 +421,34 @@ server_setvolume (rpcsvc_request_t *req)
goto fail;
}
- conn = server_connection_get (this, process_uuid);
- if (!conn) {
+ client = gf_client_get (this, &req->cred, client_uid);
+ if (client == NULL) {
op_ret = -1;
op_errno = ENOMEM;
goto fail;
}
- gf_log (this->name, GF_LOG_DEBUG, "Connected to %s", conn->id);
- cancelled = server_cancel_conn_timer (this, conn);
- if (cancelled)//Do connection_put on behalf of grace-timer-handler.
- server_connection_put (this, conn, NULL);
- if (conn->lk_version != 0 &&
- conn->lk_version != lk_version) {
- (void) server_connection_cleanup (this, conn,
+ gf_log (this->name, GF_LOG_DEBUG, "Connected to %s", client->client_uid);
+ cancelled = server_cancel_grace_timer (this, client);
+ if (cancelled)//Do gf_client_put on behalf of grace-timer-handler.
+ gf_client_put (client, NULL);
+
+ serv_ctx = server_ctx_get (client, client->this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto fail;
+ }
+
+ if (serv_ctx->lk_version != 0 &&
+ serv_ctx->lk_version != lk_version) {
+ (void) server_connection_cleanup (this, client,
INTERNAL_LOCKS | POSIX_LOCKS);
}
- if (req->trans->xl_private != conn)
- req->trans->xl_private = conn;
+ if (req->trans->xl_private != client)
+ req->trans->xl_private = client;
+
+ auth_set_username_passwd (params, config_params, client);
ret = dict_get_int32 (params, "fops-version", &fop_version);
if (ret < 0) {
@@ -563,10 +573,10 @@ server_setvolume (rpcsvc_request_t *req)
gf_log (this->name, GF_LOG_INFO,
"accepted client from %s (version: %s)",
- conn->id,
+ client->client_uid,
(clnt_version) ? clnt_version : "old");
op_ret = 0;
- conn->bound_xl = xl;
+ client->bound_xl = xl;
ret = dict_set_str (reply, "ERROR", "Success");
if (ret < 0)
gf_log (this->name, GF_LOG_DEBUG,
@@ -574,7 +584,7 @@ server_setvolume (rpcsvc_request_t *req)
} else {
gf_log (this->name, GF_LOG_ERROR,
"Cannot authenticate client from %s %s",
- conn->id,
+ client->client_uid,
(clnt_version) ? clnt_version : "old");
op_ret = -1;
@@ -586,7 +596,7 @@ server_setvolume (rpcsvc_request_t *req)
goto fail;
}
- if (conn->bound_xl == NULL) {
+ if (client->bound_xl == NULL) {
ret = dict_set_str (reply, "ERROR",
"Check volfile and handshake "
"options in protocol/client");
@@ -599,20 +609,21 @@ server_setvolume (rpcsvc_request_t *req)
goto fail;
}
- if ((conn->bound_xl != NULL) &&
+ if ((client->bound_xl != NULL) &&
(ret >= 0) &&
- (conn->bound_xl->itable == NULL)) {
+ (client->bound_xl->itable == NULL)) {
/* create inode table for this bound_xl, if one doesn't
already exist */
gf_log (this->name, GF_LOG_TRACE,
"creating inode table with lru_limit=%"PRId32", "
"xlator=%s", conf->inode_lru_limit,
- conn->bound_xl->name);
+ client->bound_xl->name);
/* TODO: what is this ? */
- conn->bound_xl->itable = inode_table_new (conf->inode_lru_limit,
- conn->bound_xl);
+ client->bound_xl->itable =
+ inode_table_new (conf->inode_lru_limit,
+ client->bound_xl);
}
ret = dict_set_str (reply, "process-uuid",
@@ -621,8 +632,7 @@ server_setvolume (rpcsvc_request_t *req)
gf_log (this->name, GF_LOG_DEBUG,
"failed to set 'process-uuid'");
- ret = dict_set_uint32 (reply, "clnt-lk-version",
- conn->lk_version);
+ ret = dict_set_uint32 (reply, "clnt-lk-version", serv_ctx->lk_version);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
"failed to set 'clnt-lk-version'");
@@ -635,7 +645,7 @@ server_setvolume (rpcsvc_request_t *req)
fail:
rsp.dict.dict_len = dict_serialized_length (reply);
- if (rsp.dict.dict_len < 0) {
+ if (rsp.dict.dict_len > UINT_MAX) {
gf_log ("server-handshake", GF_LOG_DEBUG,
"failed to get serialized length of reply dict");
op_ret = -1;
@@ -664,15 +674,15 @@ fail:
* list of connections the server is maintaining and might segfault
* during statedump when bound_xl of the connection is accessed.
*/
- if (op_ret && conn && !xl) {
+ if (op_ret && !xl) {
/* We would have set the xl_private of the transport to the
* @conn. But if we have put the connection i.e shutting down
* the connection, then we should set xl_private to NULL as it
* would be pointing to a freed memory and would segfault when
* accessed upon getting DISCONNECT.
*/
- if (server_connection_put (this, conn, NULL) == NULL)
- req->trans->xl_private = NULL;
+ gf_client_put (client, NULL);
+ req->trans->xl_private = NULL;
}
server_submit_reply (NULL, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_setvolume_rsp);
@@ -709,12 +719,13 @@ server_ping (rpcsvc_request_t *req)
int
server_set_lk_version (rpcsvc_request_t *req)
{
- int op_ret = -1;
- int op_errno = EINVAL;
- gf_set_lk_ver_req args = {0, };
- gf_set_lk_ver_rsp rsp = {0,};
- server_connection_t *conn = NULL;
- xlator_t *this = NULL;
+ int op_ret = -1;
+ int op_errno = EINVAL;
+ gf_set_lk_ver_req args = {0,};
+ gf_set_lk_ver_rsp rsp = {0,};
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ xlator_t *this = NULL;
this = req->svc->mydata;
//TODO: Decide on an appropriate errno for the error-path
@@ -730,9 +741,15 @@ server_set_lk_version (rpcsvc_request_t *req)
goto fail;
}
- conn = server_connection_get (this, args.uid);
- conn->lk_version = args.lk_ver;
- server_connection_put (this, conn, NULL);
+ client = gf_client_get (this, &req->cred, args.uid);
+ serv_ctx = server_ctx_get (client, client->this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto fail;
+ }
+
+ serv_ctx->lk_version = args.lk_ver;
+ gf_client_put (client, NULL);
rsp.lk_ver = args.lk_ver;
@@ -749,11 +766,11 @@ fail:
}
rpcsvc_actor_t gluster_handshake_actors[] = {
- [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, server_null, NULL, 0},
- [GF_HNDSK_SETVOLUME] = {"SETVOLUME", GF_HNDSK_SETVOLUME, server_setvolume, NULL, 0},
- [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0},
- [GF_HNDSK_PING] = {"PING", GF_HNDSK_PING, server_ping, NULL, 0},
- [GF_HNDSK_SET_LK_VER] = {"SET_LK_VER", GF_HNDSK_SET_LK_VER, server_set_lk_version, NULL, 0},
+ [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, server_null, NULL, 0, DRC_NA},
+ [GF_HNDSK_SETVOLUME] = {"SETVOLUME", GF_HNDSK_SETVOLUME, server_setvolume, NULL, 0, DRC_NA},
+ [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA},
+ [GF_HNDSK_PING] = {"PING", GF_HNDSK_PING, server_ping, NULL, 0, DRC_NA},
+ [GF_HNDSK_SET_LK_VER] = {"SET_LK_VER", GF_HNDSK_SET_LK_VER, server_set_lk_version, NULL, 0, DRC_NA},
};
diff --git a/xlators/protocol/server/src/server-helpers.c b/xlators/protocol/server/src/server-helpers.c
index e2bbf4bcb..600a311c3 100644
--- a/xlators/protocol/server/src/server-helpers.c
+++ b/xlators/protocol/server/src/server-helpers.c
@@ -26,10 +26,14 @@ server_decode_groups (call_frame_t *frame, rpcsvc_request_t *req)
GF_VALIDATE_OR_GOTO ("server", frame, out);
GF_VALIDATE_OR_GOTO ("server", req, out);
+ if (call_stack_alloc_groups (frame->root, req->auxgidcount) != 0)
+ return -1;
+
frame->root->ngrps = req->auxgidcount;
if (frame->root->ngrps == 0)
return 0;
+ /* ngrps cannot be bigger than USHRT_MAX(65535) */
if (frame->root->ngrps > GF_MAX_AUX_GROUPS)
return -1;
@@ -39,6 +43,7 @@ out:
return 0;
}
+
void
server_loc_wipe (loc_t *loc)
{
@@ -70,11 +75,6 @@ server_resolve_wipe (server_resolve_t *resolve)
void
free_state (server_state_t *state)
{
- if (state->conn) {
- //xprt_svc_unref (state->conn);
- state->conn = NULL;
- }
-
if (state->xprt) {
rpc_transport_unref (state->xprt);
state->xprt = NULL;
@@ -123,301 +123,26 @@ free_state (server_state_t *state)
}
-int
-gf_add_locker (server_connection_t *conn, const char *volume,
- loc_t *loc, fd_t *fd, pid_t pid, gf_lkowner_t *owner,
- glusterfs_fop_t type)
-{
- int32_t ret = -1;
- struct _locker *new = NULL;
- struct _lock_table *table = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", volume, out);
-
- new = GF_CALLOC (1, sizeof (struct _locker), gf_server_mt_locker_t);
- if (new == NULL) {
- goto out;
- }
- INIT_LIST_HEAD (&new->lockers);
-
- new->volume = gf_strdup (volume);
-
- if (fd == NULL) {
- loc_copy (&new->loc, loc);
- } else {
- new->fd = fd_ref (fd);
- }
-
- new->pid = pid;
- new->owner = *owner;
-
- pthread_mutex_lock (&conn->lock);
- {
- table = conn->ltable;
- if (type == GF_FOP_ENTRYLK)
- list_add_tail (&new->lockers, &table->entrylk_lockers);
- else
- list_add_tail (&new->lockers, &table->inodelk_lockers);
- }
- pthread_mutex_unlock (&conn->lock);
-out:
- return ret;
-}
-
-
-int
-gf_del_locker (server_connection_t *conn, const char *volume,
- loc_t *loc, fd_t *fd, gf_lkowner_t *owner,
- glusterfs_fop_t type)
-{
- struct _locker *locker = NULL;
- struct _locker *tmp = NULL;
- int32_t ret = -1;
- struct list_head *head = NULL;
- struct _lock_table *table = NULL;
- int found = 0;
-
- GF_VALIDATE_OR_GOTO ("server", volume, out);
-
- pthread_mutex_lock (&conn->lock);
- {
- table = conn->ltable;
- if (type == GF_FOP_ENTRYLK) {
- head = &table->entrylk_lockers;
- } else {
- head = &table->inodelk_lockers;
- }
-
- list_for_each_entry_safe (locker, tmp, head, lockers) {
- if (!is_same_lkowner (&locker->owner, owner) ||
- strcmp (locker->volume, volume))
- continue;
-
- if (locker->fd && fd && (locker->fd == fd))
- found = 1;
- else if (locker->loc.inode && loc &&
- (locker->loc.inode == loc->inode))
- found = 1;
- if (found) {
- list_del_init (&locker->lockers);
- break;
- }
- }
- if (!found)
- locker = NULL;
- }
- pthread_mutex_unlock (&conn->lock);
-
- if (locker) {
- if (locker->fd)
- fd_unref (locker->fd);
- else
- loc_wipe (&locker->loc);
-
- GF_FREE (locker->volume);
- GF_FREE (locker);
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-static struct _lock_table *
-gf_lock_table_new (void)
-{
- struct _lock_table *new = NULL;
-
- new = GF_CALLOC (1, sizeof (struct _lock_table), gf_server_mt_lock_table_t);
- if (new == NULL) {
- goto out;
- }
- INIT_LIST_HEAD (&new->entrylk_lockers);
- INIT_LIST_HEAD (&new->inodelk_lockers);
-out:
- return new;
-}
-
-static int
-server_nop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- int ret = -1;
- server_state_t *state = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", frame, out);
- GF_VALIDATE_OR_GOTO ("server", cookie, out);
- GF_VALIDATE_OR_GOTO ("server", this, out);
-
- if (frame->root->trans)
- server_conn_unref (frame->root->trans);
- state = CALL_STATE(frame);
-
- if (state)
- free_state (state);
- STACK_DESTROY (frame->root);
-
- ret = 0;
-out:
- return ret;
-}
-
-int
-do_lock_table_cleanup (xlator_t *this, server_connection_t *conn,
- call_frame_t *frame, struct _lock_table *ltable)
-{
- struct list_head inodelk_lockers, entrylk_lockers;
- call_frame_t *tmp_frame = NULL;
- struct gf_flock flock = {0, };
- xlator_t *bound_xl = NULL;
- struct _locker *locker = NULL, *tmp = NULL;
- int ret = -1;
- char *path = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", conn, out);
- GF_VALIDATE_OR_GOTO ("server", frame, out);
- GF_VALIDATE_OR_GOTO ("server", ltable, out);
-
- bound_xl = conn->bound_xl;
- INIT_LIST_HEAD (&inodelk_lockers);
- INIT_LIST_HEAD (&entrylk_lockers);
-
- list_splice_init (&ltable->inodelk_lockers,
- &inodelk_lockers);
-
- list_splice_init (&ltable->entrylk_lockers, &entrylk_lockers);
- GF_FREE (ltable);
-
- flock.l_type = F_UNLCK;
- flock.l_start = 0;
- flock.l_len = 0;
- list_for_each_entry_safe (locker,
- tmp, &inodelk_lockers, lockers) {
- tmp_frame = copy_frame (frame);
- if (tmp_frame == NULL) {
- goto out;
- }
- /*
- lock owner = 0 is a special case that tells posix-locks
- to release all locks from this transport
- */
- tmp_frame->root->pid = 0;
- tmp_frame->root->trans = server_conn_ref (conn);
- memset (&tmp_frame->root->lk_owner, 0, sizeof (gf_lkowner_t));
-
- if (locker->fd) {
- GF_ASSERT (locker->fd->inode);
-
- ret = inode_path (locker->fd->inode, NULL, &path);
-
- if (ret > 0) {
- gf_log (this->name, GF_LOG_INFO, "finodelk "
- "released on %s", path);
- GF_FREE (path);
- } else {
-
- gf_log (this->name, GF_LOG_INFO, "finodelk "
- "released on inode with gfid %s",
- uuid_utoa (locker->fd->inode->gfid));
- }
-
- STACK_WIND (tmp_frame, server_nop_cbk, bound_xl,
- bound_xl->fops->finodelk,
- locker->volume,
- locker->fd, F_SETLK, &flock, NULL);
- fd_unref (locker->fd);
- } else {
- gf_log (this->name, GF_LOG_INFO, "inodelk released "
- "on %s", locker->loc.path);
-
- STACK_WIND (tmp_frame, server_nop_cbk, bound_xl,
- bound_xl->fops->inodelk,
- locker->volume,
- &(locker->loc), F_SETLK, &flock, NULL);
- loc_wipe (&locker->loc);
- }
-
- GF_FREE (locker->volume);
-
- list_del_init (&locker->lockers);
- GF_FREE (locker);
- }
-
- tmp = NULL;
- locker = NULL;
- list_for_each_entry_safe (locker, tmp, &entrylk_lockers, lockers) {
- tmp_frame = copy_frame (frame);
-
- tmp_frame->root->pid = 0;
- tmp_frame->root->trans = server_conn_ref (conn);
- memset (&tmp_frame->root->lk_owner, 0, sizeof (gf_lkowner_t));
-
- if (locker->fd) {
- GF_ASSERT (locker->fd->inode);
-
- ret = inode_path (locker->fd->inode, NULL, &path);
-
- if (ret > 0) {
- gf_log (this->name, GF_LOG_INFO, "fentrylk "
- "released on %s", path);
- GF_FREE (path);
- } else {
-
- gf_log (this->name, GF_LOG_INFO, "fentrylk "
- "released on inode with gfid %s",
- uuid_utoa (locker->fd->inode->gfid));
- }
-
- STACK_WIND (tmp_frame, server_nop_cbk, bound_xl,
- bound_xl->fops->fentrylk,
- locker->volume,
- locker->fd, NULL,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
- fd_unref (locker->fd);
- } else {
- gf_log (this->name, GF_LOG_INFO, "entrylk released "
- "on %s", locker->loc.path);
-
- STACK_WIND (tmp_frame, server_nop_cbk, bound_xl,
- bound_xl->fops->entrylk,
- locker->volume,
- &(locker->loc), NULL,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
- loc_wipe (&locker->loc);
- }
-
- GF_FREE (locker->volume);
-
- list_del_init (&locker->lockers);
- GF_FREE (locker);
- }
- ret = 0;
-
-out:
- return ret;
-}
-
-
static int
server_connection_cleanup_flush_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
{
- int32_t ret = -1;
- fd_t *fd = NULL;
+ int32_t ret = -1;
+ fd_t *fd = NULL;
+ client_t *client = NULL;
GF_VALIDATE_OR_GOTO ("server", this, out);
GF_VALIDATE_OR_GOTO ("server", cookie, out);
GF_VALIDATE_OR_GOTO ("server", frame, out);
fd = frame->local;
+ client = frame->root->client;
fd_unref (fd);
frame->local = NULL;
- if (frame->root->trans)
- server_conn_unref (frame->root->trans);
+ gf_client_unref (client);
STACK_DESTROY (frame->root);
ret = 0;
@@ -426,9 +151,8 @@ out:
}
-int
-do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame,
- fdentry_t *fdentries, int fd_count)
+static int
+do_fd_cleanup (xlator_t *this, client_t* client, fdentry_t *fdentries, int fd_count)
{
fd_t *fd = NULL;
int i = 0, ret = -1;
@@ -437,16 +161,14 @@ do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame,
char *path = NULL;
GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", conn, out);
- GF_VALIDATE_OR_GOTO ("server", frame, out);
GF_VALIDATE_OR_GOTO ("server", fdentries, out);
- bound_xl = conn->bound_xl;
+ bound_xl = client->bound_xl;
for (i = 0;i < fd_count; i++) {
fd = fdentries[i].fd;
if (fd != NULL) {
- tmp_frame = copy_frame (frame);
+ tmp_frame = create_frame (this, this->ctx->pool);
if (tmp_frame == NULL) {
goto out;
}
@@ -456,20 +178,20 @@ do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame,
ret = inode_path (fd->inode, NULL, &path);
if (ret > 0) {
- gf_log (this->name, GF_LOG_INFO, "fd cleanup on "
- "%s", path);
+ gf_log (this->name, GF_LOG_INFO,
+ "fd cleanup on %s", path);
GF_FREE (path);
} else {
- gf_log (this->name, GF_LOG_INFO, "fd cleanup on"
- " inode with gfid %s",
+ gf_log (this->name, GF_LOG_INFO,
+ "fd cleanup on inode with gfid %s",
uuid_utoa (fd->inode->gfid));
}
tmp_frame->local = fd;
tmp_frame->root->pid = 0;
- tmp_frame->root->trans = server_conn_ref (conn);
+ gf_client_ref (client);
memset (&tmp_frame->root->lk_owner, 0,
sizeof (gf_lkowner_t));
@@ -486,268 +208,72 @@ out:
return ret;
}
-int
-do_connection_cleanup (xlator_t *this, server_connection_t *conn,
- struct _lock_table *ltable, fdentry_t *fdentries, int fd_count)
-{
- int ret = 0;
- int saved_ret = 0;
- call_frame_t *frame = NULL;
- server_state_t *state = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", conn, out);
-
- if (!ltable && !fdentries)
- goto out;
-
- frame = create_frame (this, this->ctx->pool);
- if (frame == NULL) {
- goto out;
- }
-
- if (ltable)
- saved_ret = do_lock_table_cleanup (this, conn, frame, ltable);
-
- if (fdentries != NULL) {
- ret = do_fd_cleanup (this, conn, frame, fdentries, fd_count);
- }
-
- state = CALL_STATE (frame);
- GF_FREE (state);
-
- STACK_DESTROY (frame->root);
-
- if (saved_ret || ret) {
- ret = -1;
- }
-
-out:
- return ret;
-}
-
int
-server_connection_cleanup (xlator_t *this, server_connection_t *conn,
+server_connection_cleanup (xlator_t *this, client_t *client,
int32_t flags)
{
- struct _lock_table *ltable = NULL;
- fdentry_t *fdentries = NULL;
- uint32_t fd_count = 0;
- int ret = 0;
+ server_ctx_t *serv_ctx = NULL;
+ fdentry_t *fdentries = NULL;
+ uint32_t fd_count = 0;
+ int cd_ret = 0;
+ int ret = 0;
GF_VALIDATE_OR_GOTO (this->name, this, out);
- GF_VALIDATE_OR_GOTO (this->name, conn, out);
+ GF_VALIDATE_OR_GOTO (this->name, client, out);
GF_VALIDATE_OR_GOTO (this->name, flags, out);
- pthread_mutex_lock (&conn->lock);
- {
- if (conn->ltable && (flags & INTERNAL_LOCKS)) {
- ltable = conn->ltable;
- conn->ltable = gf_lock_table_new ();
- }
-
- if (conn->fdtable && (flags & POSIX_LOCKS))
- fdentries = gf_fd_fdtable_get_all_fds (conn->fdtable,
- &fd_count);
- }
- pthread_mutex_unlock (&conn->lock);
-
- if (conn->bound_xl)
- ret = do_connection_cleanup (this, conn, ltable,
- fdentries, fd_count);
-
-out:
- return ret;
-}
-
-
-int
-server_connection_destroy (xlator_t *this, server_connection_t *conn)
-{
- xlator_t *bound_xl = NULL;
- int32_t ret = -1;
- struct list_head inodelk_lockers;
- struct list_head entrylk_lockers;
- struct _lock_table *ltable = NULL;
- fdtable_t *fdtable = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", conn, out);
-
- bound_xl = (xlator_t *) (conn->bound_xl);
+ serv_ctx = server_ctx_get (client, client->this);
- if (bound_xl) {
- pthread_mutex_lock (&(conn->lock));
- {
- if (conn->ltable) {
- ltable = conn->ltable;
- conn->ltable = NULL;
- }
- if (conn->fdtable) {
- fdtable = conn->fdtable;
- conn->fdtable = NULL;
- }
- }
- pthread_mutex_unlock (&conn->lock);
-
- INIT_LIST_HEAD (&inodelk_lockers);
- INIT_LIST_HEAD (&entrylk_lockers);
-
- if (ltable) {
- list_splice_init (&ltable->inodelk_lockers,
- &inodelk_lockers);
-
- list_splice_init (&ltable->entrylk_lockers,
- &entrylk_lockers);
- GF_FREE (ltable);
- }
-
- GF_ASSERT (list_empty (&inodelk_lockers));
- GF_ASSERT (list_empty (&entrylk_lockers));
-
- if (fdtable)
- gf_fd_fdtable_destroy (fdtable);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
}
- gf_log (this->name, GF_LOG_INFO, "destroyed connection of %s",
- conn->id);
-
- pthread_mutex_destroy (&conn->lock);
- GF_FREE (conn->id);
- GF_FREE (conn);
- ret = 0;
-out:
- return ret;
-}
-
-server_connection_t*
-server_conn_unref (server_connection_t *conn)
-{
- server_connection_t *todel = NULL;
- xlator_t *this = NULL;
-
- pthread_mutex_lock (&conn->lock);
+ LOCK (&serv_ctx->fdtable_lock);
{
- conn->ref--;
-
- if (!conn->ref) {
- todel = conn;
- }
+ if (serv_ctx->fdtable && (flags & POSIX_LOCKS))
+ fdentries = gf_fd_fdtable_get_all_fds (serv_ctx->fdtable,
+ &fd_count);
}
- pthread_mutex_unlock (&conn->lock);
+ UNLOCK (&serv_ctx->fdtable_lock);
- if (todel) {
- this = THIS;
- server_connection_destroy (this, todel);
- conn = NULL;
- }
- return conn;
-}
+ if (client->bound_xl == NULL)
+ goto out;
-server_connection_t*
-server_conn_ref (server_connection_t *conn)
-{
- pthread_mutex_lock (&conn->lock);
- {
- conn->ref++;
+ if (flags & INTERNAL_LOCKS) {
+ cd_ret = gf_client_disconnect (client);
}
- pthread_mutex_unlock (&conn->lock);
-
- return conn;
-}
-server_connection_t *
-server_connection_get (xlator_t *this, const char *id)
-{
- server_connection_t *conn = NULL;
- server_connection_t *trav = NULL;
- server_conf_t *conf = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", id, out);
+ if (fdentries != NULL)
+ ret = do_fd_cleanup (this, client, fdentries, fd_count);
+ else
+ gf_log (this->name, GF_LOG_INFO, "no fdentries to clean");
- conf = this->private;
-
- pthread_mutex_lock (&conf->mutex);
- {
- list_for_each_entry (trav, &conf->conns, list) {
- if (!strcmp (trav->id, id)) {
- conn = trav;
- conn->bind_ref++;
- goto unlock;
- }
- }
-
- conn = (void *) GF_CALLOC (1, sizeof (*conn),
- gf_server_mt_conn_t);
- if (!conn)
- goto unlock;
-
- conn->id = gf_strdup (id);
- /*'0' denotes uninitialised lock state*/
- conn->lk_version = 0;
- conn->fdtable = gf_fd_fdtable_alloc ();
- conn->ltable = gf_lock_table_new ();
- conn->this = this;
- conn->bind_ref = 1;
- conn->ref = 1;//when bind_ref becomes 0 it calls conn_unref
- pthread_mutex_init (&conn->lock, NULL);
- list_add (&conn->list, &conf->conns);
+ if (cd_ret || ret)
+ ret = -1;
- }
-unlock:
- pthread_mutex_unlock (&conf->mutex);
out:
- return conn;
+ return ret;
}
-server_connection_t*
-server_connection_put (xlator_t *this, server_connection_t *conn,
- gf_boolean_t *detached)
-{
- server_conf_t *conf = NULL;
- gf_boolean_t unref = _gf_false;
-
- if (detached)
- *detached = _gf_false;
- conf = this->private;
- pthread_mutex_lock (&conf->mutex);
- {
- conn->bind_ref--;
- if (!conn->bind_ref) {
- list_del_init (&conn->list);
- unref = _gf_true;
- }
- }
- pthread_mutex_unlock (&conf->mutex);
- if (unref) {
- gf_log (this->name, GF_LOG_INFO, "Shutting down connection %s",
- conn->id);
- if (detached)
- *detached = _gf_true;
- server_conn_unref (conn);
- conn = NULL;
- }
- return conn;
-}
static call_frame_t *
server_alloc_frame (rpcsvc_request_t *req)
{
- call_frame_t *frame = NULL;
- server_state_t *state = NULL;
- server_connection_t *conn = NULL;
+ call_frame_t *frame = NULL;
+ server_state_t *state = NULL;
+ client_t *client = NULL;
GF_VALIDATE_OR_GOTO ("server", req, out);
GF_VALIDATE_OR_GOTO ("server", req->trans, out);
GF_VALIDATE_OR_GOTO ("server", req->svc, out);
GF_VALIDATE_OR_GOTO ("server", req->svc->ctx, out);
- conn = (server_connection_t *)req->trans->xl_private;
- GF_VALIDATE_OR_GOTO ("server", conn, out);
+ client = req->trans->xl_private;
+ GF_VALIDATE_OR_GOTO ("server", client, out);
- frame = create_frame (conn->this, req->svc->ctx->pool);
+ frame = create_frame (client->this, req->svc->ctx->pool);
if (!frame)
goto out;
@@ -755,45 +281,102 @@ server_alloc_frame (rpcsvc_request_t *req)
if (!state)
goto out;
- if (conn->bound_xl)
- state->itable = conn->bound_xl->itable;
+ if (client->bound_xl)
+ state->itable = client->bound_xl->itable;
state->xprt = rpc_transport_ref (req->trans);
- state->conn = conn;
-
state->resolve.fd_no = -1;
state->resolve2.fd_no = -1;
+ frame->root->client = client;
frame->root->state = state; /* which socket */
frame->root->unique = 0; /* which call */
- frame->this = conn->this;
+ frame->this = client->this;
out:
return frame;
}
-
call_frame_t *
get_frame_from_request (rpcsvc_request_t *req)
{
- call_frame_t *frame = NULL;
+ call_frame_t *frame = NULL;
+ client_t *client = NULL;
+ client_t *tmp_client = NULL;
+ xlator_t *this = NULL;
+ server_conf_t *priv = NULL;
+ clienttable_t *clienttable = NULL;
+ unsigned int i = 0;
GF_VALIDATE_OR_GOTO ("server", req, out);
+ client = req->trans->xl_private;
+
frame = server_alloc_frame (req);
if (!frame)
goto out;
frame->root->op = req->procnum;
- frame->root->type = req->type;
frame->root->unique = req->xid;
+ client = req->trans->xl_private;
+ this = req->trans->xl;
+ priv = this->private;
+ clienttable = this->ctx->clienttable;
+
+ for (i = 0; i < clienttable->max_clients; i++) {
+ tmp_client = clienttable->cliententries[i].client;
+ if (client == tmp_client) {
+ /* for non trusted clients username and password
+ would not have been set. So for non trusted clients
+ (i.e clients not from the same machine as the brick,
+ and clients from outside the storage pool)
+ do the root-squashing.
+ TODO: If any client within the storage pool (i.e
+ mounting within a machine from the pool but using
+ other machine's ip/hostname from the same pool)
+ is present treat it as a trusted client
+ */
+ if (!client->auth.username && req->pid != NFS_PID)
+ RPC_AUTH_ROOT_SQUASH (req);
+
+ /* Problem: If we just check whether the client is
+ trusted client and do not do root squashing for
+ them, then for smb clients and UFO clients root
+ squashing will never happen as they use the fuse
+ mounts done within the trusted pool (i.e they are
+ trusted clients).
+ Solution: To fix it, do root squashing for trusted
+ clients also. If one wants to have a client within
+ the storage pool for which root-squashing does not
+ happen, then the client has to be mounted with
+ --no-root-squash option. But for defrag client and
+ gsyncd client do not do root-squashing.
+ */
+ if (client->auth.username &&
+ req->pid != GF_CLIENT_PID_NO_ROOT_SQUASH &&
+ req->pid != GF_CLIENT_PID_GSYNCD &&
+ req->pid != GF_CLIENT_PID_DEFRAG)
+ RPC_AUTH_ROOT_SQUASH (req);
+
+ /* For nfs clients the server processes will be running
+ within the trusted storage pool machines. So if we
+ do not do root-squashing for nfs servers, thinking
+ that its a trusted client, then root-squashing wont
+ work for nfs clients.
+ */
+ if (req->pid == NFS_PID)
+ RPC_AUTH_ROOT_SQUASH (req);
+ }
+ }
+
frame->root->uid = req->uid;
frame->root->gid = req->gid;
frame->root->pid = req->pid;
- frame->root->trans = server_conn_ref (req->trans->xl_private);
+ gf_client_ref (client);
+ frame->root->client = client;
frame->root->lk_owner = req->lk_owner;
server_decode_groups (frame, req);
@@ -878,84 +461,6 @@ out:
return ret;
}
-void
-put_server_conn_state (xlator_t *this, rpc_transport_t *xprt)
-{
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", xprt, out);
-
- xprt->xl_private = NULL;
-out:
- return;
-}
-
-server_connection_t *
-get_server_conn_state (xlator_t *this, rpc_transport_t *xprt)
-{
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", xprt, out);
-
- return (server_connection_t *)xprt->xl_private;
-out:
- return NULL;
-}
-
-server_connection_t *
-create_server_conn_state (xlator_t *this, rpc_transport_t *xprt)
-{
- server_connection_t *conn = NULL;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", xprt, out);
-
- conn = GF_CALLOC (1, sizeof (*conn), gf_server_mt_conn_t);
- if (!conn)
- goto out;
-
- pthread_mutex_init (&conn->lock, NULL);
-
- conn->fdtable = gf_fd_fdtable_alloc ();
- if (!conn->fdtable)
- goto out;
-
- conn->ltable = gf_lock_table_new ();
- if (!conn->ltable)
- goto out;
-
- conn->this = this;
-
- xprt->xl_private = conn;
-
- ret = 0;
-out:
- if (ret)
- destroy_server_conn_state (conn);
-
- return conn;
-}
-
-void
-destroy_server_conn_state (server_connection_t *conn)
-{
- GF_VALIDATE_OR_GOTO ("server", conn, out);
-
- if (conn->ltable) {
- /* TODO */
- //FREE (conn->ltable);
- ;
- }
-
- if (conn->fdtable)
- gf_fd_fdtable_destroy (conn->fdtable);
-
- pthread_mutex_destroy (&conn->lock);
-
- GF_FREE (conn);
-out:
- return;
-}
-
void
print_caller (char *str, int size, call_frame_t *frame)
@@ -1082,12 +587,15 @@ server_print_params (char *str, int size, server_state_t *state)
filled += snprintf (str + filled, size - filled,
"volume=%s,", state->volume);
+/* FIXME
snprintf (str + filled, size - filled,
- "bound_xl=%s}", state->conn->bound_xl->name);
+ "bound_xl=%s}", state->client->bound_xl->name);
+*/
out:
return;
}
+
int
server_resolve_is_empty (server_resolve_t *resolve)
{
@@ -1103,6 +611,7 @@ server_resolve_is_empty (server_resolve_t *resolve)
return 1;
}
+
void
server_print_reply (call_frame_t *frame, int op_ret, int op_errno)
{
@@ -1148,16 +657,16 @@ out:
void
server_print_request (call_frame_t *frame)
{
- server_conf_t *conf = NULL;
- xlator_t *this = NULL;
+ server_conf_t *conf = NULL;
+ xlator_t *this = NULL;
server_state_t *state = NULL;
+ char *op = "UNKNOWN";
char resolve_vars[256];
char resolve2_vars[256];
char loc_vars[256];
char loc2_vars[256];
char other_vars[512];
char caller[512];
- char *op = "UNKNOWN";
GF_VALIDATE_OR_GOTO ("server", frame, out);
@@ -1208,13 +717,14 @@ out:
return;
}
+
int
serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp)
{
gf_dirent_t *entry = NULL;
- gfs3_dirplist *trav = NULL;
- gfs3_dirplist *prev = NULL;
- int ret = -1;
+ gfs3_dirplist *trav = NULL;
+ gfs3_dirplist *prev = NULL;
+ int ret = -1;
GF_VALIDATE_OR_GOTO ("server", entries, out);
GF_VALIDATE_OR_GOTO ("server", rsp, out);
@@ -1235,7 +745,7 @@ serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp)
/* if 'dict' is present, pack it */
if (entry->dict) {
trav->dict.dict_len = dict_serialized_length (entry->dict);
- if (trav->dict.dict_len < 0) {
+ if (trav->dict.dict_len > UINT_MAX) {
gf_log (THIS->name, GF_LOG_ERROR,
"failed to get serialized length "
"of reply dict");
@@ -1282,10 +792,10 @@ out:
int
serialize_rsp_dirent (gf_dirent_t *entries, gfs3_readdir_rsp *rsp)
{
- gf_dirent_t *entry = NULL;
- gfs3_dirlist *trav = NULL;
- gfs3_dirlist *prev = NULL;
- int ret = -1;
+ gf_dirent_t *entry = NULL;
+ gfs3_dirlist *trav = NULL;
+ gfs3_dirlist *prev = NULL;
+ int ret = -1;
GF_VALIDATE_OR_GOTO ("server", entries, out);
GF_VALIDATE_OR_GOTO ("server", rsp, out);
@@ -1312,11 +822,12 @@ out:
return ret;
}
+
int
readdir_rsp_cleanup (gfs3_readdir_rsp *rsp)
{
- gfs3_dirlist *prev = NULL;
- gfs3_dirlist *trav = NULL;
+ gfs3_dirlist *prev = NULL;
+ gfs3_dirlist *trav = NULL;
trav = rsp->reply;
prev = trav;
@@ -1329,6 +840,7 @@ readdir_rsp_cleanup (gfs3_readdir_rsp *rsp)
return 0;
}
+
int
readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp)
{
@@ -1347,6 +859,7 @@ readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp)
return 0;
}
+
int
gf_server_check_getxattr_cmd (call_frame_t *frame, const char *key)
{
@@ -1375,13 +888,14 @@ gf_server_check_getxattr_cmd (call_frame_t *frame, const char *key)
return 0;
}
+
int
gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict)
{
- server_conf_t *conf = NULL;
- rpc_transport_t *xprt = NULL;
- uint64_t total_read = 0;
+ server_conf_t *conf = NULL;
+ rpc_transport_t *xprt = NULL;
+ uint64_t total_read = 0;
uint64_t total_write = 0;
conf = frame->this->private;
@@ -1402,32 +916,504 @@ gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict)
return 0;
}
+
gf_boolean_t
-server_cancel_conn_timer (xlator_t *this, server_connection_t *conn)
+server_cancel_grace_timer (xlator_t *this, client_t *client)
{
- gf_timer_t *timer = NULL;
- gf_boolean_t cancelled = _gf_false;
+ server_ctx_t *serv_ctx = NULL;
+ gf_timer_t *timer = NULL;
+ gf_boolean_t cancelled = _gf_false;
- if (!this || !conn) {
- gf_log (THIS->name, GF_LOG_ERROR, "Invalid arguments to "
- "cancel connection timer");
+ if (!this || !client) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Invalid arguments to cancel connection timer");
return cancelled;
}
- pthread_mutex_lock (&conn->lock);
- {
- if (!conn->timer)
- goto unlock;
+ serv_ctx = server_ctx_get (client, client->this);
- timer = conn->timer;
- conn->timer = NULL;
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
}
-unlock:
- pthread_mutex_unlock (&conn->lock);
+
+ LOCK (&serv_ctx->fdtable_lock);
+ {
+ if (serv_ctx->grace_timer) {
+ timer = serv_ctx->grace_timer;
+ serv_ctx->grace_timer = NULL;
+ }
+ }
+ UNLOCK (&serv_ctx->fdtable_lock);
if (timer) {
gf_timer_call_cancel (this->ctx, timer);
cancelled = _gf_true;
}
+out:
return cancelled;
}
+
+server_ctx_t*
+server_ctx_get (client_t *client, xlator_t *xlator)
+{
+ void *tmp = NULL;
+ server_ctx_t *ctx = NULL;
+
+ client_ctx_get (client, xlator, &tmp);
+
+ ctx = tmp;
+
+ if (ctx != NULL)
+ goto out;
+
+ ctx = GF_CALLOC (1, sizeof (server_ctx_t), gf_server_mt_server_conf_t);
+
+ if (ctx == NULL)
+ goto out;
+
+ /* ctx->lk_version = 0; redundant */
+ ctx->fdtable = gf_fd_fdtable_alloc ();
+
+ if (ctx->fdtable == NULL) {
+ GF_FREE (ctx);
+ ctx = NULL;
+ goto out;
+ }
+
+ LOCK_INIT (&ctx->fdtable_lock);
+
+ if (client_ctx_set (client, xlator, ctx) != 0) {
+ LOCK_DESTROY (&ctx->fdtable_lock);
+ GF_FREE (ctx);
+ ctx = NULL;
+ }
+
+out:
+ return ctx;
+}
+
+int
+auth_set_username_passwd (dict_t *input_params, dict_t *config_params,
+ client_t *client)
+{
+ int ret = 0;
+ data_t *allow_user = NULL;
+ data_t *passwd_data = NULL;
+ char *username = NULL;
+ char *password = NULL;
+ char *brick_name = NULL;
+ char *searchstr = NULL;
+ char *username_str = NULL;
+ char *tmp = NULL;
+ char *username_cpy = NULL;
+
+ ret = dict_get_str (input_params, "username", &username);
+ if (ret) {
+ gf_log ("auth/login", GF_LOG_DEBUG,
+ "username not found, returning DONT-CARE");
+ /* For non trusted clients username and password
+ will not be there. So dont reject the client.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_str (input_params, "password", &password);
+ if (ret) {
+ gf_log ("auth/login", GF_LOG_WARNING,
+ "password not found, returning DONT-CARE");
+ goto out;
+ }
+
+ ret = dict_get_str (input_params, "remote-subvolume", &brick_name);
+ if (ret) {
+ gf_log ("auth/login", GF_LOG_ERROR,
+ "remote-subvolume not specified");
+ ret = -1;
+ goto out;
+ }
+
+ ret = gf_asprintf (&searchstr, "auth.login.%s.allow", brick_name);
+ if (-1 == ret) {
+ ret = 0;
+ goto out;
+ }
+
+ allow_user = dict_get (config_params, searchstr);
+ GF_FREE (searchstr);
+
+ if (allow_user) {
+ username_cpy = gf_strdup (allow_user->data);
+ if (!username_cpy)
+ goto out;
+
+ username_str = strtok_r (username_cpy, " ,", &tmp);
+
+ while (username_str) {
+ if (!fnmatch (username_str, username, 0)) {
+ ret = gf_asprintf (&searchstr,
+ "auth.login.%s.password",
+ username);
+ if (-1 == ret)
+ goto out;
+
+ passwd_data = dict_get (config_params,
+ searchstr);
+ GF_FREE (searchstr);
+
+ if (!passwd_data) {
+ gf_log ("auth/login", GF_LOG_ERROR,
+ "wrong username/password "
+ "combination");
+ ret = -1;
+ goto out;
+ }
+
+ ret = !((strcmp (data_to_str (passwd_data),
+ password))?0: -1);
+ if (!ret) {
+ client->auth.username =
+ gf_strdup (username);
+ client->auth.passwd =
+ gf_strdup (password);
+ }
+ if (ret == -1)
+ gf_log ("auth/login", GF_LOG_ERROR,
+ "wrong password for user %s",
+ username);
+ break;
+ }
+ username_str = strtok_r (NULL, " ,", &tmp);
+ }
+ }
+
+out:
+ GF_FREE (username_cpy);
+
+ return ret;
+}
+
+int32_t
+gf_barrier_transmit (server_conf_t *conf, gf_barrier_payload_t *payload)
+{
+ gf_barrier_t *barrier = NULL;
+ int32_t ret = -1;
+ client_t *client = NULL;
+ gf_boolean_t lk_heal = _gf_false;
+ call_frame_t *frame = NULL;
+ server_state_t *state = NULL;
+
+ GF_VALIDATE_OR_GOTO ("barrier", conf, out);
+ GF_VALIDATE_OR_GOTO ("barrier", conf->barrier, out);
+ GF_VALIDATE_OR_GOTO ("barrier", payload, out);
+
+ barrier = conf->barrier;
+
+ frame = payload->frame;
+ if (frame) {
+ state = CALL_STATE (frame);
+ frame->local = NULL;
+ client = frame->root->client;
+ }
+ /* currently lk fops are not barrier'ed. This is reflecting code in
+ * server_submit_reply */
+ if (client)
+ lk_heal = ((server_conf_t *) client->this->private)->lk_heal;
+
+ ret = rpcsvc_submit_generic (payload->req, &payload->rsp, 1,
+ payload->payload, payload->payload_count,
+ payload->iobref);
+ iobuf_unref (payload->iob);
+ if (ret == -1) {
+ gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed");
+ if (frame && client && !lk_heal) {
+ server_connection_cleanup (frame->this, client,
+ INTERNAL_LOCKS | POSIX_LOCKS);
+ } else {
+ /* TODO: Failure of open(dir), create, inodelk, entrylk
+ or lk fops send failure must be handled specially. */
+ }
+ goto ret;
+ }
+
+ ret = 0;
+ret:
+ if (state) {
+ free_state (state);
+ }
+
+ if (frame) {
+ gf_client_unref (client);
+ STACK_DESTROY (frame->root);
+ }
+
+ if (payload->free_iobref) {
+ iobref_unref (payload->iobref);
+ }
+out:
+ return ret;
+}
+
+gf_barrier_payload_t *
+gf_barrier_dequeue (gf_barrier_t *barrier)
+{
+ gf_barrier_payload_t *payload = NULL;
+
+ if (!barrier || list_empty (&barrier->queue))
+ return NULL;
+
+ payload = list_entry (barrier->queue.next,
+ gf_barrier_payload_t, list);
+ if (payload) {
+ list_del_init (&payload->list);
+ barrier->cur_size--;
+ }
+
+ return payload;
+}
+
+
+void*
+gf_barrier_dequeue_start (void *data)
+{
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ gf_barrier_payload_t *payload = NULL;
+
+ conf = (server_conf_t *)data;
+ if (!conf || !conf->barrier)
+ return NULL;
+ barrier = conf->barrier;
+
+ LOCK (&barrier->lock);
+ {
+ while (barrier->cur_size) {
+ payload = gf_barrier_dequeue (barrier);
+ if (payload) {
+ if (gf_barrier_transmit (conf, payload)) {
+ gf_log ("server", GF_LOG_WARNING,
+ "Failed to transmit");
+ }
+ GF_FREE (payload);
+ }
+ }
+ }
+ UNLOCK (&barrier->lock);
+ return NULL;
+}
+
+void
+gf_barrier_timeout (void *data)
+{
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ gf_boolean_t need_dequeue = _gf_false;
+
+ conf = (server_conf_t *)data;
+ if (!conf || !conf->barrier)
+ goto out;
+ barrier = conf->barrier;
+
+ gf_log ("", GF_LOG_INFO, "barrier timed-out");
+ LOCK (&barrier->lock);
+ {
+ need_dequeue = barrier->on;
+ barrier->on = _gf_false;
+ }
+ UNLOCK (&barrier->lock);
+
+ if (need_dequeue == _gf_true)
+ gf_barrier_dequeue_start (data);
+out:
+ return;
+}
+
+
+int32_t
+gf_barrier_start (xlator_t *this)
+{
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ int32_t ret = -1;
+ struct timespec time = {0,};
+
+ conf = this->private;
+
+ GF_VALIDATE_OR_GOTO ("server", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf->barrier, out);
+
+ barrier = conf->barrier;
+
+ gf_log (this->name, GF_LOG_INFO, "barrier start called");
+ LOCK (&barrier->lock);
+ {
+ /* if barrier is on, reset timer */
+ if (barrier->on == _gf_true) {
+ ret = gf_timer_call_cancel (this->ctx, barrier->timer);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "unset timer, failing barrier start");
+ goto unlock;
+ }
+ }
+
+ barrier->on = _gf_true;
+ time.tv_sec = barrier->time_out;
+ time.tv_nsec = 0;
+
+ barrier->timer = gf_timer_call_after (this->ctx, time,
+ gf_barrier_timeout,
+ (void *)conf);
+ if (!barrier->timer) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "timer, failing barrier start");
+ barrier->on = _gf_false;
+ }
+ }
+unlock:
+ UNLOCK (&barrier->lock);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+gf_barrier_stop (xlator_t *this)
+{
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ int32_t ret = -1;
+ gf_boolean_t need_dequeue = _gf_false;
+
+ conf = this->private;
+
+ GF_VALIDATE_OR_GOTO ("server", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf->barrier, out);
+
+ barrier = conf->barrier;
+
+ gf_log (this->name, GF_LOG_INFO, "barrier stop called");
+ LOCK (&barrier->lock);
+ {
+ need_dequeue = barrier->on;
+ barrier->on = _gf_false;
+ }
+ UNLOCK (&barrier->lock);
+
+ if (need_dequeue == _gf_true) {
+ gf_timer_call_cancel (this->ctx, barrier->timer);
+ ret = gf_thread_create (&conf->barrier_th, NULL,
+ gf_barrier_dequeue_start,
+ conf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Failed to start un-barriering");
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+gf_barrier_fops_configure (xlator_t *this, gf_barrier_t *barrier, char *str)
+{
+ int32_t ret = -1;
+ char *dup_str = NULL;
+ char *str_tok = NULL;
+ char *save_ptr = NULL;
+ uint64_t fops = 0;
+
+ /* by defaul fsync & flush needs to be barriered */
+
+ fops |= 1 << GFS3_OP_FSYNC;
+ fops |= 1 << GFS3_OP_FLUSH;
+
+ if (!str)
+ goto done;
+
+ dup_str = gf_strdup (str);
+ if (!dup_str)
+ goto done;
+
+ str_tok = strtok_r (dup_str, ",", &save_ptr);
+ if (!str_tok)
+ goto done;
+
+ fops = 0;
+ while (str_tok) {
+ if (!strcmp(str_tok, "writev")) {
+ fops |= ((uint64_t)1 << GFS3_OP_WRITE);
+ } else if (!strcmp(str_tok, "fsync")) {
+ fops |= ((uint64_t)1 << GFS3_OP_FSYNC);
+ } else if (!strcmp(str_tok, "read")) {
+ fops |= ((uint64_t)1 << GFS3_OP_READ);
+ } else if (!strcmp(str_tok, "rename")) {
+ fops |= ((uint64_t)1 << GFS3_OP_RENAME);
+ } else if (!strcmp(str_tok, "flush")) {
+ fops |= ((uint64_t)1 << GFS3_OP_FLUSH);
+ } else if (!strcmp(str_tok, "ftruncate")) {
+ fops |= ((uint64_t)1 << GFS3_OP_FTRUNCATE);
+ } else if (!strcmp(str_tok, "fallocate")) {
+ fops |= ((uint64_t)1 << GFS3_OP_FALLOCATE);
+ } else if (!strcmp(str_tok, "rmdir")) {
+ fops |= ((uint64_t)1 << GFS3_OP_RMDIR);
+ } else {
+ gf_log ("barrier", GF_LOG_ERROR,
+ "Invalid barrier fop %s", str_tok);
+ }
+
+ str_tok = strtok_r (NULL, ",", &save_ptr);
+ }
+done:
+ LOCK (&barrier->lock);
+ {
+ barrier->fops = fops;
+ }
+ UNLOCK (&barrier->lock);
+ ret = 0;
+
+ GF_FREE (dup_str);
+ return ret;
+}
+
+void
+gf_barrier_enqueue (gf_barrier_t *barrier, gf_barrier_payload_t *payload)
+{
+ list_add_tail (&payload->list, &barrier->queue);
+ barrier->cur_size++;
+}
+
+gf_barrier_payload_t *
+gf_barrier_payload (rpcsvc_request_t *req, struct iovec *rsp,
+ call_frame_t *frame, struct iovec *payload_orig,
+ int payloadcount, struct iobref *iobref,
+ struct iobuf *iob, gf_boolean_t free_iobref)
+{
+ gf_barrier_payload_t *payload = NULL;
+
+ if (!rsp)
+ return NULL;
+
+ payload = GF_CALLOC (1, sizeof (*payload),1);
+ if (!payload)
+ return NULL;
+
+ INIT_LIST_HEAD (&payload->list);
+
+ payload->req = req;
+ memcpy (&payload->rsp, rsp, sizeof (struct iovec));
+ payload->frame = frame;
+ payload->payload = payload_orig;
+ payload->payload_count = payloadcount;
+ payload->iobref = iobref;
+ payload->iob = iob;
+ payload->free_iobref = free_iobref;
+
+ return payload;
+}
diff --git a/xlators/protocol/server/src/server-helpers.h b/xlators/protocol/server/src/server-helpers.h
index 0b7424bab..486048b8a 100644
--- a/xlators/protocol/server/src/server-helpers.h
+++ b/xlators/protocol/server/src/server-helpers.h
@@ -15,13 +15,8 @@
#define CALL_STATE(frame) ((server_state_t *)frame->root->state)
-#define BOUND_XL(frame) ((xlator_t *) CALL_STATE(frame)->conn->bound_xl)
-
#define XPRT_FROM_FRAME(frame) ((rpc_transport_t *) CALL_STATE(frame)->xprt)
-#define SERVER_CONNECTION(frame) \
- ((server_connection_t *) CALL_STATE(frame)->conn)
-
#define SERVER_CONF(frame) \
((server_conf_t *)XPRT_FROM_FRAME(frame)->this->private)
@@ -34,45 +29,26 @@
#define IS_NOT_ROOT(pathlen) ((pathlen > 2)? 1 : 0)
+#define is_fop_barriered(fops, procnum) (fops & ((uint64_t)1 << procnum))
+
+#define barrier_add_to_queue(barrier) (barrier->on || barrier->cur_size)
+
void free_state (server_state_t *state);
void server_loc_wipe (loc_t *loc);
-int32_t
-gf_add_locker (server_connection_t *conn, const char *volume,
- loc_t *loc,
- fd_t *fd,
- pid_t pid,
- gf_lkowner_t *owner,
- glusterfs_fop_t type);
-
-int32_t
-gf_del_locker (server_connection_t *conn, const char *volume,
- loc_t *loc,
- fd_t *fd,
- gf_lkowner_t *owner,
- glusterfs_fop_t type);
-
void
server_print_request (call_frame_t *frame);
call_frame_t *
get_frame_from_request (rpcsvc_request_t *req);
-gf_boolean_t
-server_cancel_conn_timer (xlator_t *this, server_connection_t *conn);
-
-void
-put_server_conn_state (xlator_t *this, rpc_transport_t *xprt);
-
-server_connection_t *
-get_server_conn_state (xlator_t *this, rpc_transport_t *xptr);
-
-server_connection_t *
-create_server_conn_state (xlator_t *this, rpc_transport_t *xptr);
+int
+server_connection_cleanup (xlator_t *this, struct _client_t *client,
+ int32_t flags);
-void
-destroy_server_conn_state (server_connection_t *conn);
+gf_boolean_t
+server_cancel_grace_timer (xlator_t *this, struct _client_t *client);
int
server_build_config (xlator_t *this, server_conf_t *conf);
@@ -81,5 +57,20 @@ int serialize_rsp_dirent (gf_dirent_t *entries, gfs3_readdir_rsp *rsp);
int serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp);
int readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp);
int readdir_rsp_cleanup (gfs3_readdir_rsp *rsp);
+int auth_set_username_passwd (dict_t *input_params, dict_t *config_params,
+ struct _client_t *client);
+
+server_ctx_t *server_ctx_get (client_t *client, xlator_t *xlator);
+
+int32_t gf_barrier_start (xlator_t *this);
+int32_t gf_barrier_stop (xlator_t *this);
+int32_t gf_barrier_fops_configure (xlator_t *this, gf_barrier_t *barrier,
+ char *str);
+void gf_barrier_enqueue (gf_barrier_t *barrier, gf_barrier_payload_t *stub);
+gf_barrier_payload_t *
+gf_barrier_payload (rpcsvc_request_t *req, struct iovec *rsp,
+ call_frame_t *frame, struct iovec *payload,
+ int payloadcount, struct iobref *iobref,
+ struct iobuf *iob, gf_boolean_t free_iobref);
#endif /* !_SERVER_HELPERS_H */
diff --git a/xlators/protocol/server/src/server-resolve.c b/xlators/protocol/server/src/server-resolve.c
index 4a91a4104..b2bff5c53 100644
--- a/xlators/protocol/server/src/server-resolve.c
+++ b/xlators/protocol/server/src/server-resolve.c
@@ -147,7 +147,8 @@ resolve_gfid_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
(char **) &resolve_loc->path);
STACK_WIND (frame, resolve_gfid_entry_cbk,
- BOUND_XL (frame), BOUND_XL (frame)->fops->lookup,
+ frame->root->client->bound_xl,
+ frame->root->client->bound_xl->fops->lookup,
&resolve->resolve_loc, NULL);
return 0;
out:
@@ -179,7 +180,8 @@ resolve_gfid (call_frame_t *frame)
ret = loc_path (resolve_loc, NULL);
STACK_WIND (frame, resolve_gfid_cbk,
- BOUND_XL (frame), BOUND_XL (frame)->fops->lookup,
+ frame->root->client->bound_xl,
+ frame->root->client->bound_xl->fops->lookup,
&resolve->resolve_loc, NULL);
return 0;
}
@@ -244,7 +246,7 @@ resolve_entry_simple (call_frame_t *frame)
/* simple resolution is indecisive. need to perform
deep resolution */
resolve->op_ret = -1;
- resolve->op_errno = ENOENT;
+ resolve->op_errno = ESTALE;
ret = 1;
goto out;
}
@@ -341,7 +343,7 @@ resolve_inode_simple (call_frame_t *frame)
if (!inode) {
resolve->op_ret = -1;
- resolve->op_errno = ENOENT;
+ resolve->op_errno = ESTALE;
ret = 1;
goto out;
}
@@ -449,14 +451,14 @@ server_resolve_anonfd (call_frame_t *frame)
int
server_resolve_fd (call_frame_t *frame)
{
- server_state_t *state = NULL;
- server_resolve_t *resolve = NULL;
- server_connection_t *conn = NULL;
- uint64_t fd_no = -1;
+ server_ctx_t *serv_ctx = NULL;
+ server_state_t *state = NULL;
+ client_t *client = NULL;
+ server_resolve_t *resolve = NULL;
+ uint64_t fd_no = -1;
state = CALL_STATE (frame);
resolve = state->resolve_now;
- conn = SERVER_CONNECTION (frame);
fd_no = resolve->fd_no;
@@ -465,7 +467,18 @@ server_resolve_fd (call_frame_t *frame)
return 0;
}
- state->fd = gf_fd_fdptr_get (conn->fdtable, fd_no);
+ client = frame->root->client;
+
+ serv_ctx = server_ctx_get (client, client->this);
+
+ if (serv_ctx == NULL) {
+ gf_log ("", GF_LOG_INFO, "server_ctx_get() failed");
+ resolve->op_ret = -1;
+ resolve->op_errno = ENOMEM;
+ return 0;
+ }
+
+ state->fd = gf_fd_fdptr_get (serv_ctx->fdtable, fd_no);
if (!state->fd) {
gf_log ("", GF_LOG_INFO, "fd not found in context");
@@ -520,14 +533,12 @@ int
server_resolve_done (call_frame_t *frame)
{
server_state_t *state = NULL;
- xlator_t *bound_xl = NULL;
state = CALL_STATE (frame);
- bound_xl = BOUND_XL (frame);
server_print_request (frame);
- state->resume_fn (frame, bound_xl);
+ state->resume_fn (frame, frame->root->client->bound_xl);
return 0;
}
diff --git a/xlators/protocol/server/src/server-rpc-fops.c b/xlators/protocol/server/src/server-rpc-fops.c
index a9d2ee5b6..70b8ab3a6 100644
--- a/xlators/protocol/server/src/server-rpc-fops.c
+++ b/xlators/protocol/server/src/server-rpc-fops.c
@@ -24,6 +24,11 @@
#include "xdr-nfs3.h"
+#define SERVER_REQ_SET_ERROR(req, ret) \
+ do { \
+ rpcsvc_request_seterr (req, GARBAGE_ARGS); \
+ ret = RPCSVC_ACTOR_ERROR; \
+ } while (0)
/* Callback function section */
int
@@ -31,17 +36,14 @@ server_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct statvfs *buf,
dict_t *xdata)
{
- gfs3_statfs_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
+ gfs3_statfs_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%"PRId64": STATFS (%s)",
+ gf_log (this->name, GF_LOG_WARNING, "%"PRId64": STATFS (%s)",
frame->root->unique, strerror (op_errno));
goto out;
}
@@ -52,7 +54,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
-
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_statfs_rsp);
@@ -67,16 +69,15 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *stbuf, dict_t *xdata,
struct iatt *postparent)
{
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
- inode_t *root_inode = NULL;
- inode_t *link_inode = NULL;
- loc_t fresh_loc = {0,};
- gfs3_lookup_rsp rsp = {0,};
- uuid_t rootgfid = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
+ inode_t *root_inode = NULL;
+ inode_t *link_inode = NULL;
+ loc_t fresh_loc = {0,};
+ gfs3_lookup_rsp rsp = {0,};
+ uuid_t rootgfid = {0,};
- req = frame->local;
- state = CALL_STATE(frame);
+ state = CALL_STATE (frame);
if (state->is_revalidate == 1 && op_ret == -1) {
state->is_revalidate = 2;
@@ -84,8 +85,9 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_unref (fresh_loc.inode);
fresh_loc.inode = inode_new (state->itable);
- STACK_WIND (frame, server_lookup_cbk, BOUND_XL (frame),
- BOUND_XL (frame)->fops->lookup,
+ STACK_WIND (frame, server_lookup_cbk,
+ frame->root->client->bound_xl,
+ frame->root->client->bound_xl->fops->lookup,
&fresh_loc, state->xdata);
loc_wipe (&fresh_loc);
@@ -94,7 +96,7 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_stat_from_iatt (&rsp.postparent, postparent);
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
@@ -108,7 +110,7 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- root_inode = BOUND_XL(frame)->itable->root;
+ root_inode = frame->root->client->bound_xl->itable->root;
if (inode == root_inode) {
/* we just looked up root ("/") */
stbuf->ia_ino = 1;
@@ -137,8 +139,9 @@ out:
if (state->resolve.bname) {
gf_log (this->name, ((op_errno == ENOENT) ?
GF_LOG_TRACE : GF_LOG_INFO),
- "%"PRId64": LOOKUP %s (%s/%s) ==> (%s)",
- frame->root->unique, state->loc.path,
+ "%"PRId64": LOOKUP %s (%s/%s) ==> "
+ "(%s)", frame->root->unique,
+ state->loc.path,
uuid_utoa (state->resolve.pargfid),
state->resolve.bname,
strerror (op_errno));
@@ -152,6 +155,7 @@ out:
}
}
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_lookup_rsp);
@@ -166,21 +170,20 @@ server_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
dict_t *xdata)
{
- gfs3_lk_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_lk_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": LK %"PRId64" (%s) ==> (%s)",
- frame->root->unique, state->resolve.fd_no,
+ "%"PRId64": LK %"PRId64" (%s) ==> "
+ "(%s)", frame->root->unique,
+ state->resolve.fd_no,
uuid_utoa (state->resolve.gfid),
strerror (op_errno));
}
@@ -209,6 +212,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_lk_rsp);
@@ -222,21 +226,19 @@ int
server_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- conn = SERVER_CONNECTION(frame);
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
- gf_log (this->name, GF_LOG_INFO,
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
"%"PRId64": INODELK %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
uuid_utoa (state->resolve.gfid),
@@ -245,20 +247,11 @@ server_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- if (state->flock.l_type == F_UNLCK)
- gf_del_locker (conn, state->volume,
- &state->loc, NULL, &frame->root->lk_owner,
- GF_FOP_INODELK);
- else
- gf_add_locker (conn, state->volume,
- &state->loc, NULL, frame->root->pid,
- &frame->root->lk_owner,
- GF_FOP_INODELK);
-
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -272,43 +265,32 @@ int
server_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_state_t *state = NULL;
- server_connection_t *conn = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- conn = SERVER_CONNECTION(frame);
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": FINODELK %"PRId64" (%s) ==> (%s)",
- frame->root->unique, state->resolve.fd_no,
+ "%"PRId64": FINODELK %"PRId64" (%s) "
+ "==> (%s)", frame->root->unique,
+ state->resolve.fd_no,
uuid_utoa (state->resolve.gfid),
strerror (op_errno));
}
goto out;
}
- if (state->flock.l_type == F_UNLCK)
- gf_del_locker (conn, state->volume,
- NULL, state->fd,
- &frame->root->lk_owner, GF_FOP_INODELK);
- else
- gf_add_locker (conn, state->volume,
- NULL, state->fd,
- frame->root->pid,
- &frame->root->lk_owner, GF_FOP_INODELK);
-
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -321,18 +303,15 @@ int
server_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- gf_common_rsp rsp = {0,};
-
- req = frame->local;
- conn = SERVER_CONNECTION(frame);
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
gf_log (this->name, GF_LOG_INFO,
@@ -344,20 +323,11 @@ server_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- if (state->cmd == ENTRYLK_UNLOCK)
- gf_del_locker (conn, state->volume,
- &state->loc, NULL, &frame->root->lk_owner,
- GF_FOP_ENTRYLK);
- else
- gf_add_locker (conn, state->volume,
- &state->loc, NULL, frame->root->pid,
- &frame->root->lk_owner,
- GF_FOP_ENTRYLK);
-
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -371,18 +341,15 @@ int
server_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- conn = SERVER_CONNECTION(frame);
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
gf_log (this->name, GF_LOG_INFO,
@@ -394,19 +361,11 @@ server_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- if (state->cmd == ENTRYLK_UNLOCK)
- gf_del_locker (conn, state->volume,
- NULL, state->fd, &frame->root->lk_owner,
- GF_FOP_ENTRYLK);
- else
- gf_add_locker (conn, state->volume,
- NULL, state->fd, frame->root->pid,
- &frame->root->lk_owner, GF_FOP_ENTRYLK);
-
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -420,17 +379,15 @@ int
server_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": ACCESS %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -443,6 +400,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -456,17 +414,16 @@ server_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_rmdir_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *parent = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_rmdir_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *parent = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": RMDIR %s (%s/%s) ==> (%s)",
@@ -495,6 +452,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_rmdir_rsp);
@@ -509,17 +467,16 @@ server_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_mkdir_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *link_inode = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_mkdir_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *link_inode = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": MKDIR %s (%s/%s) ==> (%s)",
@@ -542,6 +499,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_mkdir_rsp);
@@ -556,17 +514,16 @@ server_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_mknod_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *link_inode = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_mknod_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *link_inode = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": MKNOD %s (%s/%s) ==> (%s)",
@@ -589,6 +546,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_mknod_rsp);
@@ -601,17 +559,15 @@ int
server_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FSYNCDIR %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -624,6 +580,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -637,18 +594,16 @@ server_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
dict_t *xdata)
{
- gfs3_readdir_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- int ret = 0;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_readdir_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+ int ret = 0;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": READDIR %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -671,6 +626,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_readdir_rsp);
@@ -685,29 +641,33 @@ int
server_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- gfs3_opendir_rsp rsp = {0,};
- uint64_t fd_no = 0;
-
- req = frame->local;
- conn = SERVER_CONNECTION (frame);
- state = CALL_STATE (frame);
+ server_state_t *state = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ rpcsvc_request_t *req = NULL;
+ gfs3_opendir_rsp rsp = {0,};
+ uint64_t fd_no = 0;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
+ state = CALL_STATE (frame);
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
"%"PRId64": OPENDIR %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
uuid_utoa (state->resolve.gfid), strerror (op_errno));
goto out;
}
+ serv_ctx = server_ctx_get (frame->root->client, this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
+ }
+
fd_bind (fd);
- fd_no = gf_fd_unused_get (conn->fdtable, fd);
+ fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd);
fd_ref (fd); // on behalf of the client
out:
@@ -715,6 +675,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_opendir_rsp);
@@ -727,17 +688,15 @@ int
server_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": REMOVEXATTR %s (%s) of key %s ==> (%s)",
frame->root->unique, state->loc.path,
@@ -750,6 +709,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -762,17 +722,15 @@ int
server_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
+ gf_common_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
server_state_t *state = NULL;
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FREMOVEXATTR %"PRId64" (%s) (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -785,6 +743,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -798,17 +757,15 @@ server_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- gfs3_getxattr_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_getxattr_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
+ state = CALL_STATE (frame);
gf_log (this->name, (((op_errno == ENOTSUP) ||
(op_errno == ENODATA) ||
(op_errno == ENOENT)) ?
@@ -820,13 +777,14 @@ server_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val,
rsp.dict.dict_len, op_errno, out);
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_getxattr_rsp);
@@ -843,18 +801,18 @@ server_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- gfs3_fgetxattr_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_fgetxattr_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
- gf_log (this->name, ((op_errno == ENOTSUP) ?
+ state = CALL_STATE (frame);
+ gf_log (this->name, (((op_errno == ENOTSUP) ||
+ (op_errno == ENODATA) ||
+ (op_errno == ENOENT)) ?
GF_LOG_DEBUG : GF_LOG_INFO),
"%"PRId64": FGETXATTR %"PRId64" (%s) (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -863,7 +821,7 @@ server_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val,
rsp.dict.dict_len, op_errno, out);
out:
@@ -871,6 +829,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fgetxattr_rsp);
@@ -886,11 +845,11 @@ static int
_gf_server_log_setxattr_failure (dict_t *d, char *k, data_t *v,
void *tmp)
{
- server_state_t *state = NULL;
- call_frame_t *frame = NULL;
+ server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
frame = tmp;
- state = CALL_STATE(frame);
+ state = CALL_STATE (frame);
gf_log (THIS->name, GF_LOG_INFO,
"%"PRId64": SETXATTR %s (%s) ==> %s",
@@ -907,13 +866,11 @@ server_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
rpcsvc_request_t *req = NULL;
server_state_t *state = NULL;
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
+ state = CALL_STATE (frame);
if (op_errno != ENOTSUP)
dict_foreach (state->dict,
_gf_server_log_setxattr_failure,
@@ -929,6 +886,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -942,11 +900,11 @@ static int
_gf_server_log_fsetxattr_failure (dict_t *d, char *k, data_t *v,
void *tmp)
{
- call_frame_t *frame = NULL;
- server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
+ server_state_t *state = NULL;
frame = tmp;
- state = CALL_STATE(frame);
+ state = CALL_STATE (frame);
gf_log (THIS->name, GF_LOG_INFO,
"%"PRId64": FSETXATTR %"PRId64" (%s) ==> %s",
@@ -964,13 +922,11 @@ server_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
rpcsvc_request_t *req = NULL;
server_state_t *state = NULL;
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
+ state = CALL_STATE (frame);
if (op_errno != ENOTSUP) {
dict_foreach (state->dict,
_gf_server_log_fsetxattr_failure,
@@ -986,6 +942,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -1001,20 +958,19 @@ server_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *prenewparent, struct iatt *postnewparent,
dict_t *xdata)
{
- gfs3_rename_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- inode_t *tmp_inode = NULL;
- inode_t *tmp_parent = NULL;
- char oldpar_str[50] = {0,};
- char newpar_str[50] = {0,};
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_rename_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+ inode_t *tmp_inode = NULL;
+ inode_t *tmp_parent = NULL;
+ char oldpar_str[50] = {0,};
+ char newpar_str[50] = {0,};
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret == -1) {
uuid_utoa_r (state->resolve.gfid, oldpar_str);
uuid_utoa_r (state->resolve2.gfid, newpar_str);
@@ -1029,7 +985,7 @@ server_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
stbuf->ia_type = state->loc.inode->ia_type;
/* TODO: log gfid of the inodes */
- gf_log (state->conn->bound_xl->name, GF_LOG_TRACE,
+ gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE,
"%"PRId64": RENAME_CBK %s ==> %s",
frame->root->unique, state->loc.name, state->loc2.name);
@@ -1071,6 +1027,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_rename_rsp);
@@ -1084,17 +1041,16 @@ server_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_unlink_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *parent = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_unlink_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *parent = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret) {
gf_log (this->name, (op_errno == ENOENT)?
GF_LOG_DEBUG:GF_LOG_ERROR,
@@ -1106,7 +1062,7 @@ server_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
/* TODO: log gfid of the inodes */
- gf_log (state->conn->bound_xl->name, GF_LOG_TRACE,
+ gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE,
"%"PRId64": UNLINK_CBK %s",
frame->root->unique, state->loc.name);
@@ -1126,6 +1082,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_unlink_rsp);
@@ -1140,17 +1097,16 @@ server_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_symlink_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *link_inode = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_symlink_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *link_inode = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": SYMLINK %s (%s/%s) ==> (%s)",
@@ -1173,6 +1129,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_symlink_rsp);
@@ -1188,27 +1145,27 @@ server_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_link_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *link_inode = NULL;
- rpcsvc_request_t *req = NULL;
+ gfs3_link_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *link_inode = NULL;
+ rpcsvc_request_t *req = NULL;
char gfid_str[50] = {0,};
char newpar_str[50] = {0,};
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret) {
uuid_utoa_r (state->resolve.gfid, gfid_str);
uuid_utoa_r (state->resolve2.pargfid, newpar_str);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": LINK %s (%s) -> %s/%s ==> (%s)",
- frame->root->unique, state->loc.path, gfid_str,
- newpar_str, state->resolve2.bname, strerror (op_errno));
+ frame->root->unique, state->loc.path,
+ gfid_str, newpar_str, state->resolve2.bname,
+ strerror (op_errno));
goto out;
}
@@ -1224,6 +1181,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_link_rsp);
@@ -1237,17 +1195,15 @@ server_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- gfs3_truncate_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_truncate_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": TRUNCATE %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -1262,6 +1218,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_truncate_rsp);
@@ -1275,17 +1232,15 @@ server_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
dict_t *xdata)
{
- gfs3_fstat_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_fstat_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FSTAT %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1299,6 +1254,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fstat_rsp);
@@ -1312,17 +1268,15 @@ server_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- gfs3_ftruncate_rsp rsp = {0};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_ftruncate_rsp rsp = {0};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FTRUNCATE %"PRId64" (%s)==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1337,6 +1291,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_ftruncate_rsp);
@@ -1349,21 +1304,20 @@ int
server_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": FLUSH %"PRId64" (%s) ==> (%s)",
+ state = CALL_STATE (frame);
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
+ "%"PRId64": FLUSH %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
- uuid_utoa (state->resolve.gfid), strerror (op_errno));
+ uuid_utoa (state->resolve.gfid), strerror (op_errno));
goto out;
}
@@ -1371,6 +1325,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -1384,17 +1339,15 @@ server_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- gfs3_fsync_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_fsync_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FSYNC %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1409,6 +1362,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fsync_rsp);
@@ -1422,17 +1376,15 @@ server_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- gfs3_write_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_write_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": WRITEV %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1447,6 +1399,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_write_rsp);
@@ -1462,12 +1415,9 @@ server_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iovec *vector, int32_t count,
struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
{
- gfs3_read_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_read_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
#ifdef GF_TESTING_IO_XDATA
{
@@ -1479,10 +1429,11 @@ server_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"testing-xdata-value");
}
#endif
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": READV %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1497,6 +1448,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, vector, count, iobref,
(xdrproc_t)xdr_gfs3_read_rsp);
@@ -1511,17 +1463,15 @@ server_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uint32_t weak_checksum, uint8_t *strong_checksum,
dict_t *xdata)
{
- gfs3_rchecksum_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
+ gfs3_rchecksum_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
server_state_t *state = NULL;
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": RCHECKSUM %"PRId64" (%s)==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1538,6 +1488,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_rchecksum_rsp);
@@ -1551,21 +1502,19 @@ int
server_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- uint64_t fd_no = 0;
- gfs3_open_rsp rsp = {0,};
-
- req = frame->local;
- conn = SERVER_CONNECTION (frame);
- state = CALL_STATE (frame);
+ server_state_t *state = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ rpcsvc_request_t *req = NULL;
+ uint64_t fd_no = 0;
+ gfs3_open_rsp rsp = {0,};
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
+ state = CALL_STATE (frame);
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
"%"PRId64": OPEN %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
uuid_utoa (state->resolve.gfid),
@@ -1573,8 +1522,14 @@ server_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
+ serv_ctx = server_ctx_get (frame->root->client, this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
+ }
+
fd_bind (fd);
- fd_no = gf_fd_unused_get (conn->fdtable, fd);
+ fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd);
fd_ref (fd);
rsp.fd = fd_no;
@@ -1582,6 +1537,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_open_rsp);
GF_FREE (rsp.xdata.xdata_val);
@@ -1596,20 +1552,18 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- server_connection_t *conn = NULL;
server_state_t *state = NULL;
+ server_ctx_t *serv_ctx = NULL;
inode_t *link_inode = NULL;
rpcsvc_request_t *req = NULL;
uint64_t fd_no = 0;
gfs3_create_rsp rsp = {0,};
- req = frame->local;
- conn = SERVER_CONNECTION (frame);
- state = CALL_STATE (frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": CREATE %s (%s/%s) ==> (%s)",
@@ -1620,7 +1574,7 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
/* TODO: log gfid too */
- gf_log (state->conn->bound_xl->name, GF_LOG_TRACE,
+ gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE,
"%"PRId64": CREATE %s (%s)",
frame->root->unique, state->loc.name,
uuid_utoa (stbuf->ia_gfid));
@@ -1647,13 +1601,18 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_lookup (link_inode);
inode_unref (link_inode);
- fd_bind (fd);
+ serv_ctx = server_ctx_get (frame->root->client, this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
+ }
- fd_no = gf_fd_unused_get (conn->fdtable, fd);
+ fd_bind (fd);
+ fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd);
fd_ref (fd);
- if ((fd_no < 0) || (fd == 0)) {
- op_ret = fd_no;
+ if ((fd_no > UINT64_MAX) || (fd == 0)) {
+ op_ret = -1;
op_errno = errno;
}
@@ -1666,6 +1625,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_create_rsp);
@@ -1679,17 +1639,15 @@ server_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, const char *buf,
struct iatt *stbuf, dict_t *xdata)
{
- gfs3_readlink_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_readlink_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": READLINK %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -1708,6 +1666,7 @@ out:
if (!rsp.path)
rsp.path = "";
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_readlink_rsp);
@@ -1721,18 +1680,17 @@ server_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
dict_t *xdata)
{
- gfs3_stat_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_stat_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
+ state = CALL_STATE (frame);
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
"%"PRId64": STAT %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
uuid_utoa (state->resolve.gfid),
@@ -1746,6 +1704,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_stat_rsp);
@@ -1760,17 +1719,15 @@ server_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
{
- gfs3_setattr_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_setattr_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": SETATTR %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -1786,6 +1743,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_setattr_rsp);
@@ -1799,17 +1757,15 @@ server_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
{
- gfs3_fsetattr_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_fsetattr_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FSETATTR %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1825,6 +1781,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fsetattr_rsp);
@@ -1839,17 +1796,15 @@ server_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- gfs3_xattrop_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_xattrop_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": XATTROP %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -1858,13 +1813,14 @@ server_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val,
rsp.dict.dict_len, op_errno, out);
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_xattrop_rsp);
@@ -1881,17 +1837,15 @@ server_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- gfs3_xattrop_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_xattrop_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FXATTROP %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1900,13 +1854,14 @@ server_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val,
rsp.dict.dict_len, op_errno, out);
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fxattrop_rsp);
@@ -1920,20 +1875,19 @@ out:
int
server_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
{
- gfs3_readdirp_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- int ret = 0;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_readdirp_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+ int ret = 0;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": READDIRP %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1959,6 +1913,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_readdirp_rsp);
@@ -1969,6 +1924,158 @@ out:
return 0;
}
+int
+server_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
+{
+ gfs3_fallocate_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
+ rsp.xdata.xdata_len, op_errno, out);
+
+ if (op_ret) {
+ state = CALL_STATE (frame);
+ gf_log (this->name, GF_LOG_INFO,
+ "%"PRId64": FALLOCATE %"PRId64" (%s) ==> (%s)",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid),
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_stat_from_iatt (&rsp.statpre, statpre);
+ gf_stat_from_iatt (&rsp.statpost, statpost);
+
+out:
+ rsp.op_ret = op_ret;
+ rsp.op_errno = gf_errno_to_error (op_errno);
+
+ req = frame->local;
+ server_submit_reply(frame, req, &rsp, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_fallocate_rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+int
+server_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
+{
+ gfs3_discard_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
+ rsp.xdata.xdata_len, op_errno, out);
+
+ if (op_ret) {
+ state = CALL_STATE (frame);
+ gf_log (this->name, GF_LOG_INFO,
+ "%"PRId64": DISCARD %"PRId64" (%s) ==> (%s)",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid),
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_stat_from_iatt (&rsp.statpre, statpre);
+ gf_stat_from_iatt (&rsp.statpost, statpost);
+
+out:
+ rsp.op_ret = op_ret;
+ rsp.op_errno = gf_errno_to_error (op_errno);
+
+ req = frame->local;
+ server_submit_reply(frame, req, &rsp, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_discard_rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+int
+server_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
+{
+ gfs3_zerofill_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+
+ req = frame->local;
+ state = CALL_STATE (frame);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ rsp.xdata.xdata_len, op_errno, out);
+
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "%"PRId64": ZEROFILL%"PRId64" (%s) ==> (%s)",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid),
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_stat_from_iatt (&rsp.statpre, statpre);
+ gf_stat_from_iatt (&rsp.statpost, statpost);
+
+out:
+ rsp.op_ret = op_ret;
+ rsp.op_errno = gf_errno_to_error (op_errno);
+
+ server_submit_reply(frame, req, &rsp, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_zerofill_rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+
+int
+server_ipc_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+
+ req = frame->local;
+ state = CALL_STATE (frame);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ rsp.xdata.xdata_len, op_errno, out);
+
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "%"PRId64": IPC%"PRId64" (%s) ==> (%s)",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid),
+ strerror (op_errno));
+ goto out;
+ }
+
+out:
+ rsp.op_ret = op_ret;
+ rsp.op_errno = gf_errno_to_error (op_errno);
+
+ server_submit_reply(frame, req, &rsp, NULL, 0, NULL,
+ (xdrproc_t) xdr_gf_common_rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+
/* Resume function section */
int
@@ -2132,6 +2239,7 @@ err:
int
server_fentrylk_resume (call_frame_t *frame, xlator_t *bound_xl)
{
+ GF_UNUSED int ret = -1;
server_state_t *state = NULL;
state = CALL_STATE (frame);
@@ -2139,6 +2247,13 @@ server_fentrylk_resume (call_frame_t *frame, xlator_t *bound_xl)
if (state->resolve.op_ret != 0)
goto err;
+ if (!state->xdata)
+ state->xdata = dict_new ();
+
+ if (state->xdata)
+ ret = dict_set_str (state->xdata, "connection-id",
+ frame->root->client->client_uid);
+
STACK_WIND (frame, server_fentrylk_cbk, bound_xl,
bound_xl->fops->fentrylk,
state->volume, state->fd, state->name,
@@ -2155,6 +2270,7 @@ err:
int
server_entrylk_resume (call_frame_t *frame, xlator_t *bound_xl)
{
+ GF_UNUSED int ret = -1;
server_state_t *state = NULL;
state = CALL_STATE (frame);
@@ -2162,6 +2278,13 @@ server_entrylk_resume (call_frame_t *frame, xlator_t *bound_xl)
if (state->resolve.op_ret != 0)
goto err;
+ if (!state->xdata)
+ state->xdata = dict_new ();
+
+ if (state->xdata)
+ ret = dict_set_str (state->xdata, "connection-id",
+ frame->root->client->client_uid);
+
STACK_WIND (frame, server_entrylk_cbk,
bound_xl, bound_xl->fops->entrylk,
state->volume, &state->loc, state->name,
@@ -2177,13 +2300,24 @@ err:
int
server_finodelk_resume (call_frame_t *frame, xlator_t *bound_xl)
{
+ GF_UNUSED int ret = -1;
server_state_t *state = NULL;
+ gf_log (bound_xl->name, GF_LOG_DEBUG, "frame %p, xlator %p",
+ frame, bound_xl);
+
state = CALL_STATE (frame);
if (state->resolve.op_ret != 0)
goto err;
+ if (!state->xdata)
+ state->xdata = dict_new ();
+
+ if (state->xdata)
+ ret = dict_set_str (state->xdata, "connection-id",
+ frame->root->client->client_uid);
+
STACK_WIND (frame, server_finodelk_cbk, bound_xl,
bound_xl->fops->finodelk, state->volume, state->fd,
state->cmd, &state->flock, state->xdata);
@@ -2199,13 +2333,24 @@ err:
int
server_inodelk_resume (call_frame_t *frame, xlator_t *bound_xl)
{
+ GF_UNUSED int ret = -1;
server_state_t *state = NULL;
+ gf_log (bound_xl->name, GF_LOG_DEBUG, "frame %p, xlator %p",
+ frame, bound_xl);
+
state = CALL_STATE (frame);
if (state->resolve.op_ret != 0)
goto err;
+ if (!state->xdata)
+ state->xdata = dict_new ();
+
+ if (state->xdata)
+ ret = dict_set_str (state->xdata, "connection-id",
+ frame->root->client->client_uid);
+
STACK_WIND (frame, server_inodelk_cbk, bound_xl,
bound_xl->fops->inodelk, state->volume, &state->loc,
state->cmd, &state->flock, state->xdata);
@@ -2914,7 +3059,69 @@ err:
return 0;
}
+int
+server_fallocate_resume (call_frame_t *frame, xlator_t *bound_xl)
+{
+ server_state_t *state = NULL;
+
+ state = CALL_STATE (frame);
+
+ if (state->resolve.op_ret != 0)
+ goto err;
+
+ STACK_WIND (frame, server_fallocate_cbk,
+ bound_xl, bound_xl->fops->fallocate,
+ state->fd, state->flags, state->offset, state->size,
+ state->xdata);
+ return 0;
+err:
+ server_fallocate_cbk(frame, NULL, frame->this, state->resolve.op_ret,
+ state->resolve.op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+server_discard_resume (call_frame_t *frame, xlator_t *bound_xl)
+{
+ server_state_t *state = NULL;
+
+ state = CALL_STATE (frame);
+
+ if (state->resolve.op_ret != 0)
+ goto err;
+
+ STACK_WIND (frame, server_discard_cbk,
+ bound_xl, bound_xl->fops->discard,
+ state->fd, state->offset, state->size, state->xdata);
+ return 0;
+err:
+ server_discard_cbk(frame, NULL, frame->this, state->resolve.op_ret,
+ state->resolve.op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+server_zerofill_resume (call_frame_t *frame, xlator_t *bound_xl)
+{
+ server_state_t *state = NULL;
+ state = CALL_STATE (frame);
+
+ if (state->resolve.op_ret != 0)
+ goto err;
+
+ STACK_WIND (frame, server_zerofill_cbk,
+ bound_xl, bound_xl->fops->zerofill,
+ state->fd, state->offset, state->size, state->xdata);
+ return 0;
+err:
+ server_zerofill_cbk(frame, NULL, frame->this, state->resolve.op_ret,
+ state->resolve.op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
/* Fop section */
@@ -2936,31 +3143,31 @@ server3_3_stat (rpcsvc_request_t *req)
ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_stat_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
- // something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_STAT;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
state->resolve.type = RESOLVE_MUST;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
@@ -2971,7 +3178,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -2993,22 +3200,22 @@ server3_3_setattr (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_setattr_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_SETATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3018,9 +3225,10 @@ server3_3_setattr (rpcsvc_request_t *req)
gf_stat_to_iatt (&args.stbuf, &state->stbuf);
state->valid = args.valid;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3028,7 +3236,7 @@ server3_3_setattr (rpcsvc_request_t *req)
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
free (args.xdata.xdata_val);
@@ -3052,22 +3260,21 @@ server3_3_fsetattr (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fsetattr_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
- // something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSETATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3077,13 +3284,196 @@ server3_3_fsetattr (rpcsvc_request_t *req)
gf_stat_to_iatt (&args.stbuf, &state->stbuf);
state->valid = args.valid;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
+ op_errno, out);
+
+ ret = 0;
+ resolve_and_resume (frame, server_fsetattr_resume);
+
+out:
+ free (args.xdata.xdata_val);
+
+ if (op_errno)
+ SERVER_REQ_SET_ERROR (req, ret);
+
+ return ret;
+}
+
+int
+server3_3_fallocate(rpcsvc_request_t *req)
+{
+ server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
+ gfs3_fallocate_req args = {{0},};
+ int ret = -1;
+ int op_errno = 0;
+
+ if (!req)
+ return ret;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fallocate_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ frame = get_frame_from_request (req);
+ if (!frame) {
+ // something wrong, mostly insufficient memory
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+ frame->root->op = GF_FOP_FALLOCATE;
+
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
+ /* auth failure, request on subvolume without setvolume */
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ state->resolve.type = RESOLVE_MUST;
+ state->resolve.fd_no = args.fd;
+
+ state->flags = args.flags;
+ state->offset = args.offset;
+ state->size = args.size;
+ memcpy(state->resolve.gfid, args.gfid, 16);
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
+ op_errno, out);
+
+ ret = 0;
+ resolve_and_resume (frame, server_fallocate_resume);
+
+out:
+ free (args.xdata.xdata_val);
+
+ if (op_errno)
+ SERVER_REQ_SET_ERROR (req, ret);
+
+ return ret;
+}
+
+
+int
+server3_3_discard(rpcsvc_request_t *req)
+{
+ server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
+ gfs3_discard_req args = {{0},};
+ int ret = -1;
+ int op_errno = 0;
+
+ if (!req)
+ return ret;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_discard_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ frame = get_frame_from_request (req);
+ if (!frame) {
+ // something wrong, mostly insufficient memory
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+ frame->root->op = GF_FOP_DISCARD;
+
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
+ /* auth failure, request on subvolume without setvolume */
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ state->resolve.type = RESOLVE_MUST;
+ state->resolve.fd_no = args.fd;
+
+ state->offset = args.offset;
+ state->size = args.size;
+ memcpy(state->resolve.gfid, args.gfid, 16);
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
+ op_errno, out);
+
+ ret = 0;
+ resolve_and_resume (frame, server_discard_resume);
+
+out:
+ free (args.xdata.xdata_val);
+
+ if (op_errno)
+ SERVER_REQ_SET_ERROR (req, ret);
+
+ return ret;
+}
+
+
+int
+server3_3_zerofill(rpcsvc_request_t *req)
+{
+ server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
+ gfs3_zerofill_req args = {{0},};
+ int ret = -1;
+ int op_errno = 0;
+
+ if (!req)
+ return ret;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_zerofill_req);
+ if (ret < 0) {
+ /*failed to decode msg*/;
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ frame = get_frame_from_request (req);
+ if (!frame) {
+ /* something wrong, mostly insufficient memory*/
+ req->rpc_err = GARBAGE_ARGS; /* TODO */
+ goto out;
+ }
+ frame->root->op = GF_FOP_ZEROFILL;
+
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
+ /* auth failure, request on subvolume without setvolume */
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ state->resolve.type = RESOLVE_MUST;
+ state->resolve.fd_no = args.fd;
+
+ state->offset = args.offset;
+ state->size = args.size;
+ memcpy(state->resolve.gfid, args.gfid, 16);
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, state->xdata,
(args.xdata.xdata_val),
(args.xdata.xdata_len), ret,
op_errno, out);
ret = 0;
- resolve_and_resume (frame, server_fsetattr_resume);
+ resolve_and_resume (frame, server_zerofill_resume);
out:
free (args.xdata.xdata_val);
@@ -3094,6 +3484,60 @@ out:
return ret;
}
+int
+server3_3_ipc (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ gfs3_ipc_req args = {0,};
+ int ret = -1;
+ int op_errno = 0;
+ dict_t *xdata = NULL;
+ xlator_t *bound_xl = NULL;
+
+ if (!req)
+ return ret;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_ipc_req);
+ if (ret < 0) {
+ /*failed to decode msg*/;
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ frame = get_frame_from_request (req);
+ if (!frame) {
+ /* something wrong, mostly insufficient memory*/
+ req->rpc_err = GARBAGE_ARGS; /* TODO */
+ goto out;
+ }
+ frame->root->op = GF_FOP_IPC;
+
+ bound_xl = frame->root->client->bound_xl;
+ if (!bound_xl) {
+ /* auth failure, request on subvolume without setvolume */
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (bound_xl, xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len,
+ ret, op_errno, out);
+
+ ret = 0;
+ STACK_WIND (frame, server_ipc_cbk, bound_xl, bound_xl->fops->ipc,
+ args.op, xdata);
+ free(xdata);
+
+out:
+ free (args.xdata.xdata_val);
+
+ if (op_errno)
+ req->rpc_err = GARBAGE_ARGS;
+
+ return ret;
+}
int
server3_3_readlink (rpcsvc_request_t *req)
@@ -3111,22 +3555,22 @@ server3_3_readlink (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_readlink_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_READLINK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3135,9 +3579,10 @@ server3_3_readlink (rpcsvc_request_t *req)
state->size = args.size;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3147,7 +3592,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3171,22 +3616,22 @@ server3_3_create (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_create_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_CREATE;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3204,9 +3649,10 @@ server3_3_create (rpcsvc_request_t *req)
}
/* TODO: can do alloca for xdata field instead of stdalloc */
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3217,7 +3663,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3238,22 +3684,22 @@ server3_3_open (rpcsvc_request_t *req)
ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_open_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_OPEN;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3262,16 +3708,17 @@ server3_3_open (rpcsvc_request_t *req)
state->flags = gf_flags_to_flags (args.flags);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
resolve_and_resume (frame, server_open_resume);
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
free (args.xdata.xdata_val);
@@ -3294,22 +3741,22 @@ server3_3_readv (rpcsvc_request_t *req)
ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_read_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_READ;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3321,9 +3768,10 @@ server3_3_readv (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3333,7 +3781,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3357,28 +3805,29 @@ server3_3_writev (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_write_req);
if (len < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_WRITE;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
state->resolve.type = RESOLVE_MUST;
state->resolve.fd_no = args.fd;
state->offset = args.offset;
+ state->size = args.size;
state->flags = args.flag;
state->iobref = iobref_ref (req->iobref);
memcpy (state->resolve.gfid, args.gfid, 16);
@@ -3400,9 +3849,10 @@ server3_3_writev (rpcsvc_request_t *req)
state->size += state->payload_vector[i].iov_len;
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
#ifdef GF_TESTING_IO_XDATA
@@ -3415,7 +3865,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3481,26 +3931,36 @@ server3_3_writev_vecsizer (int state, ssize_t *readsize, char *base_addr,
int
server3_3_release (rpcsvc_request_t *req)
{
- server_connection_t *conn = NULL;
- gfs3_release_req args = {{0,},};
- gf_common_rsp rsp = {0,};
- int ret = -1;
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ gfs3_release_req args = {{0,},};
+ gf_common_rsp rsp = {0,};
+ int ret = -1;
ret = xdr_to_generic (req->msg[0], &args,
(xdrproc_t)xdr_gfs3_release_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
- conn = req->trans->xl_private;
- if (!conn) {
+ client = req->trans->xl_private;
+ if (!client) {
/* Handshake is not complete yet. */
req->rpc_err = SYSTEM_ERR;
goto out;
}
- gf_fd_put (conn->fdtable, args.fd);
+
+ serv_ctx = server_ctx_get (client, client->this);
+ if (serv_ctx == NULL) {
+ gf_log (req->trans->name, GF_LOG_INFO,
+ "server_ctx_get() failed");
+ req->rpc_err = SYSTEM_ERR;
+ goto out;
+ }
+
+ gf_fd_put (serv_ctx->fdtable, args.fd);
server_submit_reply (NULL, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -3513,26 +3973,35 @@ out:
int
server3_3_releasedir (rpcsvc_request_t *req)
{
- server_connection_t *conn = NULL;
- gfs3_releasedir_req args = {{0,},};
- gf_common_rsp rsp = {0,};
- int ret = -1;
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ gfs3_releasedir_req args = {{0,},};
+ gf_common_rsp rsp = {0,};
+ int ret = -1;
ret = xdr_to_generic (req->msg[0], &args,
(xdrproc_t)xdr_gfs3_release_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
- conn = req->trans->xl_private;
- if (!conn) {
- req->rpc_err = GARBAGE_ARGS;
+ client = req->trans->xl_private;
+ if (!client) {
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ serv_ctx = server_ctx_get (client, client->this);
+ if (serv_ctx == NULL) {
+ gf_log (req->trans->name, GF_LOG_INFO,
+ "server_ctx_get() failed");
+ req->rpc_err = SYSTEM_ERR;
goto out;
}
- gf_fd_put (conn->fdtable, args.fd);
+ gf_fd_put (serv_ctx->fdtable, args.fd);
server_submit_reply (NULL, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -3559,22 +4028,22 @@ server3_3_fsync (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fsync_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSYNC;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3583,9 +4052,10 @@ server3_3_fsync (rpcsvc_request_t *req)
state->flags = args.data;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3594,7 +4064,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3617,22 +4087,22 @@ server3_3_flush (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_flush_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FLUSH;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3640,9 +4110,10 @@ server3_3_flush (rpcsvc_request_t *req)
state->resolve.fd_no = args.fd;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3651,7 +4122,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3674,22 +4145,22 @@ server3_3_ftruncate (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_ftruncate_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FTRUNCATE;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3698,9 +4169,10 @@ server3_3_ftruncate (rpcsvc_request_t *req)
state->offset = args.offset;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3709,7 +4181,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3731,22 +4203,22 @@ server3_3_fstat (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fstat_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSTAT;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3754,9 +4226,10 @@ server3_3_fstat (rpcsvc_request_t *req)
state->resolve.fd_no = args.fd;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3765,7 +4238,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3787,22 +4260,22 @@ server3_3_truncate (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_truncate_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_TRUNCATE;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3810,9 +4283,10 @@ server3_3_truncate (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
state->offset = args.offset;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3821,7 +4295,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3846,22 +4320,22 @@ server3_3_unlink (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_unlink_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_UNLINK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3871,9 +4345,10 @@ server3_3_unlink (rpcsvc_request_t *req)
state->flags = args.xflags;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3882,7 +4357,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3907,22 +4382,22 @@ server3_3_setxattr (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_setxattr_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_SETXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3930,7 +4405,8 @@ server3_3_setxattr (rpcsvc_request_t *req)
state->flags = args.flags;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
@@ -3940,9 +4416,10 @@ server3_3_setxattr (rpcsvc_request_t *req)
/* There can be some commands hidden in key, check and proceed */
gf_server_check_setxattr_cmd (frame, dict);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3953,7 +4430,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
if (dict)
dict_unref (dict);
@@ -3981,22 +4458,22 @@ server3_3_fsetxattr (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fsetxattr_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSETXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4005,16 +4482,18 @@ server3_3_fsetxattr (rpcsvc_request_t *req)
state->flags = args.flags;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
state->dict = dict;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4025,7 +4504,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
if (dict)
dict_unref (dict);
@@ -4053,22 +4532,22 @@ server3_3_fxattrop (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fxattrop_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FXATTROP;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4077,16 +4556,18 @@ server3_3_fxattrop (rpcsvc_request_t *req)
state->flags = args.flags;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
state->dict = dict;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4098,7 +4579,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
if (dict)
dict_unref (dict);
@@ -4127,22 +4608,22 @@ server3_3_xattrop (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_xattrop_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_XATTROP;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4150,16 +4631,18 @@ server3_3_xattrop (rpcsvc_request_t *req)
state->flags = args.flags;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
state->dict = dict;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4170,7 +4653,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
if (dict)
dict_unref (dict);
@@ -4197,22 +4680,22 @@ server3_3_getxattr (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_getxattr_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_GETXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4225,9 +4708,10 @@ server3_3_getxattr (rpcsvc_request_t *req)
gf_server_check_getxattr_cmd (frame, state->name);
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4236,7 +4720,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4259,22 +4743,22 @@ server3_3_fgetxattr (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fgetxattr_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FGETXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4285,9 +4769,10 @@ server3_3_fgetxattr (rpcsvc_request_t *req)
if (args.namelen)
state->name = gf_strdup (args.name);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4296,7 +4781,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4321,22 +4806,22 @@ server3_3_removexattr (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_removexattr_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_REMOVEXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4344,9 +4829,10 @@ server3_3_removexattr (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
state->name = gf_strdup (args.name);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4355,7 +4841,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4378,22 +4864,22 @@ server3_3_fremovexattr (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fremovexattr_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FREMOVEXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4402,9 +4888,10 @@ server3_3_fremovexattr (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
state->name = gf_strdup (args.name);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4413,7 +4900,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4437,31 +4924,32 @@ server3_3_opendir (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_opendir_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_OPENDIR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
state->resolve.type = RESOLVE_MUST;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4470,7 +4958,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4493,22 +4981,22 @@ server3_3_readdirp (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_readdirp_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_READDIRP;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4529,7 +5017,8 @@ server3_3_readdirp (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
/* here, dict itself works as xdata */
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
@@ -4539,7 +5028,7 @@ server3_3_readdirp (rpcsvc_request_t *req)
resolve_and_resume (frame, server_readdirp_resume);
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
free (args.dict.dict_val);
@@ -4563,22 +5052,22 @@ server3_3_readdir (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_readdir_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_READDIR;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4598,9 +5087,10 @@ server3_3_readdir (rpcsvc_request_t *req)
state->offset = args.offset;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4609,7 +5099,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4630,22 +5120,22 @@ server3_3_fsyncdir (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fsyncdir_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSYNCDIR;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4654,9 +5144,10 @@ server3_3_fsyncdir (rpcsvc_request_t *req)
state->flags = args.data;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4665,7 +5156,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4690,22 +5181,22 @@ server3_3_mknod (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_mknod_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_MKNOD;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4717,9 +5208,10 @@ server3_3_mknod (rpcsvc_request_t *req)
state->dev = args.dev;
state->umask = args.umask;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4727,7 +5219,7 @@ server3_3_mknod (rpcsvc_request_t *req)
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
/* memory allocated by libc, don't use GF_FREE */
free (args.xdata.xdata_val);
@@ -4755,22 +5247,22 @@ server3_3_mkdir (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_mkdir_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_MKDIR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4782,9 +5274,10 @@ server3_3_mkdir (rpcsvc_request_t *req)
state->umask = args.umask;
/* TODO: can do alloca for xdata field instead of stdalloc */
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4792,7 +5285,7 @@ server3_3_mkdir (rpcsvc_request_t *req)
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
free (args.xdata.xdata_val);
@@ -4818,22 +5311,22 @@ server3_3_rmdir (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_rmdir_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_RMDIR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4843,9 +5336,10 @@ server3_3_rmdir (rpcsvc_request_t *req)
state->flags = args.xflags;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4854,7 +5348,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4880,22 +5374,22 @@ server3_3_inodelk (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_inodelk_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_INODELK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4932,9 +5426,10 @@ server3_3_inodelk (rpcsvc_request_t *req)
break;
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4945,7 +5440,7 @@ out:
free (args.flock.lk_owner.lk_owner_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4967,22 +5462,22 @@ server3_3_finodelk (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_finodelk_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FINODELK;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5020,9 +5515,10 @@ server3_3_finodelk (rpcsvc_request_t *req)
break;
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5033,7 +5529,7 @@ out:
free (args.flock.lk_owner.lk_owner_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5058,22 +5554,22 @@ server3_3_entrylk (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_entrylk_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_ENTRYLK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5087,9 +5583,10 @@ server3_3_entrylk (rpcsvc_request_t *req)
state->cmd = args.cmd;
state->type = args.type;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5098,7 +5595,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5122,22 +5619,22 @@ server3_3_fentrylk (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_fentrylk_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FENTRYLK;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5151,9 +5648,10 @@ server3_3_fentrylk (rpcsvc_request_t *req)
state->name = gf_strdup (args.name);
state->volume = gf_strdup (args.volume);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5162,7 +5660,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5183,22 +5681,22 @@ server3_3_access (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_access_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_ACCESS;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5206,9 +5704,10 @@ server3_3_access (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
state->mask = args.mask;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5217,7 +5716,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5243,22 +5742,22 @@ server3_3_symlink (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_symlink_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_SYMLINK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5268,9 +5767,10 @@ server3_3_symlink (rpcsvc_request_t *req)
state->name = gf_strdup (args.linkname);
state->umask = args.umask;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5278,7 +5778,7 @@ server3_3_symlink (rpcsvc_request_t *req)
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
/* memory allocated by libc, don't use GF_FREE */
free (args.xdata.xdata_val);
@@ -5305,22 +5805,22 @@ server3_3_link (rpcsvc_request_t *req)
ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_link_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_LINK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5331,9 +5831,10 @@ server3_3_link (rpcsvc_request_t *req)
state->resolve2.bname = gf_strdup (args.newbname);
memcpy (state->resolve2.pargfid, args.newgfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5342,7 +5843,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5367,22 +5868,22 @@ server3_3_rename (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_rename_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_RENAME;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5394,9 +5895,10 @@ server3_3_rename (rpcsvc_request_t *req)
state->resolve2.bname = gf_strdup (args.newbname);
memcpy (state->resolve2.pargfid, args.newgfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5405,7 +5907,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5425,22 +5927,22 @@ server3_3_lk (rpcsvc_request_t *req)
ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_lk_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_LK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5488,7 +5990,7 @@ server3_3_lk (rpcsvc_request_t *req)
state->flock.l_type = F_UNLCK;
break;
default:
- gf_log (state->conn->bound_xl->name, GF_LOG_ERROR,
+ gf_log (frame->root->client->bound_xl->name, GF_LOG_ERROR,
"fd - %"PRId64" (%s): Unknown lock type: %"PRId32"!",
state->resolve.fd_no,
uuid_utoa (state->fd->inode->gfid), state->type);
@@ -5496,9 +5998,10 @@ server3_3_lk (rpcsvc_request_t *req)
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5509,7 +6012,7 @@ out:
free (args.flock.lk_owner.lk_owner_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5531,22 +6034,22 @@ server3_3_rchecksum (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_rchecksum_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_RCHECKSUM;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5555,9 +6058,10 @@ server3_3_rchecksum (rpcsvc_request_t *req)
state->offset = args.offset;
state->size = args.len;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5566,7 +6070,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5603,14 +6107,14 @@ server3_3_lookup (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_lookup_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto err;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto err;
}
frame->root->op = GF_FOP_LOOKUP;
@@ -5620,9 +6124,9 @@ server3_3_lookup (rpcsvc_request_t *req)
*/
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5635,9 +6139,10 @@ server3_3_lookup (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5669,88 +6174,93 @@ server3_3_statfs (rpcsvc_request_t *req)
(xdrproc_t)xdr_gfs3_statfs_req);
if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_STATFS;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
state->resolve.type = RESOLVE_MUST;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
resolve_and_resume (frame, server_statfs_resume);
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
rpcsvc_actor_t glusterfs3_3_fop_actors[] = {
- [GFS3_OP_NULL] = { "NULL", GFS3_OP_NULL, server_null, NULL, 0},
- [GFS3_OP_STAT] = { "STAT", GFS3_OP_STAT, server3_3_stat, NULL, 0},
- [GFS3_OP_READLINK] = { "READLINK", GFS3_OP_READLINK, server3_3_readlink, NULL, 0},
- [GFS3_OP_MKNOD] = { "MKNOD", GFS3_OP_MKNOD, server3_3_mknod, NULL, 0},
- [GFS3_OP_MKDIR] = { "MKDIR", GFS3_OP_MKDIR, server3_3_mkdir, NULL, 0},
- [GFS3_OP_UNLINK] = { "UNLINK", GFS3_OP_UNLINK, server3_3_unlink, NULL, 0},
- [GFS3_OP_RMDIR] = { "RMDIR", GFS3_OP_RMDIR, server3_3_rmdir, NULL, 0},
- [GFS3_OP_SYMLINK] = { "SYMLINK", GFS3_OP_SYMLINK, server3_3_symlink, NULL, 0},
- [GFS3_OP_RENAME] = { "RENAME", GFS3_OP_RENAME, server3_3_rename, NULL, 0},
- [GFS3_OP_LINK] = { "LINK", GFS3_OP_LINK, server3_3_link, NULL, 0},
- [GFS3_OP_TRUNCATE] = { "TRUNCATE", GFS3_OP_TRUNCATE, server3_3_truncate, NULL, 0},
- [GFS3_OP_OPEN] = { "OPEN", GFS3_OP_OPEN, server3_3_open, NULL, 0},
- [GFS3_OP_READ] = { "READ", GFS3_OP_READ, server3_3_readv, NULL, 0},
- [GFS3_OP_WRITE] = { "WRITE", GFS3_OP_WRITE, server3_3_writev, server3_3_writev_vecsizer, 0},
- [GFS3_OP_STATFS] = { "STATFS", GFS3_OP_STATFS, server3_3_statfs, NULL, 0},
- [GFS3_OP_FLUSH] = { "FLUSH", GFS3_OP_FLUSH, server3_3_flush, NULL, 0},
- [GFS3_OP_FSYNC] = { "FSYNC", GFS3_OP_FSYNC, server3_3_fsync, NULL, 0},
- [GFS3_OP_SETXATTR] = { "SETXATTR", GFS3_OP_SETXATTR, server3_3_setxattr, NULL, 0},
- [GFS3_OP_GETXATTR] = { "GETXATTR", GFS3_OP_GETXATTR, server3_3_getxattr, NULL, 0},
- [GFS3_OP_REMOVEXATTR] = { "REMOVEXATTR", GFS3_OP_REMOVEXATTR, server3_3_removexattr, NULL, 0},
- [GFS3_OP_OPENDIR] = { "OPENDIR", GFS3_OP_OPENDIR, server3_3_opendir, NULL, 0},
- [GFS3_OP_FSYNCDIR] = { "FSYNCDIR", GFS3_OP_FSYNCDIR, server3_3_fsyncdir, NULL, 0},
- [GFS3_OP_ACCESS] = { "ACCESS", GFS3_OP_ACCESS, server3_3_access, NULL, 0},
- [GFS3_OP_CREATE] = { "CREATE", GFS3_OP_CREATE, server3_3_create, NULL, 0},
- [GFS3_OP_FTRUNCATE] = { "FTRUNCATE", GFS3_OP_FTRUNCATE, server3_3_ftruncate, NULL, 0},
- [GFS3_OP_FSTAT] = { "FSTAT", GFS3_OP_FSTAT, server3_3_fstat, NULL, 0},
- [GFS3_OP_LK] = { "LK", GFS3_OP_LK, server3_3_lk, NULL, 0},
- [GFS3_OP_LOOKUP] = { "LOOKUP", GFS3_OP_LOOKUP, server3_3_lookup, NULL, 0},
- [GFS3_OP_READDIR] = { "READDIR", GFS3_OP_READDIR, server3_3_readdir, NULL, 0},
- [GFS3_OP_INODELK] = { "INODELK", GFS3_OP_INODELK, server3_3_inodelk, NULL, 0},
- [GFS3_OP_FINODELK] = { "FINODELK", GFS3_OP_FINODELK, server3_3_finodelk, NULL, 0},
- [GFS3_OP_ENTRYLK] = { "ENTRYLK", GFS3_OP_ENTRYLK, server3_3_entrylk, NULL, 0},
- [GFS3_OP_FENTRYLK] = { "FENTRYLK", GFS3_OP_FENTRYLK, server3_3_fentrylk, NULL, 0},
- [GFS3_OP_XATTROP] = { "XATTROP", GFS3_OP_XATTROP, server3_3_xattrop, NULL, 0},
- [GFS3_OP_FXATTROP] = { "FXATTROP", GFS3_OP_FXATTROP, server3_3_fxattrop, NULL, 0},
- [GFS3_OP_FGETXATTR] = { "FGETXATTR", GFS3_OP_FGETXATTR, server3_3_fgetxattr, NULL, 0},
- [GFS3_OP_FSETXATTR] = { "FSETXATTR", GFS3_OP_FSETXATTR, server3_3_fsetxattr, NULL, 0},
- [GFS3_OP_RCHECKSUM] = { "RCHECKSUM", GFS3_OP_RCHECKSUM, server3_3_rchecksum, NULL, 0},
- [GFS3_OP_SETATTR] = { "SETATTR", GFS3_OP_SETATTR, server3_3_setattr, NULL, 0},
- [GFS3_OP_FSETATTR] = { "FSETATTR", GFS3_OP_FSETATTR, server3_3_fsetattr, NULL, 0},
- [GFS3_OP_READDIRP] = { "READDIRP", GFS3_OP_READDIRP, server3_3_readdirp, NULL, 0},
- [GFS3_OP_RELEASE] = { "RELEASE", GFS3_OP_RELEASE, server3_3_release, NULL, 0},
- [GFS3_OP_RELEASEDIR] = { "RELEASEDIR", GFS3_OP_RELEASEDIR, server3_3_releasedir, NULL, 0},
- [GFS3_OP_FREMOVEXATTR] = { "FREMOVEXATTR", GFS3_OP_FREMOVEXATTR, server3_3_fremovexattr, NULL, 0},
+ [GFS3_OP_NULL] = {"NULL", GFS3_OP_NULL, server_null, NULL, 0, DRC_NA},
+ [GFS3_OP_STAT] = {"STAT", GFS3_OP_STAT, server3_3_stat, NULL, 0, DRC_NA},
+ [GFS3_OP_READLINK] = {"READLINK", GFS3_OP_READLINK, server3_3_readlink, NULL, 0, DRC_NA},
+ [GFS3_OP_MKNOD] = {"MKNOD", GFS3_OP_MKNOD, server3_3_mknod, NULL, 0, DRC_NA},
+ [GFS3_OP_MKDIR] = {"MKDIR", GFS3_OP_MKDIR, server3_3_mkdir, NULL, 0, DRC_NA},
+ [GFS3_OP_UNLINK] = {"UNLINK", GFS3_OP_UNLINK, server3_3_unlink, NULL, 0, DRC_NA},
+ [GFS3_OP_RMDIR] = {"RMDIR", GFS3_OP_RMDIR, server3_3_rmdir, NULL, 0, DRC_NA},
+ [GFS3_OP_SYMLINK] = {"SYMLINK", GFS3_OP_SYMLINK, server3_3_symlink, NULL, 0, DRC_NA},
+ [GFS3_OP_RENAME] = {"RENAME", GFS3_OP_RENAME, server3_3_rename, NULL, 0, DRC_NA},
+ [GFS3_OP_LINK] = {"LINK", GFS3_OP_LINK, server3_3_link, NULL, 0, DRC_NA},
+ [GFS3_OP_TRUNCATE] = {"TRUNCATE", GFS3_OP_TRUNCATE, server3_3_truncate, NULL, 0, DRC_NA},
+ [GFS3_OP_OPEN] = {"OPEN", GFS3_OP_OPEN, server3_3_open, NULL, 0, DRC_NA},
+ [GFS3_OP_READ] = {"READ", GFS3_OP_READ, server3_3_readv, NULL, 0, DRC_NA},
+ [GFS3_OP_WRITE] = {"WRITE", GFS3_OP_WRITE, server3_3_writev, server3_3_writev_vecsizer, 0, DRC_NA},
+ [GFS3_OP_STATFS] = {"STATFS", GFS3_OP_STATFS, server3_3_statfs, NULL, 0, DRC_NA},
+ [GFS3_OP_FLUSH] = {"FLUSH", GFS3_OP_FLUSH, server3_3_flush, NULL, 0, DRC_NA},
+ [GFS3_OP_FSYNC] = {"FSYNC", GFS3_OP_FSYNC, server3_3_fsync, NULL, 0, DRC_NA},
+ [GFS3_OP_SETXATTR] = {"SETXATTR", GFS3_OP_SETXATTR, server3_3_setxattr, NULL, 0, DRC_NA},
+ [GFS3_OP_GETXATTR] = {"GETXATTR", GFS3_OP_GETXATTR, server3_3_getxattr, NULL, 0, DRC_NA},
+ [GFS3_OP_REMOVEXATTR] = {"REMOVEXATTR", GFS3_OP_REMOVEXATTR, server3_3_removexattr, NULL, 0, DRC_NA},
+ [GFS3_OP_OPENDIR] = {"OPENDIR", GFS3_OP_OPENDIR, server3_3_opendir, NULL, 0, DRC_NA},
+ [GFS3_OP_FSYNCDIR] = {"FSYNCDIR", GFS3_OP_FSYNCDIR, server3_3_fsyncdir, NULL, 0, DRC_NA},
+ [GFS3_OP_ACCESS] = {"ACCESS", GFS3_OP_ACCESS, server3_3_access, NULL, 0, DRC_NA},
+ [GFS3_OP_CREATE] = {"CREATE", GFS3_OP_CREATE, server3_3_create, NULL, 0, DRC_NA},
+ [GFS3_OP_FTRUNCATE] = {"FTRUNCATE", GFS3_OP_FTRUNCATE, server3_3_ftruncate, NULL, 0, DRC_NA},
+ [GFS3_OP_FSTAT] = {"FSTAT", GFS3_OP_FSTAT, server3_3_fstat, NULL, 0, DRC_NA},
+ [GFS3_OP_LK] = {"LK", GFS3_OP_LK, server3_3_lk, NULL, 0, DRC_NA},
+ [GFS3_OP_LOOKUP] = {"LOOKUP", GFS3_OP_LOOKUP, server3_3_lookup, NULL, 0, DRC_NA},
+ [GFS3_OP_READDIR] = {"READDIR", GFS3_OP_READDIR, server3_3_readdir, NULL, 0, DRC_NA},
+ [GFS3_OP_INODELK] = {"INODELK", GFS3_OP_INODELK, server3_3_inodelk, NULL, 0, DRC_NA},
+ [GFS3_OP_FINODELK] = {"FINODELK", GFS3_OP_FINODELK, server3_3_finodelk, NULL, 0, DRC_NA},
+ [GFS3_OP_ENTRYLK] = {"ENTRYLK", GFS3_OP_ENTRYLK, server3_3_entrylk, NULL, 0, DRC_NA},
+ [GFS3_OP_FENTRYLK] = {"FENTRYLK", GFS3_OP_FENTRYLK, server3_3_fentrylk, NULL, 0, DRC_NA},
+ [GFS3_OP_XATTROP] = {"XATTROP", GFS3_OP_XATTROP, server3_3_xattrop, NULL, 0, DRC_NA},
+ [GFS3_OP_FXATTROP] = {"FXATTROP", GFS3_OP_FXATTROP, server3_3_fxattrop, NULL, 0, DRC_NA},
+ [GFS3_OP_FGETXATTR] = {"FGETXATTR", GFS3_OP_FGETXATTR, server3_3_fgetxattr, NULL, 0, DRC_NA},
+ [GFS3_OP_FSETXATTR] = {"FSETXATTR", GFS3_OP_FSETXATTR, server3_3_fsetxattr, NULL, 0, DRC_NA},
+ [GFS3_OP_RCHECKSUM] = {"RCHECKSUM", GFS3_OP_RCHECKSUM, server3_3_rchecksum, NULL, 0, DRC_NA},
+ [GFS3_OP_SETATTR] = {"SETATTR", GFS3_OP_SETATTR, server3_3_setattr, NULL, 0, DRC_NA},
+ [GFS3_OP_FSETATTR] = {"FSETATTR", GFS3_OP_FSETATTR, server3_3_fsetattr, NULL, 0, DRC_NA},
+ [GFS3_OP_READDIRP] = {"READDIRP", GFS3_OP_READDIRP, server3_3_readdirp, NULL, 0, DRC_NA},
+ [GFS3_OP_RELEASE] = {"RELEASE", GFS3_OP_RELEASE, server3_3_release, NULL, 0, DRC_NA},
+ [GFS3_OP_RELEASEDIR] = {"RELEASEDIR", GFS3_OP_RELEASEDIR, server3_3_releasedir, NULL, 0, DRC_NA},
+ [GFS3_OP_FREMOVEXATTR] = {"FREMOVEXATTR", GFS3_OP_FREMOVEXATTR, server3_3_fremovexattr, NULL, 0, DRC_NA},
+ [GFS3_OP_FALLOCATE] = {"FALLOCATE", GFS3_OP_FALLOCATE, server3_3_fallocate, NULL, 0, DRC_NA},
+ [GFS3_OP_DISCARD] = {"DISCARD", GFS3_OP_DISCARD, server3_3_discard, NULL, 0, DRC_NA},
+ [GFS3_OP_ZEROFILL] = {"ZEROFILL", GFS3_OP_ZEROFILL, server3_3_zerofill, NULL, 0, DRC_NA},
+ [GFS3_OP_IPC] = {"IPC", GFS3_OP_IPC, server3_3_ipc, NULL, 0, DRC_NA},
};
diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c
index 7aafbdf0a..6bd00cac0 100644
--- a/xlators/protocol/server/src/server.c
+++ b/xlators/protocol/server/src/server.c
@@ -25,33 +25,56 @@
#include "statedump.h"
#include "defaults.h"
#include "authenticate.h"
-#include "rpcsvc.h"
void
grace_time_handler (void *data)
{
- server_connection_t *conn = NULL;
- xlator_t *this = NULL;
- gf_boolean_t cancelled = _gf_false;
- gf_boolean_t detached = _gf_false;
+ client_t *client = NULL;
+ xlator_t *this = NULL;
+ gf_timer_t *timer = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ gf_boolean_t cancelled = _gf_false;
+ gf_boolean_t detached = _gf_false;
- conn = data;
- this = conn->this;
+ client = data;
+ this = client->this;
- GF_VALIDATE_OR_GOTO (THIS->name, conn, out);
GF_VALIDATE_OR_GOTO (THIS->name, this, out);
- gf_log (this->name, GF_LOG_INFO, "grace timer expired for %s", conn->id);
+ gf_log (this->name, GF_LOG_INFO, "grace timer expired for %s",
+ client->client_uid);
- cancelled = server_cancel_conn_timer (this, conn);
+ serv_ctx = server_ctx_get (client, this);
+
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
+ }
+
+ LOCK (&serv_ctx->fdtable_lock);
+ {
+ if (serv_ctx->grace_timer) {
+ timer = serv_ctx->grace_timer;
+ serv_ctx->grace_timer = NULL;
+ }
+ }
+ UNLOCK (&serv_ctx->fdtable_lock);
+ if (timer) {
+ gf_timer_call_cancel (this->ctx, timer);
+ cancelled = _gf_true;
+ }
if (cancelled) {
- //conn should not be destroyed in conn_put, so take a ref.
- server_conn_ref (conn);
- server_connection_put (this, conn, &detached);
+
+ /*
+ * client must not be destroyed in gf_client_put(),
+ * so take a ref.
+ */
+ gf_client_ref (client);
+ gf_client_put (client, &detached);
if (detached)//reconnection did not happen :-(
- server_connection_cleanup (this, conn,
- INTERNAL_LOCKS | POSIX_LOCKS);
- server_conn_unref (conn);
+ server_connection_cleanup (this, client,
+ INTERNAL_LOCKS | POSIX_LOCKS);
+ gf_client_unref (client);
}
out:
return;
@@ -107,8 +130,6 @@ ret:
return iob;
}
-
-
int
server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
struct iovec *payload, int payloadcount,
@@ -119,19 +140,24 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
struct iovec rsp = {0,};
server_state_t *state = NULL;
char new_iobref = 0;
- server_connection_t *conn = NULL;
+ client_t *client = NULL;
gf_boolean_t lk_heal = _gf_false;
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ gf_barrier_payload_t *stub = NULL;
+ gf_boolean_t barriered = _gf_false;
GF_VALIDATE_OR_GOTO ("server", req, ret);
if (frame) {
state = CALL_STATE (frame);
frame->local = NULL;
- conn = SERVER_CONNECTION(frame);
+ client = frame->root->client;
+ conf = (server_conf_t *) client->this->private;
}
- if (conn)
- lk_heal = ((server_conf_t *) conn->this->private)->lk_heal;
+ if (client)
+ lk_heal = ((server_conf_t *) client->this->private)->lk_heal;
if (!iobref) {
iobref = iobref_new ();
@@ -150,6 +176,32 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
iobref_add (iobref, iob);
+ if (conf)
+ barrier = conf->barrier;
+ if (barrier) {
+ /* todo: write's with fd flags set to O_SYNC and O_DIRECT */
+ LOCK (&barrier->lock);
+ {
+ if (is_fop_barriered (barrier->fops, req->procnum) &&
+ (barrier_add_to_queue (barrier))) {
+ stub = gf_barrier_payload (req, &rsp, frame,
+ payload,
+ payloadcount, iobref,
+ iob, new_iobref);
+ if (stub) {
+ gf_barrier_enqueue (barrier, stub);
+ barriered = _gf_true;
+ } else {
+ gf_log ("", GF_LOG_ERROR, "Failed to "
+ " barrier fop %"PRIu64,
+ ((uint64_t)1 << req->procnum));
+ }
+ }
+ }
+ UNLOCK (&barrier->lock);
+ if (barriered == _gf_true)
+ goto out;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_generic (req, &rsp, 1, payload, payloadcount,
iobref);
@@ -165,13 +217,14 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
iobuf_unref (iob);
if (ret == -1) {
gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed");
- if (frame && conn && !lk_heal) {
- server_connection_cleanup (frame->this, conn,
+ if (frame && client && !lk_heal) {
+ server_connection_cleanup (frame->this, client,
INTERNAL_LOCKS | POSIX_LOCKS);
} else {
+ gf_log_callingfn ("", GF_LOG_ERROR,
+ "Reply submission failed");
/* TODO: Failure of open(dir), create, inodelk, entrylk
or lk fops send failure must be handled specially. */
- ;
}
goto ret;
}
@@ -183,185 +236,17 @@ ret:
}
if (frame) {
- if (frame->root->trans)
- server_conn_unref (frame->root->trans);
+ gf_client_unref (client);
STACK_DESTROY (frame->root);
}
if (new_iobref) {
iobref_unref (iobref);
}
-
- return ret;
-}
-
-/* */
-int
-server_fd_to_dict (xlator_t *this, dict_t *dict)
-{
- server_conf_t *conf = NULL;
- server_connection_t *trav = NULL;
- char key[GF_DUMP_MAX_BUF_LEN] = {0,};
- int count = 0;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO (THIS->name, this, out);
- GF_VALIDATE_OR_GOTO (this->name, dict, out);
-
- conf = this->private;
- if (!conf)
- return -1;
-
- ret = pthread_mutex_trylock (&conf->mutex);
- if (ret)
- return -1;
-
- list_for_each_entry (trav, &conf->conns, list) {
- memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "conn%d", count++);
- fdtable_dump_to_dict (trav->fdtable, key, dict);
- }
- pthread_mutex_unlock (&conf->mutex);
-
- ret = dict_set_int32 (dict, "conncount", count);
out:
return ret;
}
-int
-server_fd (xlator_t *this)
-{
- server_conf_t *conf = NULL;
- server_connection_t *trav = NULL;
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 1;
- int ret = -1;
- gf_boolean_t section_added = _gf_false;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
-
- conf = this->private;
- if (!conf) {
- gf_log (this->name, GF_LOG_WARNING,
- "conf null in xlator");
- return -1;
- }
-
- gf_proc_dump_add_section("xlator.protocol.server.conn");
- section_added = _gf_true;
-
- ret = pthread_mutex_trylock (&conf->mutex);
- if (ret)
- goto out;
-
- list_for_each_entry (trav, &conf->conns, list) {
- if (trav->id) {
- gf_proc_dump_build_key(key,
- "conn","%d.id", i);
- gf_proc_dump_write(key, "%s", trav->id);
- }
-
- gf_proc_dump_build_key(key,"conn","%d.ref",i)
- gf_proc_dump_write(key, "%d", trav->ref);
- if (trav->bound_xl) {
- gf_proc_dump_build_key(key,
- "conn","%d.bound_xl", i);
- gf_proc_dump_write(key, "%s", trav->bound_xl->name);
- }
-
- gf_proc_dump_build_key(key,
- "conn","%d.id", i);
- fdtable_dump(trav->fdtable,key);
- i++;
- }
- pthread_mutex_unlock (&conf->mutex);
-
- ret = 0;
-out:
- if (ret) {
- if (section_added == _gf_false)
- gf_proc_dump_add_section("xlator.protocol.server.conn");
- gf_proc_dump_write ("Unable to dump the list of connections",
- "(Lock acquisition failed) %s",
- this?this->name:"server");
- }
- return ret;
-}
-
-void
-ltable_dump (server_connection_t *trav)
-{
- char key[GF_DUMP_MAX_BUF_LEN] = {0,};
- struct _locker *locker = NULL;
- char locker_data[GF_MAX_LOCK_OWNER_LEN] = {0,};
- int count = 0;
-
- gf_proc_dump_build_key(key,
- "conn","bound_xl.ltable.inodelk.%s",
- trav->bound_xl?trav->bound_xl->name:"");
- gf_proc_dump_add_section(key);
-
- list_for_each_entry (locker, &trav->ltable->inodelk_lockers, lockers) {
- count++;
- gf_proc_dump_write("volume", "%s", locker->volume);
- if (locker->fd) {
- gf_proc_dump_write("fd", "%p", locker->fd);
- gf_proc_dump_write("gfid", "%s",
- uuid_utoa (locker->fd->inode->gfid));
- } else {
- gf_proc_dump_write("fd", "%s", locker->loc.path);
- gf_proc_dump_write("gfid", "%s",
- uuid_utoa (locker->loc.inode->gfid));
- }
- gf_proc_dump_write("pid", "%d", locker->pid);
- gf_proc_dump_write("lock length", "%d", locker->owner.len);
- lkowner_unparse (&locker->owner, locker_data,
- locker->owner.len);
- gf_proc_dump_write("lock owner", "%s", locker_data);
- memset (locker_data, 0, sizeof (locker_data));
-
- gf_proc_dump_build_key (key, "inode", "%d", count);
- gf_proc_dump_add_section (key);
- if (locker->fd)
- inode_dump (locker->fd->inode, key);
- else
- inode_dump (locker->loc.inode, key);
- }
-
- count = 0;
- locker = NULL;
- gf_proc_dump_build_key(key,
- "conn","bound_xl.ltable.entrylk.%s",
- trav->bound_xl?trav->bound_xl->name:"");
- gf_proc_dump_add_section(key);
-
- list_for_each_entry (locker, &trav->ltable->entrylk_lockers,
- lockers) {
- count++;
- gf_proc_dump_write("volume", "%s", locker->volume);
- if (locker->fd) {
- gf_proc_dump_write("fd", "%p", locker->fd);
- gf_proc_dump_write("gfid", "%s",
- uuid_utoa (locker->fd->inode->gfid));
- } else {
- gf_proc_dump_write("fd", "%s", locker->loc.path);
- gf_proc_dump_write("gfid", "%s",
- uuid_utoa (locker->loc.inode->gfid));
- }
- gf_proc_dump_write("pid", "%d", locker->pid);
- gf_proc_dump_write("lock length", "%d", locker->owner.len);
- lkowner_unparse (&locker->owner, locker_data, locker->owner.len);
- gf_proc_dump_write("lock data", "%s", locker_data);
- memset (locker_data, 0, sizeof (locker_data));
-
- gf_proc_dump_build_key (key, "inode", "%d", count);
- gf_proc_dump_add_section (key);
- if (locker->fd)
- inode_dump (locker->fd->inode, key);
- else
- inode_dump (locker->loc.inode, key);
- }
-}
int
server_priv_to_dict (xlator_t *this, dict_t *dict)
@@ -468,104 +353,6 @@ out:
return ret;
}
-int
-server_inode_to_dict (xlator_t *this, dict_t *dict)
-{
- server_conf_t *conf = NULL;
- server_connection_t *trav = NULL;
- char key[32] = {0,};
- int count = 0;
- int ret = -1;
- xlator_t *prev_bound_xl = NULL;
-
- GF_VALIDATE_OR_GOTO (THIS->name, this, out);
- GF_VALIDATE_OR_GOTO (this->name, dict, out);
-
- conf = this->private;
- if (!conf)
- return -1;
-
- ret = pthread_mutex_trylock (&conf->mutex);
- if (ret)
- return -1;
-
- list_for_each_entry (trav, &conf->conns, list) {
- if (trav->bound_xl && trav->bound_xl->itable) {
- /* Presently every brick contains only one
- * bound_xl for all connections. This will lead
- * to duplicating of the inode lists, if listing
- * is done for every connection. This simple check
- * prevents duplication in the present case. If
- * need arises the check can be improved.
- */
- if (trav->bound_xl == prev_bound_xl)
- continue;
- prev_bound_xl = trav->bound_xl;
-
- memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "conn%d", count);
- inode_table_dump_to_dict (trav->bound_xl->itable,
- key, dict);
- count++;
- }
- }
- pthread_mutex_unlock (&conf->mutex);
-
- ret = dict_set_int32 (dict, "conncount", count);
-
-out:
- if (prev_bound_xl)
- prev_bound_xl = NULL;
- return ret;
-}
-
-int
-server_inode (xlator_t *this)
-{
- server_conf_t *conf = NULL;
- server_connection_t *trav = NULL;
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 1;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
-
- conf = this->private;
- if (!conf) {
- gf_log (this->name, GF_LOG_WARNING,
- "conf null in xlator");
- return -1;
- }
-
- ret = pthread_mutex_trylock (&conf->mutex);
- if (ret)
- goto out;
-
- list_for_each_entry (trav, &conf->conns, list) {
- ret = pthread_mutex_trylock (&trav->lock);
- if (!ret)
- {
- gf_proc_dump_build_key(key,
- "conn","%d.ltable", i);
- gf_proc_dump_add_section(key);
- ltable_dump (trav);
- i++;
- pthread_mutex_unlock (&trav->lock);
- }else
- continue;
- }
- pthread_mutex_unlock (&conf->mutex);
-
- ret = 0;
-out:
- if (ret)
- gf_proc_dump_write ("Unable to dump the lock table",
- "(Lock acquisition failed) %s",
- this?this->name:"server");
-
- return ret;
-}
-
static int
get_auth_types (dict_t *this, char *key, data_t *value, void *data)
@@ -708,9 +495,10 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
{
gf_boolean_t detached = _gf_false;
xlator_t *this = NULL;
- rpc_transport_t *xprt = NULL;
- server_connection_t *conn = NULL;
+ rpc_transport_t *trans = NULL;
server_conf_t *conf = NULL;
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
if (!xl || !data) {
gf_log_callingfn ("server", GF_LOG_WARNING,
@@ -719,7 +507,7 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
}
this = xl;
- xprt = data;
+ trans = data;
conf = this->private;
switch (event) {
@@ -727,17 +515,17 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
{
/* Have a structure per new connection */
/* TODO: Should we create anything here at all ? * /
- conn = create_server_conn_state (this, xprt);
- if (!conn)
+ client->conn = create_server_conn_state (this, trans);
+ if (!client->conn)
goto out;
- xprt->protocol_private = conn;
+ trans->protocol_private = client->conn;
*/
- INIT_LIST_HEAD (&xprt->list);
+ INIT_LIST_HEAD (&trans->list);
pthread_mutex_lock (&conf->mutex);
{
- list_add_tail (&xprt->list, &conf->xprt_list);
+ list_add_tail (&trans->list, &conf->xprt_list);
}
pthread_mutex_unlock (&conf->mutex);
@@ -750,50 +538,58 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
*/
pthread_mutex_lock (&conf->mutex);
{
- list_del_init (&xprt->list);
+ list_del_init (&trans->list);
}
pthread_mutex_unlock (&conf->mutex);
- conn = get_server_conn_state (this, xprt);
- if (!conn)
+ client = trans->xl_private;
+ if (!client)
break;
gf_log (this->name, GF_LOG_INFO, "disconnecting connection"
- "from %s", conn->id);
+ "from %s", client->client_uid);
/* If lock self heal is off, then destroy the
conn object, else register a grace timer event */
if (!conf->lk_heal) {
- server_conn_ref (conn);
- server_connection_put (this, conn, &detached);
+ gf_client_ref (client);
+ gf_client_put (client, &detached);
if (detached)
- server_connection_cleanup (this, conn,
- INTERNAL_LOCKS |
- POSIX_LOCKS);
- server_conn_unref (conn);
- } else {
- put_server_conn_state (this, xprt);
- server_connection_cleanup (this, conn, INTERNAL_LOCKS);
+ server_connection_cleanup (this, client,
+ INTERNAL_LOCKS | POSIX_LOCKS);
+ gf_client_unref (client);
+ break;
+ }
+ trans->xl_private = NULL;
+ server_connection_cleanup (this, client, INTERNAL_LOCKS);
- pthread_mutex_lock (&conn->lock);
- {
- if (conn->timer)
- goto unlock;
+ serv_ctx = server_ctx_get (client, this);
- gf_log (this->name, GF_LOG_INFO, "starting a grace "
- "timer for %s", conn->id);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO,
+ "server_ctx_get() failed");
+ goto out;
+ }
+
+ LOCK (&serv_ctx->fdtable_lock);
+ {
+ if (!serv_ctx->grace_timer) {
- conn->timer = gf_timer_call_after (this->ctx,
- conf->grace_tv,
- grace_time_handler,
- conn);
+ gf_log (this->name, GF_LOG_INFO,
+ "starting a grace timer for %s",
+ client->client_uid);
+
+ serv_ctx->grace_timer =
+ gf_timer_call_after (this->ctx,
+ conf->grace_ts,
+ grace_time_handler,
+ client);
}
- unlock:
- pthread_mutex_unlock (&conn->lock);
}
+ UNLOCK (&serv_ctx->fdtable_lock);
break;
case RPCSVC_EVENT_TRANSPORT_DESTROY:
- /*- conn obj has been disassociated from xprt on first
+ /*- conn obj has been disassociated from trans on first
* disconnect.
* conn cleanup and destruction is handed over to
* grace_time_handler or the subsequent handler that 'owns'
@@ -887,14 +683,14 @@ server_init_grace_timer (xlator_t *this, dict_t *options,
ret = dict_get_int32 (options, "grace-timeout", &grace_timeout);
if (!ret)
- conf->grace_tv.tv_sec = grace_timeout;
+ conf->grace_ts.tv_sec = grace_timeout;
else
- conf->grace_tv.tv_sec = 10;
+ conf->grace_ts.tv_sec = 10;
gf_log (this->name, GF_LOG_DEBUG, "Server grace timeout "
- "value = %"PRIu64, conf->grace_tv.tv_sec);
+ "value = %"GF_PRI_SECOND, conf->grace_ts.tv_sec);
- conf->grace_tv.tv_usec = 0;
+ conf->grace_ts.tv_nsec = 0;
ret = 0;
out:
@@ -941,12 +737,6 @@ reconfigure (xlator_t *this, dict_t *options)
}
- /*ret = dict_get_str (options, "statedump-path", &statedump_path);
- if (!ret) {
- gf_path_strip_trailing_slashes (statedump_path);
- GF_FREE (this->ctx->statedump_path);
- this->ctx->statedump_path = gf_strdup (statedump_path);
- }*/
GF_OPTION_RECONF ("statedump-path", statedump_path,
options, path, out);
if (!statedump_path) {
@@ -985,6 +775,15 @@ reconfigure (xlator_t *this, dict_t *options)
(void) rpcsvc_set_allow_insecure (rpc_conf, options);
(void) rpcsvc_set_root_squash (rpc_conf, options);
+
+ ret = rpcsvc_set_outstanding_rpc_limit (rpc_conf, options,
+ RPCSVC_DEFAULT_OUTSTANDING_RPC_LIMIT);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to reconfigure outstanding-rpc-limit");
+ goto out;
+ }
+
list_for_each_entry (listeners, &(rpc_conf->listeners), list) {
if (listeners->trans != NULL) {
if (listeners->trans->reconfigure )
@@ -1001,6 +800,26 @@ out:
return ret;
}
+static int32_t
+client_destroy_cbk (xlator_t *this, client_t *client)
+{
+ void *tmp = NULL;
+ server_ctx_t *ctx = NULL;
+
+ client_ctx_del (client, this, &tmp);
+
+ ctx = tmp;
+
+ if (ctx == NULL)
+ return 0;
+
+ gf_fd_fdtable_destroy (ctx->fdtable);
+ LOCK_DESTROY (&ctx->fdtable_lock);
+ GF_FREE (ctx);
+
+ return 0;
+}
+
int
init (xlator_t *this)
{
@@ -1008,6 +827,8 @@ init (xlator_t *this)
server_conf_t *conf = NULL;
rpcsvc_listener_t *listener = NULL;
char *statedump_path = NULL;
+ gf_barrier_t *barrier = NULL;
+ char *str = NULL;
GF_VALIDATE_OR_GOTO ("init", this, out);
if (this->children == NULL) {
@@ -1027,7 +848,6 @@ init (xlator_t *this)
GF_VALIDATE_OR_GOTO(this->name, conf, out);
- INIT_LIST_HEAD (&conf->conns);
INIT_LIST_HEAD (&conf->xprt_list);
pthread_mutex_init (&conf->mutex, NULL);
@@ -1079,12 +899,20 @@ init (xlator_t *this)
/* RPC related */
conf->rpc = rpcsvc_init (this, this->ctx, this->options, 0);
if (conf->rpc == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_ERROR,
"creation of rpcsvc failed");
ret = -1;
goto out;
}
+ ret = rpcsvc_set_outstanding_rpc_limit (conf->rpc, this->options,
+ RPCSVC_DEFAULT_OUTSTANDING_RPC_LIMIT);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to configure outstanding-rpc-limit");
+ goto out;
+ }
+
ret = rpcsvc_create_listeners (conf->rpc, this->options,
this->name);
if (ret < 1) {
@@ -1149,6 +977,37 @@ init (xlator_t *this)
}
}
#endif
+ /* barrier related */
+ barrier = GF_CALLOC (1, sizeof (*barrier),1);
+ if (!barrier) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "WARNING: Failed to allocate barrier");
+ ret = -1;
+ goto out;
+ }
+
+ LOCK_INIT (&barrier->lock);
+ INIT_LIST_HEAD (&barrier->queue);
+ barrier->on = _gf_false;
+
+ GF_OPTION_INIT ("barrier-queue-length", barrier->max_size,
+ int64, out);
+ GF_OPTION_INIT ("barrier-timeout", barrier->time_out,
+ uint64, out);
+
+ ret = dict_get_str (this->options, "barrier-fops", &str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting barrier fops to default value");
+ }
+ ret = gf_barrier_fops_configure (this, barrier, str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "invalid barrier fops specified");
+ goto out;
+ }
+
+ conf->barrier = barrier;
this->private = conf;
ret = 0;
@@ -1202,27 +1061,65 @@ int
notify (xlator_t *this, int32_t event, void *data, ...)
{
int ret = 0;
+ int32_t val = 0;
+ dict_t *dict = NULL;
+ dict_t *output = NULL;
+ va_list ap;
+
+ dict = data;
+ va_start (ap, data);
+ output = va_arg (ap, dict_t*);
+ va_end (ap);
+
switch (event) {
+ case GF_EVENT_VOLUME_BARRIER_OP:
+ ret = dict_get_int32 (dict, "barrier", &val);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Wrong BARRIER event");
+ goto out;
+ }
+ /* !val un-barrier, if val, barrier */
+ if (val) {
+ ret = gf_barrier_start (this);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Barrier start failed");
+ } else {
+ ret = gf_barrier_stop (this);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Barrier stop failed");
+ }
+ ret = dict_set_int32 (output, "barrier-status", ret);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set barrier-status in dict");
+ break;
+
+ /* todo: call default_notify to make other xlators handle it.*/
default:
default_notify (this, event, data);
break;
}
-
+out:
return ret;
}
struct xlator_fops fops;
-struct xlator_cbks cbks;
+struct xlator_cbks cbks = {
+ .client_destroy = client_destroy_cbk,
+};
struct xlator_dumpops dumpops = {
.priv = server_priv,
- .fd = server_fd,
- .inode = server_inode,
+ .fd = gf_client_dump_fdtables,
+ .inode = gf_client_dump_inodes,
.priv_to_dict = server_priv_to_dict,
- .fd_to_dict = server_fd_to_dict,
- .inode_to_dict = server_inode_to_dict,
+ .fd_to_dict = gf_client_dump_fdtables_to_dict,
+ .inode_to_dict = gf_client_dump_inodes_to_dict,
};
@@ -1269,10 +1166,26 @@ struct volume_options options[] = {
{ .key = {"root-squash"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
- .description = "Map requests from uid/gid 0 to the anonymous "
- "uid/gid. Note that this does not apply to any other"
- "uids or gids that might be equally sensitive, such as"
- "user bin or group staff."
+ .description = "Map requests from uid/gid 0 to the anonymous "
+ "uid/gid. Note that this does not apply to any other "
+ "uids or gids that might be equally sensitive, such "
+ "as user bin or group staff."
+ },
+ { .key = {"anonuid"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "65534", /* RPC_NOBODY_UID */
+ .min = 0,
+ .max = (uint32_t) -1,
+ .description = "value of the uid used for the anonymous "
+ "user/nfsnobody when root-squash is enabled."
+ },
+ { .key = {"anongid"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "65534", /* RPC_NOBODY_GID */
+ .min = 0,
+ .max = (uint32_t) -1,
+ .description = "value of the gid used for the anonymous "
+ "user/nfsnobody when root-squash is enabled."
},
{ .key = {"statedump-path"},
.type = GF_OPTION_TYPE_PATH,
@@ -1302,15 +1215,34 @@ struct volume_options options[] = {
{ .key = {"auth.addr.*.allow"},
.type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
.description = "Allow a comma separated list of addresses and/or "
- "hostnames to connect to the server. By default, all"
- " connections are allowed."
+ "hostnames to connect to the server. Option "
+ "auth.reject overrides this option. By default, all "
+ "connections are allowed."
},
{ .key = {"auth.addr.*.reject"},
.type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
.description = "Reject a comma separated list of addresses and/or "
- "hostnames to connect to the server. By default, all"
+ "hostnames to connect to the server. This option "
+ "overrides the auth.allow option. By default, all"
" connections are allowed."
},
-
+ {.key = {"barrier-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "60",
+ .min = 0,
+ .max = 360,
+ .description = "Barrier timeout in seconds",
+ },
+ {.key = {"barrier-queue-length"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "4096",
+ .min = 0,
+ .max = 16384,
+ .description = "Barrier queue length",
+ },
+ {.key = {"barrier-fops"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Allow a comma seperated fop lists",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h
index e7eccfd11..782327d77 100644
--- a/xlators/protocol/server/src/server.h
+++ b/xlators/protocol/server/src/server.h
@@ -13,6 +13,7 @@
#include <pthread.h>
+#include "fd.h"
#include "rpcsvc.h"
#include "fd.h"
@@ -20,69 +21,47 @@
#include "server-mem-types.h"
#include "glusterfs3.h"
#include "timer.h"
+#include "client_t.h"
#define DEFAULT_BLOCK_SIZE 4194304 /* 4MB */
#define DEFAULT_VOLUME_FILE_PATH CONFDIR "/glusterfs.vol"
#define GF_MAX_SOCKET_WINDOW_SIZE (1 * GF_UNIT_MB)
#define GF_MIN_SOCKET_WINDOW_SIZE (0)
-typedef enum {
- INTERNAL_LOCKS = 1,
- POSIX_LOCKS = 2,
-} server_lock_flags_t;
-
-typedef struct _server_state server_state_t;
-
-struct _locker {
- struct list_head lockers;
- char *volume;
- loc_t loc;
- fd_t *fd;
- gf_lkowner_t owner;
- pid_t pid;
+struct _gf_barrier_payload {
+ rpcsvc_request_t *req;
+ struct iovec rsp;
+ call_frame_t *frame;
+ struct iovec *payload;
+ struct iobref *iobref;
+ struct iobuf *iob;
+ int payload_count;
+ gf_boolean_t free_iobref;
+ struct list_head list;
};
-struct _lock_table {
- struct list_head inodelk_lockers;
- struct list_head entrylk_lockers;
+typedef struct _gf_barrier_payload gf_barrier_payload_t;
+
+struct _gf_barrier {
+ gf_lock_t lock;
+ gf_boolean_t on;
+ gf_boolean_t force;
+ size_t cur_size;
+ int64_t max_size;
+ uint64_t fops;
+ gf_timer_t *timer;
+ uint64_t time_out;
+ struct list_head queue;
};
-/* private structure per connection (transport object)
- * used as transport_t->xl_private
- */
-struct _server_connection {
- struct list_head list;
- char *id;
- int ref;
- int bind_ref;
- pthread_mutex_t lock;
- fdtable_t *fdtable;
- struct _lock_table *ltable;
- gf_timer_t *timer;
- xlator_t *bound_xl;
- xlator_t *this;
- uint32_t lk_version;
-};
-
-typedef struct _server_connection server_connection_t;
-
-
-server_connection_t *
-server_connection_get (xlator_t *this, const char *id);
+typedef struct _gf_barrier gf_barrier_t;
-server_connection_t *
-server_connection_put (xlator_t *this, server_connection_t *conn,
- gf_boolean_t *detached);
-
-server_connection_t*
-server_conn_unref (server_connection_t *conn);
-
-server_connection_t*
-server_conn_ref (server_connection_t *conn);
+typedef enum {
+ INTERNAL_LOCKS = 1,
+ POSIX_LOCKS = 2,
+} server_lock_flags_t;
-int
-server_connection_cleanup (xlator_t *this, server_connection_t *conn,
- int32_t flags);
+typedef struct _server_state server_state_t;
int server_null (rpcsvc_request_t *req);
@@ -102,11 +81,12 @@ struct server_conf {
heal is on else off. */
char *conf_dir;
struct _volfile_ctx *volfile;
- struct timeval grace_tv;
+ struct timespec grace_ts;
dict_t *auth_modules;
pthread_mutex_t mutex;
- struct list_head conns;
+ gf_barrier_t *barrier;
struct list_head xprt_list;
+ pthread_t barrier_th;
};
typedef struct server_conf server_conf_t;
@@ -144,11 +124,10 @@ int
resolve_and_resume (call_frame_t *frame, server_resume_fn_t fn);
struct _server_state {
- server_connection_t *conn;
- rpc_transport_t *xprt;
- inode_table_t *itable;
+ rpc_transport_t *xprt;
+ inode_table_t *itable;
- server_resume_fn_t resume_fn;
+ server_resume_fn_t resume_fn;
loc_t loc;
loc_t loc2;
@@ -184,7 +163,7 @@ struct _server_state {
int mask;
char is_revalidate;
dict_t *dict;
- struct gf_flock flock;
+ struct gf_flock flock;
const char *volume;
dir_entry_t *entry;
@@ -192,10 +171,20 @@ struct _server_state {
mode_t umask;
};
+
extern struct rpcsvc_program gluster_handshake_prog;
extern struct rpcsvc_program glusterfs3_3_fop_prog;
extern struct rpcsvc_program gluster_ping_prog;
+
+typedef struct _server_ctx {
+ gf_lock_t fdtable_lock;
+ fdtable_t *fdtable;
+ struct _gf_timer *grace_timer;
+ uint32_t lk_version;
+} server_ctx_t;
+
+
int
server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
struct iovec *payload, int payloadcount,
@@ -204,6 +193,4 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
int gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict);
int gf_server_check_getxattr_cmd (call_frame_t *frame, const char *name);
-void ltable_dump (server_connection_t *conn);
-
#endif /* !_SERVER_H */
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am
index e1316a127..c08e8e41b 100644
--- a/xlators/storage/Makefile.am
+++ b/xlators/storage/Makefile.am
@@ -1,6 +1,7 @@
SUBDIRS = posix
if ENABLE_BD_XLATOR
-SUBDIRS += bd_map
+SUBDIRS += bd
endif
+
CLEANFILES =
diff --git a/xlators/storage/bd/Makefile.am b/xlators/storage/bd/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/storage/bd/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/storage/bd_map/src/Makefile.am b/xlators/storage/bd/src/Makefile.am
index 91412e91d..60ceff31b 100644
--- a/xlators/storage/bd_map/src/Makefile.am
+++ b/xlators/storage/bd/src/Makefile.am
@@ -1,14 +1,13 @@
-
if ENABLE_BD_XLATOR
-xlator_LTLIBRARIES = bd_map.la
+xlator_LTLIBRARIES = bd.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
-bd_map_la_LDFLAGS = -module -avoid-version
+bd_la_LDFLAGS = -module -avoid-version
LIBBD = -llvm2app -lrt
-bd_map_la_SOURCES = bd_map.c bd_map_help.c
-bd_map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD)
+bd_la_SOURCES = bd.c bd-helper.c bd-aio.c
+bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) $(LIBAIO)
-noinst_HEADERS = bd_map.h bd_map_help.h
+noinst_HEADERS = bd.h bd-aio.h bd-mem-types.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(top_srcdir)/rpc/xdr/src \
diff --git a/xlators/storage/bd/src/bd-aio.c b/xlators/storage/bd/src/bd-aio.c
new file mode 100644
index 000000000..9dc13b3ec
--- /dev/null
+++ b/xlators/storage/bd/src/bd-aio.c
@@ -0,0 +1,528 @@
+/*
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ Author: M. Mohan Kumar <mohan@in.ibm.com>
+
+ Based on posix-aio.c
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <lvm2app.h>
+#include <sys/uio.h>
+
+#include "xlator.h"
+#include "glusterfs.h"
+#include "defaults.h"
+#include "bd.h"
+#include "bd-aio.h"
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#include "bd-mem-types.h"
+
+struct bd_aio_cb {
+ struct iocb iocb;
+ call_frame_t *frame;
+ struct iobuf *iobuf;
+ struct iobref *iobref;
+ struct iatt prebuf;
+ int op;
+ off_t offset;
+ fd_t *fd;
+};
+
+void
+__bd_fd_set_odirect (fd_t *fd, bd_fd_t *bd_fd, int opflags,
+ off_t offset, size_t size)
+{
+ int odirect = 0;
+ int flags = 0;
+ int ret = 0;
+
+ odirect = bd_fd->odirect;
+
+ if ((fd->flags|opflags) & O_DIRECT) {
+ /* if instructed, use O_DIRECT always */
+ odirect = 1;
+ } else {
+ /* else use O_DIRECT when feasible */
+ if ((offset|size) & 0xfff)
+ odirect = 0;
+ else
+ odirect = 1;
+ }
+
+ if (!odirect && bd_fd->odirect) {
+ flags = fcntl (bd_fd->fd, F_GETFL);
+ ret = fcntl (bd_fd->fd, F_SETFL, (flags & (~O_DIRECT)));
+ bd_fd->odirect = 0;
+ }
+
+ if (odirect && !bd_fd->odirect) {
+ flags = fcntl (bd_fd->fd, F_GETFL);
+ ret = fcntl (bd_fd->fd, F_SETFL, (flags | O_DIRECT));
+ bd_fd->odirect = 1;
+ }
+
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d",
+ strerror (errno), bd_fd->fd, flags, bd_fd->odirect);
+ }
+}
+
+int
+bd_aio_readv_complete (struct bd_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iatt postbuf = {0,};
+ int op_ret = -1;
+ int op_errno = 0;
+ struct iovec iov;
+ struct iobref *iobref = NULL;
+ off_t offset = 0;
+ bd_attr_t *bdatt = NULL;
+
+ frame = paiocb->frame;
+ this = frame->this;
+ iobuf = paiocb->iobuf;
+ offset = paiocb->offset;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "readv(async) failed fd=%p,size=%lu,offset=%llu (%d/%s)",
+ paiocb->fd, paiocb->iocb.u.c.nbytes,
+ (unsigned long long) paiocb->offset,
+ res, strerror (op_errno));
+ goto out;
+ }
+
+ bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_ret = res;
+ op_errno = 0;
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = op_ret;
+
+ /* Hack to notify higher layers of EOF. */
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
+ op_errno = ENOENT;
+
+out:
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1,
+ &postbuf, iobref, NULL);
+ if (iobuf)
+ iobuf_unref (iobuf);
+ if (iobref)
+ iobref_unref (iobref);
+
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+int
+bd_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ struct iobuf *iobuf = NULL;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ struct bd_aio_cb *paiocb = NULL;
+ bd_priv_t *priv = NULL;
+ struct iocb *iocb = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readv, fd, size, offset,
+ flags, xdata);
+ return 0;
+ }
+ _fd = bd_fd->fd;
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto err;
+ }
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_bd_aio_cb);
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb->frame = frame;
+ paiocb->iobuf = iobuf;
+ paiocb->offset = offset;
+ paiocb->op = GF_FOP_READ;
+ paiocb->fd = fd;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.c.buf = iobuf_ptr (iobuf);
+ paiocb->iocb.u.c.nbytes = size;
+ paiocb->iocb.u.c.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ LOCK (&fd->lock);
+ {
+ __bd_fd_set_odirect (fd, bd_fd, flags, offset, size);
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ if (paiocb)
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+int
+bd_aio_writev_complete (struct bd_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iatt prebuf = {0,};
+ struct iatt postbuf = {0,};
+ int op_ret = -1;
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ frame = paiocb->frame;
+ prebuf = paiocb->prebuf;
+ this = frame->this;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "writev(async) failed fd=%p,offset=%llu (%d/%s)",
+ paiocb->fd, (unsigned long long) paiocb->offset, res,
+ strerror (op_errno));
+
+ goto out;
+ }
+
+ bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt);
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_ret = res;
+ op_errno = 0;
+
+out:
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf,
+ NULL);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+int
+bd_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *iov, int count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ struct bd_aio_cb *paiocb = NULL;
+ bd_priv_t *priv = NULL;
+ struct iocb *iocb = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_writev_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+ fd, iov, count, offset, flags, iobref, xdata);
+ return 0;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ _fd = bd_fd->fd;
+
+ paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_bd_aio_cb);
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+
+ paiocb->frame = frame;
+ paiocb->offset = offset;
+ paiocb->op = GF_FOP_WRITE;
+ paiocb->fd = fd;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iobref = iobref_ref (iobref);
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.v.vec = iov;
+ paiocb->iocb.u.v.nr = count;
+ paiocb->iocb.u.v.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ memcpy (&paiocb->prebuf, &bdatt->iatt, sizeof (struct iatt));
+ LOCK (&fd->lock);
+ {
+ __bd_fd_set_odirect (fd, bd_fd, flags, offset,
+ iov_length (iov, count));
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+void *
+bd_aio_thread (void *data)
+{
+ xlator_t *this = NULL;
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+ int i = 0;
+ struct io_event *event = NULL;
+ struct bd_aio_cb *paiocb = NULL;
+ struct io_event events[BD_AIO_MAX_NR_GETEVENTS];
+ struct timespec ts = {0, };
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+
+ ts.tv_sec = 5;
+ for (;;) {
+ memset (&events[0], 0, sizeof (events));
+ ret = io_getevents (priv->ctxp, 1, BD_AIO_MAX_NR_GETEVENTS,
+ &events[0], &ts);
+ if (ret < 0) {
+ if (ret == -EINTR)
+ continue;
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_getevents() returned %d, exiting", ret);
+ break;
+ }
+
+ for (i = 0; i < ret; i++) {
+ event = &events[i];
+
+ paiocb = event->data;
+
+ switch (paiocb->op) {
+ case GF_FOP_READ:
+ bd_aio_readv_complete (paiocb, event->res,
+ event->res2);
+ break;
+ case GF_FOP_WRITE:
+ bd_aio_writev_complete (paiocb, event->res,
+ event->res2);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "unknown op %d found in piocb",
+ paiocb->op);
+ break;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+bd_aio_init (xlator_t *this)
+{
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ ret = io_setup (BD_AIO_MAX_NR_EVENTS, &priv->ctxp);
+ if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Linux AIO not available at run-time."
+ " Continuing with synchronous IO");
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "io_setup() failed. ret=%d, errno=%d",
+ ret, errno);
+ goto out;
+ }
+
+ ret = pthread_create (&priv->aiothread, NULL,
+ bd_aio_thread, this);
+ if (ret != 0) {
+ io_destroy (priv->ctxp);
+ goto out;
+ }
+
+ this->fops->readv = bd_aio_readv;
+ this->fops->writev = bd_aio_writev;
+out:
+ return ret;
+}
+
+
+int
+bd_aio_on (xlator_t *this)
+{
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (!priv->aio_init_done) {
+ ret = bd_aio_init (this);
+ if (ret == 0)
+ priv->aio_capable = _gf_true;
+ else
+ priv->aio_capable = _gf_false;
+ priv->aio_init_done = _gf_true;
+ }
+
+ if (priv->aio_capable) {
+ this->fops->readv = bd_aio_readv;
+ this->fops->writev = bd_aio_writev;
+ }
+
+ return ret;
+}
+
+int
+bd_aio_off (xlator_t *this)
+{
+ this->fops->readv = bd_readv;
+ this->fops->writev = bd_writev;
+
+ return 0;
+}
+
+#else
+
+int
+bd_aio_on (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+int
+bd_aio_off (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+void
+__bd_fd_set_odirect (fd_t *fd, struct bd_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ xlator_t *this = THIS;
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return;
+}
+#endif
diff --git a/xlators/storage/bd/src/bd-aio.h b/xlators/storage/bd/src/bd-aio.h
new file mode 100644
index 000000000..16f686a4c
--- /dev/null
+++ b/xlators/storage/bd/src/bd-aio.h
@@ -0,0 +1,41 @@
+/*
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _BD_AIO_H
+#define _BD_AIO_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "glusterfs.h"
+
+/*
+ * Maximum number of concurrently submitted IO events. The heaviest load
+ * GlusterFS has been able to handle had 60-80 concurrent calls
+ */
+#define BD_AIO_MAX_NR_EVENTS 256
+
+/* Maximum number of completed IO operations to reap per getevents syscall */
+#define BD_AIO_MAX_NR_GETEVENTS 16
+
+int bd_aio_on (xlator_t *this);
+int bd_aio_off (xlator_t *this);
+
+int bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
+
+int bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata);
+
+#endif /* !_BD_AIO_H */
diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c
new file mode 100644
index 000000000..d598e5755
--- /dev/null
+++ b/xlators/storage/bd/src/bd-helper.c
@@ -0,0 +1,1021 @@
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <lvm2app.h>
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include "bd.h"
+#include "bd-mem-types.h"
+#include "run.h"
+#include "lvm-defaults.h"
+
+int
+bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, ctx, out);
+
+ ctx_int = (long)ctx;
+ ret = inode_ctx_set (inode, this, &ctx_int);
+out:
+ return ret;
+}
+
+int
+bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ ret = inode_ctx_get (inode, this, &ctx_int);
+ if (ret)
+ return ret;
+ if (ctx)
+ *ctx = (bd_attr_t *) ctx_int;
+out:
+ return ret;
+}
+
+void
+bd_local_free (xlator_t *this, bd_local_t *local)
+{
+ if (!local)
+ return;
+ if (local->fd)
+ fd_unref (local->fd);
+ else if (local->loc.path)
+ loc_wipe (&local->loc);
+ if (local->dict)
+ dict_unref (local->dict);
+ if (local->inode)
+ inode_unref (local->inode);
+ if (local->bdatt) {
+ GF_FREE (local->bdatt->type);
+ GF_FREE (local->bdatt);
+ }
+ mem_put (local);
+ local = NULL;
+}
+
+bd_local_t *
+bd_local_init (call_frame_t *frame, xlator_t *this)
+{
+ frame->local = mem_get0 (this->local_pool);
+ if (!frame->local)
+ return NULL;
+
+ return frame->local;
+}
+
+/*
+ * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format.
+ * This function validates this tag agains volume-uuid. Also goes
+ * through LV list to find out if a thin-pool is configured or not.
+ */
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv)
+{
+ vg_t brick = NULL;
+ data_t *tmp_data = NULL;
+ struct dm_list *tags = NULL;
+ int op_ret = -1;
+ uuid_t dict_uuid = {0, };
+ uuid_t vg_uuid = {0, };
+ gf_boolean_t uuid = _gf_false;
+ lvm_str_list_t *strl = NULL;
+ struct dm_list *lv_dm_list = NULL;
+ lv_list_t *lv_list = NULL;
+ struct dm_list *dm_seglist = NULL;
+ lvseg_list_t *seglist = NULL;
+ lvm_property_value_t prop = {0, };
+ gf_boolean_t thin = _gf_false;
+ const char *lv_name = NULL;
+
+ brick = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!brick) {
+ gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found",
+ priv->vg);
+ return ENOENT;
+ }
+
+ lv_dm_list = lvm_vg_list_lvs (brick);
+ if (!lv_dm_list)
+ goto check;
+
+ dm_list_iterate_items (lv_list, lv_dm_list) {
+ dm_seglist = lvm_lv_list_lvsegs (lv_list->lv);
+ if (!dm_seglist)
+ continue;
+ dm_list_iterate_items (seglist, dm_seglist) {
+ prop = lvm_lvseg_get_property (seglist->lvseg,
+ "segtype");
+ if (!prop.is_valid || !prop.value.string)
+ continue;
+ if (!strcmp (prop.value.string, "thin-pool")) {
+ thin = _gf_true;
+ lv_name = lvm_lv_get_name (lv_list->lv);
+ priv->pool = gf_strdup (lv_name);
+ gf_log (THIS->name, GF_LOG_INFO, "Thin Pool "
+ "\"%s\" will be used for thin LVs",
+ lv_name);
+ break;
+ }
+ }
+ }
+
+check:
+ /* If there is no volume-id set in dict, we cant validate */
+ tmp_data = dict_get (this->options, "volume-id");
+ if (!tmp_data) {
+ op_ret = 0;
+ goto out;
+ }
+
+ op_ret = uuid_parse (tmp_data->data, dict_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in volume file",
+ tmp_data->data);
+ op_ret = -1;
+ goto out;
+ }
+
+ tags = lvm_vg_get_tags (brick);
+ if (!tags) { /* no tags in the VG */
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ op_ret = -1;
+ goto out;
+ }
+ dm_list_iterate_items (strl, tags) {
+ if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY,
+ strlen (GF_XATTR_VOL_ID_KEY))) {
+ uuid = _gf_true;
+ break;
+ }
+ }
+ /* UUID tag is not set in VG */
+ if (!uuid) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1,
+ vg_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in VG", strl->str);
+ op_ret = -1;
+ goto out;
+ }
+ if (uuid_compare (dict_uuid, vg_uuid)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mismatching volume-id (%s) received. "
+ "already is a part of volume %s ",
+ tmp_data->data, vg_uuid);
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = 0;
+
+out:
+ lvm_vg_close (brick);
+
+ if (!thin)
+ gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in "
+ "VG %s\n", priv->vg);
+ else
+ priv->caps |= BD_CAPS_THIN;
+
+ return op_ret;
+}
+
+/* FIXME: Move this code to common place, so posix and bd xlator can use */
+char *
+page_aligned_alloc (size_t size, char **aligned_buf)
+{
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+
+ alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char);
+ if (!alloc_buf)
+ return NULL;
+ /* page aligned buffer */
+ buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
+ *aligned_buf = buf;
+
+ return alloc_buf;
+}
+
+static int
+__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p)
+{
+ int ret = -1;
+ int _fd = -1;
+ char *devpath = NULL;
+ bd_fd_t *bdfd = NULL;
+ uint64_t tmp_bdfd = 0;
+ bd_priv_t *priv = this->private;
+ bd_gfid_t gfid = {0, };
+ bd_attr_t *bdatt = NULL;
+
+ /* not bd file */
+ if (fd->inode->ia_type != IA_IFREG ||
+ bd_inode_ctx_get (fd->inode, this, &bdatt))
+ return 0;
+
+ ret = __fd_ctx_get (fd, this, &tmp_bdfd);
+ if (ret == 0) {
+ bdfd = (void *)(long) tmp_bdfd;
+ *bdfd_p = bdfd;
+ return 0;
+ }
+
+ uuid_utoa_r (fd->inode->gfid, gfid);
+ gf_asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid);
+ if (!devpath)
+ goto out;
+
+ _fd = open (devpath, O_RDWR | O_LARGEFILE, 0);
+ if (_fd < 0) {
+ ret = errno;
+ gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath,
+ strerror (ret));
+ goto out;
+ }
+ bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd);
+ BD_VALIDATE_MEM_ALLOC (bdfd, ret, out);
+
+ bdfd->fd = _fd;
+ bdfd->flag = O_RDWR | O_LARGEFILE;
+ if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context fd=%p", fd);
+ goto out;
+ }
+
+ *bdfd_p = bdfd;
+
+ ret = 0;
+out:
+ GF_FREE (devpath);
+ if (ret) {
+ close (_fd);
+ GF_FREE (bdfd);
+ }
+ return ret;
+}
+
+int
+bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd)
+{
+ int ret;
+
+ /* FIXME: Is it ok to fd->lock here ? */
+ LOCK (&fd->lock);
+ {
+ ret = __bd_fd_ctx_get (this, fd, bdfd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ret;
+}
+
+/*
+ * Validates if LV exists for given inode or not.
+ * Returns 0 if LV exists and size also matches.
+ * If LV does not exist -1 returned
+ * If LV size mismatches, returnes 1 also lv_size is updated with actual
+ * size
+ */
+int
+bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid)
+{
+ char *path = NULL;
+ int ret = -1;
+ bd_gfid_t gfid = {0, };
+ bd_priv_t *priv = this->private;
+ struct stat stbuf = {0, };
+ uint64_t size = 0;
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+ char *bytes = NULL;
+
+ bytes = strrchr (bd, ':');
+ if (bytes) {
+ *bytes = '\0';
+ bytes++;
+ gf_string2bytesize (bytes, &size);
+ }
+
+ if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid xattr %s", bd);
+ return -1;
+ }
+ *type = gf_strdup (bd);
+
+ /*
+ * Check if LV really exist, there could be a failure
+ * after setxattr and successful LV creation
+ */
+ uuid_utoa_r (uuid, gfid);
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid);
+ if (!path) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "insufficient memory");
+ return 0;
+ }
+
+ /* Destination file does not exist */
+ if (stat (path, &stbuf)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lstat failed for path %s", path);
+ return -1;
+ }
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "r", 0);
+ if (!vg) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "VG %s does not exist?", priv->vg);
+ ret = -1;
+ goto out;
+ }
+
+ lv = lvm_lv_from_name (vg, gfid);
+ if (!lv) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "LV %s does not exist", gfid);
+ ret = -1;
+ goto out;
+ }
+
+ *lv_size = lvm_lv_get_size (lv);
+ if (size == *lv_size) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = 1;
+
+out:
+ if (vg)
+ lvm_vg_close (vg);
+
+ GF_FREE (path);
+ return ret;
+}
+
+static int
+create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent)
+{
+ int ret = -1;
+ runner_t runner = {0, };
+ char *path = NULL;
+ struct stat stat = {0, };
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CREATE, NULL);
+ runner_add_args (&runner, "--thin", NULL);
+ runner_argprintf (&runner, "%s/%s", vg, pool);
+ runner_add_args (&runner, "--name", NULL);
+ runner_argprintf (&runner, "%s", lv);
+ runner_add_args (&runner, "--virtualsize", NULL);
+ runner_argprintf (&runner, "%ldB", extent);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ gf_asprintf (&path, "/dev/%s/%s", vg, lv);
+ if (!path) {
+ ret = ENOMEM;
+ goto out;
+ }
+ if (lstat (path, &stat) < 0)
+ ret = EAGAIN;
+ else
+ ret = 0;
+out:
+ GF_FREE (path);
+ return ret;
+}
+
+int
+bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv)
+{
+ int ret = 0;
+ vg_t vg = NULL;
+ bd_gfid_t gfid = {0, };
+
+ uuid_utoa_r (uuid, gfid);
+
+ if (!strcmp (type, BD_THIN))
+ return create_thin_lv (priv->vg, priv->pool, gfid,
+ size);
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return ENOENT;
+ }
+
+ if (!lvm_vg_create_lv_linear (vg, gfid, size)) {
+ gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear "
+ "failed");
+ ret = errno;
+ }
+
+ lvm_vg_close (vg);
+
+ return ret;
+}
+
+int32_t
+bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size)
+{
+ uint64_t new_size = 0;
+ runner_t runner = {0, };
+ bd_gfid_t gfid = {0, };
+ int ret = 0;
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+
+ uuid_utoa_r (uuid, gfid);
+
+ runinit (&runner);
+
+ runner_add_args (&runner, LVM_RESIZE, NULL);
+ runner_argprintf (&runner, "%s/%s", priv->vg, gfid);
+ runner_argprintf (&runner, "-L%ldb", size);
+ runner_add_args (&runner, "-f", NULL);
+
+ runner_start (&runner);
+ runner_end (&runner);
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return EAGAIN;
+ }
+
+ lv = lvm_lv_from_name (vg, gfid);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid);
+ ret = EIO;
+ goto out;
+ }
+ new_size = lvm_lv_get_size (lv);
+
+ if (new_size != size) {
+ gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does "
+ "not match requested size %ld", new_size, size);
+ ret = EIO;
+ }
+
+out:
+ lvm_vg_close (vg);
+ return ret;
+}
+
+uint64_t
+bd_get_default_extent (bd_priv_t *priv)
+{
+ vg_t vg = NULL;
+ uint64_t size = 0;
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return 0;
+ }
+
+ size = lvm_vg_get_extent_size (vg);
+
+ lvm_vg_close (vg);
+
+ return size;
+}
+
+/*
+ * Adjusts the user specified size to VG specific extent size
+ */
+uint64_t
+bd_adjust_size (bd_priv_t *priv, uint64_t size)
+{
+ uint64_t extent = 0;
+ uint64_t nr_ex = 0;
+
+ extent = bd_get_default_extent (priv);
+ if (!extent)
+ return 0;
+
+ nr_ex = size / extent;
+ if (size % extent)
+ nr_ex++;
+
+ size = extent * nr_ex;
+
+ return size;
+}
+
+int
+bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno)
+{
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+ int ret = -1;
+
+ *op_errno = 0;
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ *op_errno = ENOENT;
+ return -1;
+ }
+ lv = lvm_lv_from_name (vg, lv_name);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name);
+ *op_errno = ENOENT;
+ goto out;
+ }
+ ret = lvm_vg_remove_lv (lv);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed",
+ lv_name);
+ *op_errno = errno;
+ goto out;
+ }
+out:
+ lvm_vg_close (vg);
+
+ return ret;
+}
+
+inline void
+bd_update_amtime(struct iatt *iatt, int flag)
+{
+ struct timespec ts = {0, };
+
+ clock_gettime (CLOCK_REALTIME, &ts);
+ if (flag & GF_SET_ATTR_ATIME) {
+ iatt->ia_atime = ts.tv_sec;
+ iatt->ia_atime_nsec = ts.tv_nsec;
+ }
+ if (flag & GF_SET_ATTR_MTIME) {
+ iatt->ia_mtime = ts.tv_sec;
+ iatt->ia_mtime_nsec = ts.tv_nsec;
+ }
+}
+
+int
+bd_snapshot_create (bd_local_t *local, bd_priv_t *priv)
+{
+ char *path = NULL;
+ bd_gfid_t dest = {0, };
+ bd_gfid_t origin = {0, };
+ int ret = 0;
+ runner_t runner = {0, };
+ struct stat stat = {0, };
+
+ uuid_utoa_r (local->dloc->gfid, dest);
+ uuid_utoa_r (local->loc.gfid, origin);
+
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest);
+ if (!path) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "Insufficient memory");
+ return ENOMEM;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CREATE, NULL);
+ runner_add_args (&runner, "--snapshot", NULL);
+ runner_argprintf (&runner, "/dev/%s/%s", priv->vg, origin);
+ runner_add_args (&runner, "--name", NULL);
+ runner_argprintf (&runner, "%s", dest);
+ if (strcmp (local->bdatt->type, BD_THIN))
+ runner_argprintf (&runner, "-L%ldB", local->size);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ if (lstat (path, &stat) < 0)
+ ret = EIO;
+
+ GF_FREE (path);
+ return ret;
+}
+
+int
+bd_clone (bd_local_t *local, bd_priv_t *priv)
+{
+ int ret = ENOMEM;
+ int fd1 = -1;
+ int fd2 = -1;
+ int i = 0;
+ char *buff = NULL;
+ ssize_t bytes = 0;
+ char *spath = NULL;
+ char *dpath = NULL;
+ struct iovec *vec = NULL;
+ bd_gfid_t source = {0, };
+ bd_gfid_t dest = {0, };
+ void *bufp[IOV_NR] = {0, };
+
+ vec = GF_CALLOC (IOV_NR, sizeof (struct iovec), gf_common_mt_iovec);
+ if (!vec)
+ return ENOMEM;
+
+ for (i = 0; i < IOV_NR; i++) {
+ bufp[i] = page_aligned_alloc (IOV_SIZE, &buff);
+ if (!buff)
+ goto out;
+ vec[i].iov_base = buff;
+ vec[i].iov_len = IOV_SIZE;
+ }
+
+ uuid_utoa_r (local->loc.gfid, source);
+ uuid_utoa_r (local->dloc->gfid, dest);
+
+ gf_asprintf (&spath, "/dev/%s/%s", priv->vg, source);
+ gf_asprintf (&dpath, "/dev/%s/%s", priv->vg, dest);
+ if (!spath || !dpath)
+ goto out;
+
+ ret = bd_create (local->dloc->gfid, local->size,
+ local->bdatt->type, priv);
+ if (ret)
+ goto out;
+
+ fd1 = open (spath, O_RDONLY | O_DIRECT);
+ if (fd1 < 0) {
+ ret = errno;
+ goto out;
+ }
+ fd2 = open (dpath, O_WRONLY | O_DIRECT);
+ if (fd2 < 0) {
+ ret = errno;
+ goto out;
+ }
+
+ while (1) {
+ bytes = readv (fd1, vec, IOV_NR);
+ if (bytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_WARNING, "read failed: %s",
+ strerror (ret));
+ goto out;
+ }
+ if (!bytes)
+ break;
+ bytes = writev (fd2, vec, IOV_NR);
+ if (bytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "write failed: %s", strerror (ret));
+ goto out;
+ }
+ }
+ ret = 0;
+
+out:
+ for (i = 0; i < IOV_NR; i++)
+ GF_FREE (bufp[i]);
+ GF_FREE (vec);
+
+ if (fd1 != -1)
+ close (fd1);
+ if (fd2 != -1)
+ close (fd2);
+
+ GF_FREE (spath);
+ GF_FREE (dpath);
+
+ return ret;
+}
+
+/*
+ * Merges snapshot LV to origin LV and returns status
+ */
+int
+bd_merge (bd_priv_t *priv, uuid_t gfid)
+{
+ bd_gfid_t dest = {0, };
+ char *path = NULL;
+ struct stat stat = {0, };
+ runner_t runner = {0, };
+ int ret = 0;
+
+ uuid_utoa_r (gfid, dest);
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest);
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CONVERT, NULL);
+ runner_add_args (&runner, "--merge", NULL);
+ runner_argprintf (&runner, "%s", path);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ if (!lstat (path, &stat))
+ ret = EIO;
+
+ GF_FREE (path);
+
+ return ret;
+}
+
+int
+bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict)
+{
+ vg_t brick = NULL;
+ lvm_property_value_t prop = {0, };
+ lv_t lv = NULL;
+ int ret = -1;
+ bd_gfid_t gfid = {0, };
+ inode_t *inode = NULL;
+ char *origin = NULL;
+
+ brick = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!brick) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "VG %s is not found",
+ priv->vg);
+ return ENOENT;
+ }
+
+ if (fd)
+ inode = fd->inode;
+ else
+ inode = loc->inode;
+
+ uuid_utoa_r (inode->gfid, gfid);
+ lv = lvm_lv_from_name (brick, gfid);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "LV %s not found", gfid);
+ ret = ENOENT;
+ goto out;
+ }
+
+ prop = lvm_lv_get_property (lv, "origin");
+ if (!prop.is_valid || !prop.value.string) {
+ ret = ENODATA;
+ goto out;
+ }
+
+ origin = gf_strdup (prop.value.string);
+ ret = dict_set_dynstr (dict, BD_ORIGIN, origin);
+
+out:
+ lvm_vg_close (brick);
+ return ret;
+}
+
+#ifndef BLKZEROOUT
+
+int
+bd_do_manual_zerofill (int fd, off_t offset, off_t len, int o_direct)
+{
+ off_t num_vect = 0;
+ off_t num_loop = 1;
+ int idx = 0;
+ int op_ret = -1;
+ int vect_size = IOV_SIZE;
+ off_t remain = 0;
+ off_t extra = 0;
+ struct iovec *vector = NULL;
+ char *iov_base = NULL;
+ char *alloc_buf = NULL;
+
+ if (len == 0)
+ return 0;
+
+ if (len < IOV_SIZE)
+ vect_size = len;
+
+ num_vect = len / (vect_size);
+ remain = len % vect_size ;
+
+ if (num_vect > MAX_NO_VECT) {
+ extra = num_vect % MAX_NO_VECT;
+ num_loop = num_vect / MAX_NO_VECT;
+ num_vect = MAX_NO_VECT;
+ }
+
+ vector = GF_CALLOC (num_vect, sizeof(struct iovec),
+ gf_common_mt_iovec);
+ if (!vector)
+ return -1;
+
+ if (o_direct) {
+ alloc_buf = page_aligned_alloc (vect_size, &iov_base);
+ if (!alloc_buf) {
+ gf_log ("bd_do_manual_zerofill", GF_LOG_DEBUG,
+ "memory alloc failed, vect_size %d: %s",
+ vect_size, strerror (errno));
+ GF_FREE (vector);
+ return -1;
+ }
+ } else {
+ iov_base = GF_CALLOC (vect_size, sizeof(char),
+ gf_common_mt_char);
+ if (!iov_base) {
+ GF_FREE (vector);
+ return -1;
+ }
+ }
+
+ for (idx = 0; idx < num_vect; idx++) {
+ vector[idx].iov_base = iov_base;
+ vector[idx].iov_len = vect_size;
+ }
+
+ if (lseek (fd, offset, SEEK_SET) < 0) {
+ op_ret = -1;
+ goto err;
+ }
+
+ for (idx = 0; idx < num_loop; idx++) {
+ op_ret = writev (fd, vector, num_vect);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (extra) {
+ op_ret = writev (fd, vector, extra);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (remain) {
+ vector[0].iov_len = remain;
+ op_ret = writev (fd, vector , 1);
+ if (op_ret < 0)
+ goto err;
+ }
+ op_ret = 0;
+err:
+ if (o_direct)
+ GF_FREE (alloc_buf);
+ else
+ GF_FREE (iov_base);
+ GF_FREE (vector);
+ return op_ret;
+}
+
+#else
+
+/*
+ * Issue Linux ZEROOUT ioctl to write '0' to a scsi device at given offset
+ * and number of bytes. Each SCSI device's maximum write same bytes are exported
+ * in sysfs file. Sending ioctl request greater than this bytes results in slow
+ * performance. Read this file to get the maximum bytes and break down single
+ * ZEROOUT request into multiple ZEROOUT request not exceeding maximum bytes.
+ * From VG & LV name of device mapper identified and sysfs file read.
+ * /sys/block/<block-device>/queue/write_same_max_bytes
+ */
+int
+bd_do_ioctl_zerofill (bd_priv_t *priv, bd_attr_t *bdatt, int fd, char *vg,
+ off_t offset, off_t len)
+{
+ char *dm = NULL;
+ char dmname[4096] = {0, };
+ char lvname[4096] = {0, };
+ char sysfs[4096] = {0, };
+ bd_gfid_t uuid = {0, };
+ char *p = NULL;
+ off_t max_bytes = 0;
+ int sysfd = -1;
+ uint64_t param[2] = {0, 0};
+ off_t nr_loop = 0;
+ char buff[16] = {0, };
+
+ uuid_utoa_r (bdatt->iatt.ia_gfid, uuid);
+ sprintf (lvname, "/dev/%s/%s", vg, uuid);
+
+ readlink (lvname, dmname, sizeof (dmname));
+
+ p = strrchr (dmname, '/');
+ if (p)
+ dm = p + 1;
+ else
+ dm = dmname;
+
+ sprintf(sysfs, "/sys/block/%s/queue/write_same_max_bytes", dm);
+ sysfd = open (sysfs, O_RDONLY);
+ if (sysfd < 0) {
+ gf_log ("bd_do_ioctl_zerofill", GF_LOG_DEBUG,
+ "sysfs file %s does not exist", lvname);
+ goto skip;
+ }
+
+ read (sysfd, buff, sizeof (buff));
+ close (sysfd);
+
+ max_bytes = atoll (buff);
+
+skip:
+ /*
+ * If requested len is less than write_same_max_bytes,
+ * issue single ioctl to zeroout. Otherwise split the ioctls
+ */
+ if (!max_bytes || len <= max_bytes) {
+ param[0] = offset;
+ param[1] = len;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+ return 0;
+ }
+
+ /* Split ioctls to max write_same_max_bytes */
+ nr_loop = len / max_bytes;
+ for (; nr_loop; nr_loop--) {
+ param[0] = offset;
+ param[1] = max_bytes;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+
+ offset += max_bytes;
+ }
+
+ if (!(len % max_bytes))
+ return 0;
+
+ param[0] = offset;
+ param[1] = len % max_bytes;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+
+ return 0;
+}
+#endif
+
+int
+bd_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, struct iatt *prebuf,
+ struct iatt *postbuf)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = this->private;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "bd_fd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+#ifndef BLKZEROOUT
+ ret = bd_do_manual_zerofill(bd_fd->fd, offset, len,
+ bd_fd->flag & O_DIRECT);
+#else
+ ret = bd_do_ioctl_zerofill(priv, bdatt, bd_fd->fd, priv->vg, offset,
+ len);
+#endif
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "zerofill failed on fd %d length %ld %s",
+ bd_fd->fd, len, strerror (ret));
+ goto out;
+ }
+
+ if (bd_fd->flag & (O_SYNC|O_DSYNC)) {
+ ret = fsync (bd_fd->fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync() in writev on fd %d failed: %s",
+ bd_fd->fd, strerror (errno));
+ return errno;
+ }
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (postbuf));
+
+out:
+
+ return ret;
+}
diff --git a/xlators/storage/bd/src/bd-mem-types.h b/xlators/storage/bd/src/bd-mem-types.h
new file mode 100644
index 000000000..58b448342
--- /dev/null
+++ b/xlators/storage/bd/src/bd-mem-types.h
@@ -0,0 +1,27 @@
+/*
+ Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __BD_MEM_TYPES_H__
+#define __BD_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_bd_mem_types_ {
+ gf_bd_private = gf_common_mt_end + 1,
+ gf_bd_attr,
+ gf_bd_fd,
+ gf_bd_loc_t,
+ gf_bd_int32_t,
+ gf_bd_aio_cb,
+ gf_bd_mt_end
+};
+
+#endif
diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c
new file mode 100644
index 000000000..750b00943
--- /dev/null
+++ b/xlators/storage/bd/src/bd.c
@@ -0,0 +1,2450 @@
+/*
+ BD translator V2 - Exports Block devices on server side as regular
+ files to client
+
+ Now only exporting Logical volumes supported.
+
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <lvm2app.h>
+#include <openssl/md5.h>
+#include <time.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "bd.h"
+#include "bd-aio.h"
+#include "bd-mem-types.h"
+#include "defaults.h"
+#include "glusterfs3-xdr.h"
+#include "run.h"
+#include "protocol-common.h"
+#include "checksum.h"
+#include "syscall.h"
+#include "lvm-defaults.h"
+
+/*
+ * Call back function for setxattr and removexattr.
+ * does not do anything. FIXME: How to handle remove/setxattr failure
+ */
+int
+bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+/*
+ * returns 0 if a file is mapped to BD or not.
+ */
+int
+bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid,
+ char **type, uint64_t *size)
+{
+ char *bd_xattr = NULL;
+ char *bd = NULL;
+ int ret = -1;
+ loc_t loc = {0, };
+ dict_t *dict = NULL;
+ char *p = NULL;
+ call_frame_t *bd_frame = NULL;
+
+ if (!xattr)
+ return 1;
+
+ if (dict_get_str (xattr, BD_XATTR, &p))
+ return 1;
+
+ bd_xattr = gf_strdup (p);
+
+ memcpy (loc.gfid, gfid, sizeof (uuid_t));
+
+ bd_frame = copy_frame (frame);
+ BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out);
+
+ ret = bd_validate_bd_xattr (this, bd_xattr, type, size, gfid);
+ if (ret < 0) {/* LV does not exist */
+ STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->removexattr, &loc,
+ BD_XATTR, NULL);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Mapped LV not available for posix file <gfid:%s>, "
+ "deleting mapping", uuid_utoa (gfid));
+ } else if (ret == 1) {
+ /* BD_XATTR size and LV size mismatch. Update BD_XATTR */
+ gf_asprintf (&bd, "%s:%ld", *type, *size);
+
+ dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (dict, ret, out);
+
+ ret = dict_set_dynstr (dict, BD_XATTR, bd);
+ if (ret)
+ goto out;
+
+ STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0,
+ NULL);
+ }
+
+out:
+ dict_del (xattr, BD_XATTR);
+ GF_FREE (bd_xattr);
+ GF_FREE (bd);
+ return ret;
+}
+
+/*
+ * bd_lookup_cbk: Call back from posix_lookup.
+ */
+int32_t
+bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ int ret = -1;
+ bd_attr_t *bdatt = NULL;
+ uint64_t size = 0;
+ char *type = BD_TYPE_NONE;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ /* iatt already cached */
+ if (!bd_inode_ctx_get (inode, this, &bdatt))
+ goto next;
+
+ if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size))
+ goto out;
+
+ /* BD file, update buf */
+ bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ if (!bdatt) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ memcpy (&bdatt->iatt, buf, sizeof (struct iatt));
+ bdatt->type = type;
+
+ /* Cache LV size in inode_ctx */
+ ret = bd_inode_ctx_set (inode, this, bdatt);
+ if (ret < 0) {
+ GF_FREE (bdatt);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ bdatt->iatt.ia_size = size;
+ bdatt->iatt.ia_blocks = size / 512;
+
+next:
+ dict_del (xattr, GF_CONTENT_KEY);
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
+ xattr, postparent);
+ return 0;
+}
+
+/*
+ * bd_lookup: Issues posix_lookup to find out if file is mapped to BD
+ * bd_lookup -> posix_lookup -> bd_lookup_cbk
+*/
+int32_t
+bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ dict_t *bd_xattr = NULL;
+ bd_attr_t *bdatt = NULL;
+ int op_errno = EINVAL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) {
+ if (!xattr_req) {
+ bd_xattr = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out);
+ xattr_req = bd_xattr;
+ }
+ if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0)
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, loc, xattr_req);
+
+ if (bd_xattr)
+ dict_unref (bd_xattr);
+ return 0;
+out:
+ BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+bd_forget (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ uint64_t ctx = 0;
+ bd_attr_t *bdatt = NULL;
+
+ ret = bd_inode_ctx_get (inode, this, &bdatt);
+ if (!ret) {
+ inode_ctx_del (inode, this, &ctx);
+ GF_FREE (bdatt);
+ }
+ return 0;
+}
+
+int
+bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+ uint64_t size = 0;
+ char *type = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ if (entry->d_type != DT_REG)
+ continue;
+ if (!bd_get_bd_info (frame, this, entry->dict,
+ entry->d_stat.ia_gfid, &type, &size)) {
+ entry->d_stat.ia_size = size;
+ entry->d_stat.ia_blocks = size / 512;
+ GF_FREE (type);
+ }
+ }
+
+out:
+ BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+/*
+ * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set
+ * ia_size is updated with the LV(BD_XATTR_SIZE) size
+ */
+int32_t
+bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (!dict) {
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+ dict = local->dict;
+ }
+
+ if (dict_set_int8 (dict, BD_XATTR, 0)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set key %s", BD_XATTR);
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict);
+ return 0;
+}
+
+int
+bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ /* update buf with LV size */
+ if (!bd_inode_ctx_get (local->inode, this, &bdatt))
+ memcpy (buf, bdatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+ local->inode = inode_ref (loc->inode);
+
+ STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+}
+
+int
+bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *buff, dict_t *xdata)
+{
+ uint64_t size = 0;
+ uint64_t fr_size = 0;
+ bd_priv_t *priv = NULL;
+ vg_t vg = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ priv = this->private;
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "r", 0);
+ if (!vg) {
+ gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ op_ret = -1;
+ op_errno = EAGAIN;
+ goto out;
+ }
+ size = lvm_vg_get_size (vg);
+ fr_size = lvm_vg_get_free_size (vg);
+ lvm_vg_close (vg);
+
+ buff->f_blocks += size / buff->f_frsize;
+ buff->f_bfree += fr_size / buff->f_frsize;
+ buff->f_bavail += fr_size / buff->f_frsize;
+
+out:
+ BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata);
+ return 0;
+}
+
+/*
+ * bd_statfs: Mimics statfs by returning used/free extents in the VG
+ */
+int
+bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL);
+ return 0;
+}
+
+int
+bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = frame->local;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ /* update buf with LV size */
+ if (!bd_inode_ctx_get (local->inode, this, &bdatt))
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ /* if its already cached return it */
+ if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->inode = inode_ref (fd->inode);
+
+ STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+}
+
+/*
+ * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD
+ * file
+ */
+int
+bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ int _fd = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ bd_fd_t *bd_fd = NULL;
+ struct iovec vec = {0, };
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ uint64_t bd_size = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+ }
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto out;
+ }
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ _fd = bd_fd->fd;
+ op_ret = pread (_fd, iobuf->ptr, size, offset);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "read failed on fd=%p: %s", fd,
+ strerror (op_errno));
+ goto out;
+ }
+
+ vec.iov_base = iobuf->ptr;
+ vec.iov_len = op_ret;
+
+ iobref = iobref_new ();
+ iobref_add (iobref, iobuf);
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto out;
+ }
+ bd_size = bdatt->iatt.ia_size;
+ if (!bd_size || (offset + vec.iov_len) >= bd_size)
+ op_errno = ENOENT;
+
+ op_ret = vec.iov_len;
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME);
+
+out:
+ BD_STACK_UNWIND (readv, frame, op_ret, op_errno,
+ &vec, 1, &bdatt->iatt, iobref, NULL);
+
+ if (iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ return 0;
+}
+
+#ifdef BLKDISCARD
+/*
+ * bd_discard: Sends BLKDISCARD ioctl to the block device
+ */
+int
+bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int ret = -1;
+ int op_errno = EINVAL;
+ bd_fd_t *bd_fd = NULL;
+ uint64_t param[2] = {0, };
+ bd_attr_t *bdatt = NULL;
+ struct iatt prebuf = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ /* posix */
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard,
+ fd, offset, len, xdata);
+ return 0;
+ }
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ param[0] = offset;
+ param[1] = len;
+ ret = ioctl (bd_fd->fd, BLKDISCARD, param);
+ if (ret < 0) {
+ if (errno == ENOTTY)
+ op_errno = ENOSYS;
+ else
+ op_errno = errno;
+ goto out;
+ }
+ memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+
+ BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf,
+ &bdatt->iatt, xdata);
+ return 0;
+
+out:
+ BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+#else
+
+int
+bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL);
+ return 0;
+}
+#endif
+
+/*
+ * Call back from posix_open for opening the backing posix file
+ * If it failed, close BD fd
+ */
+int
+bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+ bd_fd_t *bd_fd = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ if (!op_ret)
+ goto out;
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!bdatt) /* posix file */
+ goto out;
+
+ /* posix open failed */
+ if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bd_fd is NULL from fd=%p", fd);
+ goto out;
+ }
+ close (bd_fd->fd);
+ GF_FREE (bd_fd);
+
+out:
+ BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL);
+
+ return 0;
+}
+
+/*
+ * bd_open: Opens BD file if given posix file is mapped to BD. Also opens
+ * posix file.
+ * fd contains both posix and BD fd
+ */
+int32_t
+bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ int32_t ret = EINVAL;
+ bd_fd_t *bd_fd = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_gfid_t gfid = {0, };
+ char *devpath = NULL;
+ bd_priv_t *priv = this->private;
+ int _fd = -1;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ /* not bd file */
+ if (fd->inode->ia_type != IA_IFREG ||
+ bd_inode_ctx_get (fd->inode, this, &bdatt))
+ goto posix;
+
+ uuid_utoa_r (fd->inode->gfid, gfid);
+ gf_asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid);
+ BD_VALIDATE_MEM_ALLOC (devpath, ret, out);
+
+ _fd = open (devpath, flags | O_LARGEFILE, 0);
+ if (_fd < 0) {
+ ret = errno;
+ gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath,
+ strerror (ret));
+ goto out;
+ }
+ bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd);
+ BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out);
+
+ bd_fd->fd = _fd;
+ bd_fd->flag = flags | O_LARGEFILE;
+
+ if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context fd=%p", fd);
+ goto out;
+ }
+
+ ret = 0;
+
+posix:
+
+ /* open posix equivalant of this file, fd needed for fd related
+ operations like fsetxattr, ftruncate etc */
+ STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL);
+
+ GF_FREE (devpath);
+ if (ret) {
+ close (_fd);
+ GF_FREE (bd_fd);
+ }
+
+ return 0;
+}
+
+/*
+ * call back from posix_setattr after updating iatt to posix file.
+ */
+int
+bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = local->bdatt;
+
+ BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_do_fsync (int fd, int datasync)
+{
+ int op_errno = 0;
+
+ if (datasync) {
+ if (sys_fdatasync (fd)) {
+ op_errno = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "fdatasync on fd=%d failed: %s",
+ fd, strerror (errno));
+ }
+
+ } else
+
+ {
+ if (sys_fsync (fd)) {
+ op_errno = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "fsync on fd=%d failed: %s",
+ fd, strerror (op_errno));
+ }
+ }
+
+ return op_errno;
+}
+
+/*
+ * bd_fsync: Syncs if BD fd, forwards the request to posix
+ * fsync -> posix_setattr -> posix_fsync
+*/
+int32_t
+bd_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t datasync, dict_t *xdata)
+{
+ int ret = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+ int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
+ struct iatt prebuf = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd || !bdatt) {
+ STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync, fd, datasync,
+ xdata);
+ return 0;
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_errno = bd_do_fsync (bd_fd->fd, datasync);
+ if (op_errno)
+ goto out;
+
+ /* For BD, Update the a|mtime during full fsync only */
+ if (!datasync) {
+ local = bd_local_init (frame, this);
+ /* In case of mem failure, should posix flush called ? */
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ local->bdatt->type = gf_strdup (bdatt->type);
+ memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&local->bdatt->iatt, valid);
+ uuid_copy (local->loc.gfid, fd->inode->gfid);
+ STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &local->loc,
+ &local->bdatt->iatt,
+ valid, NULL);
+ return 0;
+ }
+
+out:
+ BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = NULL;
+ bd_attr_t *bdatt = NULL;
+ int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
+ bd_local_t *local = NULL;
+ int op_errno = EINVAL;
+ loc_t loc = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!bdatt)
+ goto out;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd || !bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bdfd/bdatt is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->fd = fd_ref (fd);
+ uuid_copy (loc.gfid, bdatt->iatt.ia_gfid);
+
+ /* Update the a|mtime during flush */
+ STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt,
+ valid, NULL);
+
+ return 0;
+
+out:
+ STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->flush, fd, xdata);
+
+ return 0;
+}
+
+int32_t
+bd_release (xlator_t *this, fd_t *fd)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ uint64_t tmp_bfd = 0;
+ bd_attr_t *bdatt = NULL;
+ bd_priv_t *priv = this->private;
+
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (ret || !bdatt) /* posix file */
+ goto out;
+
+ /* FIXME: Update amtime during release */
+
+ ret = fd_ctx_del (fd, this, &tmp_bfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bfd is NULL from fd=%p", fd);
+ goto out;
+ }
+ bd_fd = (bd_fd_t *)(long)tmp_bfd;
+
+ close (bd_fd->fd);
+ GF_FREE (bd_fd);
+out:
+ return 0;
+}
+
+/*
+ * Call back for removexattr after removing BD_XATTR incase of
+ * bd create failure
+ */
+int
+bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata);
+ return 0;
+
+}
+
+/*
+ * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure
+ * invokes posix_removexattr to remove created BD_XATTR
+ */
+int
+bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto next;
+
+ /* Create LV */
+ op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size,
+ local->bdatt->type, this->private);
+ if (!op_errno)
+ goto out;
+
+ /* LV creation failed, remove BD_XATTR */
+ if (local->fd)
+ STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr,
+ local->fd, BD_XATTR, NULL);
+ else
+ STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ &local->loc, BD_XATTR, NULL);
+
+ return 0;
+out:
+
+ bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ if (!bdatt) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto next;
+ }
+
+ memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt));
+ bdatt->type = gf_strdup (local->bdatt->type);
+
+ bd_inode_ctx_set (local->inode, THIS, bdatt);
+
+next:
+ if (local->fd)
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+
+}
+
+/*
+ * Call back from posix_stat
+ */
+int
+bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *iatt,
+ dict_t *xdata)
+{
+ char *param = NULL;
+ char *type = NULL;
+ char *s_size = NULL;
+ char *p = NULL;
+ char *copy = NULL;
+ bd_local_t *local = frame->local;
+ bd_priv_t *priv = this->private;
+ char *bd = NULL;
+ uint64_t size = 0;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (!IA_ISREG (iatt->ia_type)) {
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char);
+ BD_VALIDATE_MEM_ALLOC (param, op_errno, out);
+
+ strncpy (param, local->data->data, local->data->len);
+
+ type = strtok_r (param, ":", &p);
+ if (!type) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given",
+ type);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!strcmp (type, BD_THIN) && !(priv->caps & BD_CAPS_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING, "THIN lv not supported by "
+ "this volume");
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ s_size = strtok_r (NULL, ":", &p);
+
+ /* If size not specified get default size */
+ if (!s_size)
+ size = bd_get_default_extent (priv);
+ else
+ gf_string2bytesize (s_size, &size);
+
+ gf_asprintf (&bd, "%s:%ld", type, size);
+ BD_VALIDATE_MEM_ALLOC (bd, op_errno, out);
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ local->bdatt->type = gf_strdup (type);
+ memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt));
+ local->bdatt->iatt.ia_size = size;
+
+ if (local->fd)
+ STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata);
+
+ GF_FREE (bd);
+ GF_FREE (copy);
+ return 0;
+}
+
+int
+bd_offload_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, EIO, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, NULL);
+
+ return 0;
+}
+
+int
+bd_offload_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (local->offload == BD_OF_SNAPSHOT)
+ op_ret = bd_snapshot_create (frame->local, this->private);
+ else
+ op_ret = bd_clone (frame->local, this->private);
+
+ if (op_ret) {
+ STACK_WIND (frame, bd_offload_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ local->dloc, BD_XATTR, NULL);
+ return 0;
+ }
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
+
+ return 0;
+}
+
+int
+bd_offload_getx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ char *bd = NULL;
+ bd_local_t *local = frame->local;
+ char *type = NULL;
+ char *p = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (dict_get_str (xattr, BD_XATTR, &p)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ type = gf_strdup (p);
+ BD_VALIDATE_MEM_ALLOC (type, op_errno, out);
+
+ p = strrchr (type, ':');
+ if (!p) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING,
+ "source file xattr %s corrupted?", type);
+ goto out;
+ }
+
+ *p='\0';
+
+ /* For clone size is taken from source LV */
+ if (!local->size) {
+ p++;
+ gf_string2bytesize (p, &local->size);
+ }
+ gf_asprintf (&bd, "%s:%ld", type, local->size);
+ local->bdatt->type = gf_strdup (type);
+ dict_del (local->dict, BD_XATTR);
+ dict_del (local->dict, LINKTO);
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_offload_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ local->dloc, local->dict, 0, NULL);
+
+ return 0;
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ GF_FREE (type);
+ GF_FREE (bd);
+
+ return 0;
+}
+
+int
+bd_offload_dest_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *iatt,
+ dict_t *xattr, struct iatt *postparent)
+{
+ bd_local_t *local = frame->local;
+ char *bd = NULL;
+ int ret = -1;
+ char *linkto = NULL;
+
+ if (op_ret < 0 && op_errno != ENODATA) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!IA_ISREG (iatt->ia_type)) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "destination gfid is not a "
+ "regular file");
+ goto out;
+ }
+
+ ret = dict_get_str (xattr, LINKTO, &linkto);
+ if (linkto) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "destination file not "
+ "present in same brick");
+ goto out;
+ }
+
+ ret = dict_get_str (xattr, BD_XATTR, &bd);
+ if (bd) {
+ op_errno = EEXIST;
+ goto out;
+ }
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ STACK_WIND (frame, bd_offload_getx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ &local->loc, BD_XATTR, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int
+bd_merge_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ /* FIXME: if delete failed, remove xattr */
+
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+
+int
+bd_do_merge(call_frame_t *frame, xlator_t *this)
+{
+ bd_local_t *local = frame->local;
+ inode_t *parent = NULL;
+ char *p = NULL;
+ int op_errno = 0;
+
+ op_errno = bd_merge (this->private, local->inode->gfid);
+ if (op_errno)
+ goto out;
+
+ /*
+ * posix_unlink needs loc->pargfid to be valid, but setxattr FOP does
+ * not have loc->pargfid set. Get parent's gfid by getting parents inode
+ */
+ parent = inode_parent (local->inode, NULL, NULL);
+ if (!parent) {
+ /*
+ * FIXME: Snapshot LV already deleted.
+ * remove xattr, instead of returning failure
+ */
+ op_errno = EINVAL;
+ goto out;
+ }
+ uuid_copy (local->loc.pargfid, parent->gfid);
+
+ p = strrchr (local->loc.path, '/');
+ if (p)
+ p++;
+ local->loc.name = p;
+
+ STACK_WIND (frame, bd_merge_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ &local->loc, 0, NULL);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+
+ return op_errno;
+}
+
+int
+bd_offload (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, bd_offload_t offload)
+{
+ char *param = NULL;
+ char *param_copy = NULL;
+ char *p = NULL;
+ char *size = NULL;
+ char *gfid = NULL;
+ int op_errno = 0;
+ bd_local_t *local = frame->local;
+
+ param = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char);
+ BD_VALIDATE_MEM_ALLOC (param, op_errno, out);
+ param_copy = param;
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ local->dloc = GF_CALLOC (1, sizeof (loc_t), gf_bd_loc_t);
+ BD_VALIDATE_MEM_ALLOC (local->dloc, op_errno, out);
+
+ strncpy (param, local->data->data, local->data->len);
+
+ gfid = strtok_r (param, ":", &p);
+ size = strtok_r (NULL, ":", &p);
+ if (size)
+ gf_string2bytesize (size, &local->size);
+ else if (offload != BD_OF_CLONE)
+ local->size = bd_get_default_extent (this->private);
+
+ if (dict_set_int8 (local->dict, BD_XATTR, 1) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (dict_set_int8 (local->dict, LINKTO, 1) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ uuid_parse (gfid, local->dloc->gfid);
+ local->offload = offload;
+
+ STACK_WIND (frame, bd_offload_dest_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, local->dloc,
+ local->dict);
+
+ return 0;
+
+out:
+ if (fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ GF_FREE (param_copy);
+ return 0;
+}
+
+/*
+ * bd_setxattr: Used to create & map an LV to a posix file using
+ * BD_XATTR xattr
+ * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr ->
+ * bd_setx_setx_cbk -> create_lv
+ * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk
+ */
+int
+bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int flags, dict_t *xdata)
+{
+ int op_errno = 0;
+ data_t *data = NULL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_offload_t cl_type = BD_OF_NONE;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+
+ if ((data = dict_get (dict, BD_XATTR)))
+ cl_type = BD_OF_NONE;
+ else if ((data = dict_get (dict, BD_CLONE)))
+ cl_type = BD_OF_CLONE;
+ else if ((data = dict_get (dict, BD_SNAPSHOT)))
+ cl_type = BD_OF_SNAPSHOT;
+ else if ((data = dict_get (dict, BD_MERGE)))
+ cl_type = BD_OF_MERGE;
+
+ bd_inode_ctx_get (loc->inode, this, &bdatt);
+ if (!cl_type && !data) {
+ STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr, loc, dict,
+ flags, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->data = data;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ if (cl_type) {
+ /* For cloning/snapshot, source file must be mapped to LV */
+ if (!bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s not mapped to BD", loc->path);
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (cl_type == BD_OF_MERGE)
+ bd_do_merge (frame, this);
+ else
+ bd_offload (frame, this, loc, NULL, cl_type);
+ } else if (data) {
+ if (bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s already mapped to BD", loc->path);
+ op_errno = EEXIST;
+ goto out;
+ }
+ STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->stat, loc, xdata);
+ }
+
+ return 0;
+out:
+ if (op_errno)
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata);
+
+ return 0;
+}
+
+/*
+ * bd_fsetxattr: Used to create/map an LV to a posix file using
+ * BD_XATTR xattr
+ * bd_fsetxattr -> posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr ->
+ * bd_setx_setx_cbk -> create_lv
+ * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk
+ * -> bd_fsetxattr_cbk
+ */
+int32_t
+bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int flags, dict_t *xdata)
+{
+ int op_errno = 0;
+ data_t *data = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+ bd_offload_t cl_type = BD_OF_NONE;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ data = dict_get (dict, BD_XATTR);
+ if ((data = dict_get (dict, BD_XATTR)))
+ cl_type = BD_OF_NONE;
+ else if ((data = dict_get (dict, BD_CLONE)))
+ cl_type = BD_OF_CLONE;
+ else if ((data = dict_get (dict, BD_SNAPSHOT)))
+ cl_type = BD_OF_SNAPSHOT;
+ else if ((data = dict_get (dict, BD_MERGE))) {
+ /*
+ * bd_merge is not supported for fsetxattr, because snapshot LV
+ * is opened and it causes problem in snapshot merge
+ */
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ if (!cl_type && !data) {
+ /* non bd file object */
+ STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref (fd);
+ local->data = data;
+
+ if (cl_type) {
+ /* For cloning/snapshot, source file must be mapped to LV */
+ if (!bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "fd %p not mapped to BD", fd);
+ op_errno = EINVAL;
+ goto out;
+
+ }
+ bd_offload (frame, this, NULL, fd, cl_type);
+ } else if (data) {
+ if (bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "fd %p already mapped to BD", fd);
+ op_errno = EEXIST;
+ goto out;
+ }
+ STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ }
+
+ return 0;
+out:
+
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int32_t
+bd_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ if (!strcmp (name, BD_XATTR))
+ goto out;
+
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL);
+ return 0;
+}
+
+int32_t
+bd_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ if (!strcmp (name, BD_XATTR))
+ goto out;
+
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL);
+ return 0;
+}
+
+int
+bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * Call back for setxattr after setting BD_XATTR_SIZE.
+ */
+int
+bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+ struct iatt prebuf = {0, };
+ char *bd = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ bd_inode_ctx_get (local->inode, this, &bdatt);
+ if (!bdatt)
+ goto revert_xattr;
+
+ op_errno = bd_resize (this->private, local->inode->gfid,
+ local->bdatt->iatt.ia_size);
+ if (op_errno)
+ goto revert_xattr;
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ /* LV resized, update new size in the cache */
+ bdatt->iatt.ia_size = local->bdatt->iatt.ia_size;
+
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt,
+ NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt,
+ NULL);
+
+ return 0;
+
+revert_xattr:
+ /* revert setxattr */
+ op_ret = dict_get_str (local->dict, BD_XATTR, &bd);
+ GF_FREE (bd);
+ gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size);
+
+ if (local->fd)
+ STACK_WIND (frame, bd_trunc_setxattr_setx_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_trunc_setxattr_setx_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * call back from posix_[f]truncate_stat
+ * If offset > LV size, it resizes the LV and calls posix_setxattr
+ * to update new LV size in xattr else calls posix_setattr for updating
+ * the posix file so that truncate fop behaves properly
+ */
+int
+bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ char *bd = NULL;
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ bd_inode_ctx_get (local->inode, this, &bdatt);
+ if (!bdatt) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size);
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (local->fd)
+ STACK_WIND (frame, bd_trunc_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_trunc_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL,
+ NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL,
+ NULL);
+ GF_FREE (bd);
+ return 0;
+}
+
+void
+bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc,
+ off_t offset, bd_attr_t *bdatt)
+{
+ bd_local_t *local = NULL;
+ struct iatt prebuf = {0, };
+ int op_errno = 0;
+ int op_ret = -1;
+
+ /* If requested size is less than LV size, return success */
+ if (offset <= bdatt->iatt.ia_size) {
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ op_ret = 0;
+ goto out;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ if (fd) {
+ local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref (fd);
+ } else {
+ local->inode = inode_ref (loc->inode);
+ loc_copy (&local->loc, loc);
+ }
+
+ local->bdatt->iatt.ia_size =
+ bd_adjust_size (this->private, offset);
+
+ STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, NULL);
+
+ return;
+
+out:
+ if (fd)
+ BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno,
+ &prebuf, &bdatt->iatt, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, op_ret, op_errno,
+ &prebuf, &bdatt->iatt, NULL);
+ return;
+}
+
+/*
+ * bd_ftruncate: Resizes a LV if fd belongs to BD.
+ */
+int32_t
+bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_ftruncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd,
+ offset, xdata);
+ return 0;
+ }
+
+ bd_do_trunc (frame, this, fd, NULL, offset, bdatt);
+ return 0;
+out:
+ BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+/*
+ * bd_truncate: Resizes a LV if file maps to LV.
+ */
+int32_t
+bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_truncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc,
+ offset, xdata);
+ return 0;
+ }
+
+ bd_do_trunc (frame, this, NULL, loc, offset, bdatt);
+ return 0;
+
+out:
+ BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset,
+ uint64_t bd_size)
+{
+ int index = 0;
+ int retval = 0;
+ off_t internal_offset = 0;
+
+ if (!vector)
+ return -EFAULT;
+
+ retval = pwritev (fd, vector, count, offset);
+ if (retval == -1) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "base %p, length %ld, offset %ld, message %s",
+ vector[index].iov_base, vector[index].iov_len,
+ internal_offset, strerror (errno));
+ retval = -errno;
+ goto err;
+ }
+/*
+
+
+ internal_offset = offset;
+ for (index = 0; index < count; index++) {
+ if (internal_offset > bd_size) {
+ op_ret = -ENOSPC;
+ goto err;
+ }
+ if (internal_offset + vector[index].iov_len > bd_size) {
+ vector[index].iov_len = bd_size - internal_offset;
+ no_space = 1;
+ }
+ retval = pwritev (fd, vector[index].iov_base,
+ vector[index].iov_len, internal_offset);
+ if (retval == -1) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "base %p, length %ld, offset %ld, message %s",
+ vector[index].iov_base, vector[index].iov_len,
+ internal_offset, strerror (errno));
+ op_ret = -errno;
+ goto err;
+ }
+ op_ret += retval;
+ internal_offset += retval;
+ if (no_space)
+ break;
+ }
+*/
+err:
+ return retval;
+}
+
+/*
+ * bd_writev: Writes to LV if its BD file or forwards the request to posix_write
+ * bd_writev -> posix_writev -> bd_writev_cbk
+ */
+int
+bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdict)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int _fd = -1;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ uint64_t size = 0;
+ struct iatt prebuf = {0, };
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (vector, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) { /* posix fd */
+ STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdict);
+ return 0;
+ }
+
+ _fd = bd_fd->fd;
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ size = bdatt->iatt.ia_size;
+
+ op_ret = __bd_pwritev (_fd, vector, count, offset, size);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64
+ ", %s", offset, strerror (op_errno));
+ goto out;
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+out:
+
+ BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+ int *valid = cookie;
+ bd_local_t *local = frame->local;
+
+ if (op_ret < 0 || !valid || !local)
+ goto out;
+
+ if (bd_inode_ctx_get (local->inode, this, &bdatt))
+ goto out;
+
+ if (*valid & GF_SET_ATTR_UID)
+ bdatt->iatt.ia_uid = postbuf->ia_uid;
+ else if (*valid & GF_SET_ATTR_GID)
+ bdatt->iatt.ia_gid = postbuf->ia_gid;
+ else if (*valid & GF_SET_ATTR_MODE) {
+ bdatt->iatt.ia_type = postbuf->ia_type;
+ bdatt->iatt.ia_prot = postbuf->ia_prot;
+ } else if (*valid & GF_SET_ATTR_ATIME) {
+ bdatt->iatt.ia_atime = postbuf->ia_atime;
+ bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec;
+ } else if (*valid & GF_SET_ATTR_MTIME) {
+ bdatt->iatt.ia_mtime = postbuf->ia_mtime;
+ bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec;
+ }
+
+ bdatt->iatt.ia_ctime = postbuf->ia_ctime;
+ bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec;
+
+ memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt));
+out:
+ GF_FREE (valid);
+ BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+int
+bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+ int *ck_valid = NULL;
+ int op_errno = 0;
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ ck_valid = GF_CALLOC (1, sizeof (valid), gf_bd_int32_t);
+ BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out);
+
+ local->inode = inode_ref (loc->inode);
+ *ck_valid = valid;
+
+ STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata);
+ return 0;
+}
+
+int
+bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (bd_inode_ctx_get (inode, this, &bdatt))
+ goto out;
+
+ bdatt->iatt.ia_ctime = buf->ia_ctime;
+ bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec;
+ bdatt->iatt.ia_nlink = buf->ia_nlink;
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, NULL);
+ return 0;
+}
+
+int
+bd_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ return 0;
+}
+
+int
+bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ dict_t *xattr = NULL;
+ int op_ret = -1;
+ int op_errno = ENOMEM;;
+ bd_priv_t *priv = this->private;
+
+ xattr = dict_new ();
+ if (!xattr)
+ goto out;
+
+ if (!strcmp (name, VOL_TYPE))
+ op_ret = dict_set_int64 (xattr, (char *)name, 1);
+ else if (!strcmp (name, VOL_CAPS))
+ op_ret = dict_set_int64 (xattr, (char *)name, priv->caps);
+ else
+ op_ret = bd_get_origin (this->private, loc, fd, xattr);
+
+out:
+ if (loc)
+ BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+ else
+ BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+
+ op_ret = dict_reset (xattr);
+ dict_unref (xattr);
+
+ return 0;
+}
+
+int
+bd_fgetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS)
+ || !strcmp (name, BD_ORIGIN)))
+ bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata);
+ else
+ STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int
+bd_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS)
+ || !strcmp (name, BD_ORIGIN)))
+ bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata);
+ else
+ STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+
+ return 0;
+}
+
+int
+bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ bd_gfid_t gfid = {0, };
+ bd_local_t *local = frame->local;
+
+ if (buf->ia_nlink > 1)
+ goto posix;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ uuid_utoa_r (inode->gfid, gfid);
+ if (bd_delete_lv (this->private, gfid, &op_errno) < 0) {
+ if (op_errno != ENOENT)
+ goto out;
+ }
+
+posix:
+ /* remove posix */
+ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ &local->loc, 0, NULL);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+bd_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflag, dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ loc_copy (&local->loc, loc);
+
+ STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, NULL);
+ return 0;
+out:
+ BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+bd_priv (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+bd_inode (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
+{
+ int op_ret = -1;
+ int op_errno = 0;
+ int ret = 0;
+ int _fd = -1;
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+ int32_t weak_checksum = 0;
+ bd_fd_t *bd_fd = NULL;
+ unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0};
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rchecksum, fd, offset,
+ len, xdata);
+ return 0;
+ }
+
+ memset (strong_checksum, 0, MD5_DIGEST_LENGTH);
+
+ alloc_buf = page_aligned_alloc (len, &buf);
+ if (!alloc_buf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ _fd = bd_fd->fd;
+
+ LOCK (&fd->lock);
+ {
+ ret = pread (_fd, buf, len, offset);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pread of %d bytes returned %d (%s)",
+ len, ret, strerror (errno));
+ op_errno = errno;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret < 0)
+ goto out;
+
+ weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf,
+ (size_t) len);
+ gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len,
+ (unsigned char *) strong_checksum);
+
+ op_ret = 0;
+out:
+ BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno,
+ weak_checksum, strong_checksum, NULL);
+
+ GF_FREE (alloc_buf);
+
+ return 0;
+}
+
+static int
+bd_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+ bd_attr_t *bdatt = NULL;
+
+ /* iatt already cached */
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt) < 0) {
+ STACK_WIND (frame, default_zerofill_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->zerofill,
+ fd, offset, len, xdata);
+ return 0;
+ }
+
+ ret = bd_do_zerofill(frame, this, fd, offset, len,
+ &statpre, &statpost);
+ if (ret)
+ goto err;
+
+ STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ret, NULL, NULL, NULL);
+ return 0;
+}
+
+/**
+ * notify - when parent sends PARENT_UP, send CHILD_UP event from here
+ */
+int32_t
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
+{
+ switch (event)
+ {
+ case GF_EVENT_PARENT_UP:
+ {
+ /* Tell the parent that bd xlator is up */
+ default_notify (this, GF_EVENT_CHILD_UP, data);
+ }
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1);
+
+ if (ret != 0)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = -1;
+ bd_priv_t *priv = this->private;
+
+ GF_OPTION_RECONF ("bd-aio", priv->aio_configured, options,
+ bool, out);
+
+ if (priv->aio_configured)
+ bd_aio_on (this);
+ else
+ bd_aio_off (this);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/**
+ * bd xlator init - Validate configured VG
+ */
+int
+init (xlator_t *this)
+{
+ int ret = 0;
+ char *vg_data = NULL;
+ char *device = NULL;
+ bd_priv_t *_private = NULL;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: storage/bd needs posix as subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Volume is dangling. Please check the volume file.");
+ }
+
+ GF_OPTION_INIT ("export", vg_data, str, error);
+ GF_OPTION_INIT ("device", device, str, error);
+
+ /* Now we support only LV device */
+ if (strcasecmp (device, BACKEND_VG)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: unknown %s backend %s", BD_XLATOR, device);
+ return -1;
+ }
+
+ this->local_pool = mem_pool_new (bd_local_t, 64);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: Failed to create bd memory pool");
+ return -1;
+ }
+
+ ret = 0;
+ _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private);
+ if (!_private)
+ goto error;
+
+ this->private = _private;
+ _private->vg = gf_strdup (vg_data);
+ if (!_private->vg)
+ goto error;
+
+ _private->handle = lvm_init (NULL);
+ if (!_private->handle) {
+ gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed");
+ goto error;
+ }
+ _private->caps = BD_CAPS_BD;
+ if (bd_scan_vg (this, _private))
+ goto error;
+
+ _private->aio_init_done = _gf_false;
+ _private->aio_capable = _gf_false;
+
+ GF_OPTION_INIT ("bd-aio", _private->aio_configured, bool, error);
+ if (_private->aio_configured) {
+ if (bd_aio_on (this)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "BD AIO init failed");
+ ret = -1;
+ goto error;
+ }
+ }
+
+ _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT |
+ BD_CAPS_OFFLOAD_ZERO;
+
+ return 0;
+error:
+ if (_private) {
+ GF_FREE (_private->vg);
+ if (_private->handle)
+ lvm_quit (_private->handle);
+ GF_FREE (_private);
+ }
+
+ mem_pool_destroy (this->local_pool);
+
+ return -1;
+}
+
+void
+fini (xlator_t *this)
+{
+ bd_priv_t *priv = this->private;
+ mem_pool_destroy (this->local_pool);
+ this->local_pool = NULL;
+ if (!priv)
+ return;
+ lvm_quit (priv->handle);
+ GF_FREE (priv->vg);
+ this->private = NULL;
+ GF_FREE (priv);
+ return;
+}
+
+struct xlator_dumpops dumpops = {
+ .priv = bd_priv,
+ .inode = bd_inode,
+};
+
+struct xlator_fops fops = {
+ .readdirp = bd_readdirp,
+ .lookup = bd_lookup,
+ .stat = bd_stat,
+ .statfs = bd_statfs,
+ .open = bd_open,
+ .fstat = bd_fstat,
+ .rchecksum = bd_rchecksum,
+ .readv = bd_readv,
+ .fsync = bd_fsync,
+ .setxattr = bd_setxattr,
+ .fsetxattr = bd_fsetxattr,
+ .removexattr = bd_removexattr,
+ .fremovexattr=bd_fremovexattr,
+ .truncate = bd_truncate,
+ .ftruncate = bd_ftruncate,
+ .writev = bd_writev,
+ .getxattr = bd_getxattr,
+ .fgetxattr = bd_fgetxattr,
+ .unlink = bd_unlink,
+ .link = bd_link,
+ .flush = bd_flush,
+ .setattr = bd_setattr,
+ .discard = bd_discard,
+ .zerofill = bd_zerofill,
+};
+
+struct xlator_cbks cbks = {
+ .release = bd_release,
+ .forget = bd_forget,
+};
+
+struct volume_options options[] = {
+ { .key = {"export"},
+ .type = GF_OPTION_TYPE_STR},
+ { .key = {"device"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = BACKEND_VG},
+ {
+ .key = {"bd-aio"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Support for native Linux AIO"
+ },
+
+ { .key = {NULL} }
+};
diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h
new file mode 100644
index 000000000..62add16cd
--- /dev/null
+++ b/xlators/storage/bd/src/bd.h
@@ -0,0 +1,173 @@
+/*
+ BD translator - Exports Block devices on server side as regular
+ files to client
+
+ Copyright IBM, Corp. 2012
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _BD_H
+#define _BD_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "xlator.h"
+#include "mem-types.h"
+
+#define BD_XLATOR "block device mapper xlator"
+#define BACKEND_VG "vg"
+#define GF_XATTR "user.glusterfs"
+#define BD_XATTR GF_XATTR ".bd"
+
+#define BD_LV "lv"
+#define BD_THIN "thin"
+
+#define VOL_TYPE "volume.type"
+#define VOL_CAPS "volume.caps"
+
+#define ALIGN_SIZE 4096
+
+#define BD_CAPS_BD 0x01
+#define BD_CAPS_THIN 0x02
+#define BD_CAPS_OFFLOAD_COPY 0x04
+#define BD_CAPS_OFFLOAD_SNAPSHOT 0x08
+#define BD_CAPS_OFFLOAD_ZERO 0x20
+
+#define BD_CLONE "clone"
+#define BD_SNAPSHOT "snapshot"
+#define BD_MERGE "merge"
+#define BD_ORIGIN "list-origin"
+
+#define IOV_NR 4
+#define IOV_SIZE (64 * 1024)
+
+#define ALIGN_SIZE 4096
+#define LINKTO "trusted.glusterfs.dht.linkto"
+
+#define MAX_NO_VECT 1024
+
+
+#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \
+ if (!buff) { \
+ op_errno = ENOMEM; \
+ gf_log (this->name, GF_LOG_ERROR, "out of memory"); \
+ goto label; \
+ }
+
+#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \
+ if (!local) { \
+ op_errno = EINVAL; \
+ goto label; \
+ }
+
+#define BD_STACK_UNWIND(typ, frame, args ...) do { \
+ bd_local_t *__local = frame->local; \
+ xlator_t *__this = frame->this; \
+ \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT (typ, frame, args); \
+ if (__local) \
+ bd_local_free (__this, __local); \
+ } while (0)
+
+typedef char bd_gfid_t[GF_UUID_BUF_SIZE];
+
+/**
+ * bd_fd - internal structure
+ */
+typedef struct bd_fd {
+ int fd;
+ int32_t flag;
+ int odirect;
+} bd_fd_t;
+
+typedef struct bd_priv {
+ lvm_t handle;
+ char *vg;
+ char *pool;
+ int caps;
+ gf_boolean_t aio_init_done;
+ gf_boolean_t aio_capable;
+ gf_boolean_t aio_configured;
+#ifdef HAVE_LIBAIO
+ io_context_t ctxp;
+ pthread_t aiothread;
+#endif
+} bd_priv_t;
+
+
+typedef enum bd_type {
+ BD_TYPE_NONE,
+ BD_TYPE_LV,
+} bd_type_t;
+
+typedef struct {
+ struct iatt iatt;
+ char *type;
+} bd_attr_t;
+
+typedef enum {
+ BD_OF_NONE,
+ BD_OF_CLONE,
+ BD_OF_SNAPSHOT,
+ BD_OF_MERGE,
+} bd_offload_t;
+
+typedef struct {
+ dict_t *dict;
+ bd_attr_t *bdatt;
+ inode_t *inode;
+ loc_t loc;
+ fd_t *fd;
+ data_t *data; /* for setxattr */
+ bd_offload_t offload;
+ uint64_t size;
+ loc_t *dloc;
+} bd_local_t;
+
+/* Prototypes */
+int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx);
+int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx);
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv);
+bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this);
+void bd_local_free (xlator_t *this, bd_local_t *local);
+int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd);
+char *page_aligned_alloc (size_t size, char **aligned_buf);
+int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid);
+uint64_t bd_get_default_extent (bd_priv_t *priv);
+uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size);
+int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv);
+int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size);
+int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+void bd_update_amtime(struct iatt *iatt, int flag);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+int bd_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, struct iatt *prebuf,
+ struct iatt *postbuf);
+
+#endif
diff --git a/xlators/storage/bd_map/src/bd_map.c b/xlators/storage/bd_map/src/bd_map.c
deleted file mode 100644
index 9c8f69c64..000000000
--- a/xlators/storage/bd_map/src/bd_map.c
+++ /dev/null
@@ -1,2580 +0,0 @@
-/*
- BD translator - Exports Block devices on server side as regular
- files to client
-
- Now only exporting Logical volumes supported.
-
- Copyright IBM, Corp. 2012
-
- This file is part of GlusterFS.
-
- Author:
- M. Mohan Kumar <mohan@in.ibm.com>
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <time.h>
-#include <lvm2app.h>
-#include <openssl/md5.h>
-
-#include "bd_map.h"
-#include "bd_map_help.h"
-#include "defaults.h"
-#include "glusterfs3-xdr.h"
-#include "run.h"
-#include "protocol-common.h"
-
-/* Regular fops */
-
-int
-bd_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char path[PATH_MAX] = {0, };
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
-
- sprintf (path, "/dev/mapper/%s", loc->path);
- op_ret = access (path, mask & 07);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR, "access failed on %s: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- op_ret = 0;
-out:
- STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, NULL);
-
- return 0;
-}
-
-#define LV_RENAME "/sbin/lvrename"
-
-int bd_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *new_path = NULL;
- char *np = NULL;
- struct iatt stbuf = {0, };
- struct iatt preoldparent = {0, };
- struct iatt postoldparent = {0, };
- struct iatt prenewparent = {0, };
- struct iatt postnewparent = {0, };
- bd_priv_t *priv = NULL;
- bd_entry_t *lventry = NULL;
- bd_entry_t *newp_entry = NULL;
- char *path = NULL;
- struct stat v_stat = {0, };
- runner_t runner = {0, };
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (oldloc, out);
- VALIDATE_OR_GOTO (newloc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, lventry, oldloc->path);
- if (lventry->refcnt > 1) {
- op_errno = EBUSY;
- goto out;
- }
-
- memcpy (&preoldparent, lventry->parent->attr, sizeof(preoldparent));
-
- new_path = np = gf_strdup (newloc->path);
- if (!new_path)
- goto out;
- new_path = strrchr (np, '/');
- if (!new_path) {
- op_errno = EINVAL;
- goto out;
- }
-
- *new_path = '\0';
- BD_ENTRY (priv, newp_entry, np);
-
- memcpy (&prenewparent, newp_entry->parent->attr, sizeof(preoldparent));
-
- runinit (&runner);
-
- runner_add_args (&runner, LV_RENAME, NULL);
- runner_add_args (&runner, lventry->parent->name, NULL);
- runner_add_args (&runner, oldloc->name, NULL);
- runner_add_args (&runner, newloc->name, NULL);
-
- runner_start (&runner);
- runner_end (&runner);
-
- /* verify */
- gf_asprintf (&path, "/dev/%s", newloc->path);
- if (stat (path, &v_stat) < 0) {
- op_errno = EIO;
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (lventry);
- BD_ENTRY_UPDATE_MTIME (newp_entry);
- memcpy (&postoldparent, lventry->parent->attr, sizeof(postoldparent));
- memcpy (&postnewparent, newp_entry->parent->attr,
- sizeof(postoldparent));
- BD_WR_LOCK (&priv->lock);
- strncpy (lventry->name, newloc->name, sizeof(lventry->name));
- memcpy (&stbuf, lventry->attr, sizeof(stbuf));
- BD_UNLOCK (&priv->lock);
- op_ret = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (newp_entry)
- BD_PUT_ENTRY (priv, newp_entry);
- if (np)
- GF_FREE (np);
- if (path)
- GF_FREE (path);
-
- STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, &stbuf,
- &preoldparent, &postoldparent, &prenewparent,
- &postnewparent, NULL);
- return 0;
-}
-
-int32_t
-bd_delete_lv (bd_priv_t *priv, bd_entry_t *p_entry, bd_entry_t *lventry,
- const char *path, int *op_errno)
-{
- vg_t vg = NULL;
- lv_t lv = NULL;
- int op_ret = -1;
-
- *op_errno = 0;
- BD_WR_LOCK (&priv->lock);
- vg = lvm_vg_open (priv->handle, p_entry->name, "w", 0);
- if (!vg) {
- *op_errno = ENOENT;
- BD_UNLOCK (&priv->lock);
- goto out;
- }
-
- lv = lvm_lv_from_name (vg, lventry->name);
- if (!lv) {
- lvm_vg_close (vg);
- *op_errno = ENOENT;
- BD_UNLOCK (&priv->lock);
- goto out;
- }
- op_ret = lvm_vg_remove_lv (lv);
- if (op_ret < 0) {
- *op_errno = errno;
- lvm_vg_close (vg);
- BD_UNLOCK (&priv->lock);
- goto out;
- }
- lvm_vg_close (vg);
-
- op_ret = bd_entry_rm (path);
- if (op_ret < 0) {
- *op_errno = EIO;
- BD_UNLOCK (&priv->lock);
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (p_entry);
-
- op_ret = 0;
- op_errno = 0;
-
- BD_UNLOCK (&priv->lock);
- op_ret = 0;
-out:
- return op_ret;
-}
-
-int32_t
-bd_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int xflag, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = ENOENT;
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
- bd_priv_t *priv = NULL;
- bd_entry_t *lventry = NULL;
- bd_entry_t *p_entry = NULL;
- char *vg_name = NULL;
- char *volume = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- volume = vg_name = gf_strdup (loc->path);
- if (!volume)
- goto out;
- volume = strrchr (volume, '/');
- if (!volume) {
- op_errno = EINVAL;
- goto out;
- }
- /* creating under non VG directory not permited */
- if (vg_name == volume) {
- op_errno = EOPNOTSUPP;
- goto out;
- }
- *volume = '\0';
-
- BD_ENTRY (priv, p_entry, vg_name);
- BD_ENTRY (priv, lventry, loc->path);
- if (!p_entry || !lventry)
- goto out;
-
- memcpy (&preparent, p_entry->attr, sizeof(preparent));
- op_ret = bd_delete_lv (priv, p_entry, lventry, loc->path, &op_errno);
- memcpy (&postparent, p_entry->attr, sizeof(postparent));
-out:
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (vg_name)
- GF_FREE (vg_name);
- STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
- &preparent, &postparent, NULL);
-
- return 0;
-}
-
-#define LVM_CREATE "/sbin/lvcreate"
-
-#define IOV_NR 4
-#define IOV_SIZE (4 * 1024)
-
-int bd_clone_lv (bd_priv_t *priv, bd_entry_t *p_entry, dict_t *output,
- const char *vg_name, const char *lv_name,
- const char *dest_lv_name, struct iatt *stbuf)
-{
- int32_t ret = -1;
- vg_t vg = NULL;
- lv_t lv = NULL;
- ssize_t size = 0;
- uint64_t extent = 0;
- int fd1 = -1;
- int fd2 = -1;
- struct iatt iattr = {0, };
- bd_entry_t *lventry = NULL;
- char path[512] = {0, };
- struct iovec *vec = NULL;
- int i = 0;
- ssize_t bytes = 0;
- int nr_iov = 0;
-
- vec = GF_CALLOC (IOV_NR, sizeof(struct iovec), gf_common_mt_iovec);
- if (!vec)
- goto out;
-
- for (i = 0; i < IOV_NR; i++) {
- vec[i].iov_base = GF_MALLOC (IOV_SIZE, gf_common_mt_char);
- if (!vec[i].iov_base)
- goto out;
- vec[i].iov_len = IOV_SIZE;
- }
-
- vg = lvm_vg_open (priv->handle, vg_name, "w", 0);
- if (!vg) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "lvm_vg_open %s failed", vg_name);
- ret = -1;
- goto out;
- }
- lv = lvm_lv_from_name (vg, lv_name);
- if (!lv) {
- gf_log (THIS->name, GF_LOG_ERROR, "lvm_lv_from_name failed");
- ret = -1;
- goto out;
- }
-
- size = lvm_lv_get_size (lv);
- extent = size / lvm_vg_get_extent_size (vg);
-
- if (lvm_vg_create_lv_linear (vg, dest_lv_name, size) == NULL) {
- gf_log (THIS->name, GF_LOG_ERROR, "lv_create:%s",
- lvm_errmsg(priv->handle));
- ret = -1;
- goto out;
- }
- sprintf (path, "/dev/%s/%s", vg_name, lv_name);
- fd1 = open (path, O_RDONLY);
- if (fd1 < 0) {
- gf_log (THIS->name, GF_LOG_ERROR, "opening %s failed", path);
- goto out;
- }
- sprintf (path, "/dev/%s/%s", vg_name, dest_lv_name);
- fd2 = open (path, O_WRONLY);
- if (fd2 < 0) {
- gf_log (THIS->name, GF_LOG_ERROR, "opening %s failed", path);
- goto out;
- }
-
- bd_entry_istat (path, &iattr, IA_IFREG);
- iattr.ia_size = size;
-
- bytes = size;
- while (bytes) {
- size = readv(fd1, vec, IOV_NR);
- if (size < 0) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "read failed:%s", strerror(errno));
- goto out;
- }
- if (size < IOV_NR * IOV_SIZE) {
- vec[size / IOV_SIZE].iov_len = size % IOV_SIZE;
- nr_iov = (size / IOV_SIZE) + 1;
- } else
- nr_iov = IOV_NR;
- bytes -= size;
- size = writev (fd2, vec, nr_iov);
- if (size < 0) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "write failed:%s", strerror(errno));
- goto out;
- }
- }
-
- lventry = bd_entry_add (p_entry, dest_lv_name, &iattr, IA_IFREG);
- if (!lventry) {
- ret = EAGAIN;
- goto out;
- }
-
- if (stbuf)
- memcpy (stbuf, &iattr, sizeof(iattr));
-
- ret = 0;
- gf_log (THIS->name, GF_LOG_INFO, "Clone completed");
-out:
- if (vg)
- lvm_vg_close (vg);
- if (fd1 != -1)
- close (fd1);
- if (fd2 != -1)
- close (fd2);
- if (vec)
- iov_free (vec, IOV_NR);
- return ret;
-}
-
-int bd_snapshot_lv (bd_priv_t *priv, bd_entry_t *p_entry, dict_t *output,
- const char *lv_name, const char *dest_lv, char *size,
- struct iatt *stbuf)
-{
- int32_t ret = -1;
- struct iatt iattr = {0, };
- struct stat stat = {0, };
- bd_entry_t *lventry = NULL;
- char *error = NULL;
- int retval = -1;
- runner_t runner = {0, };
- char *path = NULL;
- vg_t vg = NULL;
- lv_t lv = NULL;
-
- runinit (&runner);
-
- runner_add_args (&runner, LVM_CREATE, NULL);
- runner_add_args (&runner, "--snapshot", NULL);
- runner_argprintf (&runner, "/dev/%s/%s", p_entry->name, lv_name);
- runner_add_args (&runner, "--name", NULL);
- runner_argprintf (&runner, "%s", dest_lv);
- runner_argprintf (&runner, "-L%s", size);
-
- runner_start (&runner);
- runner_end (&runner);
-
- gf_asprintf (&path, "/dev/%s/%s", p_entry->name, dest_lv);
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
- if (lstat (path, &stat) < 0) {
- ret = -EAGAIN;
- if (output)
- gf_asprintf (&error, "try again");
- goto out;
- }
-
- vg = lvm_vg_open (priv->handle, p_entry->name, "r", 0);
- if (!vg) {
- ret = -EIO;
- if (output)
- gf_asprintf (&error, "can't open vg %s", p_entry->name);
- goto out;
- }
- lv = lvm_lv_from_name (vg, lv_name);
- if (!lv) {
- ret = -EIO;
- if (output)
- gf_asprintf (&error, "can't open lv %s", lv_name);
- goto out;
- }
- bd_entry_istat (path, &iattr, IA_IFREG);
- iattr.ia_size = lvm_lv_get_size (lv);
- lventry = bd_entry_add (p_entry, dest_lv, &iattr, IA_IFREG);
- if (!lventry) {
- if (output)
- gf_asprintf (&error, "try again");
- ret = -EAGAIN;
- goto out;
- }
- if (stbuf)
- memcpy (stbuf, &iattr, sizeof(iattr));
- ret = 0;
-out:
- if (vg)
- lvm_vg_close (vg);
- if (error && output)
- retval = dict_set_str (output, "error", error);
- GF_FREE (path);
- return ret;
-}
-
-/*
- * Creates a snapshot of given LV
- */
-int
-bd_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
- bd_entry_t *lventry = NULL;
- char *name = NULL;
- char *np = NULL;
- char *volume = NULL;
- char *vg_name = NULL;
- char *path = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- if (strchr (loc->path, '/')) {
- vg_name = gf_strdup (loc->path);
- volume = strrchr (vg_name, '/');
- if (!volume) {
- op_errno = EINVAL;
- goto out;
- }
- /* creating under non VG directory not permited */
- if (vg_name == volume) {
- op_errno = EOPNOTSUPP;
- goto out;
- }
- GF_FREE (vg_name);
- vg_name = NULL;
- }
-
- /*
- * symlink creation for BD xlator is different
- * source (LV) has to exist for creation of symbolic link (snapshot)
- */
- if (strchr (linkname, '/')) {
- op_errno = EOPNOTSUPP;
- goto out;
- }
- gf_asprintf (&path, "%s/%s", priv->vg, linkname);
- if (!path) {
- op_errno = -ENOMEM;
- goto out;
- }
- BD_ENTRY (priv, lventry, path);
- if (!lventry) {
- op_errno = ENOENT;
- goto out;
- }
-
- name = np = gf_strdup (loc->path);
- if (!name)
- goto out;
-
- /* Get LV name from loc->path */
- name = strrchr (loc->path, '/');
- if (name != loc->path)
- name++;
-
- memcpy (&preparent, lventry->parent->attr, sizeof(preparent));
- if (bd_snapshot_lv (priv, lventry->parent, NULL, lventry->name,
- name, "1", &stbuf) < 0) {
- op_errno = EAGAIN;
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (lventry->parent);
- memcpy (&postparent, lventry->parent->attr, sizeof (postparent));
- op_ret = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (np)
- GF_FREE (np);
- if (vg_name)
- GF_FREE (vg_name);
- if (path)
- GF_FREE (path);
-
- STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno,
- (loc)?loc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-/*
- * bd_link: Does full clone of given logical volume
- * A new logical volume with source logical volume's size created
- * and entire content copied
- */
-int
-bd_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
- bd_entry_t *lventry = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (oldloc, out);
- VALIDATE_OR_GOTO (newloc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, lventry, oldloc->path);
- if (!lventry) {
- op_errno = ENOENT;
- goto out;
- }
- memcpy (&postparent, lventry->parent->attr, sizeof (postparent));
- if (bd_clone_lv (priv, lventry->parent, NULL, lventry->parent->name,
- lventry->name, newloc->name, &stbuf) < 0) {
- op_errno = EAGAIN;
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (lventry->parent);
- memcpy (&preparent, lventry->parent->attr, sizeof (preparent));
- op_ret = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
-
-
- STACK_UNWIND_STRICT (link, frame, op_ret, op_errno,
- (oldloc)?oldloc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-int32_t
-bd_open (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t _fd = -1;
- bd_fd_t *bd_fd = NULL;
- bd_entry_t *lventry = NULL;
- bd_priv_t *priv = NULL;
- char *devpath = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, lventry, loc->path);
- if (!lventry) {
- op_errno = ENOENT;
- goto out;
- }
-
- gf_asprintf (&devpath, "/dev/%s/%s", lventry->parent->name,
- lventry->name);
- _fd = open (devpath, flags, 0);
- if (_fd == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "open on %s: %s", devpath, strerror (op_errno));
- goto out;
- }
-
- bd_fd = GF_CALLOC (1, sizeof(*bd_fd), gf_bd_fd);
- if (!bd_fd) {
- op_errno = errno;
- goto out;
- }
- bd_fd->entry = lventry;
- bd_fd->fd = _fd;
-
- op_ret = fd_ctx_set (fd, this, (uint64_t)(long)bd_fd);
- if (op_ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to set the fd context path=%s fd=%p",
- loc->name, fd);
- goto out;
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- if (_fd != -1)
- close (_fd);
- /* FIXME: Should we call fd_ctx_set with NULL? */
- if (bd_fd)
- GF_FREE (bd_fd);
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- }
- if (devpath)
- GF_FREE (devpath);
-
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL);
-
- return 0;
-}
-
-int
-bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata)
-{
- uint64_t tmp_bd_fd = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int _fd = -1;
- bd_priv_t *priv = NULL;
- struct iobuf *iobuf = NULL;
- struct iobref *iobref = NULL;
- struct iovec vec = {0, };
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- struct iatt stbuf = {0, };
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- op_errno = -EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL from fd=%p", fd);
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
- if (!size) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
- goto out;
- }
- iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
- if (!iobuf) {
- op_errno = ENOMEM;
- goto out;
- }
- _fd = bd_fd->fd;
- op_ret = pread (_fd, iobuf->ptr, size, offset);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "read failed on fd=%p: %s", fd,
- strerror (op_errno));
- goto out;
- }
-
- vec.iov_base = iobuf->ptr;
- vec.iov_len = op_ret;
-
- iobref = iobref_new ();
- iobref_add (iobref, iobuf);
- BD_ENTRY_UPDATE_ATIME (bd_fd->entry);
-
- memcpy (&stbuf, bd_fd->entry->attr, sizeof(stbuf));
-
- /* Hack to notify higher layers of EOF. */
- if (bd_fd->entry->size == 0)
- op_errno = ENOENT;
- else if ((offset + vec.iov_len) >= bd_fd->entry->size)
- op_errno = ENOENT;
- op_ret = vec.iov_len;
-out:
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
- &vec, 1, &stbuf, iobref, NULL);
-
- if (iobref)
- iobref_unref (iobref);
- if (iobuf)
- iobuf_unref (iobuf);
- return 0;
-}
-
-#define LVM_RESIZE "/sbin/lvresize"
-
-int32_t
-bd_resize (bd_priv_t *priv, bd_entry_t *lventry, off_t *size)
-{
- bd_entry_t *vgentry = NULL;
- uint64_t extent = 0;
- int32_t op_ret = -1;
- vg_t vg = NULL;
- uint32_t nr_ex = 0;
- lv_t lv = NULL;
- uint64_t new_size = 0;
- runner_t runner = {0, };
-
- BD_ENTRY (priv, vgentry, lventry->parent->name);
- if (!vgentry) {
- op_ret = ENOENT;
- goto out;
- }
-
- BD_WR_LOCK (&priv->lock);
- vg = lvm_vg_open (priv->handle, vgentry->name, "w", 0);
- if (!vg) {
- op_ret = lvm_errno (priv->handle);
- BD_UNLOCK (&priv->lock);
- goto out;
- }
-
- extent = lvm_vg_get_extent_size (vg);
- lvm_vg_close (vg);
- BD_UNLOCK (&priv->lock);
-
- nr_ex = *size / extent;
- if (*size % extent)
- nr_ex++;
- *size = extent * nr_ex;
-
- runinit (&runner);
-
- runner_add_args (&runner, LVM_RESIZE, NULL);
- runner_argprintf (&runner, "/dev/%s/%s", lventry->parent->name,
- lventry->name);
- runner_argprintf (&runner, "-l%ld", nr_ex);
- runner_add_args (&runner, "-f", NULL);
-
- runner_start (&runner);
- runner_end (&runner);
-
- BD_WR_LOCK (&priv->lock);
- vg = lvm_vg_open (priv->handle, vgentry->name, "w", 0);
- if (!vg) {
- op_ret = lvm_errno (priv->handle);
- BD_UNLOCK (&priv->lock);
- goto out;
- }
-
- lv = lvm_lv_from_name (vg, lventry->name);
- if (!lv) {
- op_ret = lvm_errno (priv->handle);
- lvm_vg_close (vg);
- BD_UNLOCK (&priv->lock);
- goto out;
- }
- new_size = lvm_lv_get_size (lv);
- lvm_vg_close (vg);
- if (new_size != *size) {
- op_ret = EIO;
- BD_UNLOCK (&priv->lock);
- goto out;
- }
-
- BD_UNLOCK (&priv->lock);
- op_ret = 0;
-
-out:
- if (vgentry)
- BD_PUT_ENTRY (priv, vgentry);
-
- return op_ret;
-}
-
- int32_t
-bd_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- struct iatt preop = {0, };
- struct iatt postop = {0, };
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- uint64_t tmp_bd_fd = 0;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
-
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL, fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
-
- memcpy (&preop, bd_fd->entry->attr, sizeof(preop));
- if (offset > bd_fd->entry->size) {
- op_errno = bd_resize (priv, bd_fd->entry, &offset);
- if (op_errno)
- goto out;
- if (offset > bd_fd->entry->size) {
- bd_fd->entry->attr->ia_size = offset;
- bd_fd->entry->size = offset;
- }
- }
- /* If the requested size is less then current size
- * we will not update that in bd_fd->entry->attr
- * because it will result in showing size of this file less
- * instead we will return 0 for less size truncation
- */
- BD_ENTRY_UPDATE_MTIME (bd_fd->entry);
- memcpy (&postop, bd_fd->entry->attr, sizeof(postop));
-
- op_ret = 0;
-out:
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, &preop,
- &postop, NULL);
- return 0;
-}
-
-int32_t
-bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
- off_t offset, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- struct iatt prebuf = {0, };
- struct iatt postbuf = {0, };
- bd_entry_t *lventry = NULL;
- bd_priv_t *priv = NULL;
- off_t size = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- BD_ENTRY (priv, lventry, loc->path);
- if (!lventry) {
- op_errno = ENOENT;
- gf_log (this->name, GF_LOG_ERROR,
- "pre-operation lstat on %s failed: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- memcpy (&prebuf, lventry->attr, sizeof(prebuf));
- if (offset > lventry->size) {
- op_errno = bd_resize (priv, lventry, &size);
- if (op_errno)
- goto out;
- if (lventry->size < offset) {
- lventry->attr->ia_size = offset;
- lventry->size = size;
- }
- }
- BD_ENTRY_UPDATE_MTIME (lventry);
- memcpy (&postbuf, lventry->attr, sizeof(postbuf));
- BD_PUT_ENTRY (priv, lventry);
- op_ret = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno,
- &prebuf, &postbuf, NULL);
- return 0;
-}
-
-int32_t
-__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset,
- uint64_t bd_size)
-{
- int32_t op_ret = 0;
- int index = 0;
- int retval = 0;
- off_t internal_offset = 0;
- int no_space = 0;
-
- if (!vector)
- return -EFAULT;
-
- internal_offset = offset;
- for (index = 0; index < count; index++) {
- if (internal_offset >= bd_size) {
- op_ret = -ENOSPC;
- goto err;
- }
- if (internal_offset + vector[index].iov_len >= bd_size) {
- vector[index].iov_len = bd_size - internal_offset;
- no_space = 1;
- }
-
- retval = pwrite (fd, vector[index].iov_base,
- vector[index].iov_len, internal_offset);
- if (retval == -1) {
- gf_log (THIS->name, GF_LOG_WARNING,
- "base %p, length %ld, offset %ld, message %s",
- vector[index].iov_base, vector[index].iov_len,
- internal_offset, strerror (errno));
- op_ret = -errno;
- goto err;
- }
- op_ret += retval;
- internal_offset += retval;
- if (no_space)
- break;
- }
-err:
- return op_ret;
-}
-
-int bd_create_lv (bd_priv_t *priv, bd_entry_t *p_entry, const char *vg_name,
- const char *lv_name, char *size, mode_t mode)
-{
- vg_t vg = NULL;
- int ret = -1;
- char *path = NULL;
- struct iatt iattr = {0, };
- bd_entry_t *lventry = NULL;
- uint64_t extent = 0;
-
- BD_WR_LOCK (&priv->lock);
- vg = lvm_vg_open (priv->handle, vg_name, "w", 0);
- if (!vg) {
- ret = -1;
- goto out;
- }
- extent = lvm_vg_get_extent_size (vg);
- if (size)
- gf_string2bytesize (size, &extent);
-
- if (lvm_vg_create_lv_linear (vg, lv_name, extent) == NULL) {
- ret = -EAGAIN;
- lvm_vg_close (vg);
- goto out;
- }
- lvm_vg_close (vg);
-
- gf_asprintf (&path, "/dev/%s/%s", vg_name, lv_name);
- if (!path) {
- ret = -ENOMEM;
- lvm_vg_close (vg);
- goto out;
- }
- bd_entry_istat (path, &iattr, IA_IFREG);
- iattr.ia_size = extent;
- if (!mode)
- mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
-
- iattr.ia_type = ia_type_from_st_mode (mode);
- iattr.ia_prot = ia_prot_from_st_mode (mode);
- lventry = bd_entry_add (p_entry, lv_name, &iattr, IA_IFREG);
- if (!lventry) {
- ret = -EAGAIN;
- goto out;
- }
- ret = 0;
-out:
- BD_UNLOCK (&priv->lock);
- if (path)
- GF_FREE (path);
- return ret;
-}
-
-int bd_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- mode_t umask, fd_t *fd, dict_t *params)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t _fd = -1;
- bd_priv_t *priv = NULL;
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
- bd_entry_t *p_entry = NULL;
- bd_entry_t *lventry = NULL;
- bd_fd_t *pfd = NULL;
- char *vg_name = NULL;
- char *volume = NULL;
- char *path = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- volume = vg_name = gf_strdup (loc->path);
- if (!volume)
- goto out;
- volume = strrchr (volume, '/');
- if (!volume) {
- op_errno = EINVAL;
- goto out;
- }
- /* creating under non VG directory not permited */
- if (vg_name == volume) {
- op_errno = EOPNOTSUPP;
- goto out;
- }
- *volume = '\0';
-
- BD_ENTRY (priv, p_entry, vg_name);
- if (!p_entry) {
- op_errno = ENOENT;
- goto out;
- }
-
- memcpy (&preparent, p_entry->attr, sizeof(preparent));
-
- op_errno = bd_create_lv (priv, p_entry, p_entry->name, loc->name, 0,
- mode);
- if (op_errno)
- goto out;
-
- BD_ENTRY (priv, lventry, loc->path);
- if (!lventry) {
- gf_log (this->name, GF_LOG_WARNING,
- "newly created LV not available %s", loc->path);
- op_errno = EAGAIN;
- goto out;
- }
-
- /* Mask O_CREATE since we created LV */
- flags &= ~(O_CREAT | O_EXCL);
-
- gf_asprintf (&path, "/dev/%s/%s", p_entry->name, loc->name);
- if (!path) {
- op_errno = ENOMEM;
- goto out;
- }
- _fd = open (path, flags, 0);
- if (_fd == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "open on %s: %s", path, strerror (op_errno));
- goto out;
- }
-
- memcpy (&stbuf, lventry->attr, sizeof(stbuf));
-
- pfd = GF_CALLOC (1, sizeof(*pfd), gf_bd_fd);
- if (!pfd) {
- op_errno = errno;
- goto out;
- }
- pfd->flag = flags;
- pfd->fd = _fd;
- pfd->entry = lventry;
-
- if (fd_ctx_set (fd, this, (uint64_t)(long)pfd)) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to set the fd context path=%s fd=%p",
- loc->name, fd);
- goto out;
- }
-
- op_ret = 0;
-
- memcpy (&postparent, p_entry->attr, sizeof(postparent));
-out:
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
- if (path)
- GF_FREE (path);
- if (op_ret < 0 && lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (vg_name)
- GF_FREE (vg_name);
-
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd,
- (loc)?loc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-/*
- * We don't do actual setattr on devices on the host side, we just update
- * the entries in server process & they are not persistent
- */
-int bd_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- struct iatt statpre = {0, };
- struct iatt statpost = {0, };
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- bd_fd_t *pfd = NULL;
- int ret = 0;
- uint64_t tmp_pfd = 0;
- int _fd = -1;
-
- priv = this->private;
-
- ret = fd_ctx_get (fd, this, &tmp_pfd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "pfd is NULL, fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
- pfd = (bd_fd_t *)(long)tmp_pfd;
-
- _fd = pfd->fd;
- memcpy (&statpre, pfd->entry->attr, sizeof(statpre));
- op_ret = 0;
-
- if (valid & GF_SET_ATTR_MODE)
- pfd->entry->attr->ia_prot = stbuf->ia_prot;
- if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
- if (valid & GF_SET_ATTR_UID)
- pfd->entry->attr->ia_uid = stbuf->ia_uid;
- if (valid & GF_SET_ATTR_GID)
- pfd->entry->attr->ia_gid = stbuf->ia_gid;
- }
- if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
- pfd->entry->attr->ia_atime = stbuf->ia_atime;
- pfd->entry->attr->ia_atime_nsec = stbuf->ia_atime_nsec;
- pfd->entry->attr->ia_mtime = stbuf->ia_mtime;
- pfd->entry->attr->ia_mtime_nsec = stbuf->ia_mtime_nsec;
- }
- memcpy (&statpost, pfd->entry->attr, sizeof(statpost));
- op_errno = 0;
-out:
- STACK_UNWIND_STRICT (setattr, frame, 0, 0, &statpre, &statpost, NULL);
- return 0;
-}
-
-int bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- struct iatt statpre = {0, };
- struct iatt statpost = {0, };
- bd_entry_t *lventry = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- char path[PATH_MAX] = {0, };
-
- priv = this->private;
-
- /*
- * We don't allow to do setattr on / on host side
- * ie /dev
- */
- if (!strcmp (loc->path, "/")) {
- op_ret = 0;
- goto out;
- }
-
- BD_ENTRY (priv, lventry, loc->path);
- if (!lventry) {
- op_errno = ENOENT;
- goto out;
- }
- sprintf (path, "/dev/%s/%s", lventry->parent->name, lventry->name);
-
- memcpy (&statpre, lventry->attr, sizeof(statpre));
- if (valid & GF_SET_ATTR_MODE)
- lventry->attr->ia_prot = stbuf->ia_prot;
- if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
- if (valid & GF_SET_ATTR_UID)
- lventry->attr->ia_uid = stbuf->ia_uid;
- if (valid & GF_SET_ATTR_GID)
- lventry->attr->ia_gid = stbuf->ia_gid;
- }
- if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
- lventry->attr->ia_atime = stbuf->ia_atime;
- lventry->attr->ia_atime_nsec = stbuf->ia_atime_nsec;
- lventry->attr->ia_mtime = stbuf->ia_mtime;
- lventry->attr->ia_mtime_nsec = stbuf->ia_mtime_nsec;
- }
- memcpy (&statpost, lventry->attr, sizeof(statpost));
- op_errno = 0;
-out:
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- STACK_UNWIND_STRICT (setattr, frame, 0, 0, &statpre, &statpost, NULL);
- return 0;
-}
-
-int
-bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- uint32_t flags, struct iobref *iobref, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int _fd = -1;
- bd_priv_t *priv = NULL;
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- struct iatt preop = {0, };
- struct iatt postop = {0, };
- uint64_t tmp_bd_fd = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (vector, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- op_errno = -ret;
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL from fd=%p", fd);
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
- _fd = bd_fd->fd;
-
- memcpy (&preop, bd_fd->entry->attr, sizeof(preop));
- op_ret = __bd_pwritev (_fd, vector, count, offset, bd_fd->entry->size);
- if (op_ret < 0) {
- op_errno = -op_ret;
- op_ret = -1;
- gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64
- ", %s", offset, strerror (op_errno));
- goto out;
- }
- BD_ENTRY_UPDATE_MTIME (bd_fd->entry);
- memcpy (&postop, bd_fd->entry->attr, sizeof(postop));
-
-out:
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop,
- &postop, NULL);
-
- return 0;
-}
-
-int32_t
-bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
-{
- struct iatt buf = {0, };
- int32_t op_ret = -1;
- int32_t entry_ret = 0;
- int32_t op_errno = 0;
- char *pathdup = NULL;
- bd_entry_t *bdentry = NULL;
- struct iatt postparent = {0, };
- bd_priv_t *priv = NULL;
- char *p = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (loc->path, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, bdentry, loc->path);
- if (!bdentry) {
- op_errno = ENOENT;
- entry_ret = -1;
- goto parent;
- }
- memcpy (&buf, bdentry->attr, sizeof(buf));
- BD_PUT_ENTRY (priv, bdentry);
-
-parent:
- if (loc->parent) {
- pathdup = p = gf_strdup (loc->path);
- if (!pathdup) {
- op_errno = ENOMEM;
- entry_ret = -1;
- goto out;
- }
- p = strrchr (pathdup, '/');
- if (p == pathdup)
- *(p+1) = '\0';
- else
- *p = '\0';
- BD_ENTRY (priv, bdentry, pathdup);
- if (!bdentry) {
- op_errno = ENOENT;
- gf_log (this->name, GF_LOG_ERROR,
- "post-operation lookup on parent of %s "
- "failed: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- memcpy (&postparent, bdentry->attr, sizeof(postparent));
- BD_PUT_ENTRY (priv, bdentry);
- }
-
- op_ret = entry_ret;
-out:
- if (pathdup)
- GF_FREE (pathdup);
-
- STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno,
- (loc)?loc->inode:NULL, &buf, NULL, &postparent);
-
- return 0;
-}
-
-int32_t
-bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- struct iatt buf = {0,};
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- bd_entry_t *bdentry = NULL;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, bdentry, loc->path);
- if (!bdentry) {
- op_errno = ENOENT;
- gf_log (this->name, GF_LOG_ERROR, "stat on %s failed: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- memcpy (&buf, bdentry->attr, sizeof(buf));
- BD_PUT_ENTRY (priv, bdentry);
- op_ret = 0;
-
-out:
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL);
-
- return 0;
-}
-
-int32_t
-bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- int ret = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- uint64_t tmp_bd_fd = 0;
- struct iatt buf = {0, };
- bd_fd_t *bd_fd = NULL;
- int _fd = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL, fd=%p", fd);
- op_errno = -EINVAL;
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
- _fd = bd_fd->fd;
-
- memcpy (&buf, bd_fd->entry->attr, sizeof(buf));
- op_ret = 0;
-
-out:
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, &buf, NULL);
- return 0;
-}
-
-int32_t
-bd_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
- bd_fd_t *bd_fd = NULL;
- bd_entry_t *bdentry = NULL;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (loc->path, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- BD_ENTRY (priv, bdentry, loc->path);
- if (!bdentry) {
- op_errno = ENOENT;
- gf_log (this->name, GF_LOG_ERROR, "opendir failed on %s: %s",
- loc->path, strerror (op_errno));
- goto out;
- }
- bd_fd = GF_CALLOC (1, sizeof(*bd_fd), gf_bd_fd);
- if (!bd_fd) {
- op_errno = errno;
- BD_PUT_ENTRY (priv, bdentry);
- goto out;
- }
-
- bd_fd->p_entry = bdentry;
-
- bdentry = list_entry ((&bdentry->child)->next, typeof(*bdentry), child);
- if (!bdentry) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_ERROR, "bd_entry NULL");
- goto out;
- }
- bdentry = list_entry ((&bdentry->sibling), typeof(*bdentry), sibling);
- if (!bdentry) {
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_ERROR, "bd_entry NULL");
- goto out;
- }
-
- bd_fd->entry = bdentry;
-
- op_ret = fd_ctx_set (fd, this, (uint64_t) (long)bd_fd);
- if (op_ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to set the fd context path=%s fd=%p",
- loc->path, fd);
- goto out;
- }
-
- op_ret = 0;
-out:
- if (op_ret == -1) {
- BD_PUT_ENTRY (priv, bd_fd->p_entry);
- if (bd_fd)
- GF_FREE (bd_fd);
- }
-
- STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, NULL);
- return 0;
-}
-
-int32_t
-bd_releasedir (xlator_t *this, fd_t *fd)
-{
- bd_fd_t *bd_fd = NULL;
- uint64_t tmp_bd_fd = 0;
- int ret = 0;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = fd_ctx_del (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "bd_fd from fd=%p is NULL",
- fd);
- goto out;
- }
- bd_fd = (bd_fd_t *) (long)tmp_bd_fd;
- BD_PUT_ENTRY (priv, bd_fd->p_entry);
-
- bd_fd = (bd_fd_t *) (long)tmp_bd_fd;
- GF_FREE (bd_fd);
-out:
- return 0;
-}
-
-/*
- * bd_statfs: Mimics statfs by returning used/free extents in the VG
- * TODO: IF more than one VG allowed per volume, this functions needs some
- * change
- */
-int32_t
-bd_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xdata)
-{
- int32_t op_ret = -1;
- int32_t ret = -1;
- int32_t op_errno = 0;
- bd_priv_t *priv = NULL;
- struct statvfs buf = {0, };
- vg_t vg = NULL;
- char *vg_name = NULL;
- uint64_t size = 0;
- uint64_t fr_size = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = dict_get_str (this->options, "export", &vg_name);
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: storage/bd does not specify volume groups");
- op_errno = EINVAL;
- goto out;
- }
-
- BD_RD_LOCK (&priv->lock);
-
- vg = lvm_vg_open (priv->handle, vg_name, "r", 0);
- size += lvm_vg_get_size (vg);
- fr_size += lvm_vg_get_free_size (vg);
- lvm_vg_close (vg);
-
- BD_UNLOCK (&priv->lock);
-
- if (statvfs ("/", &buf) < 0) {
- op_errno = errno;
- goto out;
- }
- op_ret = 0;
- buf.f_blocks = size / buf.f_frsize;
- buf.f_bfree = fr_size / buf.f_frsize;
- buf.f_bavail = fr_size / buf.f_frsize;
-out:
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, &buf, NULL);
- return 0;
-}
-
-int32_t
-bd_release (xlator_t *this, fd_t *fd)
-{
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- uint64_t tmp_bd_fd = 0;
- bd_priv_t *priv = NULL;
-
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING, "bd_fd is NULL from fd=%p",
- fd);
- goto out;
- }
- bd_fd = (bd_fd_t *) (long)tmp_bd_fd;
- close (bd_fd->fd);
- BD_PUT_ENTRY (priv, bd_fd->entry);
-
- GF_FREE (bd_fd);
-out:
- return 0;
-}
-
-int32_t
-bd_fsync (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t datasync, dict_t *xdata)
-{
- int _fd = -1;
- int ret = -1;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- uint64_t tmp_bd_fd = 0;
- bd_fd_t *bd_fd = NULL;
- struct iatt preop = {0, };
- struct iatt postop = {0, };
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL, fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
- bd_fd = (bd_fd_t *)(long)tmp_bd_fd;
-
- _fd = bd_fd->fd;
- memcpy (&preop, &bd_fd->entry->attr, sizeof(preop));
- if (datasync) {
- ;
-#ifdef HAVE_FDATASYNC
- op_ret = fdatasync (_fd);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "fdatasync on fd=%p failed: %s",
- fd, strerror (errno));
- }
-#endif
- } else {
- op_ret = fsync (_fd);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name, GF_LOG_ERROR,
- "fsync on fd=%p failed: %s",
- fd, strerror (op_errno));
- goto out;
- }
- }
-
- memcpy (&postop, bd_fd->entry->attr, sizeof(postop));
- op_ret = 0;
-
-out:
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, &preop,
- &postop, NULL);
-
- return 0;
-}
-
-int32_t
-bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdict)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int ret = -1;
- uint64_t tmp_bd_fd = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- op_errno = -EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "bd_fd is NULL on fd=%p", fd);
- goto out;
- }
- op_ret = 0;
-out:
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, NULL);
-
- return 0;
-}
-
-int
-__bd_fill_readdir (pthread_rwlock_t *bd_lock, bd_fd_t *bd_fd, off_t off,
- size_t size, gf_dirent_t *entries)
-{
- size_t filled = 0;
- int count = 0;
- struct dirent entry = {0, };
- int32_t this_size = -1;
- gf_dirent_t *this_entry = NULL;
- bd_entry_t *bdentry = NULL;
- bd_entry_t *cur_entry = NULL;
- bd_entry_t *n_entry = NULL;
-
- BD_RD_LOCK (bd_lock);
-
- bdentry = list_entry ((&bd_fd->p_entry->child)->next, typeof(*n_entry),
- child);
-
- if (off) {
- int i = 0;
- list_for_each_entry (n_entry, &bd_fd->entry->sibling, sibling) {
- if (i == off && strcmp (n_entry->name, "")) {
- bd_fd->entry = n_entry;
- break;
- }
- }
- } else
- bd_fd->entry = list_entry ((&bdentry->sibling),
- typeof(*n_entry), sibling);
-
- while (filled <= size) {
- cur_entry = bd_fd->entry;
-
- n_entry = list_entry ((&bd_fd->entry->sibling)->next,
- typeof (*cur_entry), sibling);
- if (&n_entry->sibling == (&bdentry->sibling))
- break;
-
- strcpy (entry.d_name, n_entry->name);
- entry.d_ino = n_entry->attr->ia_ino;
- entry.d_off = off;
- if (n_entry->attr->ia_type == IA_IFDIR)
- entry.d_type = DT_DIR;
- else
- entry.d_type = DT_REG;
-
- this_size = max (sizeof(gf_dirent_t),
- sizeof (gfs3_dirplist))
- + strlen (entry.d_name) + 1;
-
- if (this_size + filled > size)
- break;
-
- bd_fd->entry = n_entry;
-
- this_entry = gf_dirent_for_name (entry.d_name);
- if (!this_entry) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "could not create gf_dirent for entry %s",
- entry.d_name);
- goto out;
- }
- this_entry->d_off = off;
- this_entry->d_ino = entry.d_ino;
- this_entry->d_type = entry.d_type;
- off++;
-
- list_add_tail (&this_entry->list, &entries->list);
-
- filled += this_size;
- count++;
- }
-out:
- BD_UNLOCK (bd_lock);
- return count;
-}
-
-int32_t
-bd_do_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off, int whichop)
-{
- uint64_t tmp_bd_fd = 0;
- bd_fd_t *bd_fd = NULL;
- int ret = -1;
- int count = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- gf_dirent_t entries;
- gf_dirent_t *tmp_entry = NULL;
- bd_entry_t *bdentry = NULL;
- bd_priv_t *priv = NULL;
- char *devpath = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- INIT_LIST_HEAD (&entries.list);
-
- ret = fd_ctx_get (fd, this, &tmp_bd_fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING, "bd_fd is NULL, fd=%p", fd);
- op_errno = -EINVAL;
- goto out;
- }
- bd_fd = (bd_fd_t *) (long)tmp_bd_fd;
- LOCK (&fd->lock);
- {
- count = __bd_fill_readdir (&priv->lock, bd_fd, off,
- size, &entries);
- }
- UNLOCK (&fd->lock);
-
- /* pick ENOENT to indicate EOF */
- op_errno = errno;
- op_ret = count;
-
- if (whichop != GF_FOP_READDIRP)
- goto out;
-
- BD_RD_LOCK (&priv->lock);
- list_for_each_entry (tmp_entry, &entries.list, list) {
- char path[PATH_MAX];
- sprintf (path, "%s/%s", bd_fd->p_entry->name,
- tmp_entry->d_name);
- bdentry = bd_entry_get (path);
- if (!bdentry) {
- gf_log (this->name, GF_LOG_WARNING,
- "entry failed %s\n", tmp_entry->d_name);
- continue;
- }
- if (bdentry->attr->ia_ino)
- tmp_entry->d_ino = bdentry->attr->ia_ino;
- memcpy (&tmp_entry->d_stat,
- bdentry->attr, sizeof (tmp_entry->d_stat));
- bd_entry_put (bdentry);
- GF_FREE (devpath);
- }
- BD_UNLOCK (&priv->lock);
-
-out:
- STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, &entries, NULL);
-
- gf_dirent_free (&entries);
-
- return 0;
-}
-
-int32_t
-bd_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off, dict_t *dict)
-{
- bd_do_readdir (frame, this, fd, size, off, GF_FOP_READDIR);
- return 0;
-}
-
-
-int32_t
-bd_readdirp (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off, dict_t *dict)
-{
- bd_do_readdir (frame, this, fd, size, off, GF_FOP_READDIRP);
- return 0;
-}
-
-int32_t
-bd_priv (xlator_t *this)
-{
- return 0;
-}
-
-int32_t
-bd_inode (xlator_t *this)
-{
- return 0;
-}
-
-/* unsupported interfaces */
-int32_t
-bd_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size, dict_t *xdata)
-{
- struct iatt stbuf = {0, };
- char *dest = NULL;
-
- dest = alloca (size + 1);
- STACK_UNWIND_STRICT (readlink, frame, -1, ENOSYS, dest, &stbuf, NULL);
- return 0;
-}
-
-int
-bd_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t dev, mode_t umask, dict_t *xdata)
-{
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
-
- STACK_UNWIND_STRICT (mknod, frame, -1, ENOSYS,
- (loc)?loc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-int
-bd_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- mode_t umask, dict_t *xdata)
-{
- struct iatt stbuf = {0, };
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
-
- STACK_UNWIND_STRICT (mkdir, frame, -1, ENOSYS,
- (loc)?loc->inode:NULL, &stbuf, &preparent,
- &postparent, NULL);
- return 0;
-}
-
-int
-bd_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
- dict_t *xdata)
-{
- struct iatt preparent = {0, };
- struct iatt postparent = {0, };
-
- STACK_UNWIND_STRICT (rmdir, frame, -1, ENOSYS,
- &preparent, &postparent, NULL);
- return 0;
-}
-
-int32_t
-bd_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (setxattr, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_fsetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *dict, int flags, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (setxattr, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (getxattr, frame, -1, ENOSYS, NULL, NULL);
- return 0;
-}
-
-int32_t
-bd_fgetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOSYS, NULL, NULL);
-
- return 0;
-}
-
-int32_t
-bd_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (removexattr, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_fremovexattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_fsyncdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int datasync, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsyncdir, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-static int gf_bd_lk_log;
-int32_t
-bd_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
-{
- struct gf_flock nullock = {0, };
-
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (lk, frame, -1, ENOSYS, &nullock, NULL);
- return 0;
-}
-
-int32_t
-bd_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (inodelk, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (finodelk, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-
-int32_t
-bd_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (entrylk, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- GF_LOG_OCCASIONALLY (gf_bd_lk_log, this->name, GF_LOG_CRITICAL,
- "\"features/locks\" translator is "
- "not loaded. You need to use it for proper "
- "functioning of your application.");
-
- STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOSYS, NULL);
- return 0;
-}
-
-int32_t
-bd_rchecksum (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset, int32_t len, dict_t *xdata)
-{
- int32_t weak_checksum = 0;
- unsigned char strong_checksum[MD5_DIGEST_LENGTH];
-
- STACK_UNWIND_STRICT (rchecksum, frame, -1, ENOSYS,
- weak_checksum, strong_checksum, NULL);
- return 0;
-}
-
-int
-bd_xattrop (call_frame_t *frame, xlator_t *this,
- loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (xattrop, frame, -1, ENOSYS, xattr, NULL);
- return 0;
-}
-
-
-int
-bd_fxattrop (call_frame_t *frame, xlator_t *this,
- fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (xattrop, frame, -1, ENOSYS, xattr, NULL);
- return 0;
-}
-
-int bd_xl_op_create (bd_priv_t *priv, dict_t *input, dict_t *output)
-{
- char *vg = NULL;
- char *lv = NULL;
- char *path = NULL;
- bd_entry_t *p_entry = NULL;
- bd_entry_t *lventry = NULL;
- char *size = 0;
- int ret = -1;
- char *error = NULL;
- int retval = -1;
- char *buff = NULL;
- char *buffp = NULL;
- char *save = NULL;
-
- ret = dict_get_str (input, "size", &size);
- if (ret) {
- gf_asprintf (&error, "no size specified");
- goto out;
- }
- ret = dict_get_str (input, "path", &path);
- if (ret) {
- gf_asprintf (&error, "no path specified");
- goto out;
- }
-
- buff = buffp = gf_strdup (path);
-
- vg = strtok_r (buff, "/", &save);
- lv = strtok_r (NULL, "/", &save);
-
- if (!vg || !lv) {
- gf_asprintf (&error, "invalid path %s", path);
- ret = -1;
- goto out;
- }
-
- BD_ENTRY (priv, p_entry, vg);
- if (!p_entry) {
- ret = -ENOENT;
- goto out;
- }
- BD_ENTRY (priv, lventry, path);
- if (lventry) {
- ret = -EEXIST;
- gf_asprintf (&error, "%s already exists", lv);
- BD_PUT_ENTRY (priv, lventry);
- goto out;
- }
-
- ret = bd_create_lv (priv, p_entry, vg, lv, size, 0);
- if (ret < 0) {
- gf_asprintf (&error, "bd_create_lv error %d", -ret);
- goto out;
- }
- ret = 0;
-out:
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
-
- if (buffp)
- GF_FREE (buffp);
-
- if (error)
- retval = dict_set_dynstr (output, "error", error);
- return ret;
-}
-
-int bd_xl_op_delete (bd_priv_t *priv, dict_t *input, dict_t *output)
-{
- char *vg = NULL;
- char *path = NULL;
- bd_entry_t *p_entry = NULL;
- bd_entry_t *lventry = NULL;
- int ret = -1;
- char *error = NULL;
- int retval = -1;
- char *buff = NULL;
- char *buffp = NULL;
- char *save = NULL;
- int op_errno = 0;
-
- ret = dict_get_str (input, "path", &path);
- if (ret) {
- gf_asprintf (&error, "no path specified");
- goto out;
- }
-
- buff = buffp = gf_strdup (path);
-
- vg = strtok_r (buff, "/", &save);
- if (!vg) {
- gf_asprintf (&error, "invalid path %s", path);
- op_errno = EINVAL;
- ret = -1;
- goto out;
- }
-
- BD_ENTRY (priv, p_entry, vg);
- BD_ENTRY (priv, lventry, path);
- if (!p_entry || !lventry) {
- op_errno = -ENOENT;
- gf_asprintf (&error, "%s not found", path);
- ret = -1;
- goto out;
- }
- ret = bd_delete_lv (priv, p_entry, lventry, path, &op_errno);
- if (ret < 0) {
- gf_asprintf (&error, "bd_delete_lv error, error:%d", op_errno);
- goto out;
- }
- ret = 0;
-out:
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
- if (lventry)
- BD_PUT_ENTRY (priv, lventry);
- if (buffp)
- GF_FREE (buffp);
- if (error)
- retval = dict_set_dynstr (output, "error", error);
- return ret;
-}
-
-int bd_xl_op_clone(bd_priv_t *priv, int subop, dict_t *input, dict_t *output)
-{
- bd_entry_t *p_entry = NULL;
- bd_entry_t *lventry = NULL;
- int ret = -1;
- char *error = NULL;
- int retval = -1;
- char *vg = NULL;
- char *lv = NULL;
- char *dest_lv = NULL;
- char *size = NULL;
- char *buff = NULL;
- char *buffp = NULL;
- char *path = NULL;
- char *save = NULL;
- char *npath = NULL;
-
- ret = dict_get_str (input, "path", &path);
- ret = dict_get_str (input, "dest_lv", &dest_lv);
- ret = dict_get_str (input, "size", &size);
-
- if (!path || !dest_lv) {
- gf_asprintf (&error, "invalid arguments");
- ret = -1;
- goto out;
- }
-
- buff = buffp = gf_strdup (path);
-
- vg = strtok_r (buff, "/", &save);
- lv = strtok_r (NULL, "/", &save);
- if (!lv) {
- gf_asprintf (&error, "lv not given %s", path);
- ret = -1;
- goto out;
- }
-
- BD_ENTRY (priv, p_entry, vg);
- if (!p_entry) {
- gf_asprintf (&error, "%s does not exist", vg);
- retval = dict_set_str (output, "error", error);
- goto out;
- }
-
- BD_ENTRY (priv, lventry, path);
- if (!lventry) {
- gf_asprintf (&error, "%s does not exist", path);
- ret = -1;
- goto out;
- }
- BD_PUT_ENTRY (priv, lventry);
- lventry = NULL;
- gf_asprintf (&npath, "/%s/%s", vg, dest_lv);
- BD_ENTRY (priv, lventry, npath);
- if (lventry) {
- gf_asprintf (&error, "%s already exists", dest_lv);
- BD_PUT_ENTRY (priv, lventry);
- ret = -1;
- goto out;
- }
-
- if (subop == GF_BD_OP_SNAPSHOT_BD) {
- if (!size) {
- gf_asprintf (&error, "size not given");
- ret = -1;
- goto out;
- }
- ret = bd_snapshot_lv (priv, p_entry, output, lv, dest_lv,
- size, NULL);
- } else
- ret = bd_clone_lv (priv, p_entry, output, vg, lv, dest_lv,
- NULL);
-
- if (ret)
- goto out;
- ret = 0;
-out:
- if (error)
- retval = dict_set_dynstr (output, "error", error);
- if (p_entry)
- BD_PUT_ENTRY (priv, p_entry);
- if (npath)
- GF_FREE (npath);
- if (buffp)
- GF_FREE (buffp);
- return ret;
-}
-
-int32_t
-bd_notify (xlator_t *this, dict_t *input, dict_t *output)
-{
- int ret = -1;
- int retval = -1;
- int32_t bdop = -1;
- bd_priv_t *priv = NULL;
- char *error = NULL;
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv, out);
-
- ret = dict_get_int32 (input, "bd-op", (int32_t *)&bdop);
- if (ret) {
- gf_asprintf (&error, "no sub-op specified");
- goto out;
- }
-
- switch (bdop)
- {
- case GF_BD_OP_NEW_BD:
- ret = bd_xl_op_create (priv, input, output);
- break;
- case GF_BD_OP_DELETE_BD:
- ret = bd_xl_op_delete (priv, input, output);
- break;
- case GF_BD_OP_CLONE_BD:
- case GF_BD_OP_SNAPSHOT_BD:
- ret = bd_xl_op_clone (priv, bdop, input, output);
- break;
- default:
- gf_asprintf (&error, "invalid bd-op %d specified", bdop);
- retval = dict_set_dynstr (output, "error", error);
- goto out;
- }
-
-out:
- return ret;
-}
-
-/**
- * notify - when parent sends PARENT_UP, send CHILD_UP event from here
- */
-int32_t
-notify (xlator_t *this,
- int32_t event,
- void *data,
- ...)
-{
- va_list ap;
- int ret = 0;
- void *data2 = NULL;
- dict_t *input = NULL;
- dict_t *output = NULL;
-
- va_start (ap, data);
- data2 = va_arg (ap, dict_t *);
- va_end (ap);
-
- switch (event)
- {
- case GF_EVENT_PARENT_UP:
- {
- /* Tell the parent that bd xlator is up */
- default_notify (this, GF_EVENT_CHILD_UP, data);
- }
- break;
- case GF_EVENT_TRANSLATOR_OP:
- input = data;
- output = data2;
- if (!output)
- output = dict_new ();
- ret = bd_notify (this, input, output);
- break;
-
- default:
- break;
- }
- return ret;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-
- return ret;
-}
-
-
-/**
- * init - Constructs lists of LVs in the given VG
- */
-int
-init (xlator_t *this)
-{
- bd_priv_t *_private = NULL;
- int ret = 0;
- char *vg = NULL;
- char *device = NULL;
-
- LOCK_INIT (&inode_lk);
-
- bd_rootp = bd_entry_add_root ();
- if (!bd_rootp) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: adding root entry failed");
- return -1;
- }
-
- if (this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: storage/bd cannot have subvolumes");
- ret = -1;
- goto out;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "Volume is dangling. Please check the volume file.");
- }
-
- ret = dict_get_str (this->options, "device", &device);
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: storage/bd does not specify backend");
- return -1;
- }
-
- /* Now we support only LV device */
- if (strcasecmp (device, BACKEND_VG)) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: unknown %s backend %s", BD_XLATOR, device);
- return -1;
- }
-
- ret = dict_get_str (this->options, "export", &vg);
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "FATAL: storage/bd does not specify volume groups");
- return -1;
- }
-
- ret = 0;
- _private = GF_CALLOC (1, sizeof(*_private), gf_bd_private);
- if (!_private)
- goto error;
-
- pthread_rwlock_init (&_private->lock, NULL);
- this->private = (void *)_private;
- _private->handle = NULL;
- _private->vg = gf_strdup (vg);
- if (!_private->vg) {
- goto error;
- }
-
- if (bd_build_lv_list (this->private, vg) < 0)
- goto error;
-
-out:
- return 0;
-error:
- BD_WR_LOCK (&_private->lock);
- bd_entry_cleanup ();
- lvm_quit (_private->handle);
- if (_private->vg)
- GF_FREE (_private->vg);
- GF_FREE (_private);
- return -1;
-}
-
-void
-fini (xlator_t *this)
-{
- bd_priv_t *priv = this->private;
- if (!priv)
- return;
- lvm_quit (priv->handle);
- BD_WR_LOCK (&priv->lock);
- bd_entry_cleanup ();
- BD_UNLOCK (&priv->lock);
- GF_FREE (priv->vg);
- this->private = NULL;
- GF_FREE (priv);
- return;
-}
-
-struct xlator_dumpops dumpops = {
- .priv = bd_priv,
- .inode = bd_inode,
-};
-
-struct xlator_fops fops = {
- /* Not supported */
- .readlink = bd_readlink,
- .mknod = bd_mknod,
- .mkdir = bd_mkdir,
- .rmdir = bd_rmdir,
- .setxattr = bd_setxattr,
- .fsetxattr = bd_fsetxattr,
- .getxattr = bd_getxattr,
- .fgetxattr = bd_fgetxattr,
- .removexattr = bd_removexattr,
- .fremovexattr= bd_fremovexattr,
- .fsyncdir = bd_fsyncdir,
- .lk = bd_lk,
- .inodelk = bd_inodelk,
- .finodelk = bd_finodelk,
- .entrylk = bd_entrylk,
- .fentrylk = bd_fentrylk,
- .rchecksum = bd_rchecksum,
- .xattrop = bd_xattrop,
-
- /* Supported */
- .lookup = bd_lookup,
- .opendir = bd_opendir,
- .readdir = bd_readdir,
- .readdirp = bd_readdirp,
- .stat = bd_stat,
- .statfs = bd_statfs,
- .open = bd_open,
- .access = bd_access,
- .flush = bd_flush,
- .readv = bd_readv,
- .fstat = bd_fstat,
- .truncate = bd_truncate,
- .ftruncate = bd_ftruncate,
- .fsync = bd_fsync,
- .writev = bd_writev,
- .fstat = bd_fstat,
- .create = bd_create,
- .setattr = bd_setattr,
- .fsetattr = bd_fsetattr,
- .unlink = bd_unlink,
- .link = bd_link,
- .symlink = bd_symlink,
- .rename = bd_rename,
-};
-
-struct xlator_cbks cbks = {
- .releasedir = bd_releasedir,
- .release = bd_release,
-};
-
-struct volume_options options[] = {
- { .key = {"export"},
- .type = GF_OPTION_TYPE_STR},
- { .key = {"device"},
- .type = GF_OPTION_TYPE_STR},
- { .key = {NULL} }
-};
diff --git a/xlators/storage/bd_map/src/bd_map.h b/xlators/storage/bd_map/src/bd_map.h
deleted file mode 100644
index 1a0f4248e..000000000
--- a/xlators/storage/bd_map/src/bd_map.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- BD translator - Exports Block devices on server side as regular
- files to client
-
- Copyright IBM, Corp. 2012
-
- This file is part of GlusterFS.
-
- Author:
- M. Mohan Kumar <mohan@in.ibm.com>
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _BD_MAP_H
-#define _BD_MAP_H
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "xlator.h"
-#include "mem-types.h"
-
-#define BD_XLATOR "block device mapper xlator"
-
-#define BACKEND_VG "vg"
-
-enum gf_bd_mem_types_ {
- gf_bd_fd = gf_common_mt_end + 1,
- gf_bd_private,
- gf_bd_entry,
- gf_bd_attr,
- gf_bd_mt_end
-};
-
-/*
- * Each BD/LV is represented by this data structure
- * Usually root entry will have only children and there is no sibling for that
- * All other entries may have children and/or sibling entries
- * If an entry is a Volume Group it will have child (. & .. and Logical
- * Volumes) and also other Volume groups will be a sibling for this
- */
-typedef struct bd_entry {
- struct list_head child; /* List to child */
- struct list_head sibling; /* List of siblings */
- struct bd_entry *parent;/* Parent of this node */
- struct bd_entry *link; /* Link to actual entry, if its . or .. */
- char name[NAME_MAX];
- struct iatt *attr;
- int refcnt;
- uint64_t size;
- pthread_rwlock_t lock;
-} bd_entry_t;
-
-/**
- * bd_fd - internal structure common to file and directory fd's
- */
-typedef struct bd_fd {
- bd_entry_t *entry;
- bd_entry_t *p_entry; /* Parent entry */
- int fd;
- int32_t flag;
-} bd_fd_t;
-
-typedef struct bd_priv {
- lvm_t handle;
- pthread_rwlock_t lock;
- char *vg;
-} bd_priv_t;
-
-#endif
diff --git a/xlators/storage/bd_map/src/bd_map_help.c b/xlators/storage/bd_map/src/bd_map_help.c
deleted file mode 100644
index 0613aa383..000000000
--- a/xlators/storage/bd_map/src/bd_map_help.c
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- BD translator - Exports Block devices on server side as regular
- files to client
-
- Copyright IBM, Corp. 2012
-
- This file is part of GlusterFS.
-
- Author:
- M. Mohan Kumar <mohan@in.ibm.com>
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#define __XOPEN_SOURCE 500
-
-#include <libgen.h>
-#include <time.h>
-#include <lvm2app.h>
-
-#include "bd_map.h"
-#include "bd_map_help.h"
-#include "defaults.h"
-#include "glusterfs3-xdr.h"
-
-#define CHILD_ENTRY(node) list_entry ((&node->child)->next, typeof(*node), \
- child)
-
-bd_entry_t *bd_rootp;
-gf_lock_t inode_lk;
-static uint64_t bd_entry_ino = 5000; /* Starting inode */
-
-static void bd_entry_get_ino (uint64_t *inode)
-{
- LOCK (&inode_lk);
- {
- *inode = bd_entry_ino++;
- }
- UNLOCK (&inode_lk);
-}
-
-void bd_update_time (bd_entry_t *entry, int type)
-{
- struct timespec ts;
-
- clock_gettime (CLOCK_REALTIME, &ts);
- if (type == 0) {
- entry->attr->ia_mtime = ts.tv_sec;
- entry->attr->ia_mtime_nsec = ts.tv_nsec;
- entry->attr->ia_atime = ts.tv_sec;
- entry->attr->ia_atime_nsec = ts.tv_nsec;
- } else if (type == 1) {
- entry->attr->ia_mtime = ts.tv_sec;
- entry->attr->ia_mtime_nsec = ts.tv_nsec;
- } else {
- entry->attr->ia_atime = ts.tv_sec;
- entry->attr->ia_atime_nsec = ts.tv_nsec;
- }
-}
-
-static bd_entry_t *bd_entry_init (const char *name)
-{
- bd_entry_t *bdentry;
-
- bdentry = GF_MALLOC (sizeof(bd_entry_t), gf_bd_entry);
- if (!bdentry)
- return NULL;
-
- bdentry->attr = GF_MALLOC (sizeof(struct iatt), gf_bd_attr);
- if (!bdentry->attr) {
- GF_FREE (bdentry);