diff -Nru drbd8-8.3.7/.filelist drbd8-8.4.1+git55a81dc~cmd1/.filelist --- drbd8-8.3.7/.filelist 2010-01-13 16:17:27.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.filelist 1970-01-01 00:00:00.000000000 +0000 @@ -1,107 +0,0 @@ -drbd-8.3.7/.gitignore -drbd-8.3.7/COPYING -drbd-8.3.7/ChangeLog -drbd-8.3.7/Makefile.in -drbd-8.3.7/README -drbd-8.3.7/autogen.sh -drbd-8.3.7/benchmark/Makefile -drbd-8.3.7/benchmark/README -drbd-8.3.7/benchmark/dm.c -drbd-8.3.7/benchmark/io-latency-test.c -drbd-8.3.7/configure.ac -drbd-8.3.7/documentation/Makefile.in -drbd-8.3.7/documentation/Makefile.lang -drbd-8.3.7/documentation/aspell.en.per -drbd-8.3.7/documentation/drbd.conf.xml -drbd-8.3.7/documentation/drbd.xml -drbd-8.3.7/documentation/drbdadm.xml -drbd-8.3.7/documentation/drbddisk.xml -drbd-8.3.7/documentation/drbdmeta.xml -drbd-8.3.7/documentation/drbdsetup.xml -drbd-8.3.7/drbd-km.spec.in -drbd-8.3.7/drbd.spec.in -drbd-8.3.7/drbd/Kconfig -drbd-8.3.7/drbd/Makefile -drbd-8.3.7/drbd/Makefile-2.6 -drbd-8.3.7/drbd/cn_queue.c -drbd-8.3.7/drbd/connector.c -drbd-8.3.7/drbd/drbd_actlog.c -drbd-8.3.7/drbd/drbd_bitmap.c -drbd-8.3.7/drbd/drbd_int.h -drbd-8.3.7/drbd/drbd_main.c -drbd-8.3.7/drbd/drbd_nl.c -drbd-8.3.7/drbd/drbd_proc.c -drbd-8.3.7/drbd/drbd_receiver.c -drbd-8.3.7/drbd/drbd_req.c -drbd-8.3.7/drbd/drbd_req.h -drbd-8.3.7/drbd/drbd_strings.c -drbd-8.3.7/drbd/drbd_tracing.c -drbd-8.3.7/drbd/drbd_tracing.h -drbd-8.3.7/drbd/drbd_vli.h -drbd-8.3.7/drbd/drbd_worker.c -drbd-8.3.7/drbd/drbd_wrappers.h -drbd-8.3.7/drbd/linux/connector.h -drbd-8.3.7/drbd/linux/drbd.h -drbd-8.3.7/drbd/linux/drbd_config.h -drbd-8.3.7/drbd/linux/drbd_limits.h -drbd-8.3.7/drbd/linux/drbd_nl.h -drbd-8.3.7/drbd/linux/drbd_tag_magic.h -drbd-8.3.7/drbd/linux/hardirq.h -drbd-8.3.7/drbd/linux/lru_cache.h -drbd-8.3.7/drbd/linux/memcontrol.h -drbd-8.3.7/drbd/linux/mutex.h -drbd-8.3.7/drbd/linux/tracepoint.h -drbd-8.3.7/drbd/lru_cache.c -drbd-8.3.7/scripts/Makefile.in -drbd-8.3.7/scripts/README -drbd-8.3.7/scripts/adjust_drbd_config_h.sh -drbd-8.3.7/scripts/block-drbd -drbd-8.3.7/scripts/crm-fence-peer.sh -drbd-8.3.7/scripts/drbd -drbd-8.3.7/scripts/drbd-overview.pl -drbd-8.3.7/scripts/drbd.conf -drbd-8.3.7/scripts/drbd.conf.example -drbd-8.3.7/scripts/drbd.gentoo -drbd-8.3.7/scripts/drbd.metadata.rhcs -drbd-8.3.7/scripts/drbd.ocf -drbd-8.3.7/scripts/drbd.rules -drbd-8.3.7/scripts/drbd.sh.rhcs -drbd-8.3.7/scripts/drbdadm.bash_completion -drbd-8.3.7/scripts/drbddisk -drbd-8.3.7/scripts/drbdupper -drbd-8.3.7/scripts/get_uts_release.sh -drbd-8.3.7/scripts/global_common.conf -drbd-8.3.7/scripts/notify.sh -drbd-8.3.7/scripts/outdate-peer.sh -drbd-8.3.7/scripts/patch-kernel -drbd-8.3.7/scripts/pretty-proc-drbd.sh -drbd-8.3.7/scripts/snapshot-resync-target-lvm.sh -drbd-8.3.7/scripts/unsnapshot-resync-target-lvm.sh -drbd-8.3.7/user/Makefile.in -drbd-8.3.7/user/drbd_endian.h -drbd-8.3.7/user/drbdadm.h -drbd-8.3.7/user/drbdadm_adjust.c -drbd-8.3.7/user/drbdadm_main.c -drbd-8.3.7/user/drbdadm_minor_table.c -drbd-8.3.7/user/drbdadm_parser.c -drbd-8.3.7/user/drbdadm_parser.h -drbd-8.3.7/user/drbdadm_scanner.fl -drbd-8.3.7/user/drbdadm_usage_cnt.c -drbd-8.3.7/user/drbdmeta.c -drbd-8.3.7/user/drbdmeta_parser.h -drbd-8.3.7/user/drbdmeta_scanner.fl -drbd-8.3.7/user/drbdsetup.c -drbd-8.3.7/user/drbdtool_common.c -drbd-8.3.7/user/drbdtool_common.h -drbd-8.3.7/user/unaligned.h -drbd-8.3.7/documentation/drbdsetup.8 -drbd-8.3.7/documentation/drbd.conf.5 -drbd-8.3.7/documentation/drbd.8 -drbd-8.3.7/documentation/drbdadm.8 -drbd-8.3.7/documentation/drbdmeta.8 -drbd-8.3.7/documentation/drbddisk.8 -drbd-8.3.7/drbd_config.h -drbd-8.3.7/drbd/drbd_buildtag.c -drbd-8.3.7/.filelist -drbd-8.3.7/configure -drbd-8.3.7/user/config.h.in diff -Nru drbd8-8.3.7/.git/COMMIT_EDITMSG drbd8-8.4.1+git55a81dc~cmd1/.git/COMMIT_EDITMSG --- drbd8-8.3.7/.git/COMMIT_EDITMSG 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/COMMIT_EDITMSG 2012-09-03 21:31:23.000000000 +0000 @@ -0,0 +1,25 @@ +drbd: fix binary-incompatible build on some platforms + +The type of the make_request_fn changed from int to void in upstream +kernel. To be compatible with both older and newer kernels, our compat.h +auto-detect magic tries to figure this out. + +However, at least on Ubuntu Lucid, the compiler ignores the +pragma -Werror we rely on there, leading to us using void, +where the rest of the kernel expects an int return code. + +This caused interesting BUGs like (short version): + BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 + IP: [] clone_endio+0x34/0xe0 + Pid: 3517, comm: kdmflush Not tainted 2.6.32-38-server #83-Ubuntu X8DTN + Call Trace: + [] bio_endio+0x1d/0x40 + [] drbd_make_request+0x34b/0x350 [drbd] + [] generic_make_request+0x1b1/0x4f0 + [] __map_bio+0xad/0x130 + [] __clone_and_map+0x4ad/0x4c0 + [] __split_and_process_bio+0x108/0x190 + [] dm_flush+0x56/0x70 + +Fix: in compat/tests/have_void_make_request.c, don't rely on -Werror, but + BUILD_BUG_ON(!(__same_type(&drbd_make_request, make_request_fn))); diff -Nru drbd8-8.3.7/.git/FETCH_HEAD drbd8-8.4.1+git55a81dc~cmd1/.git/FETCH_HEAD --- drbd8-8.3.7/.git/FETCH_HEAD 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/FETCH_HEAD 2012-09-03 21:30:57.000000000 +0000 @@ -0,0 +1,5 @@ +56ed9398f8e74a8b1b1e3e4b8770acc58fd1cd17 not-for-merge branch 'master' of git://git.drbd.org/drbd-8.4 +e5b8bd4b2b2bfb746de97fd4881feefb47f0b8fc not-for-merge branch 'zero-copy-receive' of git://git.drbd.org/drbd-8.4 +7a59a5b69271df94c1f10b5a4dad48a5a3b1aea5 not-for-merge tag 'drbd-8.4.2rc1' of git://git.drbd.org/drbd-8.4 +5ab1ece053485cf9b9b3e775fe58a746ed1c20df not-for-merge tag 'drbd-8.4.2rc2' of git://git.drbd.org/drbd-8.4 +ac15f759f6055a930b3aad30b57781315b5abbef not-for-merge tag 'drbd-8.4.2rc3' of git://git.drbd.org/drbd-8.4 diff -Nru drbd8-8.3.7/.git/HEAD drbd8-8.4.1+git55a81dc~cmd1/.git/HEAD --- drbd8-8.3.7/.git/HEAD 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/HEAD 2012-09-03 21:31:23.000000000 +0000 @@ -0,0 +1 @@ +e3169387b068d825dd433287f7fd7ba48ed07919 diff -Nru drbd8-8.3.7/.git/config drbd8-8.4.1+git55a81dc~cmd1/.git/config --- drbd8-8.3.7/.git/config 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/config 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,11 @@ +[core] + repositoryformatversion = 0 + filemode = true + bare = false + logallrefupdates = true +[remote "origin"] + fetch = +refs/heads/*:refs/remotes/origin/* + url = git://git.drbd.org/drbd-8.4.git +[branch "master"] + remote = origin + merge = refs/heads/master diff -Nru drbd8-8.3.7/.git/description drbd8-8.4.1+git55a81dc~cmd1/.git/description --- drbd8-8.3.7/.git/description 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/description 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1 @@ +Unnamed repository; edit this file 'description' to name the repository. diff -Nru drbd8-8.3.7/.git/hooks/applypatch-msg.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/applypatch-msg.sample --- drbd8-8.3.7/.git/hooks/applypatch-msg.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/applypatch-msg.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,15 @@ +#!/bin/sh +# +# An example hook script to check the commit log message taken by +# applypatch from an e-mail message. +# +# The hook should exit with non-zero status after issuing an +# appropriate message if it wants to stop the commit. The hook is +# allowed to edit the commit message file. +# +# To enable this hook, rename this file to "applypatch-msg". + +. git-sh-setup +test -x "$GIT_DIR/hooks/commit-msg" && + exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"} +: diff -Nru drbd8-8.3.7/.git/hooks/commit-msg.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/commit-msg.sample --- drbd8-8.3.7/.git/hooks/commit-msg.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/commit-msg.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,24 @@ +#!/bin/sh +# +# An example hook script to check the commit log message. +# Called by git-commit with one argument, the name of the file +# that has the commit message. The hook should exit with non-zero +# status after issuing an appropriate message if it wants to stop the +# commit. The hook is allowed to edit the commit message file. +# +# To enable this hook, rename this file to "commit-msg". + +# Uncomment the below to add a Signed-off-by line to the message. +# Doing this in a hook is a bad idea in general, but the prepare-commit-msg +# hook is more suited to it. +# +# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') +# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" + +# This example catches duplicate Signed-off-by lines. + +test "" = "$(grep '^Signed-off-by: ' "$1" | + sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || { + echo >&2 Duplicate Signed-off-by lines. + exit 1 +} diff -Nru drbd8-8.3.7/.git/hooks/post-commit.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/post-commit.sample --- drbd8-8.3.7/.git/hooks/post-commit.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/post-commit.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,8 @@ +#!/bin/sh +# +# An example hook script that is called after a successful +# commit is made. +# +# To enable this hook, rename this file to "post-commit". + +: Nothing diff -Nru drbd8-8.3.7/.git/hooks/post-receive.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/post-receive.sample --- drbd8-8.3.7/.git/hooks/post-receive.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/post-receive.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,15 @@ +#!/bin/sh +# +# An example hook script for the "post-receive" event. +# +# The "post-receive" script is run after receive-pack has accepted a pack +# and the repository has been updated. It is passed arguments in through +# stdin in the form +# +# For example: +# aa453216d1b3e49e7f6f98441fa56946ddcd6a20 68f7abf4e6f922807889f52bc043ecd31b79f814 refs/heads/master +# +# see contrib/hooks/ for a sample, or uncomment the next line and +# rename the file to "post-receive". + +#. /usr/share/doc/git-core/contrib/hooks/post-receive-email diff -Nru drbd8-8.3.7/.git/hooks/post-update.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/post-update.sample --- drbd8-8.3.7/.git/hooks/post-update.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/post-update.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,8 @@ +#!/bin/sh +# +# An example hook script to prepare a packed repository for use over +# dumb transports. +# +# To enable this hook, rename this file to "post-update". + +exec git-update-server-info diff -Nru drbd8-8.3.7/.git/hooks/pre-applypatch.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/pre-applypatch.sample --- drbd8-8.3.7/.git/hooks/pre-applypatch.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/pre-applypatch.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,14 @@ +#!/bin/sh +# +# An example hook script to verify what is about to be committed +# by applypatch from an e-mail message. +# +# The hook should exit with non-zero status after issuing an +# appropriate message if it wants to stop the commit. +# +# To enable this hook, rename this file to "pre-applypatch". + +. git-sh-setup +test -x "$GIT_DIR/hooks/pre-commit" && + exec "$GIT_DIR/hooks/pre-commit" ${1+"$@"} +: diff -Nru drbd8-8.3.7/.git/hooks/pre-commit.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/pre-commit.sample --- drbd8-8.3.7/.git/hooks/pre-commit.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/pre-commit.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,46 @@ +#!/bin/sh +# +# An example hook script to verify what is about to be committed. +# Called by git-commit with no arguments. The hook should +# exit with non-zero status after issuing an appropriate message if +# it wants to stop the commit. +# +# To enable this hook, rename this file to "pre-commit". + +if git-rev-parse --verify HEAD >/dev/null 2>&1 +then + against=HEAD +else + # Initial commit: diff against an empty tree object + against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 +fi + +# If you want to allow non-ascii filenames set this variable to true. +allownonascii=$(git config hooks.allownonascii) + +# Cross platform projects tend to avoid non-ascii filenames; prevent +# them from being added to the repository. We exploit the fact that the +# printable range starts at the space character and ends with tilde. +if [ "$allownonascii" != "true" ] && + # Note that the use of brackets around a tr range is ok here, (it's + # even required, for portability to Solaris 10's /usr/bin/tr), since + # the square bracket bytes happen to fall in the designated range. + test "$(git diff --cached --name-only --diff-filter=A -z $against | + LC_ALL=C tr -d '[ -~]\0')" +then + echo "Error: Attempt to add a non-ascii file name." + echo + echo "This can cause problems if you want to work" + echo "with people on other platforms." + echo + echo "To be portable it is advisable to rename the file ..." + echo + echo "If you know what you are doing you can disable this" + echo "check using:" + echo + echo " git config hooks.allownonascii true" + echo + exit 1 +fi + +exec git diff-index --check --cached $against -- diff -Nru drbd8-8.3.7/.git/hooks/pre-rebase.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/pre-rebase.sample --- drbd8-8.3.7/.git/hooks/pre-rebase.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/pre-rebase.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,169 @@ +#!/bin/sh +# +# Copyright (c) 2006, 2008 Junio C Hamano +# +# The "pre-rebase" hook is run just before "git-rebase" starts doing +# its job, and can prevent the command from running by exiting with +# non-zero status. +# +# The hook is called with the following parameters: +# +# $1 -- the upstream the series was forked from. +# $2 -- the branch being rebased (or empty when rebasing the current branch). +# +# This sample shows how to prevent topic branches that are already +# merged to 'next' branch from getting rebased, because allowing it +# would result in rebasing already published history. + +publish=next +basebranch="$1" +if test "$#" = 2 +then + topic="refs/heads/$2" +else + topic=`git symbolic-ref HEAD` || + exit 0 ;# we do not interrupt rebasing detached HEAD +fi + +case "$topic" in +refs/heads/??/*) + ;; +*) + exit 0 ;# we do not interrupt others. + ;; +esac + +# Now we are dealing with a topic branch being rebased +# on top of master. Is it OK to rebase it? + +# Does the topic really exist? +git show-ref -q "$topic" || { + echo >&2 "No such branch $topic" + exit 1 +} + +# Is topic fully merged to master? +not_in_master=`git-rev-list --pretty=oneline ^master "$topic"` +if test -z "$not_in_master" +then + echo >&2 "$topic is fully merged to master; better remove it." + exit 1 ;# we could allow it, but there is no point. +fi + +# Is topic ever merged to next? If so you should not be rebasing it. +only_next_1=`git-rev-list ^master "^$topic" ${publish} | sort` +only_next_2=`git-rev-list ^master ${publish} | sort` +if test "$only_next_1" = "$only_next_2" +then + not_in_topic=`git-rev-list "^$topic" master` + if test -z "$not_in_topic" + then + echo >&2 "$topic is already up-to-date with master" + exit 1 ;# we could allow it, but there is no point. + else + exit 0 + fi +else + not_in_next=`git-rev-list --pretty=oneline ^${publish} "$topic"` + perl -e ' + my $topic = $ARGV[0]; + my $msg = "* $topic has commits already merged to public branch:\n"; + my (%not_in_next) = map { + /^([0-9a-f]+) /; + ($1 => 1); + } split(/\n/, $ARGV[1]); + for my $elem (map { + /^([0-9a-f]+) (.*)$/; + [$1 => $2]; + } split(/\n/, $ARGV[2])) { + if (!exists $not_in_next{$elem->[0]}) { + if ($msg) { + print STDERR $msg; + undef $msg; + } + print STDERR " $elem->[1]\n"; + } + } + ' "$topic" "$not_in_next" "$not_in_master" + exit 1 +fi + +exit 0 + +################################################################ + +This sample hook safeguards topic branches that have been +published from being rewound. + +The workflow assumed here is: + + * Once a topic branch forks from "master", "master" is never + merged into it again (either directly or indirectly). + + * Once a topic branch is fully cooked and merged into "master", + it is deleted. If you need to build on top of it to correct + earlier mistakes, a new topic branch is created by forking at + the tip of the "master". This is not strictly necessary, but + it makes it easier to keep your history simple. + + * Whenever you need to test or publish your changes to topic + branches, merge them into "next" branch. + +The script, being an example, hardcodes the publish branch name +to be "next", but it is trivial to make it configurable via +$GIT_DIR/config mechanism. + +With this workflow, you would want to know: + +(1) ... if a topic branch has ever been merged to "next". Young + topic branches can have stupid mistakes you would rather + clean up before publishing, and things that have not been + merged into other branches can be easily rebased without + affecting other people. But once it is published, you would + not want to rewind it. + +(2) ... if a topic branch has been fully merged to "master". + Then you can delete it. More importantly, you should not + build on top of it -- other people may already want to + change things related to the topic as patches against your + "master", so if you need further changes, it is better to + fork the topic (perhaps with the same name) afresh from the + tip of "master". + +Let's look at this example: + + o---o---o---o---o---o---o---o---o---o "next" + / / / / + / a---a---b A / / + / / / / + / / c---c---c---c B / + / / / \ / + / / / b---b C \ / + / / / / \ / + ---o---o---o---o---o---o---o---o---o---o---o "master" + + +A, B and C are topic branches. + + * A has one fix since it was merged up to "next". + + * B has finished. It has been fully merged up to "master" and "next", + and is ready to be deleted. + + * C has not merged to "next" at all. + +We would want to allow C to be rebased, refuse A, and encourage +B to be deleted. + +To compute (1): + + git-rev-list ^master ^topic next + git-rev-list ^master next + + if these match, topic has not merged in next at all. + +To compute (2): + + git-rev-list master..topic + + if this is empty, it is fully merged to "master". diff -Nru drbd8-8.3.7/.git/hooks/prepare-commit-msg.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/prepare-commit-msg.sample --- drbd8-8.3.7/.git/hooks/prepare-commit-msg.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/prepare-commit-msg.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,36 @@ +#!/bin/sh +# +# An example hook script to prepare the commit log message. +# Called by git-commit with the name of the file that has the +# commit message, followed by the description of the commit +# message's source. The hook's purpose is to edit the commit +# message file. If the hook fails with a non-zero status, +# the commit is aborted. +# +# To enable this hook, rename this file to "prepare-commit-msg". + +# This hook includes three examples. The first comments out the +# "Conflicts:" part of a merge commit. +# +# The second includes the output of "git diff --name-status -r" +# into the message, just before the "git status" output. It is +# commented because it doesn't cope with --amend or with squashed +# commits. +# +# The third example adds a Signed-off-by line to the message, that can +# still be edited. This is rarely a good idea. + +case "$2,$3" in + merge,) + perl -i.bak -ne 's/^/# /, s/^# #/#/ if /^Conflicts/ .. /#/; print' "$1" ;; + +# ,|template,) +# perl -i.bak -pe ' +# print "\n" . `git diff --cached --name-status -r` +# if /^#/ && $first++ == 0' "$1" ;; + + *) ;; +esac + +# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') +# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" diff -Nru drbd8-8.3.7/.git/hooks/update.sample drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/update.sample --- drbd8-8.3.7/.git/hooks/update.sample 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/hooks/update.sample 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,128 @@ +#!/bin/sh +# +# An example hook script to blocks unannotated tags from entering. +# Called by git-receive-pack with arguments: refname sha1-old sha1-new +# +# To enable this hook, rename this file to "update". +# +# Config +# ------ +# hooks.allowunannotated +# This boolean sets whether unannotated tags will be allowed into the +# repository. By default they won't be. +# hooks.allowdeletetag +# This boolean sets whether deleting tags will be allowed in the +# repository. By default they won't be. +# hooks.allowmodifytag +# This boolean sets whether a tag may be modified after creation. By default +# it won't be. +# hooks.allowdeletebranch +# This boolean sets whether deleting branches will be allowed in the +# repository. By default they won't be. +# hooks.denycreatebranch +# This boolean sets whether remotely creating branches will be denied +# in the repository. By default this is allowed. +# + +# --- Command line +refname="$1" +oldrev="$2" +newrev="$3" + +# --- Safety check +if [ -z "$GIT_DIR" ]; then + echo "Don't run this script from the command line." >&2 + echo " (if you want, you could supply GIT_DIR then run" >&2 + echo " $0 )" >&2 + exit 1 +fi + +if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +# --- Config +allowunannotated=$(git config --bool hooks.allowunannotated) +allowdeletebranch=$(git config --bool hooks.allowdeletebranch) +denycreatebranch=$(git config --bool hooks.denycreatebranch) +allowdeletetag=$(git config --bool hooks.allowdeletetag) +allowmodifytag=$(git config --bool hooks.allowmodifytag) + +# check for no description +projectdesc=$(sed -e '1q' "$GIT_DIR/description") +case "$projectdesc" in +"Unnamed repository"* | "") + echo "*** Project description file hasn't been set" >&2 + exit 1 + ;; +esac + +# --- Check types +# if $newrev is 0000...0000, it's a commit to delete a ref. +zero="0000000000000000000000000000000000000000" +if [ "$newrev" = "$zero" ]; then + newrev_type=delete +else + newrev_type=$(git-cat-file -t $newrev) +fi + +case "$refname","$newrev_type" in + refs/tags/*,commit) + # un-annotated tag + short_refname=${refname##refs/tags/} + if [ "$allowunannotated" != "true" ]; then + echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2 + echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2 + exit 1 + fi + ;; + refs/tags/*,delete) + # delete tag + if [ "$allowdeletetag" != "true" ]; then + echo "*** Deleting a tag is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/tags/*,tag) + # annotated tag + if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1 + then + echo "*** Tag '$refname' already exists." >&2 + echo "*** Modifying a tag is not allowed in this repository." >&2 + exit 1 + fi + ;; + refs/heads/*,commit) + # branch + if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then + echo "*** Creating a branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/heads/*,delete) + # delete branch + if [ "$allowdeletebranch" != "true" ]; then + echo "*** Deleting a branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/remotes/*,commit) + # tracking branch + ;; + refs/remotes/*,delete) + # delete tracking branch + if [ "$allowdeletebranch" != "true" ]; then + echo "*** Deleting a tracking branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + *) + # Anything else (is there anything else?) + echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2 + exit 1 + ;; +esac + +# --- Finished +exit 0 Binary files /tmp/eLC1L4pXeJ/drbd8-8.3.7/.git/index and /tmp/EtA86naCDw/drbd8-8.4.1+git55a81dc~cmd1/.git/index differ diff -Nru drbd8-8.3.7/.git/info/exclude drbd8-8.4.1+git55a81dc~cmd1/.git/info/exclude --- drbd8-8.3.7/.git/info/exclude 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/info/exclude 2012-02-02 14:09:06.000000000 +0000 @@ -0,0 +1,6 @@ +# git-ls-files --others --exclude-from=.git/info/exclude +# Lines that start with '#' are comments. +# For a project mostly in C, the following would be a good set of +# exclude patterns (uncomment them if you want to use them): +# *.[oa] +# *~ diff -Nru drbd8-8.3.7/.git/logs/HEAD drbd8-8.4.1+git55a81dc~cmd1/.git/logs/HEAD --- drbd8-8.3.7/.git/logs/HEAD 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/logs/HEAD 2012-09-03 21:31:23.000000000 +0000 @@ -0,0 +1,3 @@ +0000000000000000000000000000000000000000 91b4c048c1a0e06777b5f65d312b38d47abaea80 root 1328191754 -0600 clone: from git://git.drbd.org/drbd-8.4.git +91b4c048c1a0e06777b5f65d312b38d47abaea80 91b4c048c1a0e06777b5f65d312b38d47abaea80 root 1328191774 -0600 checkout: moving from master to drbd-8.4.1 +91b4c048c1a0e06777b5f65d312b38d47abaea80 e3169387b068d825dd433287f7fd7ba48ed07919 Ildefonso Camargo 1346707883 -0430 cherry-pick: drbd: fix binary-incompatible build on some platforms diff -Nru drbd8-8.3.7/.git/logs/refs/heads/master drbd8-8.4.1+git55a81dc~cmd1/.git/logs/refs/heads/master --- drbd8-8.3.7/.git/logs/refs/heads/master 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/logs/refs/heads/master 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1 @@ +0000000000000000000000000000000000000000 91b4c048c1a0e06777b5f65d312b38d47abaea80 root 1328191754 -0600 clone: from git://git.drbd.org/drbd-8.4.git diff -Nru drbd8-8.3.7/.git/logs/refs/remotes/origin/master drbd8-8.4.1+git55a81dc~cmd1/.git/logs/refs/remotes/origin/master --- drbd8-8.3.7/.git/logs/refs/remotes/origin/master 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/logs/refs/remotes/origin/master 2012-09-03 21:30:57.000000000 +0000 @@ -0,0 +1 @@ +91b4c048c1a0e06777b5f65d312b38d47abaea80 56ed9398f8e74a8b1b1e3e4b8770acc58fd1cd17 Ildefonso Camargo 1346707857 -0430 fetch: fast-forward Binary files /tmp/eLC1L4pXeJ/drbd8-8.3.7/.git/objects/19/57f36865f12f413186301639c90f18c5b6383a and /tmp/EtA86naCDw/drbd8-8.4.1+git55a81dc~cmd1/.git/objects/19/57f36865f12f413186301639c90f18c5b6383a differ Binary files /tmp/eLC1L4pXeJ/drbd8-8.3.7/.git/objects/39/d387426544fdd10293597bfae99a825ac48cf7 and /tmp/EtA86naCDw/drbd8-8.4.1+git55a81dc~cmd1/.git/objects/39/d387426544fdd10293597bfae99a825ac48cf7 differ Binary files /tmp/eLC1L4pXeJ/drbd8-8.3.7/.git/objects/e3/169387b068d825dd433287f7fd7ba48ed07919 and /tmp/EtA86naCDw/drbd8-8.4.1+git55a81dc~cmd1/.git/objects/e3/169387b068d825dd433287f7fd7ba48ed07919 differ Binary files /tmp/eLC1L4pXeJ/drbd8-8.3.7/.git/objects/pack/pack-7cd34c0c501af2938b3310c64932d0fb210658d7.idx and /tmp/EtA86naCDw/drbd8-8.4.1+git55a81dc~cmd1/.git/objects/pack/pack-7cd34c0c501af2938b3310c64932d0fb210658d7.idx differ Binary files /tmp/eLC1L4pXeJ/drbd8-8.3.7/.git/objects/pack/pack-7cd34c0c501af2938b3310c64932d0fb210658d7.pack and /tmp/EtA86naCDw/drbd8-8.4.1+git55a81dc~cmd1/.git/objects/pack/pack-7cd34c0c501af2938b3310c64932d0fb210658d7.pack differ Binary files /tmp/eLC1L4pXeJ/drbd8-8.3.7/.git/objects/pack/pack-fe1890efcc4e4e5aea4829936564928a4a8a3b08.idx and /tmp/EtA86naCDw/drbd8-8.4.1+git55a81dc~cmd1/.git/objects/pack/pack-fe1890efcc4e4e5aea4829936564928a4a8a3b08.idx differ Binary files /tmp/eLC1L4pXeJ/drbd8-8.3.7/.git/objects/pack/pack-fe1890efcc4e4e5aea4829936564928a4a8a3b08.pack and /tmp/EtA86naCDw/drbd8-8.4.1+git55a81dc~cmd1/.git/objects/pack/pack-fe1890efcc4e4e5aea4829936564928a4a8a3b08.pack differ diff -Nru drbd8-8.3.7/.git/packed-refs drbd8-8.4.1+git55a81dc~cmd1/.git/packed-refs --- drbd8-8.3.7/.git/packed-refs 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/packed-refs 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,19 @@ +# pack-refs with: peeled +311dc112cb6f3c9c2658c3e9e3de33d788af7902 refs/tags/drbd-8.4.1rc2 +^23a65b276f93f211aebb992513aade5bb0e76a69 +004fa9964f91021633ac05b3fac0bc2a3ce16941 refs/tags/drbd-8.4.1rc1 +^eb252062fcc810de0b7d6ff5a67867ba4bf3f9b1 +66248dfa824afe2525aa33e99d8f71a9899c7ea4 refs/tags/drbd-8.4.1 +^91b4c048c1a0e06777b5f65d312b38d47abaea80 +a96878f6e589e4a3e83b2925ea927f6e22699a82 refs/tags/drbd-8.4.0rc4 +^4cae5718c19dbcb1505dadc6d078626e6af205ed +5f6efb0772b729d2396b18db5ba3c89a2923eb9e refs/tags/drbd-8.4.0rc3 +^a222a5af13886743c47e9d44329923cf94ba8d18 +7ec7671696654597347758a6fbd4931717dbdb80 refs/tags/drbd-8.4.0rc2 +^c0014a5ec7b162d3e4a3a81df829f9d4d84de94a +c9b4428e663c24c74ec33af97f36f5acc4b6bc2c refs/tags/drbd-8.4.0rc1 +^2712ba1a920636bfa324920409f949dcd4e0f5d5 +7a7f1aebba8cbbb3651ff6babea441d06e0a36fc refs/tags/drbd-8.4.0 +^28753f559ab51b549d16bcf487fe625d5919c49c +e5b8bd4b2b2bfb746de97fd4881feefb47f0b8fc refs/remotes/origin/zero-copy-receive +91b4c048c1a0e06777b5f65d312b38d47abaea80 refs/remotes/origin/master diff -Nru drbd8-8.3.7/.git/refs/heads/master drbd8-8.4.1+git55a81dc~cmd1/.git/refs/heads/master --- drbd8-8.3.7/.git/refs/heads/master 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/refs/heads/master 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1 @@ +91b4c048c1a0e06777b5f65d312b38d47abaea80 diff -Nru drbd8-8.3.7/.git/refs/remotes/origin/HEAD drbd8-8.4.1+git55a81dc~cmd1/.git/refs/remotes/origin/HEAD --- drbd8-8.3.7/.git/refs/remotes/origin/HEAD 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/refs/remotes/origin/HEAD 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1 @@ +ref: refs/remotes/origin/master diff -Nru drbd8-8.3.7/.git/refs/remotes/origin/master drbd8-8.4.1+git55a81dc~cmd1/.git/refs/remotes/origin/master --- drbd8-8.3.7/.git/refs/remotes/origin/master 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/refs/remotes/origin/master 2012-09-03 21:30:57.000000000 +0000 @@ -0,0 +1 @@ +56ed9398f8e74a8b1b1e3e4b8770acc58fd1cd17 diff -Nru drbd8-8.3.7/.git/refs/tags/drbd-8.4.2rc1 drbd8-8.4.1+git55a81dc~cmd1/.git/refs/tags/drbd-8.4.2rc1 --- drbd8-8.3.7/.git/refs/tags/drbd-8.4.2rc1 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/refs/tags/drbd-8.4.2rc1 2012-09-03 21:30:57.000000000 +0000 @@ -0,0 +1 @@ +7a59a5b69271df94c1f10b5a4dad48a5a3b1aea5 diff -Nru drbd8-8.3.7/.git/refs/tags/drbd-8.4.2rc2 drbd8-8.4.1+git55a81dc~cmd1/.git/refs/tags/drbd-8.4.2rc2 --- drbd8-8.3.7/.git/refs/tags/drbd-8.4.2rc2 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/refs/tags/drbd-8.4.2rc2 2012-09-03 21:30:57.000000000 +0000 @@ -0,0 +1 @@ +5ab1ece053485cf9b9b3e775fe58a746ed1c20df diff -Nru drbd8-8.3.7/.git/refs/tags/drbd-8.4.2rc3 drbd8-8.4.1+git55a81dc~cmd1/.git/refs/tags/drbd-8.4.2rc3 --- drbd8-8.3.7/.git/refs/tags/drbd-8.4.2rc3 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.git/refs/tags/drbd-8.4.2rc3 2012-09-03 21:30:57.000000000 +0000 @@ -0,0 +1 @@ +ac15f759f6055a930b3aad30b57781315b5abbef diff -Nru drbd8-8.3.7/.gitignore drbd8-8.4.1+git55a81dc~cmd1/.gitignore --- drbd8-8.3.7/.gitignore 2008-11-24 10:43:32.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/.gitignore 2012-02-02 14:09:14.000000000 +0000 @@ -1,35 +1,46 @@ -ID -TODO -tags +/autom4te.cache +/config.log +/config.status +/configure +/drbd-*.tar.gz +/drbd.spec +/drbd-kernel.spec +/drbd-km.spec +/ID +/TODO +/tags +/Makefile + +user/Makefile +scripts/Makefile +documentation/Makefile ./.filelist ./drbd_config.h +*.gcda +*.gcno *.o drbd/drbd.ko -drbd/.drbd.ko.cmd -drbd/.drbd.mod.o.cmd -drbd/.drbd.o.cmd -drbd/.drbd_actlog.o.cmd -drbd/.drbd_bitmap.o.cmd -drbd/.drbd_buildtag.o.cmd -drbd/.drbd_kernelrelease -drbd/.drbd_main.o.cmd -drbd/.drbd_nl.o.cmd -drbd/.drbd_proc.o.cmd -drbd/.drbd_receiver.o.cmd -drbd/.drbd_req.o.cmd -drbd/.drbd_strings.o.cmd -drbd/.drbd_worker.o.cmd +drbd/drbd.ko.unsigned +drbd/.*.cmd +drbd/compat/.*.cmd +drbd/.compat.h.d +drbd/.config.timestamp drbd/.kernel.config.gz -drbd/.lru_cache.o.cmd +drbd/.drbd_kernelrelease +drbd/.drbd_kernelrelease.new drbd/.tmp_versions drbd/Module.symvers +drbd/compat.h drbd/drbd.mod.c drbd/drbd_buildtag.c +drbd/modules.order drbd/linux/drbd_config.h.orig +user/config.h +user/config.h.in user/drbd_buildtag.c user/drbd_strings.c user/drbdadm @@ -46,5 +57,6 @@ documentation/drbdsetup.8 documentation/manpage.links documentation/manpage.refs +documentation/drbdsetup_*.xml benchmark/dm diff -Nru drbd8-8.3.7/ChangeLog drbd8-8.4.1+git55a81dc~cmd1/ChangeLog --- drbd8-8.3.7/ChangeLog 2010-01-13 16:13:58.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/ChangeLog 2012-02-02 14:09:14.000000000 +0000 @@ -2,7 +2,166 @@ ------ For even more detail, use "git log" or visit http://git.drbd.org/. -8.3.7 (api:86/proto:86-91) +8.4.1 (api:genl1/proto:86-100) +-------- + * Fixed a bug that might cause in kernel list corruption triggered by + simultaneous IO on multiple volumes in a single resource + * Fixed a bug that might cause a kernel OOPS in the worker thread while + the receiver tied to establish a connection (drbd-8.4.0 regression) + * Fixed an issue in the receiver that could cause connection triggered by + simultaneous IO on multiple volumes in a single resource + * Consider the discard-my-data flag for all volumes + * Fixed attaching to backing devices that do not support barriers/flushes, + when barriers/flushes are not disabled by the configuration. + (drbd-8.4.0 regression) + * Fixed a rare compatibility issue with DRBD's older than 8.3.7 + when negotiating the bio_size + * Fixed a rare race condition where an empty resync could stall with + if pause/unpause events happen in parallel + * Made the re-establishing of connections quicker, if it got a broken pipe + once. Previously there was a bug in the code caused it to waste the first + successful established connection after a broken pipe event. + * crm-fence-peer.sh: Can now deal with multiple DRBD instances being in + a master/slave group + * Optional load balancing for read requests: new keyword "read-balance" + +8.4.0 (api:genl1/proto:86-100) +-------- + * Fixed handling of read errors during online verify + * Fix for connecting on high latency network links + * Fixed state transitions if fence-peer handler returns after connection was + established again + * Go into inconsistent disk state with on-io-error=pass-on policy + * Timeouts for requests processing on the peer (previously that + worked only if the data socket was congested) + * Reworked Linux backward compatibility mechanism + * Conflicting write detection is now based on an interval tree, + removed the hash-tables (necessary for the unlimited BIO sizes) + * Removed the tracing framework + * Support for multiple volumes (minors, block devices) per connection; + up to 65536 volumes per connection supported + * Reduced IO latencies during some state changes (esp. start resync) + * New on disk format for the AL: double capacity; 4k aligned IO; same space + * Multiple AL changes in a single transaction (precondition for + unlimited BIO sizes) + * DRBD no longer imposes any limit on BIO sizes + * Removed DRBD's limits on the number of minor devices + * DRBD's minors can now be removed (not only unconfigured) + * Switched the user space interface form connector to generic netlink + * drbdadm, configuration changes: volume sections; syncer section removed; + bool options got yes/no values, that improves option inheritance; + resource options + * drbdsetup: new commands for creating and removing resources + and minors + * drbdsetup: new commands for changing disk options while the disk + is attached; ...for changing net options while the connection is + established + * drbdsetup/drbdadm the wire-protocol is now a regular connection option + * Removed drbdadm option --force + * IO freezing/thawing is done on connection (all volumes) level + * fencing is done on connection (all volumes) level + * Enforce application of activity log after primary crash in user space + * Features from drbd-8.3: Allow detach from frozen backing devices with the + new --force option; configurable timeout for backing devices by the new + disk-timeout option + * Renamed --dry-run of connect to --tentative; plus alias in drbdsetup + * drbdadm got a "help" sub command, that shows the specific options + * drbdadm now knows all drbdsetup options, and verify ... + * drbdadm can now process all options in random order, and ignores the "--" + separator; compatibility aliases with the old calling conventions; now it + is compatible with the pre 8.4 way of calling. + * New default values (compared to drbd-8.3) for: minor-count, ko-count, al-extents, + c-plan-ahead, c-fill-target, c-min-rate, use-rle, on-io-error + +8.3.10 (api:88/proto:86-96) +-------- + * Fixed a subtle performance degradation that might affected synchronous + work loads (databases) (introduced in 8.3.9) + * Fixed a locking regression (introduced in 8.3.9) + * Fixed on-no-data-accessible for Primary, SyncTarget nodes (Bugz 332) + * Progress bar for online verify + * Optionally use the resync speed control loop code for the online verify + process as well + * Added code to detect false positives when using data-integrity-alg + * New config option on-congestion and new connection states ahead and behind + * Reduced IO latencies during resync, bitmap exchange and temporal states + * Only build a single kernel module package on distributions that provide + the infrastructure to have kernel version independent modules + * On 64bit architectures allow device sizes up to one petabyte + +8.3.9 (api:88/proto:86-95) +-------- + * Fix for possible deadlock on IO error during resync + * Fixed a race condition between adding and removing network configuration. + Lead to a BUG_ON() when triggered. + * Fixed spurious full syncs that could happen after an empty resync and + concurrent connection loss. + * Fixed spurious full syncs that happened when connection got lost while + one node was in WFSyncUUID state (Bugz 318) + * Fixed a race in the meta-data update code path, that could lead to forgotten + updates to the meta-data. That in fact could lead to unexpected behavior + at the next connect + * Fixed potential deadlock on detach + * Fixed potential data divergence after multiple failures + * Implicitly create unconfigured devices which are referenced in sync-after + dependencies. + * OCF RA now also works with pacemaker 1.1 + * Allow BIO sizes of up to 128kByte. Note: In case drbd-proxy is used, at least + version 1.0.16 of drbd-proxy is required. + * New configuration keyword on-no-data-accessible. Possible values + io-error, and suspend-io. The default is "io-error", which matches the + previous behavior. + * If the fencing policy is set to resource-and-stonith, the primary node + will creates the new current UUID _after_ the fencing handler + returned. (Before it did immediately) + * Rewrote the resync speed control loop code. New configuration parameters + c-plan-ahead, c-fill-target, c-delay-target, c-max-rate, c-min-rate. + * Disable activity log updates when all blocks of an unconnected device is + are out of sync. That can be activated by using "invalidate-remote" on an + unconnected primary. + * Improved IPv6 support: link local addresses + * Improved resync speed display in /proc/drbd + +8.3.8 (api:88/proto:86-94) +-------- + * Do not expose failed local READs to upper layers, regression introduced + in 8.3.3 + * Fixed support for devices with 4k hard sector size (again) + * Fixed a potential Oops in the disconnect code + * Fixed a race condition that could cause DRBD to consider the peers disk + as Inconstent after resync instead of UpToDate (Bugz 271) + * Fixed a reace condition that could cause DRBD to consider the peers disk + as Outdated instead of Inconsistent during resync (Bugz 277) + * Disallow to start a resync with invalidate / invalidate-remote when the + source disk is not UpToDate + * Forcing primary works now also for Consistent, not only for Outdated and + Inconsistent (Bugz 266) + * Improved robustness against corrupt or malicous sector addresses when + receiving data + * Added the initial-split-brain, it gets called also if the split-brain gets + automatically resolved + * Added the --assume-clean option for the resize command, it causes drbd to + not resync the new storage after an online grow operation + * drbdadm: Do not segfault if stacked-on-top-of refers to an undefined res + * drbdadm: Do not consider configs with invalid after statements as invalid + * drbdadm: Do not segfault if the peer's proxy section is missing + * drbdadm: Allow nullglob in include statement + * drbdadm: Fixed the use of waitpid + * init script: fix insserv headers (Debian 576901) + * Gave the receiving code the ability to use multiple BIOs for writing a + single data packet; now DRBD works with BIOs up to 32kByte also on LVM + devices; from now on the use_bmbv config option does nothing + * New command check-resize, that allows DRBD to detect offline resizing + and to move internal meta-data accordingly + * Added a control loop, that allows DRBD to find auto tune the resync + speed, on connections with large queues (drbd-proxy) + * --dry-run option for connect; disconnects after sync handshake + * --overwrite-data-of-peer got an alias named --force + * Improvements to crm-fence-peer + * Fixed option parsing and stacking in snapshot-resync-target-lvm.sh + * Compiles on 2.6.33 and 2.6.34 + +8.3.7 (api:88/proto:86-91) -------- * Lots of fixes to the new RPM packaging * Lots of fixes to the autoconfig stuff @@ -29,12 +188,12 @@ * Following Linux upstream changes 2.6.32 (SHASH and in_flight issues) * New /etc/drbd.conf example that suggests the use of /etc/drbd.d/xxx.res -8.3.6 (api:86/proto:86-91) +8.3.6 (api:88/proto:86-91) -------- * Make sure that we ship all unplug events * Introduced autoconf, new RPM packaging -8.3.5 (api:86/proto:86-91) +8.3.5 (api:88/proto:86-91) -------- * Fixed a regression introduced shortly before 8.3.3, which might case a deadlock in DRBD's disconnect code path. (Bugz 258) @@ -44,12 +203,12 @@ to avoid unnecessary migrations * Do not display the usage count dialog for /etc/inti.d/drbd status -8.3.4 (api:86/proto:86-91) +8.3.4 (api:88/proto:86-91) -------- * Fixed a regression in the connector backport introduced with 8.3.3. Affected only kernels older than 2.6.14. I.e. RHEL4 and SLES9. -8.3.3 (api:86/proto:86-91) +8.3.3 (api:88/proto:86-91) -------- * Correctly deal with large bitmaps (Bugz 239, 240) * Fixed a segfault in drbdadm's parser for unknown sync-after dependencies @@ -71,7 +230,7 @@ * Install bash completion stuff on SLES11 * Following Linux upstream changes 2.6.31 -8.3.2 (api:86/proto:86-90) +8.3.2 (api:88/proto:86-90) -------- * Fixed the "Not a digest" issue for hash functions already ported to shash * Fixed a race condition between device configuration and de-configuration @@ -103,7 +262,7 @@ * Using Linux's own tracing framework instead of our own * Compatibility with Linux 2.6.30 and 2.6.31-rc1 -8.3.1 (api:86/proto:86-89) +8.3.1 (api:88/proto:86-89) -------- * Fixed drbdadm invalidate on disconnected devices (reg in 8.2.7) * Fixed a hard to trigger spinlock deadlock when using device stacking @@ -123,7 +282,7 @@ * Do not force a full resync after a detach on a primary node * Compatibility with Linux 2.6.27, 2.6.28 and 2.6.29 -8.3.0 (api:86/proto:86-89) +8.3.0 (api:88/proto:86-89) -------- * Fixed 'sleep with spinlock held' in case online verify found a difference * Fixed error code pathes in request processing. @@ -149,7 +308,7 @@ * More build compatibility with older vendor kernels * Added drbd-overview.pl to the packages -8.2.7 (api:86/proto:86-88) +8.2.7 (api:88/proto:86-88) -------- * Fixed possible Oops on connection loss during sync handshake * Fixed various possible deadlocks in the disconnect/reconnect and @@ -163,7 +322,7 @@ node. New config options: no-disk-barrier, no-disk-drain * Merged all changes from 8.0.12 -> 8.0.14 into 8.2 -8.2.6 (api:86/proto:86-88) +8.2.6 (api:88/proto:86-88) -------- * The details of the LRU data structures is now hidden from /proc/drbd but can be re-enabled by echoing 1 to @@ -184,7 +343,7 @@ * Fixed online resizing in case it is triggered from the secondary node. -8.2.5 (api:86/proto:86-88) +8.2.5 (api:88/proto:86-88) -------- * Fixed a race between online-verify and application writes. It caused drbd to report false positives, and very likely @@ -195,13 +354,13 @@ lockup after the first connection loss. Fixed. * Merged all changes from 8.0.8 -> 8.0.11 into 8.2 -8.2.4 (api:86/proto:86-88) +8.2.4 (api:88/proto:86-88) -------- * Fixed the online-verify and data-integrity-alg features. While preparing DRBD for Linux-2.6.24 a bug was introduced that rendered most digest based functionality in DRBD useless. -8.2.3 (api:86/proto:86-88) +8.2.3 (api:88/proto:86-88) -------- * Released the online-verify feature from DRBD+ into drbd-8.2 * Fixed the data-integrity-alg feature to work correctly diff -Nru drbd8-8.3.7/Makefile drbd8-8.4.1+git55a81dc~cmd1/Makefile --- drbd8-8.3.7/Makefile 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/Makefile 2012-09-03 22:37:14.000000000 +0000 @@ -25,9 +25,7 @@ # and call those from here. -- lge # variables set by configure -GIT = -KDIR ?= -KVER ?= +GIT = /usr/bin/git LN_S = ln -s PREFIX = /usr RPMBUILD = @@ -40,16 +38,27 @@ WITH_XEN = yes WITH_PACEMAKER = yes WITH_HEARTBEAT = yes -WITH_RGMANAGER = yes +WITH_RGMANAGER = no WITH_BASHCOMPLETION = yes +# default for KDIR/KVER +ifndef KVER + ifndef KDIR +KVER = `uname -r` +KDIR = /lib/modules/$(KVER)/build + else +KVER := $(shell make -s -C $(KDIR) kernelrelease) + endif +endif +KDIR ?= /lib/modules/$(KVER)/build + # for some reason some of the commands below only work correctly in bash, # and not in e.g. dash. I'm too lazy to fix it to be compatible. SHELL=/bin/bash SUBDIRS = user scripts documentation drbd -REL_VERSION := $(shell $(SED) -ne '/REL_VERSION/{s/^[^"]*"\([^ "]*\).*/\1/;p;q;}' drbd/linux/drbd_config.h) +REL_VERSION := $(shell $(SED) -ne '/^\#define REL_VERSION/{s/^[^"]*"\([^ "]*\).*/\1/;p;q;}' drbd/linux/drbd_config.h) ifdef FORCE # # NOTE to generate a tgz even if too lazy to update the changelogs, @@ -84,7 +93,7 @@ .PHONY: module module: check-kdir ifeq ($(WITH_KM),yes) - @ $(MAKE) -C drbd + @ $(MAKE) -C drbd KVER=$(KVER) KDIR=$(KDIR) @ echo -e "\n\tModule build was successful." endif @@ -133,6 +142,18 @@ then \ echo -e "\n\t%changelog in drbd.spec.in needs update"; \ up2date=false; fi; \ + in_changelog=$$(sed -n -e '0,/^%changelog/d' \ + -e '/- '"$$dver_re"'-/p' < drbd-km.spec.in) ; \ + if test -z "$$in_changelog" ; \ + then \ + echo -e "\n\t%changelog in drbd-km.spec.in needs update"; \ + up2date=false; fi; \ + in_changelog=$$(sed -n -e '0,/^%changelog/d' \ + -e '/- '"$$dver_re"'-/p' < drbd-kernel.spec.in) ; \ + if test -z "$$in_changelog" ; \ + then \ + echo -e "\n\t%changelog in drbd-kernel.spec.in needs update"; \ + up2date=false; fi; \ if ! grep "^$$dver_re\>" >/dev/null 2>&1 ChangeLog; \ then \ echo -e "\n\tChangeLog needs update"; \ @@ -141,7 +162,7 @@ then \ echo -e "\n\tconfigure.ac needs update"; \ up2date=false; fi ; \ - if ! grep "^drbd8 ($$dver_re-" >/dev/null 2>&1 debian/changelog; \ + if ! grep "^drbd8 (2:$$dver_re-" >/dev/null 2>&1 debian/changelog; \ then \ echo -e "\n\tdebian/changelog needs update [ignored]\n"; \ : do not fail the build because of outdated debian/changelog ; fi ; \ @@ -161,12 +182,12 @@ @$(GIT) ls-files | sed '$(if $(PRESERVE_DEBIAN),,/^debian/d);s#^#drbd-$(DIST_VERSION)/#' > .filelist @[ -s .filelist ] # assert there is something in .filelist now @find documentation -name "[^.]*.[58]" -o -name "*.html" | \ - sed "s/^/drbd-$(DIST_VERSION)\//" >> .filelist ;\ - echo drbd-$(DIST_VERSION)/drbd_config.h >> .filelist ;\ - echo drbd-$(DIST_VERSION)/drbd/drbd_buildtag.c >> .filelist ;\ - echo drbd-$(DIST_VERSION)/.filelist >> .filelist ;\ - echo drbd-$(DIST_VERSION)/configure >> .filelist ;\ - echo drbd-$(DIST_VERSION)/user/config.h.in >> .filelist ;\ + sed "s/^/drbd-$(DIST_VERSION)\//" >> .filelist ; \ + echo drbd-$(DIST_VERSION)/drbd_config.h >> .filelist ; \ + echo drbd-$(DIST_VERSION)/drbd/drbd_buildtag.c >> .filelist ; \ + echo drbd-$(DIST_VERSION)/.filelist >> .filelist ; \ + echo drbd-$(DIST_VERSION)/configure >> .filelist ; \ + echo drbd-$(DIST_VERSION)/user/config.h.in >> .filelist ; \ echo "./.filelist updated." # tgz will no longer automatically update .filelist, @@ -206,7 +227,7 @@ tarball: check_all_committed distclean doc configure .filelist $(MAKE) tgz -all tools doc .filelist: drbd/drbd_buildtag.c +all module tools doc .filelist: drbd/drbd_buildtag.c kernel-patch: drbd/drbd_buildtag.c set -o errexit; \ @@ -223,6 +244,9 @@ drbd-km.spec: drbd-km.spec.in configure ./configure --enable-spec --without-utils --with-km +drbd-kernel.spec: drbd-kernel.spec.in configure + ./configure --enable-spec --without-utils --with-km + .PHONY: rpm rpm: tgz drbd.spec cp drbd-$(FDIST_VERSION).tar.gz `rpm -E "%_sourcedir"` @@ -241,13 +265,29 @@ drbd-km.spec @echo "You have now:" ; find `rpm -E "%_rpmdir"` -name *.rpm +# kernel module package using the system macros. +# result is kABI aware and uses the weak-updates mechanism. +# Only define %kernel_version, it it was set outside of this file, +# i.e. was inherited from environment, or set explicitly on command line. +# If unset, the macro will figure it out internally, and not depend on +# uname -r, which may be wrong in a chroot build environment. +.PHONY: kmp-rpm +kmp-rpm: tgz drbd-kernel.spec + cp drbd-$(FDIST_VERSION).tar.gz `rpm -E "%_sourcedir"` + $(RPMBUILD) -bb \ + $(if $(filter file,$(origin KVER)), --define "kernel_version $(KVER)") \ + $(RPMOPT) \ + drbd-kernel.spec + @echo "You have now:" ; find `rpm -E "%_rpmdir"` -name *.rpm + .PHONY: srpm srpm: tgz drbd.spec drbd-km.spec cp drbd-$(FDIST_VERSION).tar.gz `rpm -E "%_sourcedir"` $(RPMBUILD) -bs \ --define "kernelversion $(KVER)" \ + --define "kernel_version $(KVER)" \ --define "kdir $(KDIR)" \ $(RPMOPT) \ - drbd.spec drbd-km.spec + drbd.spec drbd-km.spec drbd-kernel.spec @echo "You have now:" ; find `rpm -E "%_srcrpmdir"` -name *.src.rpm endif diff -Nru drbd8-8.3.7/Makefile.in drbd8-8.4.1+git55a81dc~cmd1/Makefile.in --- drbd8-8.3.7/Makefile.in 2010-01-13 16:04:50.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/Makefile.in 2012-02-02 14:09:14.000000000 +0000 @@ -26,8 +26,6 @@ # variables set by configure GIT = @GIT@ -KDIR ?= @KDIR@ -KVER ?= @KVER@ LN_S = @LN_S@ PREFIX = @prefix@ RPMBUILD = @RPMBUILD@ @@ -43,13 +41,24 @@ WITH_RGMANAGER = @WITH_RGMANAGER@ WITH_BASHCOMPLETION = @WITH_BASHCOMPLETION@ +# default for KDIR/KVER +ifndef KVER + ifndef KDIR +KVER = `uname -r` +KDIR = /lib/modules/$(KVER)/build + else +KVER := $(shell make -s -C $(KDIR) kernelrelease) + endif +endif +KDIR ?= /lib/modules/$(KVER)/build + # for some reason some of the commands below only work correctly in bash, # and not in e.g. dash. I'm too lazy to fix it to be compatible. SHELL=/bin/bash SUBDIRS = user scripts documentation drbd -REL_VERSION := $(shell $(SED) -ne '/REL_VERSION/{s/^[^"]*"\([^ "]*\).*/\1/;p;q;}' drbd/linux/drbd_config.h) +REL_VERSION := $(shell $(SED) -ne '/^\#define REL_VERSION/{s/^[^"]*"\([^ "]*\).*/\1/;p;q;}' drbd/linux/drbd_config.h) ifdef FORCE # # NOTE to generate a tgz even if too lazy to update the changelogs, @@ -84,7 +93,7 @@ .PHONY: module module: check-kdir ifeq ($(WITH_KM),yes) - @ $(MAKE) -C drbd + @ $(MAKE) -C drbd KVER=$(KVER) KDIR=$(KDIR) @ echo -e "\n\tModule build was successful." endif @@ -133,6 +142,18 @@ then \ echo -e "\n\t%changelog in drbd.spec.in needs update"; \ up2date=false; fi; \ + in_changelog=$$(sed -n -e '0,/^%changelog/d' \ + -e '/- '"$$dver_re"'-/p' < drbd-km.spec.in) ; \ + if test -z "$$in_changelog" ; \ + then \ + echo -e "\n\t%changelog in drbd-km.spec.in needs update"; \ + up2date=false; fi; \ + in_changelog=$$(sed -n -e '0,/^%changelog/d' \ + -e '/- '"$$dver_re"'-/p' < drbd-kernel.spec.in) ; \ + if test -z "$$in_changelog" ; \ + then \ + echo -e "\n\t%changelog in drbd-kernel.spec.in needs update"; \ + up2date=false; fi; \ if ! grep "^$$dver_re\>" >/dev/null 2>&1 ChangeLog; \ then \ echo -e "\n\tChangeLog needs update"; \ @@ -141,7 +162,7 @@ then \ echo -e "\n\tconfigure.ac needs update"; \ up2date=false; fi ; \ - if ! grep "^drbd8 ($$dver_re-" >/dev/null 2>&1 debian/changelog; \ + if ! grep "^drbd8 (2:$$dver_re-" >/dev/null 2>&1 debian/changelog; \ then \ echo -e "\n\tdebian/changelog needs update [ignored]\n"; \ : do not fail the build because of outdated debian/changelog ; fi ; \ @@ -161,12 +182,12 @@ @$(GIT) ls-files | sed '$(if $(PRESERVE_DEBIAN),,/^debian/d);s#^#drbd-$(DIST_VERSION)/#' > .filelist @[ -s .filelist ] # assert there is something in .filelist now @find documentation -name "[^.]*.[58]" -o -name "*.html" | \ - sed "s/^/drbd-$(DIST_VERSION)\//" >> .filelist ;\ - echo drbd-$(DIST_VERSION)/drbd_config.h >> .filelist ;\ - echo drbd-$(DIST_VERSION)/drbd/drbd_buildtag.c >> .filelist ;\ - echo drbd-$(DIST_VERSION)/.filelist >> .filelist ;\ - echo drbd-$(DIST_VERSION)/configure >> .filelist ;\ - echo drbd-$(DIST_VERSION)/user/config.h.in >> .filelist ;\ + sed "s/^/drbd-$(DIST_VERSION)\//" >> .filelist ; \ + echo drbd-$(DIST_VERSION)/drbd_config.h >> .filelist ; \ + echo drbd-$(DIST_VERSION)/drbd/drbd_buildtag.c >> .filelist ; \ + echo drbd-$(DIST_VERSION)/.filelist >> .filelist ; \ + echo drbd-$(DIST_VERSION)/configure >> .filelist ; \ + echo drbd-$(DIST_VERSION)/user/config.h.in >> .filelist ; \ echo "./.filelist updated." # tgz will no longer automatically update .filelist, @@ -206,7 +227,7 @@ tarball: check_all_committed distclean doc configure .filelist $(MAKE) tgz -all tools doc .filelist: drbd/drbd_buildtag.c +all module tools doc .filelist: drbd/drbd_buildtag.c kernel-patch: drbd/drbd_buildtag.c set -o errexit; \ @@ -223,6 +244,9 @@ drbd-km.spec: drbd-km.spec.in configure ./configure --enable-spec --without-utils --with-km +drbd-kernel.spec: drbd-kernel.spec.in configure + ./configure --enable-spec --without-utils --with-km + .PHONY: rpm rpm: tgz drbd.spec cp drbd-$(FDIST_VERSION).tar.gz `rpm -E "%_sourcedir"` @@ -241,13 +265,29 @@ drbd-km.spec @echo "You have now:" ; find `rpm -E "%_rpmdir"` -name *.rpm +# kernel module package using the system macros. +# result is kABI aware and uses the weak-updates mechanism. +# Only define %kernel_version, it it was set outside of this file, +# i.e. was inherited from environment, or set explicitly on command line. +# If unset, the macro will figure it out internally, and not depend on +# uname -r, which may be wrong in a chroot build environment. +.PHONY: kmp-rpm +kmp-rpm: tgz drbd-kernel.spec + cp drbd-$(FDIST_VERSION).tar.gz `rpm -E "%_sourcedir"` + $(RPMBUILD) -bb \ + $(if $(filter file,$(origin KVER)), --define "kernel_version $(KVER)") \ + $(RPMOPT) \ + drbd-kernel.spec + @echo "You have now:" ; find `rpm -E "%_rpmdir"` -name *.rpm + .PHONY: srpm srpm: tgz drbd.spec drbd-km.spec cp drbd-$(FDIST_VERSION).tar.gz `rpm -E "%_sourcedir"` $(RPMBUILD) -bs \ --define "kernelversion $(KVER)" \ + --define "kernel_version $(KVER)" \ --define "kdir $(KDIR)" \ $(RPMOPT) \ - drbd.spec drbd-km.spec + drbd.spec drbd-km.spec drbd-kernel.spec @echo "You have now:" ; find `rpm -E "%_srcrpmdir"` -name *.src.rpm endif diff -Nru drbd8-8.3.7/autogen.sh drbd8-8.4.1+git55a81dc~cmd1/autogen.sh --- drbd8-8.3.7/autogen.sh 2010-01-13 16:04:50.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/autogen.sh 2012-02-02 14:09:14.000000000 +0000 @@ -8,5 +8,8 @@ echo " suggested configure parameters: +# prepare for rpmbuild, only generate spec files +./configure --with-km --enable-spec +# or prepare for direct build ./configure --prefix=/usr --localstatedir=/var --sysconfdir=/etc " diff -Nru drbd8-8.3.7/benchmark/io-latency-test.c drbd8-8.4.1+git55a81dc~cmd1/benchmark/io-latency-test.c --- drbd8-8.3.7/benchmark/io-latency-test.c 2009-07-27 08:47:42.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/benchmark/io-latency-test.c 2012-02-02 14:09:14.000000000 +0000 @@ -57,6 +57,7 @@ unsigned long record_nr; unsigned int write_duration_us; unsigned int write_duration_records; + unsigned int max_write_duration_us; double avg_write_duration; }; @@ -64,7 +65,7 @@ { struct shared_data *data = (struct shared_data*) arg; unsigned long last_record_nr=-1, current_record_nr=0; - unsigned int avg_write,wd,wr; + unsigned int avg_write,wd,wr,mwd; double avg_write_duration; enum { IO_RUNNING, IO_BLOCKED } io_state = IO_RUNNING; @@ -76,8 +77,10 @@ current_record_nr = data->record_nr; wd = data->write_duration_us; wr = data->write_duration_records; + mwd = data->max_write_duration_us; data->write_duration_us = 0; data->write_duration_records = 0; + data->max_write_duration_us = 0; avg_write_duration = data->avg_write_duration; pthread_mutex_unlock(&data->mutex); @@ -106,7 +109,8 @@ last_record_nr = current_record_nr; case IO_BLOCKED: if(current_record_nr != last_record_nr) { - printf("IO just resumed.\n"); + printf("IO just resumed. Blocked for %d.%02dms\n", + mwd/1000, (mwd%1000)/10); io_state = IO_RUNNING; } } @@ -216,6 +220,7 @@ data.record_nr = record_nr; data.write_duration_us = 0; data.write_duration_records = 1; + data.max_write_duration_us = 0; pthread_create(&watch_dog,NULL,wd_thread,&data); for( ; !records || record_nr < records ; record_nr++) { @@ -268,6 +273,8 @@ data.write_duration_us += write_duration_us; data.write_duration_records++; data.avg_write_duration = avg_write_duration; + if (write_duration_us > data.max_write_duration_us) + data.max_write_duration_us = write_duration_us; pthread_mutex_unlock(&data.mutex); if(write_duration_us < record_time ) { diff -Nru drbd8-8.3.7/config.log drbd8-8.4.1+git55a81dc~cmd1/config.log --- drbd8-8.3.7/config.log 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/config.log 2012-09-03 22:37:14.000000000 +0000 @@ -1,20 +1,20 @@ This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by DRBD configure 8.3.7, which was -generated by GNU Autoconf 2.64. Invocation command line was +It was created by DRBD configure 8.4.1, which was +generated by GNU Autoconf 2.65. Invocation command line was - $ ./configure --prefix=/usr --localstatedir=/var --sysconfdir=/etc --with-utils --with-udev --with-xen --with-pacemaker --with-rgmanager --with-bashcompletion + $ ./configure --prefix=/usr --localstatedir=/var --sysconfdir=/etc ## --------- ## ## Platform. ## ## --------- ## -hostname = lucid -uname -m = i686 -uname -r = 2.6.32-13-generic-pae +hostname = rexy +uname -m = x86_64 +uname -r = 3.2.27 uname -s = Linux -uname -v = #18-Ubuntu SMP Wed Feb 10 22:52:52 UTC 2010 +uname -v = #1 SMP Sat Aug 11 17:21:58 VET 2012 /usr/bin/uname -p = unknown /bin/uname -X = unknown @@ -40,111 +40,118 @@ ## Core tests. ## ## ----------- ## -configure:1939: checking for gcc -configure:1955: found /usr/bin/gcc -configure:1966: result: gcc -configure:2195: checking for C compiler version -configure:2204: gcc --version >&5 -gcc-4.4.real (Ubuntu 4.4.3-2ubuntu2) 4.4.3 -Copyright (C) 2010 Free Software Foundation, Inc. +configure:1951: checking for gcc +configure:1967: found /usr/bin/gcc +configure:1978: result: gcc +configure:2207: checking for C compiler version +configure:2216: gcc --version >&5 +gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3 +Copyright (C) 2011 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -configure:2215: $? = 0 -configure:2204: gcc -v >&5 +configure:2227: $? = 0 +configure:2216: gcc -v >&5 Using built-in specs. -Target: i486-linux-gnu -Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.4.3-2ubuntu2' --with-bugurl=file:///usr/share/doc/gcc-4.4/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared --enable-multiarch --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.4 --program-suffix=-4.4 --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug --enable-objc-gc --enable-targets=all --disable-werror --with-arch-32=i486 --with-tune=generic --enable-checking=release --build=i486-linux-gnu --host=i486-linux-gnu --target=i486-linux-gnu +COLLECT_GCC=gcc +COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6/lto-wrapper +Target: x86_64-linux-gnu +Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.3-1ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu Thread model: posix -gcc version 4.4.3 (Ubuntu 4.4.3-2ubuntu2) -configure:2215: $? = 0 -configure:2204: gcc -V >&5 -gcc-4.4.real: '-V' option must have argument -configure:2215: $? = 1 -configure:2204: gcc -qversion >&5 -gcc-4.4.real: unrecognized option '-qversion' -gcc-4.4.real: no input files -configure:2215: $? = 1 -configure:2237: checking for C compiler default output file name -configure:2259: gcc -Wall -g -O2 -Wl,-Bsymbolic-functions conftest.c >&5 -configure:2263: $? = 0 -configure:2300: result: a.out -configure:2316: checking whether the C compiler works -configure:2325: ./a.out -configure:2329: $? = 0 -configure:2344: result: yes -configure:2351: checking whether we are cross compiling -configure:2353: result: no -configure:2356: checking for suffix of executables -configure:2363: gcc -o conftest -Wall -g -O2 -Wl,-Bsymbolic-functions conftest.c >&5 -configure:2367: $? = 0 -configure:2389: result: -configure:2395: checking for suffix of object files -configure:2417: gcc -c -Wall -g -O2 conftest.c >&5 -configure:2421: $? = 0 -configure:2442: result: o -configure:2446: checking whether we are using the GNU C compiler -configure:2465: gcc -c -Wall -g -O2 conftest.c >&5 -configure:2465: $? = 0 -configure:2474: result: yes -configure:2483: checking whether gcc accepts -g -configure:2503: gcc -c -g conftest.c >&5 -configure:2503: $? = 0 -configure:2544: result: yes -configure:2561: checking for gcc option to accept ISO C89 -configure:2625: gcc -c -Wall -g -O2 conftest.c >&5 -configure:2625: $? = 0 -configure:2638: result: none needed -configure:2658: checking whether ln -s works -configure:2662: result: yes -configure:2671: checking for sed -configure:2689: found /bin/sed -configure:2701: result: /bin/sed -configure:2711: checking for grep -configure:2729: found /bin/grep -configure:2741: result: /bin/grep -configure:2751: checking for flex -configure:2769: found /usr/bin/flex -configure:2781: result: /usr/bin/flex -configure:2791: checking for rpmbuild -configure:2824: result: no -configure:2831: checking for xsltproc -configure:2849: found /usr/bin/xsltproc -configure:2861: result: /usr/bin/xsltproc -configure:2871: checking for tar -configure:2889: found /bin/tar -configure:2901: result: /bin/tar -configure:2911: checking for git -configure:2944: result: no -configure:2951: checking for dpkg-buildpackage -configure:2969: found /usr/bin/dpkg-buildpackage -configure:2981: result: /usr/bin/dpkg-buildpackage -configure:2991: checking for udevadm -configure:3009: found /sbin/udevadm -configure:3021: result: /sbin/udevadm -configure:3031: checking for udevinfo -configure:3064: result: no -configure:3086: WARNING: No rpmbuild found, building RPM packages is disabled. -configure:3101: WARNING: Cannot update buildtag without git. You may safely ignore this warning when building from a tarball. -configure:3161: checking for /etc/gentoo-release -configure:3174: result: no -configure:3180: checking for /etc/redhat-release -configure:3193: result: no -configure:3199: checking for /etc/slackware-version -configure:3212: result: no -configure:3218: checking for /etc/debian_version -configure:3231: result: yes -configure:3237: checking for /etc/SuSE-release -configure:3250: result: no -configure:3309: configured for Debian (includes Ubuntu). -configure:3504: creating ./config.status +gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5) +configure:2227: $? = 0 +configure:2216: gcc -V >&5 +gcc: error: unrecognized option '-V' +gcc: fatal error: no input files +compilation terminated. +configure:2227: $? = 4 +configure:2216: gcc -qversion >&5 +gcc: error: unrecognized option '-qversion' +gcc: fatal error: no input files +compilation terminated. +configure:2227: $? = 4 +configure:2247: checking whether the C compiler works +configure:2269: gcc -Wall -g -O2 -D_FORTIFY_SOURCE=2 -Wl,-Bsymbolic-functions -Wl,-z,relro conftest.c >&5 +configure:2273: $? = 0 +configure:2322: result: yes +configure:2325: checking for C compiler default output file name +configure:2327: result: a.out +configure:2333: checking for suffix of executables +configure:2340: gcc -o conftest -Wall -g -O2 -D_FORTIFY_SOURCE=2 -Wl,-Bsymbolic-functions -Wl,-z,relro conftest.c >&5 +configure:2344: $? = 0 +configure:2366: result: +configure:2388: checking whether we are cross compiling +configure:2396: gcc -o conftest -Wall -g -O2 -D_FORTIFY_SOURCE=2 -Wl,-Bsymbolic-functions -Wl,-z,relro conftest.c >&5 +configure:2400: $? = 0 +configure:2407: ./conftest +configure:2411: $? = 0 +configure:2426: result: no +configure:2431: checking for suffix of object files +configure:2453: gcc -c -Wall -g -O2 -D_FORTIFY_SOURCE=2 conftest.c >&5 +configure:2457: $? = 0 +configure:2478: result: o +configure:2482: checking whether we are using the GNU C compiler +configure:2501: gcc -c -Wall -g -O2 -D_FORTIFY_SOURCE=2 conftest.c >&5 +configure:2501: $? = 0 +configure:2510: result: yes +configure:2519: checking whether gcc accepts -g +configure:2539: gcc -c -g -D_FORTIFY_SOURCE=2 conftest.c >&5 +configure:2539: $? = 0 +configure:2580: result: yes +configure:2597: checking for gcc option to accept ISO C89 +configure:2661: gcc -c -Wall -g -O2 -D_FORTIFY_SOURCE=2 conftest.c >&5 +configure:2661: $? = 0 +configure:2674: result: none needed +configure:2694: checking whether ln -s works +configure:2698: result: yes +configure:2707: checking for sed +configure:2725: found /bin/sed +configure:2737: result: /bin/sed +configure:2747: checking for grep +configure:2765: found /bin/grep +configure:2777: result: /bin/grep +configure:2787: checking for flex +configure:2805: found /usr/bin/flex +configure:2817: result: /usr/bin/flex +configure:2827: checking for rpmbuild +configure:2860: result: no +configure:2867: checking for xsltproc +configure:2885: found /usr/bin/xsltproc +configure:2897: result: /usr/bin/xsltproc +configure:2907: checking for tar +configure:2925: found /bin/tar +configure:2937: result: /bin/tar +configure:2947: checking for git +configure:2965: found /usr/bin/git +configure:2977: result: /usr/bin/git +configure:2987: checking for dpkg-buildpackage +configure:3005: found /usr/bin/dpkg-buildpackage +configure:3017: result: /usr/bin/dpkg-buildpackage +configure:3027: checking for udevadm +configure:3045: found /sbin/udevadm +configure:3058: result: /sbin/udevadm +configure:3068: checking for udevinfo +configure:3099: result: false +configure:3124: WARNING: No rpmbuild found, building RPM packages is disabled. +configure:3168: checking for /etc/gentoo-release +configure:3181: result: no +configure:3187: checking for /etc/redhat-release +configure:3200: result: no +configure:3206: checking for /etc/slackware-version +configure:3219: result: no +configure:3225: checking for /etc/debian_version +configure:3238: result: yes +configure:3244: checking for /etc/SuSE-release +configure:3257: result: no +configure:3316: configured for Debian (includes Ubuntu). +configure:3522: creating ./config.status ## ---------------------- ## ## Running config.status. ## ## ---------------------- ## -This file was extended by DRBD config.status 8.3.7, which was -generated by GNU Autoconf 2.64. Invocation command line was +This file was extended by DRBD config.status 8.4.1, which was +generated by GNU Autoconf 2.65. Invocation command line was CONFIG_FILES = CONFIG_HEADERS = @@ -152,13 +159,17 @@ CONFIG_COMMANDS = $ ./config.status -on lucid +on rexy -config.status:828: creating Makefile -config.status:828: creating user/Makefile -config.status:828: creating scripts/Makefile -config.status:828: creating documentation/Makefile -config.status:828: creating user/config.h +config.status:836: creating Makefile +config.status:836: creating user/Makefile +config.status:836: creating user/legacy/Makefile +config.status:836: creating scripts/Makefile +config.status:836: creating documentation/Makefile +config.status:836: creating user/config.h +config.status:997: user/config.h is unchanged +config.status:836: creating user/legacy/config.h +config.status:997: user/legacy/config.h is unchanged ## ---------------- ## ## Cache variables. ## @@ -170,9 +181,9 @@ ac_cv_env_CFLAGS_set=set ac_cv_env_CFLAGS_value='-Wall -g -O2' ac_cv_env_CPPFLAGS_set=set -ac_cv_env_CPPFLAGS_value= +ac_cv_env_CPPFLAGS_value=-D_FORTIFY_SOURCE=2 ac_cv_env_LDFLAGS_set=set -ac_cv_env_LDFLAGS_value=-Wl,-Bsymbolic-functions +ac_cv_env_LDFLAGS_value='-Wl,-Bsymbolic-functions -Wl,-z,relro' ac_cv_env_LIBS_set= ac_cv_env_LIBS_value= ac_cv_env_build_alias_set= @@ -189,10 +200,12 @@ ac_cv_objext=o ac_cv_path_DPKG_BUILDPACKAGE=/usr/bin/dpkg-buildpackage ac_cv_path_FLEX=/usr/bin/flex +ac_cv_path_GIT=/usr/bin/git ac_cv_path_GREP=/bin/grep ac_cv_path_SED=/bin/sed ac_cv_path_TAR=/bin/tar ac_cv_path_UDEVADM=/sbin/udevadm +ac_cv_path_UDEVINFO=false ac_cv_path_XSLTPROC=/usr/bin/xsltproc ac_cv_prog_ac_ct_CC=gcc ac_cv_prog_cc_c89= @@ -205,7 +218,7 @@ BASH_COMPLETION_SUFFIX='' CC='gcc' CFLAGS='-Wall -g -O2' -CPPFLAGS='' +CPPFLAGS='-D_FORTIFY_SOURCE=2' DEFS='-DHAVE_CONFIG_H' DISTRO='debian' DPKG_BUILDPACKAGE='/usr/bin/dpkg-buildpackage' @@ -214,13 +227,11 @@ ECHO_T='' EXEEXT='' FLEX='/usr/bin/flex' -GIT='' +GIT='/usr/bin/git' GREP='/bin/grep' INITDIR='/etc/init.d' INITSCRIPT_SYMLINK='' -KDIR='' -KVER='' -LDFLAGS='-Wl,-Bsymbolic-functions' +LDFLAGS='-Wl,-Bsymbolic-functions -Wl,-z,relro' LIBOBJS='' LIBS='' LN_S='ln -s' @@ -228,10 +239,10 @@ OBJEXT='o' PACKAGE_BUGREPORT='drbd-dev@lists.linbit.com' PACKAGE_NAME='DRBD' -PACKAGE_STRING='DRBD 8.3.7' +PACKAGE_STRING='DRBD 8.4.1' PACKAGE_TARNAME='drbd' PACKAGE_URL='' -PACKAGE_VERSION='8.3.7' +PACKAGE_VERSION='8.4.1' PATH_SEPARATOR=':' RPMBUILD='' RPM_BUILDREQ_DEFAULT='gcc flex glibc-devel make' @@ -249,13 +260,14 @@ SHELL='/bin/bash' TAR='/bin/tar' UDEVADM='/sbin/udevadm' -UDEVINFO='' +UDEVINFO='false' UDEV_RULE_SUFFIX='' WITH_BASHCOMPLETION='yes' WITH_HEARTBEAT='yes' WITH_KM='no' +WITH_LEGACY_UTILS='yes' WITH_PACEMAKER='yes' -WITH_RGMANAGER='yes' +WITH_RGMANAGER='no' WITH_UDEV='yes' WITH_UTILS='yes' WITH_XEN='yes' @@ -294,12 +306,14 @@ /* confdefs.h */ #define PACKAGE_NAME "DRBD" #define PACKAGE_TARNAME "drbd" -#define PACKAGE_VERSION "8.3.7" -#define PACKAGE_STRING "DRBD 8.3.7" +#define PACKAGE_VERSION "8.4.1" +#define PACKAGE_STRING "DRBD 8.4.1" #define PACKAGE_BUGREPORT "drbd-dev@lists.linbit.com" #define PACKAGE_URL "" #define DRBD_LIB_DIR "/var/lib/drbd" +#define DRBD_RUN_DIR "/var/run/drbd" #define DRBD_LOCK_DIR "/var/lock" #define DRBD_CONFIG_DIR "/etc" +#define DRBD_LEGACY_83 1 configure: exit 0 diff -Nru drbd8-8.3.7/config.status drbd8-8.4.1+git55a81dc~cmd1/config.status --- drbd8-8.3.7/config.status 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/config.status 2012-09-03 22:37:14.000000000 +0000 @@ -391,8 +391,8 @@ # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by DRBD $as_me 8.3.7, which was -generated by GNU Autoconf 2.64. Invocation command line was +This file was extended by DRBD $as_me 8.4.1, which was +generated by GNU Autoconf 2.65. Invocation command line was CONFIG_FILES = $CONFIG_FILES CONFIG_HEADERS = $CONFIG_HEADERS @@ -404,8 +404,8 @@ " # Files that config.status was made for. -config_files=" Makefile user/Makefile scripts/Makefile documentation/Makefile" -config_headers=" user/config.h" +config_files=" Makefile user/Makefile user/legacy/Makefile scripts/Makefile documentation/Makefile" +config_headers=" user/config.h user/legacy/config.h" ac_cs_usage="\ \`$as_me' instantiates files and other configuration actions @@ -416,6 +416,7 @@ -h, --help print this help, then exit -V, --version print version number and configuration settings, then exit + --config print configuration, then exit -q, --quiet, --silent do not print progress messages -d, --debug don't remove temporary files @@ -433,16 +434,17 @@ Report bugs to ." +ac_cs_config="'--prefix=/usr' '--localstatedir=/var' '--sysconfdir=/etc' 'CFLAGS=-Wall -g -O2' 'LDFLAGS=-Wl,-Bsymbolic-functions -Wl,-z,relro' 'CPPFLAGS=-D_FORTIFY_SOURCE=2'" ac_cs_version="\ -DRBD config.status 8.3.7 -configured by ./configure, generated by GNU Autoconf 2.64, - with options \"'--prefix=/usr' '--localstatedir=/var' '--sysconfdir=/etc' '--with-utils' '--with-udev' '--with-xen' '--with-pacemaker' '--with-rgmanager' '--with-bashcompletion' 'CFLAGS=-Wall -g -O2' 'LDFLAGS=-Wl,-Bsymbolic-functions' 'CPPFLAGS='\" +DRBD config.status 8.4.1 +configured by ./configure, generated by GNU Autoconf 2.65, + with options \"$ac_cs_config\" Copyright (C) 2009 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it." -ac_pwd='/home/ivoks/Cluster/drbd8-8.3.7' +ac_pwd='/home/ildefonso/trabajo/commandprompt/cmd/drbd/drbd-8.4.1' srcdir='.' test -n "$AWK" || AWK=awk # The default lists apply if the user does not specify any file. @@ -468,6 +470,8 @@ ac_cs_recheck=: ;; --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) $as_echo "$ac_cs_version"; exit ;; + --config | --confi | --conf | --con | --co | --c ) + $as_echo "$ac_cs_config"; exit ;; --debug | --debu | --deb | --de | --d | -d ) debug=: ;; --file | --fil | --fi | --f ) @@ -513,7 +517,7 @@ fi if $ac_cs_recheck; then - set X '/bin/bash' './configure' '--prefix=/usr' '--localstatedir=/var' '--sysconfdir=/etc' '--with-utils' '--with-udev' '--with-xen' '--with-pacemaker' '--with-rgmanager' '--with-bashcompletion' 'CFLAGS=-Wall -g -O2' 'LDFLAGS=-Wl,-Bsymbolic-functions' 'CPPFLAGS=' $ac_configure_extra_args --no-create --no-recursion + set X '/bin/bash' './configure' '--prefix=/usr' '--localstatedir=/var' '--sysconfdir=/etc' 'CFLAGS=-Wall -g -O2' 'LDFLAGS=-Wl,-Bsymbolic-functions -Wl,-z,relro' 'CPPFLAGS=-D_FORTIFY_SOURCE=2' $ac_configure_extra_args --no-create --no-recursion shift $as_echo "running CONFIG_SHELL=/bin/bash $*" >&6 CONFIG_SHELL='/bin/bash' @@ -537,11 +541,14 @@ case $ac_config_target in "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; "user/Makefile") CONFIG_FILES="$CONFIG_FILES user/Makefile" ;; + "user/legacy/Makefile") CONFIG_FILES="$CONFIG_FILES user/legacy/Makefile" ;; "scripts/Makefile") CONFIG_FILES="$CONFIG_FILES scripts/Makefile" ;; "documentation/Makefile") CONFIG_FILES="$CONFIG_FILES documentation/Makefile" ;; "user/config.h") CONFIG_HEADERS="$CONFIG_HEADERS user/config.h" ;; + "user/legacy/config.h") CONFIG_HEADERS="$CONFIG_HEADERS user/legacy/config.h" ;; "drbd.spec") CONFIG_FILES="$CONFIG_FILES drbd.spec" ;; "drbd-km.spec") CONFIG_FILES="$CONFIG_FILES drbd-km.spec" ;; + "drbd-kernel.spec") CONFIG_FILES="$CONFIG_FILES drbd-kernel.spec" ;; *) as_fn_error "invalid argument: \`$ac_config_target'" "$LINENO" 5;; esac @@ -622,12 +629,10 @@ S["BASH_COMPLETION_SUFFIX"]="" S["INITDIR"]="/etc/init.d" S["DISTRO"]="debian" -S["KDIR"]="" -S["KVER"]="" -S["UDEVINFO"]="" +S["UDEVINFO"]="false" S["UDEVADM"]="/sbin/udevadm" S["DPKG_BUILDPACKAGE"]="/usr/bin/dpkg-buildpackage" -S["GIT"]="" +S["GIT"]="/usr/bin/git" S["TAR"]="/bin/tar" S["XSLTPROC"]="/usr/bin/xsltproc" S["RPMBUILD"]="" @@ -638,17 +643,18 @@ S["OBJEXT"]="o" S["EXEEXT"]="" S["ac_ct_CC"]="gcc" -S["CPPFLAGS"]="" -S["LDFLAGS"]="-Wl,-Bsymbolic-functions" +S["CPPFLAGS"]="-D_FORTIFY_SOURCE=2" +S["LDFLAGS"]="-Wl,-Bsymbolic-functions -Wl,-z,relro" S["CFLAGS"]="-Wall -g -O2" S["CC"]="gcc" S["WITH_BASHCOMPLETION"]="yes" -S["WITH_RGMANAGER"]="yes" +S["WITH_RGMANAGER"]="no" S["WITH_HEARTBEAT"]="yes" S["WITH_PACEMAKER"]="yes" S["WITH_XEN"]="yes" S["WITH_UDEV"]="yes" S["WITH_KM"]="no" +S["WITH_LEGACY_UTILS"]="yes" S["WITH_UTILS"]="yes" S["target_alias"]="" S["host_alias"]="" @@ -682,8 +688,8 @@ S["exec_prefix"]="/usr" S["PACKAGE_URL"]="" S["PACKAGE_BUGREPORT"]="drbd-dev@lists.linbit.com" -S["PACKAGE_STRING"]="DRBD 8.3.7" -S["PACKAGE_VERSION"]="8.3.7" +S["PACKAGE_STRING"]="DRBD 8.4.1" +S["PACKAGE_VERSION"]="8.4.1" S["PACKAGE_TARNAME"]="drbd" S["PACKAGE_NAME"]="DRBD" S["PATH_SEPARATOR"]=":" @@ -731,13 +737,15 @@ BEGIN { D["PACKAGE_NAME"]=" \"DRBD\"" D["PACKAGE_TARNAME"]=" \"drbd\"" -D["PACKAGE_VERSION"]=" \"8.3.7\"" -D["PACKAGE_STRING"]=" \"DRBD 8.3.7\"" +D["PACKAGE_VERSION"]=" \"8.4.1\"" +D["PACKAGE_STRING"]=" \"DRBD 8.4.1\"" D["PACKAGE_BUGREPORT"]=" \"drbd-dev@lists.linbit.com\"" D["PACKAGE_URL"]=" \"\"" D["DRBD_LIB_DIR"]=" \"/var/lib/drbd\"" +D["DRBD_RUN_DIR"]=" \"/var/run/drbd\"" D["DRBD_LOCK_DIR"]=" \"/var/lock\"" D["DRBD_CONFIG_DIR"]=" \"/etc\"" +D["DRBD_LEGACY_83"]=" 1" for (key in D) D_is_set[key] = 1 FS = "" } diff -Nru drbd8-8.3.7/configure drbd8-8.4.1+git55a81dc~cmd1/configure --- drbd8-8.3.7/configure 2010-01-13 16:17:27.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/configure 2012-02-02 14:09:44.000000000 +0000 @@ -1,12 +1,14 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.64 for DRBD 8.3.7. +# Generated by GNU Autoconf 2.65 for DRBD 8.4.1. # # Report bugs to . # +# # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, -# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software -# Foundation, Inc. +# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, +# Inc. +# # # This configure script is free software; the Free Software Foundation # gives unlimited permission to copy, distribute and modify it. @@ -526,7 +528,8 @@ as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" -exec 7<&0 &1 +test -n "$DJDIR" || exec 7<&0 &1 # Name of the host. # hostname on some systems (SVR3.2, Linux) returns a bogus exit status, @@ -548,8 +551,8 @@ # Identity of this package. PACKAGE_NAME='DRBD' PACKAGE_TARNAME='drbd' -PACKAGE_VERSION='8.3.7' -PACKAGE_STRING='DRBD 8.3.7' +PACKAGE_VERSION='8.4.1' +PACKAGE_STRING='DRBD 8.4.1' PACKAGE_BUGREPORT='drbd-dev@lists.linbit.com' PACKAGE_URL='' @@ -571,8 +574,6 @@ BASH_COMPLETION_SUFFIX INITDIR DISTRO -KDIR -KVER UDEVINFO UDEVADM DPKG_BUILDPACKAGE @@ -598,6 +599,7 @@ WITH_XEN WITH_UDEV WITH_KM +WITH_LEGACY_UTILS WITH_UTILS target_alias host_alias @@ -641,6 +643,7 @@ ac_user_opts=' enable_option_checking with_utils +with_legacy_utils with_km with_udev with_xen @@ -1202,7 +1205,7 @@ # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures DRBD 8.3.7 to adapt to many kinds of systems. +\`configure' configures DRBD 8.4.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1263,7 +1266,7 @@ if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of DRBD 8.3.7:";; + short | recursive ) echo "Configuration of DRBD 8.4.1:";; esac cat <<\_ACEOF @@ -1278,6 +1281,7 @@ --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) --with-utils Enable management utilities + --without-legacy_utils Do not include legacy <= 8.3 drbdsetup/drbdadm --with-km Enable kernel module --with-udev Enable udev integration --with-xen Enable Xen integration @@ -1300,7 +1304,7 @@ LDFLAGS linker flags, e.g. -L if you have libraries in a nonstandard directory LIBS libraries to pass to the linker, e.g. -l - CPPFLAGS C/C++/Objective C preprocessor flags, e.g. -I if + CPPFLAGS (Objective) C/C++ preprocessor flags, e.g. -I if you have headers in a nonstandard directory Use these variables to override the choices made by `configure' or to help @@ -1369,8 +1373,8 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -DRBD configure 8.3.7 -generated by GNU Autoconf 2.64 +DRBD configure 8.4.1 +generated by GNU Autoconf 2.65 Copyright (C) 2009 Free Software Foundation, Inc. This configure script is free software; the Free Software Foundation @@ -1417,15 +1421,15 @@ ac_retval=1 fi eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;} - return $ac_retval + as_fn_set_status $ac_retval } # ac_fn_c_try_compile cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by DRBD $as_me 8.3.7, which was -generated by GNU Autoconf 2.64. Invocation command line was +It was created by DRBD $as_me 8.4.1, which was +generated by GNU Autoconf 2.65. Invocation command line was $ $0 $@ @@ -1678,7 +1682,7 @@ for ac_site_file in "$ac_site_file1" "$ac_site_file2" do test "x$ac_site_file" = xNONE && continue - if test -r "$ac_site_file"; then + if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 $as_echo "$as_me: loading site script $ac_site_file" >&6;} sed 's/^/| /' "$ac_site_file" >&5 @@ -1687,9 +1691,9 @@ done if test -r "$cache_file"; then - # Some versions of bash will fail to source /dev/null (special - # files actually), so we avoid doing that. - if test -f "$cache_file"; then + # Some versions of bash will fail to source /dev/null (special files + # actually), so we avoid doing that. DJGPP emulates it as a regular file. + if test /dev/null != "$cache_file" && test -f "$cache_file"; then { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 $as_echo "$as_me: loading cache $cache_file" >&6;} case $cache_file in @@ -1795,6 +1799,7 @@ docdir="`eval echo ${docdir}`" WITH_UTILS=yes +WITH_LEGACY_UTILS=yes WITH_KM=no WITH_UDEV=yes WITH_XEN=yes @@ -1810,6 +1815,12 @@ fi +# Check whether --with-legacy_utils was given. +if test "${with_legacy_utils+set}" = set; then : + withval=$with_legacy_utils; WITH_LEGACY_UTILS=$withval +fi + + # Check whether --with-km was given. if test "${with_km+set}" = set; then : withval=$with_km; WITH_KM=$withval @@ -1888,6 +1899,7 @@ + ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' @@ -2210,32 +2222,30 @@ ... rest of stderr output deleted ... 10q' conftest.err >conftest.er1 cat conftest.er1 >&5 - rm -f conftest.er1 conftest.err fi + rm -f conftest.er1 conftest.err $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 test $ac_status = 0; } done cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ -#include + int main () { -FILE *f = fopen ("conftest.out", "w"); - return ferror (f) || fclose (f) != 0; ; return 0; } _ACEOF ac_clean_files_save=$ac_clean_files -ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out conftest.out" +ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" # Try to create an executable without -o first, disregard a.out. # It will help us diagnose broken compilers, and finding out an intuition # of exeext. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5 -$as_echo_n "checking for C compiler default output file name... " >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5 +$as_echo_n "checking whether the C compiler works... " >&6; } ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` # The possible output files: @@ -2297,10 +2307,10 @@ else ac_file='' fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5 -$as_echo "$ac_file" >&6; } if test -z "$ac_file"; then : - $as_echo "$as_me: failed program was:" >&5 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +$as_echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 @@ -2308,51 +2318,18 @@ { as_fn_set_status 77 as_fn_error "C compiler cannot create executables See \`config.log' for more details." "$LINENO" 5; }; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5 +$as_echo_n "checking for C compiler default output file name... " >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5 +$as_echo "$ac_file" >&6; } ac_exeext=$ac_cv_exeext -# Check that the compiler produces executables we can run. If not, either -# the compiler is broken, or we cross compile. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5 -$as_echo_n "checking whether the C compiler works... " >&6; } -# If not cross compiling, check that we can run a simple program. -if test "$cross_compiling" != yes; then - if { ac_try='./$ac_file' - { { case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 - (eval "$ac_try") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; }; then - cross_compiling=no - else - if test "$cross_compiling" = maybe; then - cross_compiling=yes - else - { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -as_fn_error "cannot run C compiled programs. -If you meant to cross compile, use \`--host'. -See \`config.log' for more details." "$LINENO" 5; } - fi - fi -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - -rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out conftest.out +rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out ac_clean_files=$ac_clean_files_save -# Check that the compiler produces executables we can run. If not, either -# the compiler is broken, or we cross compile. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5 -$as_echo_n "checking whether we are cross compiling... " >&6; } -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5 -$as_echo "$cross_compiling" >&6; } - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5 $as_echo_n "checking for suffix of executables... " >&6; } if { { ac_try="$ac_link" @@ -2385,13 +2362,72 @@ as_fn_error "cannot compute suffix of executables: cannot compile and link See \`config.log' for more details." "$LINENO" 5; } fi -rm -f conftest$ac_cv_exeext +rm -f conftest conftest$ac_cv_exeext { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5 $as_echo "$ac_cv_exeext" >&6; } rm -f conftest.$ac_ext EXEEXT=$ac_cv_exeext ac_exeext=$EXEEXT +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +FILE *f = fopen ("conftest.out", "w"); + return ferror (f) || fclose (f) != 0; + + ; + return 0; +} +_ACEOF +ac_clean_files="$ac_clean_files conftest.out" +# Check that the compiler produces executables we can run. If not, either +# the compiler is broken, or we cross compile. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5 +$as_echo_n "checking whether we are cross compiling... " >&6; } +if test "$cross_compiling" != yes; then + { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } + if { ac_try='./conftest$ac_cv_exeext' + { { case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + cross_compiling=no + else + if test "$cross_compiling" = maybe; then + cross_compiling=yes + else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error "cannot run C compiled programs. +If you meant to cross compile, use \`--host'. +See \`config.log' for more details." "$LINENO" 5; } + fi + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5 +$as_echo "$cross_compiling" >&6; } + +rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out +ac_clean_files=$ac_clean_files_save { $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5 $as_echo_n "checking for suffix of object files... " >&6; } if test "${ac_cv_objext+set}" = set; then : @@ -2999,7 +3035,7 @@ ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH +for as_dir in /sbin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. @@ -3013,6 +3049,7 @@ done IFS=$as_save_IFS + test -z "$ac_cv_path_UDEVADM" && ac_cv_path_UDEVADM="false" ;; esac fi @@ -3039,7 +3076,7 @@ ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH +for as_dir in /sbin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. @@ -3053,6 +3090,7 @@ done IFS=$as_save_IFS + test -z "$ac_cv_path_UDEVINFO" && ac_cv_path_UDEVINFO="false" ;; esac fi @@ -3067,7 +3105,7 @@ -if test -z $CC; then +if test -z "$CC"; then if test "$WITH_UTILS" = "yes"; then as_fn_error "Cannot build utils without a C compiler, either install a compiler or pass the --without-utils option." "$LINENO" 5 fi @@ -3095,6 +3133,7 @@ if test -z $XSLTPROC; then { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Cannot build man pages without xsltproc. You may safely ignore this warning when building from a tarball." >&5 $as_echo "$as_me: WARNING: Cannot build man pages without xsltproc. You may safely ignore this warning when building from a tarball." >&2;} + XSLTPROC=xsltproc fi if test -z $GIT; then @@ -3102,7 +3141,7 @@ $as_echo "$as_me: WARNING: Cannot update buildtag without git. You may safely ignore this warning when building from a tarball." >&2;} fi -if test -z $UDEVADM && test -z $UDEVINFO; then +if test $UDEVADM = false && test $UDEVINFO = false; then if test "$WITH_UDEV" = "yes"; then { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: udev support enabled, but neither udevadm nor udevinfo found on this system." >&5 $as_echo "$as_me: WARNING: udev support enabled, but neither udevadm nor udevinfo found on this system." >&2;} @@ -3111,38 +3150,6 @@ -if test "$WITH_KM" = "yes"; then - as_ac_File=`$as_echo "ac_cv_file_$KDIR/Makefile" | $as_tr_sh` -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $KDIR/Makefile" >&5 -$as_echo_n "checking for $KDIR/Makefile... " >&6; } -if { as_var=$as_ac_File; eval "test \"\${$as_var+set}\" = set"; }; then : - $as_echo_n "(cached) " >&6 -else - test "$cross_compiling" = yes && - as_fn_error "cannot check for file existence when cross compiling" "$LINENO" 5 -if test -r "$KDIR/Makefile"; then - eval "$as_ac_File=yes" -else - eval "$as_ac_File=no" -fi -fi -eval ac_res=\$$as_ac_File - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 -$as_echo "$ac_res" >&6; } -eval as_val=\$$as_ac_File - if test "x$as_val" = x""yes; then : - -else - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Unable to find a kernel Makefile in $KDIR. You will have to set KDIR correctly when invoking make." >&5 -$as_echo "$as_me: WARNING: Unable to find a kernel Makefile in $KDIR. You will have to set KDIR correctly when invoking make." >&2;} -fi - - KVER="`uname -r`" - KDIR="/lib/modules/$KVER/build" -fi - - - BASH_COMPLETION_SUFFIX="" UDEV_RULE_SUFFIX="" RPM_DIST_TAG="" @@ -3338,8 +3345,8 @@ test -z $INITDIR && INITDIR="$sysconfdir/init.d" if test "$WITH_UDEV" = "yes"; then - udev_version=`$UDEVADM version 2>/dev/null` || udev_version=`$UDEVINFO -V | cut -d " " -f 3` || udev_version=0 - if test $udev_version -lt 85; then + udev_version=`$UDEVADM version 2>/dev/null` || udev_version=`$UDEVINFO -V | cut -d " " -f 3` + if test -z $udev_version || test $udev_version -lt 85; then UDEV_RULE_SUFFIX=".disabled" { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Obsolete or unknown udev version. Installing disabled udev rules." >&5 $as_echo "$as_me: WARNING: Obsolete or unknown udev version. Installing disabled udev rules." >&2;} @@ -3370,11 +3377,17 @@ + + cat >>confdefs.h <<_ACEOF #define DRBD_LIB_DIR "$localstatedir/lib/$PACKAGE_TARNAME" _ACEOF cat >>confdefs.h <<_ACEOF +#define DRBD_RUN_DIR "$localstatedir/run/$PACKAGE_TARNAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF #define DRBD_LOCK_DIR "$localstatedir/lock" _ACEOF @@ -3383,10 +3396,15 @@ _ACEOF +if test "$WITH_LEGACY_UTILS" = "yes"; then + $as_echo "#define DRBD_LEGACY_83 1" >>confdefs.h + +fi + if test -z $SPECMODE; then - ac_config_files="$ac_config_files Makefile user/Makefile scripts/Makefile documentation/Makefile" + ac_config_files="$ac_config_files Makefile user/Makefile user/legacy/Makefile scripts/Makefile documentation/Makefile" - ac_config_headers="$ac_config_headers user/config.h" + ac_config_headers="$ac_config_headers user/config.h user/legacy/config.h" else if test "$WITH_UTILS" = "yes"; then @@ -3394,7 +3412,7 @@ fi if test "$WITH_KM" = "yes"; then - ac_config_files="$ac_config_files drbd-km.spec" + ac_config_files="$ac_config_files drbd-km.spec drbd-kernel.spec" fi fi @@ -3904,8 +3922,8 @@ # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by DRBD $as_me 8.3.7, which was -generated by GNU Autoconf 2.64. Invocation command line was +This file was extended by DRBD $as_me 8.4.1, which was +generated by GNU Autoconf 2.65. Invocation command line was CONFIG_FILES = $CONFIG_FILES CONFIG_HEADERS = $CONFIG_HEADERS @@ -3944,6 +3962,7 @@ -h, --help print this help, then exit -V, --version print version number and configuration settings, then exit + --config print configuration, then exit -q, --quiet, --silent do not print progress messages -d, --debug don't remove temporary files @@ -3963,10 +3982,11 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -DRBD config.status 8.3.7 -configured by $0, generated by GNU Autoconf 2.64, - with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" +DRBD config.status 8.4.1 +configured by $0, generated by GNU Autoconf 2.65, + with options \\"\$ac_cs_config\\" Copyright (C) 2009 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation @@ -4001,6 +4021,8 @@ ac_cs_recheck=: ;; --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) $as_echo "$ac_cs_version"; exit ;; + --config | --confi | --conf | --con | --co | --c ) + $as_echo "$ac_cs_config"; exit ;; --debug | --debu | --deb | --de | --d | -d ) debug=: ;; --file | --fil | --fi | --f ) @@ -4079,11 +4101,14 @@ case $ac_config_target in "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; "user/Makefile") CONFIG_FILES="$CONFIG_FILES user/Makefile" ;; + "user/legacy/Makefile") CONFIG_FILES="$CONFIG_FILES user/legacy/Makefile" ;; "scripts/Makefile") CONFIG_FILES="$CONFIG_FILES scripts/Makefile" ;; "documentation/Makefile") CONFIG_FILES="$CONFIG_FILES documentation/Makefile" ;; "user/config.h") CONFIG_HEADERS="$CONFIG_HEADERS user/config.h" ;; + "user/legacy/config.h") CONFIG_HEADERS="$CONFIG_HEADERS user/legacy/config.h" ;; "drbd.spec") CONFIG_FILES="$CONFIG_FILES drbd.spec" ;; "drbd-km.spec") CONFIG_FILES="$CONFIG_FILES drbd-km.spec" ;; + "drbd-kernel.spec") CONFIG_FILES="$CONFIG_FILES drbd-kernel.spec" ;; *) as_fn_error "invalid argument: \`$ac_config_target'" "$LINENO" 5;; esac @@ -4186,7 +4211,7 @@ t delim :nl h -s/\(.\{148\}\).*/\1/ +s/\(.\{148\}\)..*/\1/ t more1 s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ p @@ -4200,7 +4225,7 @@ t nl :delim h -s/\(.\{148\}\).*/\1/ +s/\(.\{148\}\)..*/\1/ t more2 s/["\\]/\\&/g; s/^/"/; s/$/"/ p diff -Nru drbd8-8.3.7/configure.ac drbd8-8.4.1+git55a81dc~cmd1/configure.ac --- drbd8-8.3.7/configure.ac 2010-01-13 16:14:13.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/configure.ac 2012-02-02 14:09:14.000000000 +0000 @@ -7,7 +7,7 @@ AC_PREREQ(2.53) dnl What we are, our version, who to bug in case of problems -AC_INIT(DRBD, 8.3.7, [drbd-dev@lists.linbit.com]) +AC_INIT(DRBD, 8.4.1, [drbd-dev@lists.linbit.com]) dnl Sanitize $prefix. Autoconf does this by itself, but so late in the dnl generated configure script that the expansion does not occur until @@ -41,6 +41,7 @@ dnl "--with-" options (all except km enabled by default, pass --without- to disable) WITH_UTILS=yes +WITH_LEGACY_UTILS=yes WITH_KM=no WITH_UDEV=yes WITH_XEN=yes @@ -53,6 +54,10 @@ [AS_HELP_STRING([--with-utils], [Enable management utilities])], [WITH_UTILS=$withval]) +AC_ARG_WITH([legacy_utils], + [AS_HELP_STRING([--without-legacy_utils], + [Do not include legacy <= 8.3 drbdsetup/drbdadm])], + [WITH_LEGACY_UTILS=$withval]) AC_ARG_WITH([km], [AS_HELP_STRING([--with-km], [Enable kernel module])], @@ -101,6 +106,7 @@ AC_SUBST(WITH_UTILS) +AC_SUBST(WITH_LEGACY_UTILS) AC_SUBST(WITH_KM) AC_SUBST(WITH_UDEV) AC_SUBST(WITH_XEN) @@ -121,10 +127,10 @@ AC_PATH_PROG(TAR, tar) AC_PATH_PROG(GIT, git) AC_PATH_PROG(DPKG_BUILDPACKAGE, dpkg-buildpackage) -AC_PATH_PROG(UDEVADM, udevadm) -AC_PATH_PROG(UDEVINFO, udevinfo) +AC_PATH_PROG(UDEVADM, udevadm, [false], [/sbin$PATH_SEPARATOR$PATH]) +AC_PATH_PROG(UDEVINFO, udevinfo, [false], [/sbin$PATH_SEPARATOR$PATH]) -if test -z $CC; then +if test -z "$CC"; then if test "$WITH_UTILS" = "yes"; then AC_MSG_ERROR([Cannot build utils without a C compiler, either install a compiler or pass the --without-utils option.]) fi @@ -149,13 +155,17 @@ if test -z $XSLTPROC; then AC_MSG_WARN([Cannot build man pages without xsltproc. You may safely ignore this warning when building from a tarball.]) + dnl default to some sane value at least, + dnl so the error message about command not found makes sense + dnl otherwise you get "--xinclude ... command not found" :-/ + XSLTPROC=xsltproc fi if test -z $GIT; then AC_MSG_WARN(Cannot update buildtag without git. You may safely ignore this warning when building from a tarball.) fi -if test -z $UDEVADM && test -z $UDEVINFO; then +if test $UDEVADM = false && test $UDEVINFO = false; then if test "$WITH_UDEV" = "yes"; then AC_MSG_WARN([udev support enabled, but neither udevadm nor udevinfo found on this system.]) fi @@ -164,17 +174,6 @@ dnl Checks for system services -dnl figure out the kernel versin and kernel headers directory -if test "$WITH_KM" = "yes"; then - AC_CHECK_FILE($KDIR/Makefile, - , - AC_MSG_WARN([Unable to find a kernel Makefile in $KDIR. You will have to set KDIR correctly when invoking make.])) - KVER="`uname -r`" - KDIR="/lib/modules/$KVER/build" -fi -AC_SUBST(KVER) -AC_SUBST(KDIR) - BASH_COMPLETION_SUFFIX="" UDEV_RULE_SUFFIX="" RPM_DIST_TAG="" @@ -262,8 +261,8 @@ dnl Our udev rules file is known to work only with udev >= 85 if test "$WITH_UDEV" = "yes"; then - udev_version=`$UDEVADM version 2>/dev/null` || udev_version=`$UDEVINFO -V | cut -d " " -f 3` || udev_version=0 - if test $udev_version -lt 85; then + udev_version=`$UDEVADM version 2>/dev/null` || udev_version=`$UDEVINFO -V | cut -d " " -f 3` + if test -z $udev_version || test $udev_version -lt 85; then UDEV_RULE_SUFFIX=".disabled" AC_MSG_WARN([Obsolete or unknown udev version. Installing disabled udev rules.]) fi @@ -293,27 +292,35 @@ AH_TEMPLATE(DRBD_LIB_DIR, [Local state directory. Commonly /var/lib/drbd or /usr/local/var/lib/drbd]) +AH_TEMPLATE(DRBD_RUN_DIR, [Runtime state directory. Commonly + /var/run/drbd or /usr/local/var/run/drbd]) AH_TEMPLATE(DRBD_LOCK_DIR, [Local lock directory. Commonly /var/lock or /usr/local/var/lock]) AH_TEMPLATE(DRBD_CONFIG_DIR, [Local configuration directory. Commonly /etc or /usr/local/etc]) +AH_TEMPLATE(DRBD_LEGACY_83, [Include support for drbd-8.3 kernel code]) AC_DEFINE_UNQUOTED(DRBD_LIB_DIR, ["$localstatedir/lib/$PACKAGE_TARNAME"]) +AC_DEFINE_UNQUOTED(DRBD_RUN_DIR, ["$localstatedir/run/$PACKAGE_TARNAME"]) AC_DEFINE_UNQUOTED(DRBD_LOCK_DIR, ["$localstatedir/lock"]) AC_DEFINE_UNQUOTED(DRBD_CONFIG_DIR, ["$sysconfdir"]) +if test "$WITH_LEGACY_UTILS" = "yes"; then + AC_DEFINE(DRBD_LEGACY_83, [1]) +fi + dnl The configuration files we create (from their .in template) if test -z $SPECMODE; then - AC_CONFIG_FILES(Makefile user/Makefile scripts/Makefile documentation/Makefile) - AC_CONFIG_HEADERS(user/config.h) + AC_CONFIG_FILES(Makefile user/Makefile user/legacy/Makefile scripts/Makefile documentation/Makefile) + AC_CONFIG_HEADERS(user/config.h user/legacy/config.h) else if test "$WITH_UTILS" = "yes"; then AC_CONFIG_FILES(drbd.spec) fi if test "$WITH_KM" = "yes"; then - AC_CONFIG_FILES(drbd-km.spec) + AC_CONFIG_FILES(drbd-km.spec drbd-kernel.spec) fi fi diff -Nru drbd8-8.3.7/debian/README.Debian drbd8-8.4.1+git55a81dc~cmd1/debian/README.Debian --- drbd8-8.3.7/debian/README.Debian 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/README.Debian 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,10 @@ +drbd for Debian +--------------- + +To make sure the default installation of drbd is non-interactive, I +have set the default value of the inittimeout parameter to be a +negative number. This may not be what you would like for a production +setup. See the drbd.conf man page and pay special attention to the +inittimeout, skip-wait, and load-only options. + + -- David Krovich , Tue May 25 12:47:11 2004 diff -Nru drbd8-8.3.7/debian/TODO drbd8-8.4.1+git55a81dc~cmd1/debian/TODO --- drbd8-8.3.7/debian/TODO 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/TODO 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,4 @@ +Create lintian overrides to deal with bash not handling extglob +syntax. I'll also contact lintian package maintainers to make sure +this is the correct thing to do. Refer to #247605 in the BTS for +background on this. diff -Nru drbd8-8.3.7/debian/changelog drbd8-8.4.1+git55a81dc~cmd1/debian/changelog --- drbd8-8.3.7/debian/changelog 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/changelog 2012-09-03 22:50:38.000000000 +0000 @@ -1,951 +1,221 @@ -drbd8 (2:8.3.7-1ubuntu2) lucid; urgency=low +drbd8 (2:8.4.1+git55a81dc~cmd1-1~lucid1) lucid; urgency=low - * Drop recommends on kernel-package, we use dkms instead. + * Add git commit 55a81dcc13bf199cfa1bb6695267deec5adc92ca, for compatibility with older kernels. - -- Steve Langasek Thu, 15 Apr 2010 13:24:49 +0000 + -- Jose Ildefonso Camargo Tolosa Mon, 03 Sep 2012 18:04:49 -0430 -drbd8 (2:8.3.7-1ubuntu1) lucid; urgency=low - - * Merge from Debian, remaining Ubuntu changes: - - switch to DKMS: - + debian/control: drbd8-utils depends on drbd8-source - + debian/control: drbd8-source depends on dkms and kernel headers - + debian/dkms.conf - + debian/drbd8-source.postinst - + debian/drbd8-source.prerm - + debian/rules - + removed 10_different-kernels.dpatch; not needed with DKMS - + debian/drbd8-source.dirs: removed modass directory - * Use /usr/lib/dkms/common.postinst in debian/drbd8-source.postinst - (LP: #497149) - - -- Ante Karamatic Wed, 27 Jan 2010 17:57:54 +0000 - -drbd8 (2:8.3.7-1) unstable; urgency=low +drbd8 (2:8.4.1-0) unstable; urgency=low * New upstream release. - + Correct dependencies in init.d script. (closes: 547566, #563783) - * Acknowledge NMU of 2:8.3.4-1.1, thanks Iustin for taking care of this! - (closes: #499516) - * Ship scripts/adjust_drbd_config_h.sh and run it before building the kernel - module. (closes: #551479, #552439) - - -- Norbert Tretkowski Mon, 18 Jan 2010 16:29:43 +0100 - -drbd8 (2:8.3.4-1.1) unstable; urgency=low - * Non-maintainer upload. - * Fix watch file - * No longer stops the drbd resources on upgrades of the drbd8-utils - package, since this is not needed and shutdowns must be done by the - administrator. (closes: #499516) + -- Philipp Reisner Tue, 20 Dec 2011 13:00:00 +0200 - -- Iustin Pop Fri, 13 Nov 2009 19:58:29 +0100 - -drbd8 (2:8.3.4-1) unstable; urgency=low +drbd8 (2:8.4.0-0) unstable; urgency=low * New upstream release. - -- Norbert Tretkowski Fri, 16 Oct 2009 09:18:11 +0200 - -drbd8 (2:8.3.3-0ubuntu1) karmic; urgency=low - - * Merge from Debian, remaining Ubuntu changes: - - switch to DKMS: - + debian/control: drbd8-utils depends on drbd8-source - + debian/control: drbd8-source depends on dkms and kernel headers - + debian/dkms.conf - + debian/drbd8-source.postinst - + debian/drbd8-source.prerm - + debian/rules - - removed 10_different-kernels.dpatch; not needed with DKMS - * Source update to final 8.3.3 version - - -- Ante Karamatic Wed, 07 Oct 2009 09:47:00 +0200 - -drbd8 (2:8.3.3~rc3-1) experimental; urgency=low - - * New upstream release candidate. - - -- Norbert Tretkowski Tue, 29 Sep 2009 21:43:01 +0200 - -drbd8 (2:8.3.3~rc2-1) experimental; urgency=low - - * New upstream release candidate. - - -- Norbert Tretkowski Wed, 16 Sep 2009 19:17:52 +0200 - -drbd8 (2:8.3.3~rc1-1) experimental; urgency=low - - * New upstream release candidate. + -- Philipp Reisner Mon, 18 Jul 2011 15:05:06 +0200 - -- Norbert Tretkowski Mon, 31 Aug 2009 11:44:25 +0200 - -drbd8 (2:8.3.2-3) unstable; urgency=low - - * Drop DKMS support for now, to get the package back into testing. - (closes: #537986, #539218, #539219) - - -- Norbert Tretkowski Sun, 16 Aug 2009 12:23:59 +0200 - -drbd8 (2:8.3.2-2) unstable; urgency=low - - * Switch to DKMS, patch from Ante Karamatić (Ubuntu). - - -- Norbert Tretkowski Tue, 21 Jul 2009 20:17:47 +0200 - -drbd8 (2:8.3.2-1) unstable; urgency=low +drbd8 (2:8.3.10-0) unstable; urgency=low * New upstream release. - * Section of drbd8-source is kernel. - - -- Norbert Tretkowski Mon, 06 Jul 2009 21:17:28 +0200 - -drbd8 (2:8.3.2~rc2-1) unstable; urgency=low - - * New upstream release candidate. - - -- Norbert Tretkowski Sat, 27 Jun 2009 16:28:20 +0200 - -drbd8 (2:8.3.2~rc1-1) unstable; urgency=low - - * New upstream release candidate. - + Make it compile on Linux 2.6.30. (closes: #533261, #533654) - * Update Standards-Version to 3.8.2, no changes required. - * Fix maintainer-script-ignores-errors lintian warning. - - -- Norbert Tretkowski Mon, 22 Jun 2009 20:52:22 +0200 - -drbd8 (2:8.3.1-2) unstable; urgency=medium - - * Added a new patch from Michael Prokop to fix build with kernel 2.6.28 and - older. (closes: #522891) - -- Norbert Tretkowski Fri, 03 Apr 2009 19:54:44 +0200 + -- Philipp Reisner Fri, 28 Jan 2011 11:44:13 +0200 -drbd8 (2:8.3.1-1) unstable; urgency=low +drbd8 (2:8.3.9-0) unstable; urgency=low * New upstream release. - -- Norbert Tretkowski Fri, 27 Mar 2009 14:16:36 +0100 + -- Philipp Reisner Fri, 22 Oct 2010 15:01:16 +0200 -drbd8 (2:8.3.1~rc2-1) experimental; urgency=low - - * New upstream release candidate. - - -- Norbert Tretkowski Thu, 26 Mar 2009 10:52:06 +0100 - -drbd8 (2:8.3.1~rc1-1) experimental; urgency=low - - * New upstream release candidate. - * Drop patch 10_lsb-init-script.dpatch, merged upstream. - - -- Norbert Tretkowski Tue, 17 Mar 2009 16:38:52 +0100 - -drbd8 (2:8.3.0-3) unstable; urgency=low - - [ Martin G. Loschwitz ] - * Sigh. Remove SVN-Files from debian-diff file. - - -- Martin Loschwitz Wed, 11 Mar 2009 17:46:00 +0100 - -drbd8 (2:8.3.0-2) unstable; urgency=low - - [ Norbert Tretkowski ] - * Package is now team-maintained. - * Use dpatch for patch management. - * New patch 10_lsb-init-script.dpatch to make init-script a bit more LSB - compliant. - - [ Martin G. Loschwitz ] - * Add myself to the Uploaders:-Field. - - -- Martin Loschwitz Wed, 11 Mar 2009 16:45:00 +0100 - -drbd8 (2:8.3.0-1) unstable; urgency=low +drbd8 (2:8.3.8-0) unstable; urgency=low * New upstream release. - -- Norbert Tretkowski Wed, 24 Dec 2008 15:05:27 +0100 - -drbd8 (2:8.3.0~rc3-1) experimental; urgency=low - - * New upstream release candidate. - - -- Norbert Tretkowski Mon, 15 Dec 2008 10:14:37 +0100 + -- Philipp Reisner Wed, 2 Jun 2010 10:44:46 +0200 -drbd8 (2:8.3.0~rc2-1) experimental; urgency=low - - * New upstream release candidate. - - -- Norbert Tretkowski Sun, 07 Dec 2008 15:09:01 +0100 - -drbd8 (2:8.3.0~rc1-1) experimental; urgency=low - - * New upstream release candidate. - - -- Norbert Tretkowski Mon, 01 Dec 2008 16:19:08 +0100 - -drbd8 (2:8.2.7-2) experimental; urgency=low - - * Merge 2:8.0.14-2. - - -- Norbert Tretkowski Thu, 13 Nov 2008 12:47:22 +0100 - -drbd8 (2:8.2.7-1) experimental; urgency=low +drbd8 (2:8.3.7-0) unstable; urgency=low * New upstream release. - * Merge 2:8.0.14~rc1-1 and 2:8.0.14-1. - - -- Norbert Tretkowski Thu, 13 Nov 2008 10:55:02 +0100 - -drbd8 (2:8.2.7~rc2-1) experimental; urgency=low - - * New upstream release candidate. - - -- Norbert Tretkowski Wed, 29 Oct 2008 17:34:12 +0100 - -drbd8 (2:8.2.6-4) experimental; urgency=low - - * Fix kernel panic during verify. - - -- Norbert Tretkowski Thu, 23 Oct 2008 08:51:09 +0200 -drbd8 (2:8.2.6-3) experimental; urgency=low + -- Philipp Reisner Thu, 13 Jan 2010 13:00:00 +0200 - * Merge 2:8.0.13-1 and 2:8.0.13-2. - - -- Norbert Tretkowski Thu, 09 Oct 2008 16:26:00 +0200 - -drbd8 (2:8.2.6-2) experimental; urgency=low - - * Fix build on Linux 2.6.26. - - -- Norbert Tretkowski Wed, 27 Aug 2008 11:40:53 +0200 - -drbd8 (2:8.2.6-1) experimental; urgency=low +drbd8 (2:8.3.6-0) unstable; urgency=low * New upstream release. - * Merge changes from 2:8.0.10-1, 2:8.0.11-1 and 2:8.0.12-1. - - -- Norbert Tretkowski Tue, 17 Jun 2008 14:28:39 +0200 - -drbd8 (2:8.2.4-1) experimental; urgency=low - - * New upstream release - * Upload for experimental - - -- Philipp Hug Mon, 21 Jan 2008 21:35:06 +0100 - -drbd8 (2:8.0.14-2) unstable; urgency=low - - * Drop dpatch build-dependency. - * Drop homepage from description. - * Don't ignore make clean errors. - -- Norbert Tretkowski Thu, 13 Nov 2008 11:46:25 +0100 + -- Philipp Reisner Sun, 8 Nov 2009 10:04:24 +0200 -drbd8 (2:8.0.14-1) unstable; urgency=low +drbd8 (2:8.3.5-0) unstable; urgency=low * New upstream release. - -- Norbert Tretkowski Thu, 13 Nov 2008 10:50:27 +0100 + -- Philipp Reisner Tue, 27 Oct 2009 12:30:41 +0200 -drbd8 (2:8.0.14~rc1-1) unstable; urgency=low +drbd8 (2:8.3.4-12) unstable; urgency=low - * New upstream release candidate. - * New maintainer. (closes: #500353) + * Packaging makeover. - -- Norbert Tretkowski Wed, 29 Oct 2008 17:43:24 +0100 + -- Florian Haas Wed, 21 Oct 2009 13:55:45 +0200 -drbd8 (2:8.0.13-2) unstable; urgency=low - - * Run depmod from generated module package using dh_installmodules. - (closes: #496882) - - -- Norbert Tretkowski Wed, 03 Sep 2008 12:07:22 +0200 - -drbd8 (2:8.0.13-1) unstable; urgency=low +drbd8 (2:8.3.4-0) unstable; urgency=low * New upstream release. - + Make it compile on Linux 2.6.26. (closes: #493145) - -- Norbert Tretkowski Mon, 04 Aug 2008 17:56:03 +0200 + -- Philipp Reisner Tue, 6 Oct 2009 14:32:15 +0200 -drbd8 (2:8.0.12-1) unstable; urgency=low +drbd8 (2:8.3.3-0) unstable; urgency=low * New upstream release. - + Make it compile on Linux 2.6.25. (closes: #480418, #481992, #483676) - * Add myself as co-maintainer. - - -- Norbert Tretkowski Tue, 17 Jun 2008 14:18:36 +0200 -drbd8 (2:8.0.11-1) unstable; urgency=low + -- Philipp Reisner Mon, 5 Oct 2009 11:25:55 +0200 - * New upstream release - - -- Philipp Hug Wed, 13 Feb 2008 19:12:53 +0100 - -drbd8 (2:8.0.10-1) unstable; urgency=low - - * New upstream release - - -- Philipp Hug Tue, 12 Feb 2008 22:10:04 +0100 - -drbd8 (2:8.0.8-1) unstable; urgency=low - - * New upstream release - * Provide drbd8-module-source for compatibility - * Use EXTRA_CFLAGS (Closes: #461750) - - -- Philipp Hug Mon, 21 Jan 2008 21:23:02 +0100 - -drbd8 (2:8.0.7-1) unstable; urgency=low - - * New upstream release (Closes: #449241) - * Integrated NMU changes (Closes: #448876) +drbd8 (2:8.3.2-0) unstable; urgency=low - -- Philipp Hug Sun, 04 Nov 2007 13:46:53 +0100 + * New upstream release. -drbd8 (2:8.0.6-0.1) unstable; urgency=low + -- Philipp Reisner Fri, 3 Jul 2009 15:33:12 +0200 - * Non-Maintainer upload with permission of Philipp Hug. - * New upstream release (Closes: #438167) - * Switch to debhelper 5. - * Rename kernel module package from drbd8-module-source to - drbd8-source. - * Compress the module source tarball with bzip2. - * Provide modules/drbd8 to allow parallel installation with - drbd0.7-module-source, and remove obsolete Conflict header. - * Provide own Makefiles in the module tarball for automated building - within linux-modules-extra-2.6 (closes: #431771) +drbd8 (2:8.3.1-0) unstable; urgency=low - -- Frederik Schüler Thu, 01 Nov 2007 15:13:29 +0100 + * New upstream release. -drbd8 (2:8.0.4-1) unstable; urgency=low + -- Philipp Reisner Fri, 27 Mar 2009 12:16:00 +0200 - * New upstream release (Closes: #432104) +drbd8 (2:8.3.0-0) unstable; urgency=low - -- Philipp Hug Sun, 08 Jul 2007 12:45:33 +0200 + * New upstream release. -drbd8 (2:8.0.3-2) unstable; urgency=low + -- Philipp Reisner Thu, 18 Dec 2008 14:03:03 +0200 - * Updated Maintainer in control.modules.in - * Added documentation about how to install source package +drbd8 (8.2.7-0) unstable; urgency=low - -- Philipp Hug Thu, 21 Jun 2007 18:05:16 +0100 + * New upstream release. -drbd8 (2:8.0.3-1) unstable; urgency=low + -- Philipp Reisner Thu, 12 Nov 2008 10:01:00 +0200 - * New upstream release - * Added watch file +drbd8 (8.2.6-0) unstable; urgency=low - -- Philipp Hug Tue, 22 May 2007 21:59:01 +0200 + * New upstream release. -drbd8 (2:8.0.2-1) unstable; urgency=low + -- Philipp Reisner Fri, 30 May 2008 09:51:15 +0200 - * New upstream release +drbd8 (8.2.5-0) unstable; urgency=low - -- Philipp Hug Sat, 14 Apr 2007 19:49:41 +0200 + * New upstream release. -drbd8 (2:8.0.1-1) unstable; urgency=low + -- Philipp Reisner Tue, 12 Feb 2008 13:37:59 +0200 - * New upstream release +drbd8 (8.2.4-0) unstable; urgency=low + + * New upstream release. - -- Philipp Hug Thu, 8 Mar 2007 11:50:35 +0100 + -- Philipp Reisner Fri, 11 Jan 2008 13:37:50 +0200 -drbd8 (2:8.0.0-1) unstable; urgency=low +drbd8 (8.2.3-0) unstable; urgency=low - * New upstream release - * debian/control: updated Maintainer and Uploaders fields to match - reality. + * New upstream release. - -- Philipp Hug Sun, 18 Feb 2007 18:50:04 +0100 + -- Philipp Reisner Wed, 9 Jan 2008 15:27:53 +0200 -drbd8 (2:8.0pre5-1) unstable; urgency=low +drbd8 (8.2.1-0) unstable; urgency=low - * New upstream release - * scripts/drbd: patch for LSB compliance, submitted the patch upstream. - * debian/rules: the documentation/Makefile 'clean' target as been - renamed to 'doc-clean' + * New upstream release. - -- Cyril Bouthors Mon, 2 Oct 2006 13:48:33 +0300 + -- Philipp Reisner Fri, 2 Nov 2007 13:10:27 +0200 -drbd8 (8.0-pre4-3) unstable; urgency=low +drbd8 (8.2.0-0) unstable; urgency=low - * debian/rules: applyied patch to fix building for x86_64 on i386 from - Guido Guenther and Philipp Hug . + * New upstream release. - -- Cyril Bouthors Wed, 23 Aug 2006 00:13:22 +0300 + -- Philipp Reisner Fri, 28 Sep 2007 12:15:07 +0200 -drbd8 (8.0-pre4-2) unstable; urgency=low +drbd8 (8.0.6-0) unstable; urgency=low - * debian/drbd8-module-_KVERS_.postinst.modules.in: take care of chroot - environments when calling depmod (closes: 381767). + * New upstream release. - -- Cyril Bouthors Sun, 20 Aug 2006 22:13:01 +0300 + -- Philipp Reisner Mon, 3 Sep 2007 10:00:00 +0200 -drbd8 (8.0-pre4-1) unstable; urgency=low +drbd8 (8.0.5-0) unstable; urgency=low - * New upstream release - * debian/control: updated standards version from 3.6.2.1 to 3.7.2 - * debian/drbd8-utils.prerm: use invoke-rc.d + * New upstream release. - -- Cyril Bouthors Mon, 31 Jul 2006 18:01:22 +0300 + -- Philipp Reisner Fri, 3 Aug 2007 09:34:49 +0200 -drbd8 (8.0-pre3-1) unstable; urgency=low +drbd8 (8.0.4-0) unstable; urgency=low - * New upstream release - * debian/control.modules.in: fixed "Source" field thanks to Guido - Guenther (closes #361957). + * New upstream release. - -- Cyril Bouthors Wed, 26 Apr 2006 11:04:54 +0200 + -- Philipp Reisner Wed, 27 Jun 2007 10:00:00 +0200 -drbd8 (8.0-pre2-1) unstable; urgency=low +drbd8 (8.0.3-0) unstable; urgency=low - * New upstream release + * New upstream release. - -- Cyril Bouthors Thu, 6 Apr 2006 19:08:52 +0200 + -- Philipp Reisner Fri, 7 May 2007 17:10:14 +0200 -drbd8 (8.0-pre1-2) unstable; urgency=low +drbd8 (8.0.2-0) unstable; urgency=low - * Renamed source from drbd to drbd8 + * New upstream release. - -- Cyril Bouthors Mon, 27 Mar 2006 00:14:03 +0200 + -- Philipp Reisner Fri, 6 Apr 2007 21:32:39 +0200 -drbd (8.0-pre1-1) unstable; urgency=low +drbd8 (8.0.1-0) unstable; urgency=low - * New upstream release + * New upstream release. - -- Cyril Bouthors Wed, 22 Mar 2006 12:15:03 +0300 + -- Philipp Reisner Mon, 3 Mar 2007 10:10:26 +0200 -drbd (0.7.17-1) unstable; urgency=low +drbd8 (8.0.0-0) unstable; urgency=low - * New upstream release + * New upstream release. - -- Cyril Bouthors Wed, 8 Mar 2006 17:26:36 +0300 + -- Philipp Reisner Wed, 24 Jan 2007 16:10:09 +0200 -drbd (0.7.16-1) unstable; urgency=low +drbd8 (8.0rc2-0) unstable; urgency=low - * New upstream release + * New upstream release. - -- Cyril Bouthors Tue, 14 Feb 2006 15:13:49 +0300 + -- Philipp Reisner Wed, 17 Jan 2007 17:30:23 +0200 -drbd (0.7.15-2) unstable; urgency=low +drbd8 (8.0rc1-0) unstable; urgency=low - * debian/control: removed hard-coded dependency on libc6 thanks to - Adeodato Simó (closes: #349927). + * New upstream release. - -- Cyril Bouthors Tue, 31 Jan 2006 09:13:32 +0300 + -- Philipp Reisner Fri, 22 Dec 2006 15:19:10 +0200 -drbd (0.7.15-1) unstable; urgency=low +drbd8 (8.0pre6-0) unstable; urgency=low - * New upstream release + * New upstream release. - -- Cyril Bouthors Tue, 20 Dec 2005 17:55:40 +0300 + -- Philipp Reisner Fri, 3 Nov 2006 15:20:54 +0200 -drbd (0.7.14-3) unstable; urgency=low +drbd8 (8.0pre4-0) unstable; urgency=low - * debian/control: added explicit dependency on libc6 >= 2.3.5 + * New upstream release. - -- Cyril Bouthors Sun, 18 Dec 2005 10:15:14 +0300 + -- Philipp Reisner Mon, 31 Jul 2006 12:04:41 +0200 -drbd (0.7.14-2) unstable; urgency=low +drbd8 (8.0pre3-0) unstable; urgency=low - * debian/control: depends on debconf or debconf-2.0 (closes: #331806). + * New upstream release. - -- Cyril Bouthors Sat, 17 Dec 2005 10:43:22 +0300 + -- Philipp Reisner Thu, 20 Apr 2006 13:46:18 +0200 -drbd (0.7.14-1) unstable; urgency=low +drbd8 (8.0-pre2-0) unstable; urgency=low - * New upstream release (closes: #310993, #338994). - * debian/control: added dependency to dpatch (closes: #338994). + * New upstream release. - -- Cyril Bouthors Fri, 16 Dec 2005 13:10:25 +0300 + -- Philipp Reisner Thu, 6 Apr 2006 17:53:56 +0200 -drbd (0.7.12-1) unstable; urgency=low +drbd8 (8.0_pre1-0) unstable; urgency=low - * New upstream release + * New major release. - -- Cyril Bouthors Sat, 27 Aug 2005 18:25:47 +0300 + -- Philipp Reisner Thu, 14 Mar 2006 11:37:56 +0200 -drbd (0.7.11-1) unstable; urgency=low +drbd (0.7.13-0) unstable; urgency=low * New upstream release - -- Cyril Bouthors Mon, 20 Jun 2005 15:49:40 +0300 - -drbd (0.7.10-4) unstable; urgency=low - - * debian/control: added missing dependency to dpatch for - drbd0.7-module-source (closes: #308295). - * debian/control: updated Maintainer and Uploaders fields to match - reality. - - -- Cyril Bouthors Mon, 30 May 2005 11:22:46 +0300 - -drbd (0.7.10-3) unstable; urgency=low - - * (Cyril Bouthors) - - scripts/drbd: explicit modprobe and rmmod pathnames - (initscript_explicit_pathname.patch) (closes: #303060, #302556). - - -- Cyril Bouthors Sun, 17 Apr 2005 18:08:30 +0300 - -drbd (0.7.10-2) unstable; urgency=low - - * (Cyril Bouthors) - - debian/drbd0.7-utils.prerm: silently ignore the initscript return - code if we remove or deconfigure the package or carefully pay - attention to it if we upgrade the package. (closes: #295533). - - debian/control: fixed drbd0.7-module-source description. - - -- Cyril Bouthors Wed, 16 Feb 2005 21:05:51 +0100 - -drbd (0.7.10-1) unstable; urgency=low - - * (Cyril Bouthors) - - New upstream release - - -- Cyril Bouthors Mon, 31 Jan 2005 17:29:27 +0300 - -drbd (0.7.9-2) unstable; urgency=low - - * (Cyril Bouthors) - - Applied patch from Lars Marowsky-Bree that fixes a - "severe [...] memory corruption bug [...]". - - -- Cyril Bouthors Thu, 27 Jan 2005 13:55:00 +0300 - -drbd (0.7.9-1) unstable; urgency=low - - * (Cyril Bouthors) - - New upstream release - - -- Cyril Bouthors Thu, 27 Jan 2005 11:35:19 +0300 - -drbd (0.7.8-1) unstable; urgency=low - - * (Cyril Bouthors) - - New upstream release - - -- Cyril Bouthors Mon, 17 Jan 2005 18:50:49 +0400 - -drbd (0.7.7-1) unstable; urgency=low - - * (Cyril Bouthors) - - New upstream release - - -- Cyril Bouthors Wed, 15 Dec 2004 17:15:35 +0300 - -drbd (0.7.6-2) unstable; urgency=low - - * (Cyril Bouthors) - - drbd0.7-module-source: Moved debhelper from Recommends to Depends - - -- Cyril Bouthors Thu, 9 Dec 2004 20:37:41 +0300 - -drbd (0.7.6-1) unstable; urgency=low - - * (Cyril Bouthors) - - New upstream release - - debian/control.modules.in: updated description - - debian/TODO: removed - - -- Cyril Bouthors Tue, 30 Nov 2004 19:43:27 +0300 - -drbd (0.7.5-2) unstable; urgency=low - - * (Philipp Hug) - - debian/control: Added Conflict with drbd-util and drbd-module-source - - debian/control.in.modules: Fixed description for kernel module - - debian/control.in.modules: Added Conflict line in kernel module package - - debian/control.in.modules: Change depends to drbd0.7-util - - debian/rules: don't use top-level makefile, to prevent re-creation of - drbd_buildtag.c - - call depmod in postinst of kernel module - - debian/rules: remove obsolete upstream ./debian/ files in clean target - - added module-assistant override file - - this version is ready for sarge (Closes: #277669) - - -- Philipp Hug Tue, 19 Oct 2004 20:50:49 +0200 - -drbd (0.7.5-1) unstable; urgency=low - - * (Cyril Bouthors) - - New upstream release (closes: #276640). - - debian/drbd0.7.dirs: removed usr/bin (closes: #276643). - - debian/control: changed Section from misc to admin. - - scripts/drbd: prevent the "stop" target from failing if drbd is not running - - debian/rules: call upstream Makefile targets instead of doing hardcoded stuff, clean. - - debian/drbd0.7.docs: added upgrade_0.6.x_to_0.7.0.txt and upgrade_0.7.0_to_0.7.1.txt. - - The package drbd as been renamed to drbd0.7-utils and drbd-source as - been renamed to drbd0.7-module-source. We'll introduce drbd0.6-* - and drbd*-module soon. - - Added myself as Uploader. - - * (Philipp Hug) - - Conflict with drbd and drbd-source - - Fixed description for kernel modules - - Added Conflict line in kernel-module package - - Added bison, flex to Build-Depends - - Call 'make clean' in drbd directory - - Upgraded to debian-policy 3.6.1 - - Rewritten debian/rules using module-assistant - - Change binary package name to drbd0.7 - - Added myself as Uploader - - -- Cyril Bouthors Sat, 16 Oct 2004 23:43:27 +0200 - -drbd (0.7.4-1) unstable; urgency=low - - * Fixed a critical bug with Linux-2.4.x and HIGHMEM! - * Fixed a bug that only showed up with the HIGHMEM problem on - Linux-2.4.x -> It caused the resync process to starve. - * The drbd.spec file now creates /dev/drbd in the post-install stage. - * Fixed support for more than 2TB storage. Now DRBD supports up to - 3.99TB storage. It will also tell you, that it is not supported if - you try to set up a bigger device. - * Debian's build rules file now knows about the adjust_drbd_config_h.sh - file. - * DRBD_DISABLE_SENDPAGE available in drbd_config.h - - -- Philipp Reisner Thu, 9 Sep 2004 19:50:00 +0200 - -drbd (0.7.3-2) unstable; urgency=low - - * Fixed debian/rules: Include adjust_drbd_config_h.sh in drbd-source - - -- Philipp Hug Tue, 31 Aug 2004 15:37:38 +0000 - -drbd (0.7.3-1) unstable; urgency=low - - * Fixed minor bugs in the handling of the generation counters. - * prevent possible in-kernel buffer overflow in drbd_proc.c - * Fixed debian's postinst script to create /dev/drbd? instead of /dev/nb? - * drbd status: - be nice to heartbeat, include "OK" in output. - * added FullSync meta data flag to read/write gc.pl - * make the RHEL3 happy (page_count no longer in mm.h, but in mm_inline.h) - * [Patch by Pavel Semerad]. Also use the drbd_devfs_name on Linux-2.4.x - * fix missing dependencies on drbd_config.h - - -- Philipp Reisner Fri, 27 Aug 2004 15:02:00 +0200 - -drbd (0.7.2-1) unstable; urgency=low - - * Proper handling of backing storage devices that occasionally fail - READA (=read ahead) requests. (E.g. LVM and MD) - * DRBD now fails READA requests itself, if a resynchronisation is running - and it would need to fetch the block from its peer. - * "drbdadm adjust" had a race, which caused random errors. ( Missing - waitpid() ). Fixed now. - * Proper subtract SyncPause times from the syncer performance numbers. - * Fix to the syncer progress bar in /proc/drbd. - * Fix to debian build rules. - - -- Philipp Reisner Fri, 6 Aug 2004 14:44:31 +0200 - -drbd (0.7.1-1) unstable; urgency=low - - * Upgrade instructions for 0.6.x -> 0.7.0 and 0.7.0 -> 0.7.1 - * Workaround for XFS' IO requests with page count of zero. - * Handle the human and the timeout count correctly in the new init script. - * The implementation of the incon-degr-cmd was missing, added. - * Fix for integer overflow in /proc/drbd syncer progress display - * Longer timeouts in drbdadm for drbdsetup commands witch operate on - meta data. - * New major number 147 (officially registered at lanana.org). - * Added a missing w_resume_next_wg() in case we stop syncing because - of connection loss. - * Fixed a Linux-2.2-ismus in recieve_data_tail(). Should considerably - speed up protocols A and B. - * Some work on vendor kernel compatibility - - -- Philipp Reisner Fri, 30 Jul 2004 13:50:33 +0200 - -drbd (0.7.0-1) unstable; urgency=low - - * s/WriteHint/UnplugRemote/g - * new module parameter major_nr to allow "arbitrary" major numbers - * adjusted CTH to cope with that - * fix copy'n'paste and conversion errors in initial bitmap handshake - * warning "please upgrade me" if peer speaks (PRO_VERSION+1) - * drbd_set_in_sync and drbd_set_out_of_sync are now macros - calling to __*, giving file and line information, - to be able to easily track causes of "strange state"s there. - * rs_total is now != 0 only if we actually ARE syncing. - it is reset - * when sync is done - * when connection is lost - * when storage is lost on either node - this way we can optimize and call drbd_set_in_sync only if rs_total != 0 - (and it feels somewhat more clean, too) - * makefile adjusted to recognize svn revision and date tags - * updates and fixes to the test helpers and bash test cases - - -- Philipp Reisner Fri, 16 Jul 2004 10:13:33 +0200 - -drbd (0.7_pre10-1) unstable; urgency=low - - * A fix to a generic bug in the bitmap code introduced with the -pre9 - release (with the 64 bit work) - * A fix to a bug in the bitmap code only relevant for 64 bit platforms. - * Better 2.4.x compatibility and compatibility to 2.4.x vendor kernels. - * Improvements in the way to deal with incompatible protocol releases. - * Added the "dialog-refresh" config option. - - changes up to -pre9: - * Re-enabled zero copy IO for protocols B and C. (Zero copy IO is not - used with protocol A) - * Implemented the unpopular user dialog in the boot process. - * Some fixes for Linux-2.4.x compatibility. - * drbd.conf man page updated - * Bugfixes for 64bit architectures - * Ensured protocol compatibility between hosts of different word sizes - (Tested with i386 and alpha) - * Support for meta-data on block devices with hardsect size != 512 Byte - (e.g. dasd on s390x) - * New debian subdir - - -- Lars Ellenberg Fri, 09 Jul 2004 20:00:19 +0200 - -drbd (0.7_pre8-2) unstable; urgency=low - - * fix up the modules source package - - -- Bernd Schubert Mon, 05 Jul 2004 00:57:38 -0100 - -drbd (0.7_pre8-1) unstable; urgency=low - - * initial 0.7 debian package - - -- Bernd Schubert Mon, 21 Jun 2004 19:57:38 -0400 - -drbd (0.6.12-5) unstable; urgency=low - - * Changed default drbd.conf file to set a negative inittimeout value and - updated the README.Debian file to reflect this change. - (Closes Bug#221751) - - -- David Krovich Tue, 25 May 2004 12:51:15 -0400 - -drbd (0.6.12-4) unstable; urgency=low - - * Refactored rules file in an attempt to use binary-arch and binary-indep - targets more wisely. This is an attempt to fix Bug#244392. - * Listed /etc/ha.d/resource.d/drbd in debian/conffiles. (Closes Bug#247606) - * Moved drbdsetup from /usr/bin/ to /usr/sbin. I think I introduced this - when I overhauled the debian directory in the 0.6.12-1 release. - (Closes Bug#247607) - - -- David Krovich Sun, 16 May 2004 15:20:59 -0400 - -drbd (0.6.12-3) unstable; urgency=low - - * After discussing with upstream, tweak /etc/init.d/drbd script so the - stop target works if the module is not loaded. (Closes: Bug#243417) - * Put the drbd script in the /etc/ha.d/resource.d directory. (Closes: Bug#245219) - - -- David Krovich Thu, 22 Apr 2004 18:12:47 -0400 - -drbd (0.6.12-2) unstable; urgency=low - - * Create /dev/nb[0-7] devices in postinst script. (Closes: Bug#221545) - - -- David Krovich Sat, 17 Apr 2004 15:18:29 -0400 - -drbd (0.6.12-1) unstable; urgency=low - - * new upstream release. (Closes: Bug#239804) - * Completely overhauled the debian/ directory. - * Changed sequence number in the runlevel to start at 70 and stop - at 08. drbd should start after things like ssh, but before - heartbeat. - - -- David Krovich Mon, 22 Mar 2004 00:04:35 -0500 - -drbd (0.6.10-3) unstable; urgency=low - * Added back the drbd.postinst, drbd.postrm, and drbd.prerm scripts until - I figure out why they aren't being handled by dh_installinit. - * As of drbd-0.6.9, The drbd module no longer builds against just the - kernel-headers package and now needs a full kernel-source tree. - - -- David Krovich Mon, 26 Jan 2004 00:32:49 -0500 - -drbd (0.6.10-2) unstable; urgency=low - - * noel: fixed lintian warning: - W: drbd: package-contains-CVS-dir usr/share/doc/drbd/HOWTO/CVS/ - W: drbd: script-in-etc-init.d-not-registered-via-update-rc.d /etc/init.d/drbd - - * Lintian/Linda fixes. - - * Tweaked the drbd-0.6.10.orig.tar.gz to not have a debian/ directory in it. - * Stopped tweaking the copyright notice on drbd_fs.c and drbd_receiver.c. - I'm not sure how that got there in the first place. - * Removed mystery report_to_html.pl.debdiff file. - * Put the datadisk in the correct location. (Closes: Bug#221544) - - * Removed drbd.postinst, drbd.postrm, and drbd.prerm as they are - being generated by dh_installinit during the build process and do not - need to part of the source package. - * removed dependancy on automake and autoconf - * Changed control.modules to require debhelper >= 4. - * Stop settting $KSRC in the rules file. - - * Removed conffiles, files, kernel-patch-wup.substvars as they are - unneccessary. - * Tightened the build dependancy on debhelper. >=4 - * Updated Debian packages up to newest upstream version. (Closes: Bug#197906) - * Updated Package descriptions. (Closes: Bug#209462) - * Verified support for devfs. (Closes: Bug#203552) - * I'd like to become a Debian Developer and take over maintenance for - this package. I'm working with Debian Devolpers on making this happen. - - -- David Krovich Tue, 20 Jan 2004 01:36:58 -0500 - -drbd (0.6.10-1) unstable; urgency=low - - * With 0.6.9 there was a bug introduced which prevented the sending - of ACK packets during resync. Fixed. - * A fix to drbdsetup's wait_connect command. - * Replaced all invocations of the sleep_on() family functions with the - invocations of the wait_event() macros. This removes lost wakup events - and race conditions. - * New implementation of drbd_wait_ee(). This makes the - "(BUG?) Moving bh=%p to done_ee" go away. - * Handle the case if vmalloc() of the bitmap fails. - - -- Philipp Reisner Thu, 12 Dec 2003 15:10:44 +0200 - -drbd (0.6.9-1) unstable; urgency=low - - * New module build system (using kernel source tree build system) - * New net section option 'ko-count'. It allows you to kick out a - secondary node which does no longer process data in acceptable time. - Its default value is 0 which disables this feature. - * Changing syncgroups while resync runs has shows now the correct behaviour. - * In case thread creations fails DRBD would deadlock on its own - semaphore. Fixed now. - * BKL is no longer used on Linux-2.4.x. - * Now you can stack mapping block devices like LVM2 (and maybe md) on - top of drbd (a one character fix). - * drbdsetup wait_connect on a StandAlone node looked like a timeout and - forced primary. fixed. - * if drbdsetup wait_* in fact did timeout this looked like a failed ioctl. - this bug was newly introduced in 0.6.8. fixed. - * A fix to a race in _drbd_alloc_ee(). You could trigger this race if - your filesystem uses a blocksize < 4K and your machine has multiple CPUs. - By Eric W. Biederman. - * A maybe bugfix regarding calls to free_page() by Eric W. Biederman. - * A cleanup patch to drbd_process_done_ee() by Eric W. Biederman. - - -- Philipp Reisner Thu, 27 Nov 2003 08:21:34 +0200 - -drbd (0.6.8-1) unstable; urgency=low - - * Two fixes to the sync-group functionality. - - -- Philipp Reisner Mon, 20 Oct 2003 11:45:33 +0200 - -drbd (0.6.7-1) unstable; urgency=low - - * A fix to a bug that could cause data corruption if you use a - other blocksize than 4k to access the DRBD device. - * A fix to a SMP race in the syncer code. The problem was tirggered - when using DRBD on QLogic fiber channel adapters. - * Replaced various calls to sleep_on() variants with the wait_event() - macros. -- This removes potential (, non-critical) SMP races. - * This release includes the sync-group option. - - -- Philipp Reisner Thu, 13 Oct 2003 11:17:27 +0200 - -drbd (0.6.6-1) unstable; urgency=low - - * In the 0.6.5 release the secondary_remote command was badly broken, - it succeeded when it should fail silently. This is fixed now. - * Probabely in all previous releases, the resyncer thread did not - exit properly if the secondary node goes away during resync. - This was not fatal sind the resyncher thread did exit at soon - as it gets a network error. This is fixed now. - * Some new switches to the drbd script. - - -- Philipp Reisner Mon, 28 Jul 2003 14:40:43 +0200 - -drbd (0.6.5-1) unstable; urgency=low - - * Improvements to the build system - * Now it is possible to tune the socket send buffer size via drbdsetup/ - drbd.conf. This is especially usefull for WAN mirroring / using - protocol A. - * Compatibility code to compile DRBD under RedHat 9.0 (RH's version of - Linux-2.4.20) - * Improved sample drbd.conf file - - -- Philipp Reisner Sun, 06 Jul 2003 13:35:00 +0100 - -drbd (0.6.4-1) unstable; urgency=low - - * Reworked build system (i.e. better Makefiles) - * SyncAll works forward instead of backwards. Improves performance on - some storage controlers. - * Reworked /etc/init.d/drbd script (i.e. better support of - different bash releases) - - -- Philipp Reisner Thu, 01 May 2003 21:00:00 +0100 - -drbd (0.6.3-1) unstable; urgency=low - - * Lockup of primary if secondary fails during resync. Fixed. (Stupid!) - * Probabely SMP only deadlock in the drop-conection code path. - * Improved connect code. (The old code could trap into a distributed - deadlock, resulting in an endless connect/disconnect loop.) - * The 'BitMap too small bug' was actually caused by a patch in - SuSE's distribution kernel. This patch makes DRBD 'more' compatible - with SuSE's kernel. - * Improved code to allocate buffers for the rsynchronisation process. - The old code allocated physical adjacent pages although the syncer - does not need them! The old code could fail under high memory pressure. - - -- Philipp Reisner Thu, 20 Mar 2003 20:23:40 +0100 - -drbd (0.6.2-1) unstable; urgency=low - - * SMP fix in drbd_dio_end_sec() - * /etc/init.d/drbd knows about returncodes of fsck - * SUSE style rcdrbd - * Fixes for uninstall Target of the Makefiles. - - -- Philipp Reisner Tue, 11 Feb 2003 15:58:49 +0100 - -drbd (0.6.1-1) unstable; urgency=low - - * Stable release - - -- Philipp Reisner Mon, 25 Nov 2002 14:51:39 +0100 - -drbd (0.6-1.pre16-0cvs20020909.1) unstable; urgency=low - - * changed the maintainer to jan@debian.org in agreement with - Ard who currently doesn't work on drbd. - * changed name of generated drbd-module-... package to include - the full version number of the kernel package - * place generated drbd-module-... package in $(KSRC)/.. - - -- Jan Niehusmann Fri, 13 Sep 2002 15:57:01 +0200 - -drbd (0.6-1.pre16-0cvs20020909) unstable; urgency=low - - * updated version - * strange version number because debian versioning doesn't handle - -pre versions sanely - * uploading to unstable. (Closes: Bug#130031) - - -- Jan Niehusmann Wed, 11 Sep 2002 13:10:03 +0200 - -drbd (cvs20010511-1) unstable; urgency=low - - * First deb-anized version - - -- Ard van Breemen Fri, 11 May 2001 11:59:53 +0200 + -- Philipp Reisner Thu, 1 Sep 2005 10:00:00 +0200 diff -Nru drbd8-8.3.7/debian/compat drbd8-8.4.1+git55a81dc~cmd1/debian/compat --- drbd8-8.3.7/debian/compat 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/compat 2012-02-02 14:09:14.000000000 +0000 @@ -1 +1 @@ -5 +4 diff -Nru drbd8-8.3.7/debian/control drbd8-8.4.1+git55a81dc~cmd1/debian/control --- drbd8-8.3.7/debian/control 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/control 2012-02-02 14:09:14.000000000 +0000 @@ -1,45 +1,47 @@ Source: drbd8 Section: admin Priority: extra -Maintainer: Ubuntu Developers -XSBC-Original-Maintainer: Debian DRBD Maintainers -Uploaders: Norbert Tretkowski , Martin Loschwitz , Philipp Hug -Build-Depends: debhelper (>= 5), debconf-utils, sp, docbook-utils, bison, flex, dpatch, bzip2, dpatch -Standards-Version: 3.8.3 -Homepage: http://www.drbd.org/ -Vcs-Browser: http://svn.debian.org/wsvn/debian-ha/drbd8/ -Vcs-Svn: svn://svn.debian.org/svn/debian-ha/drbd8/ +Maintainer: DRBD dev +Uploaders: Philipp Reisner , Lars Ellenberg +Build-Depends: debhelper (>= 4), debconf-utils, docbook-xml, docbook-xsl, dpatch, flex, xsltproc +Standards-Version: 3.6.2.1 Package: drbd8-utils Architecture: any Section: admin -Depends: debconf | debconf-2.0, ${shlibs:Depends}, drbd8-source +Depends: debconf | debconf-2.0, ${shlibs:Depends} Conflicts: drbd-utils Provides: drbd-utils Replaces: drbd-utils, drbd Suggests: heartbeat Description: RAID 1 over tcp/ip for Linux utilities - Drbd is a block device which is designed to build high availability + DRBD is a block device which is designed to build high availability clusters by providing a virtual shared device which keeps disks in nodes synchronised using TCP/IP. This simulates RAID 1 but avoiding the use of uncommon hardware (shared SCSI buses or Fibre Channel). It is currently limited to fail-over HA clusters. . - This package contains the programs that will control the drbd kernel + This package contains the programs that will control the DRBD kernel module provided in drbd-source. You will need a clustering service (such as heartbeat) to fully implement it. + . + Homepage: http://www.drbd.org -Package: drbd8-source +Package: drbd8-module-source Architecture: all -Section: kernel -Depends: debhelper (>= 5), dpatch, bzip2, dkms, linux-headers-server | linux-headers-generic | linux-headers -Provides: drbd-module-source, drbd-source, drbd8-module-source -Recommends: dpkg-dev, debconf-utils +Section: admin +Depends: module-assistant, debhelper (>= 4), dpatch +Conflicts: drbd-module-source, drbd-source +Provides: drbd-module-source +Replaces: drbd-module-source, drbd-source +Recommends: dpkg-dev, kernel-package, debconf-utils Description: RAID 1 over tcp/ip for Linux module source - Drbd is a block device which is designed to build high availability + DRBD is a block device which is designed to build high availability clusters by providing a virtual shared device which keeps disks in nodes synchronised using TCP/IP. This simulates RAID 1 but avoiding the use of uncommon hardware (shared SCSI buses or Fibre Channel). It is currently limited to fail-over HA clusters. . - This package contains the source code for the drbd kernel module. + This package contains the source code for the DRBD kernel module. + . + Homepage: http://www.drbd.org diff -Nru drbd8-8.3.7/debian/control.modules.in drbd8-8.4.1+git55a81dc~cmd1/debian/control.modules.in --- drbd8-8.3.7/debian/control.modules.in 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/control.modules.in 2012-02-02 14:09:14.000000000 +0000 @@ -1,16 +1,17 @@ Source: drbd8 Section: misc Priority: extra -Maintainer: Philipp Hug -Build-Depends: debhelper (>= 4), drbd8-source, bzip2 +Maintainer: DRBD dev +Uploaders: Philipp Reisner , Lars Ellenberg +Build-Depends: debhelper (>= 4), drbd8-source Standards-Version: 3.6.1 -Package: drbd8-_KVERS_ +Package: drbd8-module-_KVERS_ Architecture: any Depends: drbd8-utils -Conflicts: drbd-_KVERS_ -Provides: drbd-_KVERS_ -Replaces: drbd-_KVERS_ +Conflicts: drbd-module-_KVERS_ +Provides: drbd-module-_KVERS_ +Replaces: drbd-module-_KVERS_ Section: misc Recommends: kernel-image-_KVERS_ Description: RAID 1 over tcp/ip for Linux kernel module diff -Nru drbd8-8.3.7/debian/copyright drbd8-8.4.1+git55a81dc~cmd1/debian/copyright --- drbd8-8.3.7/debian/copyright 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/copyright 2012-02-02 14:09:14.000000000 +0000 @@ -3,18 +3,19 @@ upstream. Debianization of this package was started by Ard van Breemen . -Later, Jan Niehusmann finished the packaging and made the +Later, Jan Niehusmann finished the packaging and made the initial upload. -It was downloaded from http://www.linbit.com/en/article/articleview/34/1/11/ +It was downloaded from http://oss.linbit.com/drbd/ More information can be found at http://www.drbd.org/ -Drbd is free software; you can redistribute them and/or modify them under -the terms of the GNU General Public License as published by the Free Software -Foundation; either version 2 of the License, or (at your option) any later +DRBD was written by Philipp Reisner and Lars Ellenberg +for LINBIT Information Technologies, http://www.linbit.com + +Drbd is free software; you can redistribute them and/or modify them under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later version. On Debian systems, the complete text of the GNU General Public License can be found in /usr/share/common-licenses/GPL file. - - diff -Nru drbd8-8.3.7/debian/dkms.conf drbd8-8.4.1+git55a81dc~cmd1/debian/dkms.conf --- drbd8-8.3.7/debian/dkms.conf 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/dkms.conf 1970-01-01 00:00:00.000000000 +0000 @@ -1,9 +0,0 @@ -PACKAGE_NAME="drbd" -PACKAGE_VERSION="xxxVERSIONxxx" -AUTOINSTALL=yes -CLEAN="make -C drbd clean KERNELDIR=$kernel_source_dir" -MAKE="make -C drbd KERNELDIR=$kernel_source_dir" -BUILT_MODULE_NAME[0]="drbd" -BUILT_MODULE_LOCATION[0]="drbd" -DEST_MODULE_LOCATION[0]="/kernel/updates" -MODULES_CONF[0]="options drbd cn_idx=7" diff -Nru drbd8-8.3.7/debian/drbd8-module-_KVERS_.postrm.modules.in drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-module-_KVERS_.postrm.modules.in --- drbd8-8.3.7/debian/drbd8-module-_KVERS_.postrm.modules.in 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-module-_KVERS_.postrm.modules.in 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,14 @@ +#!/bin/sh +set -e + +SYSTEMMAP=/boot/System.map-_KVERS_ + +if [ -f $SYSTEMMAP ] +then + depmod -ae -F $SYSTEMMAP _KVERS_ +elif [ "`uname -r`" = "_KVERS_" ] +then + depmod -a & +fi + +#DEBHELPER# diff -Nru drbd8-8.3.7/debian/drbd8-module-source.dirs drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-module-source.dirs --- drbd8-8.3.7/debian/drbd8-module-source.dirs 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-module-source.dirs 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,3 @@ +usr/src/modules/drbd/debian +usr/src/modules/drbd/drbd +usr/share/modass/overrides diff -Nru drbd8-8.3.7/debian/drbd8-source.Makefile drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.Makefile --- drbd8-8.3.7/debian/drbd8-source.Makefile 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -obj-m = drbd/ diff -Nru drbd8-8.3.7/debian/drbd8-source.README.Debian drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.README.Debian --- drbd8-8.3.7/debian/drbd8-source.README.Debian 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.README.Debian 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -drbd for Debian ---------------- - -The Debian drbd-source package can be used in several ways, - - - Using module-assistant(1) commands provided by the module-assistant Debian - package: - - # module-assistant auto-install drbd8 - - - Using the make-kpkg(1) command provided by the kernel-package Debian - package. See the "modules_image" section of the make-kpkg(1) man page. - - - Unpacking /usr/src/drbd*.tar.bz2 and installing the module on your own. - - -- Philipp Hug Thu, 21 Jun 2007 18:08:00 +0000 - diff -Nru drbd8-8.3.7/debian/drbd8-source.dirs drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.dirs --- drbd8-8.3.7/debian/drbd8-source.dirs 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.dirs 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ -usr/src/modules/drbd8/debian -usr/src/modules/drbd8/drbd -usr/src/modules/drbd8/scripts/ diff -Nru drbd8-8.3.7/debian/drbd8-source.drbd-Makefile drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.drbd-Makefile --- drbd8-8.3.7/debian/drbd8-source.drbd-Makefile 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.drbd-Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -#CFLAGS_drbd_sizeof_sanity_check.o = -Wpadded # -Werror - -EXTRA_CFLAGS += -I$(src) - -obj-m := drbd.o - -drbd-objs := drbd_buildtag.o drbd_bitmap.o drbd_proc.o \ - drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o \ - lru_cache.o drbd_main.o drbd_strings.o drbd_nl.o - -ifndef CONFIG_CONNECTOR - drbd-objs += connector.o cn_queue.o -endif - diff -Nru drbd8-8.3.7/debian/drbd8-source.postinst drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.postinst --- drbd8-8.3.7/debian/drbd8-source.postinst 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.postinst 1970-01-01 00:00:00.000000000 +0000 @@ -1,48 +0,0 @@ -#!/bin/sh -# Copyright (C) 2002-2005 Flavio Stanchina -# Copyright (C) 2005-2006 Aric Cyr -# Copyright (C) 2007 Mario Limonciello -# Copyright (C) 2009 Alberto Milone - -set -e - -NAME=drbd8 -PACKAGE_NAME=$NAME-source -CVERSION=`dpkg-query -W -f='${Version}' $PACKAGE_NAME | awk -F "-" '{print $1}' | cut -d\: -f2` -ARCH=`dpkg --print-architecture` - -dkms_configure () { - for POSTINST in /usr/lib/dkms/common.postinst "/usr/share/$PACKAGE_NAME/postinst"; do - if [ -f "$POSTINST" ]; then - "$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2" - return $? - fi - echo "WARNING: $POSTINST does not exist." >&2 - done - echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2 - echo "built with legacy DKMS support." >&2 - echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2 - echo "support or upgrade DKMS to a more current version." >&2 - return 1 -} - -case "$1" in - configure) - dkms_configure - ;; - - abort-upgrade|abort-remove|abort-deconfigure) - ;; - - *) - echo "postinst called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - -# dh_installdeb will replace this with shell code automatically -# generated by other debhelper scripts. - -#DEBHELPER# - -exit 0 diff -Nru drbd8-8.3.7/debian/drbd8-source.prerm drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.prerm --- drbd8-8.3.7/debian/drbd8-source.prerm 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-source.prerm 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ -#!/bin/sh - -set -e - -PKG="drbd8" -PKGVER=`dpkg-query -W -f='${Version}' "$PKG"-source | cut -d: -f2 | cut -f1 -d-` - -case "$1" in - remove|upgrade) - echo "Removing all [$PKG-$PKGVER] DKMS Modules" - dkms remove -m $PKG -v $PKGVER --all >/dev/null || true - echo "Done." - ;; -esac - -#DEBHELPER# diff -Nru drbd8-8.3.7/debian/drbd8-utils.dirs drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-utils.dirs --- drbd8-8.3.7/debian/drbd8-utils.dirs 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-utils.dirs 2012-02-02 14:09:14.000000000 +0000 @@ -1,4 +1,3 @@ etc etc/init.d etc/ha.d/resource.d -etc/udev/rules.d diff -Nru drbd8-8.3.7/debian/drbd8-utils.postinst drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-utils.postinst --- drbd8-8.3.7/debian/drbd8-utils.postinst 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-utils.postinst 2012-02-02 14:09:14.000000000 +0000 @@ -1,7 +1,5 @@ #!/bin/sh -set -e - if [ -x "/etc/init.d/drbd" ]; then update-rc.d drbd defaults 70 8 >/dev/null fi @@ -9,8 +7,12 @@ # Make sure /dev/nb[0-7] devices exist # cd /dev; for a in `seq 0 7`; do MAKEDEV nb$a; done -for i in `seq 0 15` ; do - test -b /dev/drbd$i || mknod -m 0660 /dev/drbd$i b 147 $i; -done +if [ -d /etc/udev/rules.d ]; then + echo "Udev found. Not creating device nodes." +else + for i in `seq 0 15` ; do + test -b /dev/drbd$i || mknod -m 0660 /dev/drbd$i b 147 $i; + done +fi #DEBHELPER# diff -Nru drbd8-8.3.7/debian/drbd8-utils.postrm drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-utils.postrm --- drbd8-8.3.7/debian/drbd8-utils.postrm 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-utils.postrm 2012-02-02 14:09:14.000000000 +0000 @@ -1,7 +1,5 @@ #!/bin/sh -set -e - if [ "$1" = "purge" ] ; then update-rc.d drbd remove >/dev/null fi diff -Nru drbd8-8.3.7/debian/drbd8-utils.prerm drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-utils.prerm --- drbd8-8.3.7/debian/drbd8-utils.prerm 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/drbd8-utils.prerm 2012-02-02 14:09:14.000000000 +0000 @@ -13,9 +13,11 @@ then case "$1" in remove|deconfigure) - # we don't stop the drbd resources as it's not absolutely needed during - # a utils upgrade and it would mean that unattended upgrades cause - # outages + /usr/sbin/invoke-rc.d drbd stop || true + ;; + + upgrade|failed-upgrade) + /usr/sbin/invoke-rc.d drbd stop ;; *) diff -Nru drbd8-8.3.7/debian/modass.drbd8-module-source drbd8-8.4.1+git55a81dc~cmd1/debian/modass.drbd8-module-source --- drbd8-8.3.7/debian/modass.drbd8-module-source 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/modass.drbd8-module-source 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,16 @@ +#!/bin/sh +# +# (c) Eduard Bloch , 2003 +# generic maintainer script for module-assistant controled packages +# to be sourced or copied as example code + +# autodetecting values. They may be overriden by the caller. + +MA_DIR=${MA_DIR:-/usr/share/modass} + +TARBALL=/usr/src/drbd8.tar.gz +BUILDDIR=${MODULE_LOC:-/usr/src/modules}/drbd + +. $MA_DIR/packages/generic.sh + +$1 "$@" diff -Nru drbd8-8.3.7/debian/modass.drbd8-source drbd8-8.4.1+git55a81dc~cmd1/debian/modass.drbd8-source --- drbd8-8.3.7/debian/modass.drbd8-source 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/modass.drbd8-source 1970-01-01 00:00:00.000000000 +0000 @@ -1,247 +0,0 @@ -#!/bin/sh -# -# (c) Eduard Bloch , 2003 -# generic maintainer script for module-assistant controled packages -# to be sourced or copied as example code - -# autodetecting values. They may be overriden by the caller. - -MA_DIR=${MA_DIR:-/usr/share/modass} - -TARBALL=/usr/src/drbd8.tar.bz2 -BUILDDIR=${MODULE_LOC:-/usr/src/modules}/drbd8 - -guess_source=${MA_SOURCE_PKG:-`basename $0`} -export guess_source - -guess_package=${guess_source%-src} -guess_package=${guess_package%-source} -guess_package=${guess_package%-modules} -guess_package=${guess_package%-driver} -guess_package=${guess_package%-kernel} -export guess_package - -topdir=${MOD_TOPDIR:-/usr/src} - -if test -n "$MA_DEBUG" ; then - set -x -fi - -if [ "$TARBALL" ] ; then - tarball="$TARBALL" -else - for suf in .tar.bz2 .tar.gz .tgz ; do - for presuf in "" -module -modules -driver -drivers -source -src -kernel-source -kernel-src ; do - if [ -r "$MOD_SRCDIR" -a -e "$MOD_SRCDIR/$guess_package$presuf$suf" ] ; then - tarball=$MOD_SRCDIR/$guess_package$presuf$suf - break 2; - fi - if [ -e /usr/src/$guess_package$presuf$suf ] ; then - tarball=/usr/src/$guess_package$presuf$suf - break 2; - fi - done - done -fi - -MODULE_LOC=${MODULE_LOC:-/usr/src/modules} -builddir_base=${BUILDDIR:-$MODULE_LOC/$guess_package} -pkgprefix=${PKGPREFIX:-$guess_package} # target base name something like sl-modules -sourcepkg=${SOURCEPKG:-$guess_source} # installed package that provides the source - -MA_VARDIR=${MA_VARDIR:-/var/cache/modass} - -if [ `id -u` != 0 ] ; then - if test -n "$ROOT_CMD" ; then - : - else - if which fakeroot >/dev/null 2>&1 ; then - ROOT_CMD=fakeroot - else - clear - echo - echo Warning, you are not root and fakeroot is not installed - sleep 3 - fi - fi -fi - -# and better not export ROOT_CMD, the targets in debian/rules do not -# need to run fakeroot inside fakeroot - -action () { - if [ "$VERBOSE" ] ; then - echo " $@" >&2 - "$@" - elif [ "$DRYRUN" ] ; then - echo " $@" >&2 - else - "$@" - fi -} - -locate_dir () { - for suf in "" -module -modules -driver -drivers -source -src -kernel-source -kernel-src -module-source -module-src -kernel; do - if [ -d "$builddir_base$suf/" ] ; then - builddir=$builddir_base$suf - return 0; - fi - done - return 1; -} - -locate_dir - -update () { - export sourcepkg - # action $dpkg -s $sourcepkg 2>/dev/null | grep ^Version: | cut -f2 -d\ > \ - # $MA_VARDIR/cache/$pkgprefix.cur_version|| rm $MA_VARDIR/cache/$pkgprefix.cur_version - - if test -e $MA_VARDIR/$sourcepkg.apt_policy ; then - newinfo=`cat $MA_VARDIR/$sourcepkg.apt_policy` - else - newinfo=`LANG=C apt-cache policy $sourcepkg 2>/dev/null` - fi - IFS='' - if test "$newinfo" ; then - export newinfo - echo -n $newinfo |tr -s " " | grep Candidate: | cut -f3 -d\ | tr -d '\n' > \ - $MA_VARDIR/$sourcepkg.avail_version - instvers=$(echo -n $newinfo |tr -s " " | grep Installed: | cut -f3 -d\ | tr -d '\n') - if [ "$instvers" = "(none)" ] ; then - rm -f $MA_VARDIR/$sourcepkg.cur_version - else - echo -n $instvers > $MA_VARDIR/$sourcepkg.cur_version - fi - else - rm -f $MA_VARDIR/$sourcepkg.avail_version $MA_VARDIR/$sourcepkg.cur_version - fi -} - -cur_version() { - cat $MA_VARDIR/$sourcepkg.cur_version 2>/dev/null -} - -avail_version() { - cat $MA_VARDIR/$sourcepkg.avail_version -} - -build() { - shift - eval `echo "$@" | tr ' ' '\n' | grep "KVERS\|KSRC\|KDREV"` 2>&1 - logfile=$MA_VARDIR/$sourcepkg.buildlog.$KVERS.`date +%s` - flag=$MA_VARDIR/$sourcepkg.flag.$KVERS.`date +%s` - export KVERS KDREV KSRC MA_VARDIR logfile flag - - ( echo Build log starting, file: $logfile ; - echo Date: `date -R` ; - echo ; - ) > $logfile - - if test -z "$builddir" || ! test -d $builddir ; then - if ! unpack || ! locate_dir ; then - echo "Source not found. Run: module-assistant auto-install" | tee $logfile - exit 1; - fi - fi - cd $builddir || exit 1 - - action $ROOT_CMD debian/rules kdist_clean | tee $logfile || true - - # bash cannot evaluate the return codes of the command in pipe, so - # make this groovy workaround. I have tried flag process and tail - # constructs, they all suck - - ( touch $flag && action $ROOT_CMD debian/rules "$@" 2>&1 || rm $flag - ) | tee -a $logfile - - # if flag has survived, okay, otherwise sth. failed - if test -f $flag ; then - file=`action $ROOT_CMD debian/rules echo-debfile 2>/dev/null` - if test -n "$file" && test -r "$file" ; then - echo "$file" >> $MA_VARDIR/$sourcepkg.buildstate.$KVERS - else - perl -mCwd -pe 'if (/^dpkg-deb/) { s,\.deb.*\n,.deb,; s,.*\p{Zs}[^\w./]+,,; s,//,/,g; $_=Cwd::abs_path($_)."\n";} else {undef $_}' $logfile >> $MA_VARDIR/$sourcepkg.buildstate.$KVERS - fi - # extra stuff - begin=`ls -l --time-style=+%s $flag | tr -s ' ' | cut -f6 -d\ ` - echo Build time: $(expr $(date +%s) - $begin) seconds >> $logfile - rm -f $flag - else - tput smso ; echo BUILD FAILED! ; - tput rmso ; echo See $logfile for details. - exit 1; - fi -} - -lastpkg() { -# assume that KVERS is in the environment - action tail -n1 $MA_VARDIR/$sourcepkg.buildstate.$KVERS 2>/dev/null -} - -unpack() { - - if test -n "$MA_NOTUNP" ; then return 0 ; fi - - cd $topdir || exit 1 -# test -r $target || return 1 - if test -e "$tarball" ; then - if ! test -r "$tarball" ; then - echo "Could not read $tarball!" - exit 1 - fi - else - echo "The source tarball could not be found!" - echo "Package $sourcepkg not installed?" - echo "Running \"m-a -f get $sourcepkg\" may help." - exit 1 - fi - echo Extracting the package tarball, $tarball, please wait... - if [ ${tarball%gz} != $tarball ] ; then - action tar --gzip -x -f $tarball - elif [ ${tarball%bz2} != $tarball ] ; then - action action tar --bzip2 -x -f $tarball - else - echo Unknown compression method, $tarball - exit 1 - fi - - cd /usr/src/modules/drbd8/drbd/ - ../scripts/adjust_drbd_config_h.sh - -} - -download() { - action $ROOT_CMD apt-get $REINSTALL install $sourcepkg -} - -# wipes the builddir -clean() { - rm -rf $builddir -} - -purge() { - action rm -f `cat $MA_VARDIR/$sourcepkg.buildstate.*` - action rm -rf $builddir $MA_VARDIR/$sourcepkg.* -} - -installed() { -# action test "`cat $MA_VARDIR/$pkgprefix.cur_version` 2>/dev/null" -# exit $? - test -s $MA_VARDIR/$sourcepkg.cur_version && test -e $tarball - exit $? -} - -prefix() { - echo $pkgprefix -} - -echodebfile() { - eval `echo "$@" | tr ' ' '\n' | grep "KVERS\|KDREV\|KSRC"` - logfile=$MA_VARDIR/$sourcepkg.buildlog.$KVERS.`date +%s` - export KVERS KDREV KSRC MA_VARDIR - cd $builddir 2>/dev/null || exit 1 - $ROOT_CMD debian/rules echo-debfile 2>/dev/null -} - -$1 "$@" diff -Nru drbd8-8.3.7/debian/patches/00list drbd8-8.4.1+git55a81dc~cmd1/debian/patches/00list --- drbd8-8.3.7/debian/patches/00list 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/patches/00list 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -#10_different-kernels.dpatch diff -Nru drbd8-8.3.7/debian/patches/10_different-kernels.dpatch drbd8-8.4.1+git55a81dc~cmd1/debian/patches/10_different-kernels.dpatch --- drbd8-8.3.7/debian/patches/10_different-kernels.dpatch 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/patches/10_different-kernels.dpatch 1970-01-01 00:00:00.000000000 +0000 @@ -1,20 +0,0 @@ -#! /bin/sh /usr/share/dpatch/dpatch-run -## 10_different-kernels.dpatch by Michael Prokop -## -## All lines beginning with `## DP:' are a description of the patch. -## DP: Fix build with older kernels - -@DPATCH@ ---- drbd8-8.3.2~rc2.orig/drbd/linux/drbd_config.h 2009-06-25 15:13:04.000000000 +0200 -+++ drbd8-8.3.2~rc2/drbd/linux/drbd_config.h 2009-06-26 09:59:46.000000000 +0200 -@@ -79,6 +79,10 @@ - - /* 2.6.29 and up no longer have swabb.h */ - //#define HAVE_LINUX_BYTEORDER_SWABB_H -+#include -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29) -+ #define HAVE_LINUX_BYTEORDER_SWABB_H -+#endif - - /* Some vendor kernels < 2.6.7 might define msleep in one or - * another way .. */ diff -Nru drbd8-8.3.7/debian/rules drbd8-8.4.1+git55a81dc~cmd1/debian/rules --- drbd8-8.3.7/debian/rules 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/rules 2012-02-02 14:09:14.000000000 +0000 @@ -14,7 +14,7 @@ # This has to be exported to make some magic below work. export DH_OPTIONS -PACKAGE=drbd8 +PACKAGE=drbd8-module MA_DIR ?= /usr/share/modass -include $(MA_DIR)/include/generic.make -include $(MA_DIR)/include/common-rules.make @@ -30,26 +30,9 @@ INSTALL_PROGRAM += -s endif -DEB_VERSION ?= $(shell dpkg-parsechangelog | egrep '^Version:' | cut -f 2 -d ' ') -DEB_NOEPOCH_VERSION ?= $(shell echo $(DEB_VERSION) | cut -d: -f2-) -DEB_UPSTREAM_VERSION ?= $(shell echo $(DEB_NOEPOCH_VERSION) | cut -d- -f1) - -# module-assistant stuff -MAJOR=$(shell echo $(KVERS) | sed -e 's/\(...\).*/\1/') -ifeq ($(MAJOR),2.6) -KO=k -endif - -ifeq ($(DEB_BUILD_ARCH),i386) -ifeq ($(KPKG_ARCH),amd64) -KBUILD_PARAMS := "CROSS_COMPILE=amd64-linux- ARCH=x86_64" -CC=amd64-linux-gcc -endif -endif - -kdist_clean: +kdist_clean: unpatch dh_clean - $(MAKE) -C $(KSRC) M=$(CURDIR)/drbd/ clean + -$(MAKE) -C drbd clean # prep-deb-files rewrites the debian/ files as needed. See RATIONALE for # details @@ -62,31 +45,30 @@ dh_testdir dh_testroot dh_clean -k - make -C $(KSRC) M=$(CURDIR)/drbd/ modules - #install -m644 -b -D drbd/drbd.$(KO)o $(CURDIR)/debian/$(PKGNAME)/lib/modules/$(KVERS)/kernel/extra/drbd.$(KO)o - # this is broken, dunno why: - #make -C $(KSRC) M=$(CURDIR)/drbd/ modules_install INSTALL_MOD_PATH=$(CURDIR)/debian/$(PKGNAME) INSTALL_MOD_DIR=extra/ + $(MAKE) -C drbd KERNEL_SOURCES=$(KSRC) MODVERSIONS=detect KERNEL=linux-$(KVERS) KDIR=$(KSRC) + install -m644 -b -D drbd/drbd.ko $(CURDIR)/debian/$(PKGNAME)/lib/modules/$(KVERS)/updates/drbd.ko dh_installdocs dh_installchangelogs dh_compress dh_fixperms - dh_installmodules dh_installdeb dh_gencontrol -- -v$(VERSION) dh_md5sums dh_builddeb --destdir=$(DEB_DESTDIR) #Architecture -configure: patch - ./configure --prefix=/usr --localstatedir=/var --sysconfdir=/etc --with-utils --with-udev --with-xen --with-pacemaker --with-rgmanager --with-bashcompletion - -build: configure build-arch build-indep - +build: patch build-arch build-indep build-arch: build-arch-stamp -build-arch-stamp: patch +build-arch-stamp: dh_testdir - make +# build this first, so user/drbd_buildtag.c does not use the stale thing from the tgz + [ -f configure ] || ( autoheader && autoconf ) + ./configure --prefix=/usr --localstatedir=/var --sysconfdir=/etc + $(MAKE) drbd/drbd_buildtag.c + $(MAKE) -C user + $(MAKE) -C scripts + $(MAKE) -C documentation doc touch build-arch-stamp build-indep: build-indep-stamp @@ -99,11 +81,10 @@ rm -f build-arch-stamp build-indep-stamp #CONFIGURE-STAMP# # remove these files from upstream tgz rm -f debian/drbd8.* - #rm -f debian/drbd8-source.dirs + rm -f debian/drbd8-source.dirs rm -f debian/kernel-patch-wup.kpatches debian/kernel-patch-wup.README.Debian debian/kernel-export-wup.patch -$(MAKE) -C user clean -$(MAKE) -C scripts clean - -$(MAKE) -C benchmark clean -$(MAKE) -C documentation clean -$(MAKE) -C drbd clean dh_clean @@ -114,28 +95,18 @@ dh_testroot dh_clean -k -i dh_installdirs -i - cp -a drbd/* debian/$(PACKAGE)-source/usr/src/modules/drbd8/drbd/ - rm debian/$(PACKAGE)-source/usr/src/modules/drbd8/drbd/Makefile \ - debian/$(PACKAGE)-source/usr/src/modules/drbd8/drbd/Makefile-2.6 - cp debian/drbd8-source.Makefile debian/$(PACKAGE)-source/usr/src/modules/drbd8/Makefile - cp debian/drbd8-source.drbd-Makefile debian/$(PACKAGE)-source/usr/src/modules/drbd8/drbd/Makefile - cp drbd/linux/drbd_config.h debian/$(PACKAGE)-source/usr/src/modules/drbd8 - cp -a scripts/adjust_drbd_config_h.sh debian/$(PACKAGE)-source/usr/src/modules/drbd8/scripts/ - + cp -a drbd/* debian/$(PACKAGE)-source/usr/src/modules/drbd/drbd + cp Makefile debian/$(PACKAGE)-source/usr/src/modules/drbd + cp drbd/linux/drbd_config.h debian/$(PACKAGE)-source/usr/src/modules/drbd + mkdir debian/$(PACKAGE)-source/usr/src/modules/drbd/scripts/ + # install debian/ files - mkdir -p debian/$(PACKAGE)-source/usr/src/modules/drbd8/debian/ - cd debian ; cp changelog control compat *.modules.in rules copyright $(PACKAGE)-source/usr/src/modules/drbd8/debian - + cd debian ; cp changelog control compat *.modules.in rules copyright $(PACKAGE)-source/usr/src/modules/drbd/debian + # tar the stuff - cd debian/$(PACKAGE)-source/usr/src/ ; tar cjpvf drbd8.tar.bz2 modules ; rm -rf modules - - #install -m 755 $(CURDIR)/debian/modass.drbd8-source $(CURDIR)/debian/$(PACKAGE)-source/usr/share/modass/overrides/drbd8-source - - rm debian/$(PACKAGE)-source/usr/src/drbd8.tar.bz2 - mkdir -p debian/$(PACKAGE)-source/usr/src/$(PACKAGE)-$(DEB_UPSTREAM_VERSION)/ - cp -a drbd debian/$(PACKAGE)-source/usr/src/$(PACKAGE)-$(DEB_UPSTREAM_VERSION)/ - cp debian/dkms.conf debian/$(PACKAGE)-source/usr/src/$(PACKAGE)-$(DEB_UPSTREAM_VERSION)/ - sed -i -e 's/xxxVERSIONxxx/$(DEB_UPSTREAM_VERSION)/g' debian/$(PACKAGE)-source/usr/src/$(PACKAGE)-$(DEB_UPSTREAM_VERSION)/dkms.conf + cd debian/$(PACKAGE)-source/usr/src/ ; tar pzfvc drbd8.tar.gz modules ; rm -rf modules + + install -m 755 $(CURDIR)/debian/modass.drbd8-module-source $(CURDIR)/debian/$(PACKAGE)-source/usr/share/modass/overrides/drbd8-module-source dh_install -i install-arch: @@ -143,10 +114,11 @@ dh_testroot dh_clean -k -s dh_installdirs -s + $(MAKE) DESTDIR=$(CURDIR)/debian/drbd8-utils/ -C user install + $(MAKE) DESTDIR=$(CURDIR)/debian/drbd8-utils/ -C scripts install + $(MAKE) DESTDIR=$(CURDIR)/debian/drbd8-utils/ -C documentation install - $(MAKE) DESTDIR=$(CURDIR)/debian/drbd8-utils/ install - dh_install --source=debian/drbd8-utils --list-missing -s - + dh_install -s # Must not depend on anything. This is to be called by # binary-arch/binary-indep # in another 'make' thread. @@ -156,22 +128,12 @@ dh_installchangelogs ChangeLog dh_installdocs dh_installexamples -# dh_installmenu -# dh_installdebconf -# dh_installlogrotate -# dh_installemacsen -# dh_installpam -# dh_installmime # dh_installinit -# dh_installcron -# dh_installinfo dh_installman dh_link dh_strip dh_compress dh_fixperms -# dh_perl -# dh_python dh_makeshlibs dh_installdeb dh_shlibdeps @@ -187,4 +149,4 @@ $(MAKE) -f debian/rules DH_OPTIONS=-a binary-common binary: binary-arch binary-indep -.PHONY: build clean binary-indep binary-arch binary install install-indep install-arch unpatch +.PHONY: build clean binary-indep binary-arch binary install install-indep install-arch patch unpatch diff -Nru drbd8-8.3.7/debian/substvars drbd8-8.4.1+git55a81dc~cmd1/debian/substvars --- drbd8-8.3.7/debian/substvars 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/substvars 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1 @@ +shlibs:Depends=libc6 (>= 2.3.2.ds1-4) diff -Nru drbd8-8.3.7/debian/watch drbd8-8.4.1+git55a81dc~cmd1/debian/watch --- drbd8-8.3.7/debian/watch 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/debian/watch 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -version=3 -http://oss.linbit.com/drbd/ 8.3/drbd-(.*).tar.gz diff -Nru drbd8-8.3.7/documentation/Makefile drbd8-8.4.1+git55a81dc~cmd1/documentation/Makefile --- drbd8-8.3.7/documentation/Makefile 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/Makefile 2012-09-03 22:37:14.000000000 +0000 @@ -29,7 +29,6 @@ BASH_COMPLETION_SUFFIX = UDEV_RULE_SUFFIX = INITDIR = /etc/init.d -KDIR ?= LIBDIR = /usr/lib/drbd CC = gcc CFLAGS = -Wall -g -O2 @@ -42,7 +41,7 @@ WITH_XEN = yes WITH_PACEMAKER = yes WITH_HEARTBEAT = yes -WITH_RGMANAGER = yes +WITH_RGMANAGER = no WITH_BASHCOMPLETION = yes # variables meant to be overridden from the make command line @@ -66,6 +65,15 @@ XSLTPROC_HTML_OPTIONS ?= $(XSLTPROC_OPTIONS) XSLTPROC_FO_OPTIONS ?= $(XSLTPROC_OPTIONS) +DRBDSETUP_CMDS = new-resource new-minor del-resource del-minor +DRBDSETUP_CMDS += attach connect disk-options net-options resource-options +DRBDSETUP_CMDS += disconnect detach primary secondary verify invalidate invalidate-remote +DRBDSETUP_CMDS += down wait-connect wait-sync role cstate dstate +DRBDSETUP_CMDS += resize check-resize pause-sync resume-sync +DRBDSETUP_CMDS += outdate show-gi get-gi show events +DRBDSETUP_CMDS += suspend-io resume-io new-current-uuid + + all: @echo "To (re)make the documentation: make doc" @@ -102,9 +110,19 @@ $(FO_STYLESHEET) $< endif +../user/drbdsetup: + (cd ../user; make drbdsetup) + +drbdsetup_xml-help_%.xml: ../user/drbdsetup + ../user/drbdsetup xml-help $* > $@ + +drbdsetup_%.xml: drbdsetup_xml-help_%.xml xml-usage-to-docbook.xsl + $(XSLTPROC) -o $@ xml-usage-to-docbook.xsl $< + distclean: rm -f *.[58] manpage.links manpage.refs *~ manpage.log rm -f *.ps.gz *.pdf *.ps *.html pod2htm* + rm -f drbdsetup_*.xml ####### @@ -131,5 +149,4 @@ ps: $(SOURCES:.xml=.ps) - - +drbdsetup.8: drbdsetup.xml $(patsubst %,drbdsetup_%.xml,$(DRBDSETUP_CMDS)) diff -Nru drbd8-8.3.7/documentation/Makefile.in drbd8-8.4.1+git55a81dc~cmd1/documentation/Makefile.in --- drbd8-8.3.7/documentation/Makefile.in 2010-01-07 09:09:33.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/Makefile.in 2012-02-02 14:09:14.000000000 +0000 @@ -29,7 +29,6 @@ BASH_COMPLETION_SUFFIX = @BASH_COMPLETION_SUFFIX@ UDEV_RULE_SUFFIX = @UDEV_RULE_SUFFIX@ INITDIR = @INITDIR@ -KDIR ?= @KDIR@ LIBDIR = @prefix@/lib/@PACKAGE_TARNAME@ CC = @CC@ CFLAGS = @CFLAGS@ @@ -66,6 +65,15 @@ XSLTPROC_HTML_OPTIONS ?= $(XSLTPROC_OPTIONS) XSLTPROC_FO_OPTIONS ?= $(XSLTPROC_OPTIONS) +DRBDSETUP_CMDS = new-resource new-minor del-resource del-minor +DRBDSETUP_CMDS += attach connect disk-options net-options resource-options +DRBDSETUP_CMDS += disconnect detach primary secondary verify invalidate invalidate-remote +DRBDSETUP_CMDS += down wait-connect wait-sync role cstate dstate +DRBDSETUP_CMDS += resize check-resize pause-sync resume-sync +DRBDSETUP_CMDS += outdate show-gi get-gi show events +DRBDSETUP_CMDS += suspend-io resume-io new-current-uuid + + all: @echo "To (re)make the documentation: make doc" @@ -102,9 +110,19 @@ $(FO_STYLESHEET) $< endif +../user/drbdsetup: + (cd ../user; make drbdsetup) + +drbdsetup_xml-help_%.xml: ../user/drbdsetup + ../user/drbdsetup xml-help $* > $@ + +drbdsetup_%.xml: drbdsetup_xml-help_%.xml xml-usage-to-docbook.xsl + $(XSLTPROC) -o $@ xml-usage-to-docbook.xsl $< + distclean: rm -f *.[58] manpage.links manpage.refs *~ manpage.log rm -f *.ps.gz *.pdf *.ps *.html pod2htm* + rm -f drbdsetup_*.xml ####### @@ -131,5 +149,4 @@ ps: $(SOURCES:.xml=.ps) - - +drbdsetup.8: drbdsetup.xml $(patsubst %,drbdsetup_%.xml,$(DRBDSETUP_CMDS)) diff -Nru drbd8-8.3.7/documentation/drbd.8 drbd8-8.4.1+git55a81dc~cmd1/documentation/drbd.8 --- drbd8-8.3.7/documentation/drbd.8 2010-01-13 16:17:24.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbd.8 2012-02-02 14:09:57.000000000 +0000 @@ -1,7 +1,7 @@ '\" t .\" Title: drbd .\" Author: [see the "Author" section] -.\" Generator: DocBook XSL Stylesheets v1.75.1 +.\" Generator: DocBook XSL Stylesheets v1.75.2 .\" Date: 15 Oct 2008 .\" Manual: System Administration .\" Source: DRBD 8.3.2 @@ -9,6 +9,15 @@ .\" .TH "DRBD" "8" "15 Oct 2008" "DRBD 8.3.2" "System Administration" .\" ----------------------------------------------------------------- +.\" * Define some portability stuff +.\" ----------------------------------------------------------------- +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.\" http://bugs.debian.org/507673 +.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" ----------------------------------------------------------------- .\" * set default formatting .\" ----------------------------------------------------------------- .\" disable hyphenation diff -Nru drbd8-8.3.7/documentation/drbd.conf.5 drbd8-8.4.1+git55a81dc~cmd1/documentation/drbd.conf.5 --- drbd8-8.3.7/documentation/drbd.conf.5 2010-01-13 16:17:23.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbd.conf.5 2012-02-02 14:09:56.000000000 +0000 @@ -1,13 +1,22 @@ '\" t .\" Title: drbd.conf .\" Author: [see the "Author" section] -.\" Generator: DocBook XSL Stylesheets v1.75.1 -.\" Date: 5 Dec 2008 +.\" Generator: DocBook XSL Stylesheets v1.75.2 +.\" Date: 6 May 2011 .\" Manual: Configuration Files -.\" Source: DRBD 8.3.2 +.\" Source: DRBD 8.4.0 .\" Language: English .\" -.TH "DRBD\&.CONF" "5" "5 Dec 2008" "DRBD 8.3.2" "Configuration Files" +.TH "DRBD\&.CONF" "5" "6 May 2011" "DRBD 8.4.0" "Configuration Files" +.\" ----------------------------------------------------------------- +.\" * Define some portability stuff +.\" ----------------------------------------------------------------- +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.\" http://bugs.debian.org/507673 +.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' .\" ----------------------------------------------------------------- .\" * set default formatting .\" ----------------------------------------------------------------- @@ -19,7 +28,7 @@ .\" * MAIN CONTENT STARTS HERE * .\" ----------------------------------------------------------------- .SH "NAME" -drbd.conf \- Configuration file for DRBD\'s devices .\" drbd.conf +drbd.conf \- Configuration file for DRBD\*(Aqs devices .\" drbd.conf .SH "INTRODUCTION" .PP The file @@ -32,24 +41,29 @@ should be the same on both nodes of the cluster\&. Changes to \fB/etc/drbd\&.conf\fR do not apply immediately\&. -.PP \fBExample\ \&1.\ \&A small drbd.conf file\fR .sp .if n \{\ .RS 4 .\} .nf global { usage\-count yes; } common { syncer { rate 10M; } } resource r0 { protocol C; net { cram\-hmac\-alg sha1; shared\-secret "FooFunFactory"; } on alice { device minor 1; disk /dev/sda7; address 10\&.1\&.1\&.31:7789; meta\-disk internal; } on bob { device minor 1; disk /dev/sda7; address 10\&.1\&.1\&.32:7789; meta\-disk internal; } } .fi .if n \{\ .RE .\} -In this example, there is a single DRBD resource (called r0) which uses protocol C for the connection between its devices\&. The device which runs on host +.PP +By convention the main config contains two include statements\&. The first one includes the file +\fB/etc/drbd\&.d/global_common\&.conf\fR, the second one all file with a +\fB\&.res\fR +suffix\&. +.PP +.PP \fBExample\ \&1.\ \&A small example.res file\fR .sp .if n \{\ .RS 4 .\} .nf resource r0 { net { protocol C; cram\-hmac\-alg sha1; shared\-secret "FooFunFactory"; } disk { resync\-rate 10M; } on alice { volume 0 { device minor 1; disk /dev/sda7; meta\-disk internal; } address 10\&.1\&.1\&.31:7789; } on bob { volume 0 { device minor 1; disk /dev/sda7; meta\-disk internal; } address 10\&.1\&.1\&.32:7789; } } .fi .if n \{\ .RE .\}In this example, there is a single DRBD resource (called r0) which uses protocol C for the connection between its devices\&. It contains a single volume which runs on host \fIalice\fR uses \fI/dev/drbd1\fR as devices for its application, and \fI/dev/sda7\fR -as low\-level storage for the data\&. The IP addresses are used to specify the networking interfaces to be used\&. An eventually running resync process should use about 10MByte/second of IO bandwidth\&. +as low\-level storage for the data\&. The IP addresses are used to specify the networking interfaces to be used\&. An eventually running resync process should use about 10MByte/second of IO bandwidth\&. This sync\-rate statement is valid for volume 0, but would also be valid for further volumes\&. In this example it assigns full 10MByte/second to each volume\&. .PP There may be multiple resource sections in a single drbd\&.conf file\&. For more examples, please have a look at the -\m[blue]\fBDRBD User\'s Guide\fR\m[]\&\s-2\u[1]\d\s+2\&. +\m[blue]\fBDRBD User\*(Aqs Guide\fR\m[]\&\s-2\u[1]\d\s+2\&. .SH "FILE FORMAT" .PP The file consists of sections and parameters\&. A section begins with a keyword, sometimes an additional name, and an opening brace (\(lq{\(rq)\&. A section ends with a closing brace (\(lq}\(rq\&. The braces enclose the parameters\&. .PP section [name] { parameter value; [\&.\&.\&.] } .PP -A parameter starts with the identifier of the parameter followed by whitespace\&. Every subsequent character is considered as part of the parameter\'s value\&. A special case are Boolean parameters which only consist of the identifier\&. Parameters are terminated by a semicolon (\(lq;\(rq)\&. +A parameter starts with the identifier of the parameter followed by whitespace\&. Every subsequent character is considered as part of the parameter\*(Aqs value\&. A special case are Boolean parameters which consist only of the identifier\&. Parameters are terminated by a semicolon (\(lq;\(rq)\&. .PP Some parameter values have default units which might be overruled by K, M or G\&. These units are defined in the usual way (K = 2^10 = 1024, M = 1024 K, G = 1024 M)\&. .PP @@ -61,7 +75,7 @@ .\" drbd.conf: skip Comments out chunks of text, even spanning more than one line\&. Characters between the keyword \fBskip\fR -and the opening brace (\(lq{\(rq) are ignored\&. Everything enclosed by the braces is skipped\&. This comes in handy, if you just want to comment out some \'resource [name] {\&.\&.\&.}\' section: just precede it with \'skip\'\&. +and the opening brace (\(lq{\(rq) are ignored\&. Everything enclosed by the braces is skipped\&. This comes in handy, if you just want to comment out some \*(Aq\fBresource [name] {\&.\&.\&.}\fR\*(Aq section: just precede it with \*(Aq\(lqskip\(rq\*(Aq\&. .RE .PP \fBglobal\fR @@ -81,7 +95,7 @@ .\" drbd.conf: common All resources inherit the options set in this section\&. The common section might have a \fBstartup\fR, a -\fBsyncer\fR, a +\fBoptions\fR, a \fBhandlers\fR, a \fBnet\fR and a @@ -96,13 +110,13 @@ \fBon \fR\fB\fIhost\fR\fR sections and may have a \fBstartup\fR, a -\fBsyncer\fR, a +\fBoptions\fR, a \fBhandlers\fR, a \fBnet\fR and a \fBdisk\fR -section\&. Required parameter in this section: -\fBprotocol\fR\&. +section\&. It might contain +\fBvolume\fRs sections\&. .RE .PP \fBon \fR\fB\fIhost\-name\fR\fR @@ -110,7 +124,7 @@ .\" drbd.conf: on Carries the necessary configuration parameters for a DRBD device of the enclosing resource\&. \fIhost\-name\fR -is mandatory and must match the Linux host name (uname \-n) of one of the nodes\&. You may list more than one host name here, in case you want to use the same parameters on several hosts (you\'d have to move the IP around usually)\&. Or you may list more than two such sections\&. +is mandatory and must match the Linux host name (uname \-n) of one of the nodes\&. You may list more than one host name here, in case you want to use the same parameters on several hosts (you\*(Aqd have to move the IP around usually)\&. Or you may list more than two such sections\&. .sp .if n \{\ .RS 4 @@ -142,12 +156,19 @@ .sp See also the \fBfloating\fR -section keyword\&. Required parameters in this section: +section keyword\&. Required statements in this section: +\fBaddress\fR +and +\fBvolume\fR\&. Note for backward compatibility and convenience it is valid to embed the statements of a single volume directly into the host section\&. +.RE +.PP +\fBvolume \fR\fB\fIvnr\fR\fR +.RS 4 +.\" drbd.conf: volume +Defines a volume within a connection\&. The minor numbers of a replicated volume might be different on different hosts, the volume number (\fIvnr\fR) is what groups them together\&. Required parameters in this section: \fBdevice\fR, \fBdisk\fR, -\fBaddress\fR, -\fBmeta\-disk\fR, -\fBflexible\-meta\-disk\fR\&. +\fBmeta\-disk\fR\&. .RE .PP \fBstacked\-on\-top\-of \fR\fB\fIresource\fR\fR @@ -173,8 +194,7 @@ section is that the matching of the host sections to machines is done by the IP\-address instead of the node name\&. Required parameters in this section: \fBdevice\fR, \fBdisk\fR, -\fBmeta\-disk\fR, -\fBflexible\-meta\-disk\fR, all of which +\fBmeta\-disk\fR, all of which \fImay\fR be inherited from the resource section, in which case you may shorten this section down to just the address identifier\&. .sp @@ -202,32 +222,41 @@ .if n \{\ .RE .\} -.sp .RE .PP \fBdisk\fR .RS 4 .\" drbd.conf: disk -This section is used to fine tune DRBD\'s properties in respect to the low level storage\&. Please refer to +This section is used to fine tune DRBD\*(Aqs properties in respect to the low level storage\&. Please refer to \fBdrbdsetup\fR(8) -for detailed description of the parameters\&. Optional parameter: +for detailed description of the parameters\&. Optional parameters: \fBon\-io\-error\fR, \fBsize\fR, \fBfencing\fR, -\fBuse\-bmbv\fR, -\fBno\-disk\-barrier\fR, -\fBno\-disk\-flushes\fR, -\fBno\-disk\-drain\fR, -\fBno\-md\-flushes\fR, -\fBmax\-bio\-bvecs\fR\&. +\fBdisk\-barrier\fR, +\fBdisk\-flushes\fR, +\fBdisk\-drain\fR, +\fBmd\-flushes\fR, +\fBmax\-bio\-bvecs\fR, +\fBresync\-rate\fR, +\fBresync\-after\fR, +\fBal\-extents\fR, +\fBc\-plan\-ahead\fR, +\fBc\-fill\-target\fR, +\fBc\-delay\-target\fR, +\fBc\-max\-rate\fR, +\fBc\-min\-rate\fR, +\fBdisk\-timeout\fR, +\fBread\-balancing\fR\&. .RE .PP \fBnet\fR .RS 4 .\" drbd.conf: net -This section is used to fine tune DRBD\'s properties\&. Please refer to +This section is used to fine tune DRBD\*(Aqs properties\&. Please refer to \fBdrbdsetup\fR(8) -for a detailed description of this section\'s parameters\&. Optional parameters: +for a detailed description of this section\*(Aqs parameters\&. Optional parameters: +\fBprotocol\fR, \fBsndbuf\-size\fR, \fBrcvbuf\-size\fR, \fBtimeout\fR, @@ -244,15 +273,21 @@ \fBafter\-sb\-1pri\fR, \fBafter\-sb\-2pri\fR, \fBdata\-integrity\-alg\fR, -\fBno\-tcp\-cork\fR +\fBno\-tcp\-cork\fR, +\fBon\-congestion\fR, +\fBcongestion\-fill\fR, +\fBcongestion\-extents\fR, +\fBverify\-alg\fR, +\fBuse\-rle\fR, +\fBcsums\-alg\fR\&. .RE .PP \fBstartup\fR .RS 4 .\" drbd.conf: startup -This section is used to fine tune DRBD\'s properties\&. Please refer to +This section is used to fine tune DRBD\*(Aqs properties\&. Please refer to \fBdrbdsetup\fR(8) -for a detailed description of this section\'s parameters\&. Optional parameters: +for a detailed description of this section\*(Aqs parameters\&. Optional parameters: \fBwfc\-timeout\fR, \fBdegr\-wfc\-timeout\fR, \fBoutdated\-wfc\-timeout\fR, @@ -262,46 +297,104 @@ \fBbecome\-primary\-on\fR\&. .RE .PP -\fBsyncer\fR +\fBoptions\fR .RS 4 -.\" drbd.conf: syncer -This section is used to fine tune the synchronization daemon for the device\&. Please refer to +.\" drbd.conf: options +This section is used to fine tune the behaviour of the resource object\&. Please refer to \fBdrbdsetup\fR(8) -for a detailed description of this section\'s parameters\&. Optional parameters: -\fBrate\fR, -\fBafter\fR, -\fBal\-extents\fR, -\fBuse\-rle\fR, -\fBcpu\-mask\fR, -\fBverify\-alg\fR -and -\fBcsums\-alg\fR\&. +for a detailed description of this section\*(Aqs parameters\&. Optional parameters: +\fBcpu\-mask\fR, and +\fBon\-no\-data\-accessible\fR\&. .RE .PP \fBhandlers\fR .RS 4 .\" drbd.conf: handlers -In this section you can define handlers (executables) that are executed by the DRBD system in response to certain events\&. Optional parameters: +In this section you can define handlers (executables) that are started by the DRBD system in response to certain events\&. Optional parameters: \fBpri\-on\-incon\-degr\fR, \fBpri\-lost\-after\-sb\fR, \fBpri\-lost\fR, \fBfence\-peer\fR (formerly oudate\-peer), \fBlocal\-io\-error\fR, +\fBinitial\-split\-brain\fR, \fBsplit\-brain\fR, \fBbefore\-resync\-target\fR, \fBafter\-resync\-target\fR\&. +.sp +The interface is done via environment variables: +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +.sp -1 +.IP \(bu 2.3 +.\} +\fBDRBD_RESOURCE\fR +is the name of the resource +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +.sp -1 +.IP \(bu 2.3 +.\} +\fBDRBD_MINOR\fR +is the minor number of the DRBD device, in decimal\&. +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +.sp -1 +.IP \(bu 2.3 +.\} +\fBDRBD_CONF\fR +is the path to the primary configuration file; if you split your configuration into multiple files (e\&.g\&. in +\fB/etc/drbd\&.conf\&.d/\fR), this will not be helpful\&. +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +.sp -1 +.IP \(bu 2.3 +.\} +\fBDRBD_PEER_AF\fR +, +\fBDRBD_PEER_ADDRESS\fR +, +\fBDRBD_PEERS\fR +are the address family (e\&.g\&. +\fBipv6\fR), the peer\*(Aqs address and hostnames\&. +.RE +.sp + +\fBDRBD_PEER\fR +is deprecated\&. +.sp +Please note that not all of these might be set for all handlers, and that some values might not be useable for a +\fBfloating\fR +definition\&. .RE .SS "Parameters" .PP \fBminor\-count \fR\fB\fIcount\fR\fR .RS 4 .\" drbd.conf: minor-count\fIcount\fR -may be a number from 1 to 255\&. +may be a number from 1 to FIXME\&. .sp -Use -\fIminor\-count\fR -if you want to define massively more resources later without reloading the DRBD kernel module\&. Per default the module loads with 11 more resources than you have currently in your config but at least 32\&. +\fIMinor\-count\fR +is a sizing hint for DRBD\&. It helps to right\-size various memory pools\&. It should be set in the in the same order of magnitude than the actual number of minors you use\&. Per default the module loads with 11 more resources than you have currently in your config but at least 32\&. .RE .PP \fBdialog\-refresh \fR\fB\fItime\fR\fR @@ -321,14 +414,18 @@ .\" drbd.conf: disable-ip-verification Use \fIdisable\-ip\-verification\fR -if, for some obscure reasons, drbdadm can/might not use ip or ifconfig to do a sanity check for the IP address\&. You can disable the IP verification with this option\&. +if, for some obscure reasons, drbdadm can/might not use +\fBip\fR +or +\fBifconfig\fR +to do a sanity check for the IP address\&. You can disable the IP verification with this option\&. .RE .PP \fBusage\-count \fR\fB\fIval\fR\fR .RS 4 .\" drbd.conf: usage-count Please participate in -\m[blue]\fBDRBD\'s online usage counter\fR\m[]\&\s-2\u[2]\d\s+2\&. The most convenient way to do so is to set this option to +\m[blue]\fBDRBD\*(Aqs online usage counter\fR\m[]\&\s-2\u[2]\d\s+2\&. The most convenient way to do so is to set this option to \fByes\fR\&. Valid options are: \fByes\fR, \fBno\fR @@ -357,12 +454,12 @@ \fBdisk\fR parameter\&. .sp -One can ether ommit the +One can ether omit the \fIname\fR or \fBminor\fR and the -\fIminor number\fR\&. If you ommit the +\fIminor number\fR\&. If you omit the \fIname\fR a default of /dev/drbd\fIminor\fR will be used\&. @@ -373,7 +470,7 @@ \fBdisk \fR\fB\fIname\fR\fR .RS 4 .\" drbd.conf: disk -DRBD uses this block device to actually store and retrieve the data\&. Never access such a device while DRBD is running on top of it\&. This holds also true for +DRBD uses this block device to actually store and retrieve the data\&. Never access such a device while DRBD is running on top of it\&. This also holds true for \fBdumpe2fs\fR(8) and similar commands\&. .RE @@ -390,61 +487,62 @@ \fBipv6\fR, \fBssocks\fR or -\fBsdp\fR\&. (For compatibility reasons +\fBsdp\fR +(for compatibility reasons \fBsci\fR is an alias for -\fBssocks\fR) It may be ommited for IPv4 addresses\&. The actual IPv6 address that follows the +\fBssocks\fR)\&. It may be omited for IPv4 addresses\&. The actual IPv6 address that follows the \fBipv6\fR keyword must be placed inside brackets: ipv6 [fd01:2345:6789:abcd::1]:7800\&. .sp Each DRBD resource needs a TCP \fIport\fR -which is used to connect to the node\'s partner device\&. Two different DRBD resources may not use the same +which is used to connect to the node\*(Aqs partner device\&. Two different DRBD resources may not use the same \fIaddr:port\fR combination on the same node\&. .RE .PP -\fBmeta\-disk \fR\fB\fIinternal\fR\fR, \fBflexible\-meta\-disk \fR\fB\fIinternal\fR\fR, \fBmeta\-disk \fR\fB\fIdevice [index]\fR\fR, \fBflexible\-meta\-disk \fR\fB\fIdevice \fR\fR +\fBmeta\-disk internal\fR, \fBmeta\-disk \fR\fB\fIdevice\fR\fR, \fBmeta\-disk \fR\fB\fIdevice\fR\fR\fB [\fR\fB\fIindex\fR\fR\fB]\fR .RS 4 -.\" drbd.conf: meta-disk.\" drbd.conf: flexible-meta-disk -Internal means that the last part of the backing device is used to store the meta\-data\&. You must not use -\fI[index]\fR -with internal\&. Note: Regardless of whether you use the -\fBmeta\-disk\fR -or the -\fBflexible\-meta\-disk\fR -keyword, it will always be of the size needed for the remaining storage size\&. +.\" drbd.conf: meta-disk +Internal means that the last part of the backing device is used to store the meta\-data\&. The size of the meta\-data is computed based on the size of the device\&. .sp -You can use a single block +When a \fIdevice\fR -to store meta\-data of multiple DRBD devices\&. E\&.g\&. use meta\-disk /dev/sde6[0]; and meta\-disk /dev/sde6[1]; for two different resources\&. In this case the meta\-disk would need to be at least 256 MB in size\&. -.sp -With the -\fBflexible\-meta\-disk\fR -keyword you specify a block device as meta\-data storage\&. You usually use this with LVM, which allows you to have many variable sized block devices\&. The required size of the meta\-disk block device is 36kB + Backing\-Storage\-size / 32k\&. Round this number to the next 4kb boundary up and you have the exact size\&. Rule of the thumb: 32kByte per 1GByte of storage, round up to the next MB\&. +is specified, either with or without an +\fIindex\fR, DRBD stores the meta\-data on this device\&. Without +\fIindex\fR, the size of the meta\-data is determined by the size of the data device\&. This is usually used with LVM, which allows to have many variable sized block devices\&. The meta\-data size is 36kB + Backing\-Storage\-size / 32k, rounded up to the next 4kb boundary\&. (Rule of the thumb: 32kByte per 1GByte of storage, rounded up to the next MB\&.) +.sp +When an +\fIindex\fR +is specified, each index number refers to a fixed slot of meta\-data of 128 MB, which allows a maximum data size of 4 GB\&. This way, multiple DBRD devices can share the same meta\-data device\&. For example, if /dev/sde6[0] and /dev/sde6[1] are used, /dev/sde6 must be at least 256 MB big\&. Because of the hard size limit, use of meta\-disk indexes is discouraged\&. .RE .PP \fBon\-io\-error \fR\fB\fIhandler\fR\fR .RS 4 .\" drbd.conf: on-io-error\fIhandler\fR -is taken, if the lower level device reports io\-error to the upper layers\&. +is taken, if the lower level device reports io\-errors to the upper layers\&. .sp \fIhandler\fR -may be pass_on, call\-local\-io\-error or detach\&. +may be +\fBpass_on\fR, +\fBcall\-local\-io\-error\fR +or +\fBdetach\&.\fR .sp -pass_on: Report the io\-error to the upper layers\&. On Primary report it to the mounted file system\&. On Secondary ignore it\&. +\fBpass_on\fR: The node downgrades the disk status to inconsistent, marks the erroneous block as inconsistent in the bitmap and retries the IO on the remote node\&. .sp -call\-local\-io\-error: Call the handler script +\fBcall\-local\-io\-error\fR: Call the handler script \fBlocal\-io\-error\fR\&. .sp -detach: The node drops its low level device, and continues in diskless mode\&. +\fBdetach\fR: The node drops its low level device, and continues in diskless mode\&. .RE .PP \fBfencing \fR\fB\fIfencing_policy\fR\fR .RS 4 .\" drbd.conf: fencing -Under +By \fBfencing\fR we understand preventive measures to avoid situations where both nodes are primary and disconnected (AKA split brain)\&. .sp @@ -452,94 +550,135 @@ .PP \fBdont\-care\fR .RS 4 -This is the default policy\&. No fencing actions are undertaken\&. +This is the default policy\&. No fencing actions are taken\&. .RE .PP \fBresource\-only\fR .RS 4 -If a node becomes a disconnected primary, it tries to fence the peer\'s disk\&. This is done by calling the fence\-peer handler\&. The handler is supposed to reach the other node over alternative communication paths and call \'drbdadm outdate res\' there\&. +If a node becomes a disconnected primary, it tries to fence the peer\*(Aqs disk\&. This is done by calling the +\fBfence\-peer\fR +handler\&. The handler is supposed to reach the other node over alternative communication paths and call \*(Aq\fBdrbdadm outdate res\fR\*(Aq there\&. .RE .PP \fBresource\-and\-stonith\fR .RS 4 -If a node becomes a disconnected primary, it freezes all its IO operations and calls its fence\-peer handler\&. The fence\-peer handler is supposed to reach the peer over alternative communication paths and call \'drbdadm outdate res\' there\&. In case it cannot reach the peer it should stonith the peer\&. IO is resumed as soon as the situation is resolved\&. In case your handler fails, you can resume IO with the +If a node becomes a disconnected primary, it freezes all its IO operations and calls its fence\-peer handler\&. The fence\-peer handler is supposed to reach the peer over alternative communication paths and call \*(Aqdrbdadm outdate res\*(Aq there\&. In case it cannot reach the peer it should stonith the peer\&. IO is resumed as soon as the situation is resolved\&. In case your handler fails, you can resume IO with the \fBresume\-io\fR command\&. .RE .RE .PP -\fBuse\-bmbv\fR -.RS 4 -.\" drbd.conf: use-bmbv -In case the backing storage\'s driver has a merge_bvec_fn() function, DRBD has to pretend that it can only process IO requests in units not lager than 4kByte\&. (At time of writing the only known drivers which have such a function are: md (software raid driver), dm (device mapper \- LVM) and DRBD itself) -.sp -To get best performance out of DRBD on top of software RAID (or any other driver with a merge_bvec_fn() function) you might enable this function, if you know for sure that the merge_bvec_fn() function will deliver the same results on all nodes of your cluster\&. I\&.e\&. the physical disks of the software RAID are of exactly the same type\&. -\fIUse this option only if you know what you are doing\&.\fR -.RE -.PP -\fBno\-disk\-barrier\fR, \fBno\-disk\-flushes\fR, \fBno\-disk\-drain\fR +\fBdisk\-barrier\fR, \fBdisk\-flushes\fR, \fBdisk\-drain\fR .RS 4 -.\" drbd.conf: no-disk-flushes -.\" drbd.conf: no-disk-flushes -.\" drbd.conf: no-disk-flushes -DRBD has four implementations to express write\-after\-write dependencies to its backing storage device\&. DRBD will use the first method that is supported by the backing storage device and that is not disabled by the user\&. -.sp -When selecting the method you should not only base your decision on the measurable performance\&. In case your backing storage device has a volatile write cache (plain disks, RAID of plain disks) you should use one of the first two\&. In case your backing storage device has battery\-backed write cache you may go with option 3 or 4\&. Option 4 will deliver the best performance such devices\&. -.sp -Unfortunately device mapper (LVM) does not support barriers\&. -.sp -The letter after "wo:" in /proc/drbd indicates with method is currently in use for a device: b, f, d, n\&. The implementations: +.\" drbd.conf: disk-barrier +.\" drbd.conf: disk-flushes +.\" drbd.conf: disk-drain +DRBD has four implementations to express write\-after\-write dependencies to its backing storage device\&. DRBD will use the first method that is supported by the backing storage device and that is not disabled by the user\&. By default all three methods are enabled\&. +.sp +When selecting the method you should not only base your decision on the measurable performance\&. In case your backing storage device has a volatile write cache (plain disks, RAID of plain disks) you should use one of the first two\&. In case your backing storage device has battery\-backed write cache you may go with option 3\&. Option 4 (disable everything, use "none") +\fIis dangerous\fR +on most IO stacks, may result in write\-reordering, and if so, can theoretically be the reason for data corruption, or disturb the DRBD protocol, causing spurious disconnect/reconnect cycles\&. +\fIDo not use\fR +\fBno\-disk\-drain\fR\&. +.sp +Unfortunately device mapper (LVM) might not support barriers\&. +.sp +The letter after "wo:" in /proc/drbd indicates with method is currently in use for a device: +\fBb\fR, +\fBf\fR, +\fBd\fR, +\fBn\fR\&. The implementations are: .PP barrier .RS 4 -The first requirs that the driver of the backing storage device support barriers (called \'tagged command queuing\' in SCSI and \'native command queuing\' in SATA speak)\&. The use of this method can be disabled by the we -\fBno\-disk\-barrier\fR -option\&. +The first requires that the driver of the backing storage device support barriers (called \*(Aqtagged command queuing\*(Aq in SCSI and \*(Aqnative command queuing\*(Aq in SATA speak)\&. The use of this method can be disabled by setting the +\fBdisk\-barrier\fR +options to +\fBno\fR\&. .RE .PP flush .RS 4 -The second requires that the backing device support disk flushes (called \'force unit access\' in the drive vendors speak)\&. The use of this method can be disabled using the -\fBno\-disk\-flushes\fR -option\&. +The second requires that the backing device support disk flushes (called \*(Aqforce unit access\*(Aq in the drive vendors speak)\&. The use of this method can be disabled setting +\fBdisk\-flushes\fR +to +\fBno\fR\&. .RE .PP drain .RS 4 -The third method is simply to let write requests drain before write requests of a new reordering domain are issued\&. That was the only implementation before 8\&.0\&.9\&. You can prevent to use of this method by using the -\fBno\-disk\-drain\fR -option\&. +The third method is simply to let write requests drain before write requests of a new reordering domain are issued\&. This was the only implementation before 8\&.0\&.9\&. .RE .PP none .RS 4 -The fourth method is to not express write\-after\-write dependencies to the backing store at all\&. +The fourth method is to not express write\-after\-write dependencies to the backing store at all, by also specifying +\fBno\-disk\-drain\fR\&. This +\fIis dangerous\fR +on most IO stacks, may result in write\-reordering, and if so, can theoretically be the reason for data corruption, or disturb the DRBD protocol, causing spurious disconnect/reconnect cycles\&. +\fIDo not use\fR +\fBno\-disk\-drain\fR\&. .RE .RE .PP -\fBno\-md\-flushes\fR +\fBmd\-flushes\fR .RS 4 -.\" drbd.conf: no-md-flushes +.\" drbd.conf: md-flushes Disables the use of disk flushes and barrier BIOs when accessing the meta data device\&. See the notes on -\fBno\-disk\-flushes\fR\&. +\fBdisk\-flushes\fR\&. .RE .PP \fBmax\-bio\-bvecs\fR .RS 4 .\" drbd.conf: max-bio-bvecs -In some special circumstances the device mapper stack manages to pass BIOs to DRBD that violate the constraints that are set forth by DRBD\'s merge_bvec() function and which have more than one bvec\&. A known example is: phys\-disk \-> DRBD \-> LVM \-> Xen \-> missaligned partition (63) \-> DomU FS\&. Then you might see "bio would need to, but cannot, be split:" in the Dom0\'s kernel log\&. +In some special circumstances the device mapper stack manages to pass BIOs to DRBD that violate the constraints that are set forth by DRBD\*(Aqs merge_bvec() function and which have more than one bvec\&. A known example is: phys\-disk \-> DRBD \-> LVM \-> Xen \-> misaligned partition (63) \-> DomU FS\&. Then you might see "bio would need to, but cannot, be split:" in the Dom0\*(Aqs kernel log\&. .sp -The best workaround is to proper align the partition within the VM (E\&.g\&. start it at sector 1024)\&. Costs 480 KiByte of storage\&. Unfortunately the default of most Linux partitioning tools is to start the first partition at an odd number (63)\&. Therefore most distribution\'s install helpers for virtual linux machines will end up with missaligned partitions\&. The second best workaround is to limit DRBD\'s max bvecs per BIO (= max\-bio\-bvecs) to 1\&. Might cost performance\&. +The best workaround is to proper align the partition within the VM (E\&.g\&. start it at sector 1024)\&. This costs 480 KiB of storage\&. Unfortunately the default of most Linux partitioning tools is to start the first partition at an odd number (63)\&. Therefore most distribution\*(Aqs install helpers for virtual linux machines will end up with misaligned partitions\&. The second best workaround is to limit DRBD\*(Aqs max bvecs per BIO (= +\fBmax\-bio\-bvecs\fR) to 1, but that might cost performance\&. .sp The default value of \fBmax\-bio\-bvecs\fR is 0, which means that there is no user imposed limitation\&. .RE .PP +\fBdisk\-timeout\fR +.RS 4 +.\" drbd.conf: disk-timeout +If the driver of the +\fIlower_device\fR +does not finish an IO request within +\fIdisk_timeout\fR, DRBD considers the disk as failed\&. If DRBD is connected to a remote host, it will reissue local pending IO requests to the peer, and ship all new IO requests to the peer only\&. The disk state advances to diskless, as soon as the backing block device has finished all IO requests\&. +.sp +The default value of is 0, which means that no timeout is enforced\&. The default unit is 100ms\&. This option is available since 8\&.3\&.12\&. +.RE +.PP +\fBread\-balancing \fR\fB\fImethod\fR\fR +.RS 4 +.\" drbd.conf: read-balancing +The supported +\fImethods\fR +for load balancing of read requests are +\fBprefer\-local\fR, +\fBprefer\-remote\fR, +\fBround\-robin\fR, +\fBleast\-pending\fR +\fBwhen\-congested\-remote\fR, +\fB32K\-striping\fR, +\fB64K\-striping\fR, +\fB128K\-striping\fR, +\fB256K\-striping\fR, +\fB512K\-striping\fR +and +\fB1M\-striping\fR\&. +.sp +The default value of is +\fBprefer\-local\fR\&. This option is available since 8\&.4\&.1\&. +.RE +.PP \fBsndbuf\-size \fR\fB\fIsize\fR\fR .RS 4 .\" drbd.conf: sndbuf-size\fIsize\fR -is the size of the TCP socket send buffer\&. The default value is 0, i\&.e\&. autotune\&. You can specify smaller or larger values\&. Larger values are appropriate for reasonable write throughput with protocol A over high latency networks\&. Very large values like 1M may cause problems\&. Also values below 32K do not make much sense\&. Since 8\&.0\&.13 resp\&. 8\&.2\&.7, setting the +is the size of the TCP socket send buffer\&. The default value is 0, i\&.e\&. autotune\&. You can specify smaller or larger values\&. Larger values are appropriate for reasonable write throughput with protocol A over high latency networks\&. Values below 32K do not make sense\&. Since 8\&.0\&.13 resp\&. 8\&.2\&.7, setting the \fIsize\fR value to 0 means that the kernel should autotune this\&. .RE @@ -557,8 +696,7 @@ .\" drbd.conf: timeout If the partner node fails to send an expected response packet within \fItime\fR -10ths -of a second, the partner node is considered dead and therefore the TCP/IP connection is abandoned\&. This must be lower than +tenths of a second, the partner node is considered dead and therefore the TCP/IP connection is abandoned\&. This must be lower than \fIconnect\-int\fR and \fIping\-int\fR\&. The default value is 60 = 6 seconds, the unit 0\&.1 seconds\&. @@ -567,7 +705,7 @@ \fBconnect\-int \fR\fB\fItime\fR\fR .RS 4 .\" drbd.conf: connect-int -In case it is not possible to connect to the remote DRBD device immediately, DRBD keeps on trying to connect\&. With this option you can set the time between two tries\&. The default value is 10 seconds, the unit is 1 second\&. +In case it is not possible to connect to the remote DRBD device immediately, DRBD keeps on trying to connect\&. With this option you can set the time between two retries\&. The default value is 10 seconds, the unit is 1 second\&. .RE .PP \fBping\-int \fR\fB\fItime\fR\fR @@ -581,13 +719,13 @@ \fBping\-timeout \fR\fB\fItime\fR\fR .RS 4 .\" drbd.conf: ping-timeout -The time the peer has time to answer to a keep\-alive packet\&. In case the peer\'s reply is not received within this time period, it is considered as dead\&. The default value is 500ms, the default unit is 100ms\&. +The time the peer has time to answer to a keep\-alive packet\&. In case the peer\*(Aqs reply is not received within this time period, it is considered as dead\&. The default value is 500ms, the default unit are tenths of a second\&. .RE .PP \fBmax\-buffers \fR\fB\fInumber\fR\fR .RS 4 .\" drbd.conf: max-buffers -Maximum number of requests to be allocated by DRBD\&. Unit is PAGE_SIZE, which is 4 KB on most systems\&. The minimum is hard coded to 32 (=128 KB)\&. For high\-performance installations it might help, if you increase that number\&. These buffers are used to hold data blocks while they are written to disk\&. +Maximum number of requests to be allocated by DRBD\&. Unit is PAGE_SIZE, which is 4 KiB on most systems\&. The minimum is hard coded to 32 (=128 KiB)\&. For high\-performance installations it might help if you increase that number\&. These buffers are used to hold data blocks while they are written to disk\&. .RE .PP \fBko\-count \fR\fB\fInumber\fR\fR @@ -596,7 +734,9 @@ In case the secondary node fails to complete a single write request for \fIcount\fR times the -\fItimeout\fR, it is expelled from the cluster\&. (I\&.e\&. the primary node goes into StandAlone mode\&.) The default value is 0, which disables this feature\&. +\fItimeout\fR, it is expelled from the cluster\&. (I\&.e\&. the primary node goes into +\fBStandAlone\fR +mode\&.) The default value is 0, which disables this feature\&. .RE .PP \fBmax\-epoch\-size \fR\fB\fInumber\fR\fR @@ -608,25 +748,29 @@ \fBallow\-two\-primaries\fR .RS 4 .\" drbd.conf: allow-two-primaries -With this option set you may assign primary role to both nodes\&. You only should use this option if you use a shared storage file system on top of DRBD\&. At the time of writing the only ones are: OCFS2 and GFS\&. If you use this option with any other file system, you are going to crash your nodes and to corrupt your data! +With this option set you may assign the primary role to both nodes\&. You only should use this option if you use a shared storage file system on top of DRBD\&. At the time of writing the only ones are: OCFS2 and GFS\&. If you use this option with any other file system, you are going to crash your nodes and to corrupt your data! .RE .PP \fBunplug\-watermark \fR\fB\fInumber\fR\fR .RS 4 .\" drbd.conf: unplug-watermark -When the number of pending write requests on the standby (secondary) node exceeds the unplug\-watermark, we trigger the request processing of our backing storage device\&. Some storage controllers deliver better performance with small values, others deliver best performance when the value is set to the same value as max\-buffers\&. Minimum 16, default 128, maximum 131072\&. +When the number of pending write requests on the standby (secondary) node exceeds the +\fBunplug\-watermark\fR, we trigger the request processing of our backing storage device\&. Some storage controllers deliver better performance with small values, others deliver best performance when the value is set to the same value as max\-buffers\&. Minimum 16, default 128, maximum 131072\&. .RE .PP \fBcram\-hmac\-alg\fR .RS 4 .\" drbd.conf: cram-hmac-alg -You need to specify the HMAC algorithm to enable peer authentication at all\&. You are strongly encouraged to use peer authentication\&. The HMAC algorithm will be used for the challenge response authentication of the peer\&. You may specify any digest algorithm that is named in /proc/crypto\&. +You need to specify the HMAC algorithm to enable peer authentication at all\&. You are strongly encouraged to use peer authentication\&. The HMAC algorithm will be used for the challenge response authentication of the peer\&. You may specify any digest algorithm that is named in +\fB/proc/crypto\fR\&. .RE .PP \fBshared\-secret\fR .RS 4 .\" drbd.conf: shared-secret -The shared secret used in peer authentication\&. May be up to 64 characters\&. Note that peer authentication is disabled as long as no cram\-hmac\-alg (see above) is specified\&. +The shared secret used in peer authentication\&. May be up to 64 characters\&. Note that peer authentication is disabled as long as no +\fBcram\-hmac\-alg\fR +(see above) is specified\&. .RE .PP \fBafter\-sb\-0pri \fR \fIpolicy\fR @@ -679,19 +823,25 @@ .RS 4 Discard the version of the secondary if the outcome of the \fBafter\-sb\-0pri\fR -algorithm would also destroy the current secondary\'s data\&. Otherwise disconnect\&. +algorithm would also destroy the current secondary\*(Aqs data\&. Otherwise disconnect\&. .RE .PP \fBviolently\-as0p\fR .RS 4 Always take the decision of the \fBafter\-sb\-0pri\fR -algorithm\&. Even if that causes an erratic change of the primary\'s view of the data\&. This is only useful if you use a 1node FS (i\&.e\&. not OCFS2 or GFS) with the allow\-two\-primaries flag, _AND_ if you really know what you are doing\&. This is DANGEROUS and MAY CRASH YOUR MACHINE if you have an FS mounted on the primary node\&. +algorithm, even if that causes an erratic change of the primary\*(Aqs view of the data\&. This is only useful if you use a one\-node FS (i\&.e\&. not OCFS2 or GFS) with the +\fBallow\-two\-primaries\fR +flag, +\fIAND\fR +if you really know what you are doing\&. This is +\fIDANGEROUS and MAY CRASH YOUR MACHINE\fR +if you have an FS mounted on the primary node\&. .RE .PP \fBdiscard\-secondary\fR .RS 4 -Discard the secondary\'s version\&. +Discard the secondary\*(Aqs version\&. .RE .PP \fBcall\-pri\-lost\-after\-sb\fR @@ -716,7 +866,13 @@ .RS 4 Always take the decision of the \fBafter\-sb\-0pri\fR -algorithm\&. Even if that causes an erratic change of the primary\'s view of the data\&. This is only useful if you use a 1node FS (i\&.e\&. not OCFS2 or GFS) with the allow\-two\-primaries flag, _AND_ if you really know what you are doing\&. This is DANGEROUS and MAY CRASH YOUR MACHINE if you have an FS mounted on the primary node\&. +algorithm, even if that causes an erratic change of the primary\*(Aqs view of the data\&. This is only useful if you use a one\-node FS (i\&.e\&. not OCFS2 or GFS) with the +\fBallow\-two\-primaries\fR +flag, +\fIAND\fR +if you really know what you are doing\&. This is +\fIDANGEROUS and MAY CRASH YOUR MACHINE\fR +if you have an FS mounted on the primary node\&. .RE .PP \fBcall\-pri\-lost\-after\-sb\fR @@ -735,7 +891,7 @@ \fBrr\-conflict \fR \fIpolicy\fR .RS 4 .\" drbd.conf: rr-conflict -To solve the cases when the outcome of the resync decision is incompatible with the current role assignment in the cluster\&. +This option helps to solve the cases when the outcome of the resync decision is incompatible with the current role assignment in the cluster\&. .PP \fBdisconnect\fR .RS 4 @@ -757,9 +913,9 @@ \fBdata\-integrity\-alg \fR \fIalg\fR .RS 4 .\" drbd.conf: data-integrity-alg -DRBD can ensure the data integrity of the user\'s data on the network by comparing hash values\&. Normally this is ensured by the 16 bit checksums in the headers of TCP/IP packets\&. +DRBD can ensure the data integrity of the user\*(Aqs data on the network by comparing hash values\&. Normally this is ensured by the 16 bit checksums in the headers of TCP/IP packets\&. .sp -This option can be set to any of the kernel\'s data digest algorithms\&. In a typical kernel configuration you should have at least one of +This option can be set to any of the kernel\*(Aqs data digest algorithms\&. In a typical kernel configuration you should have at least one of \fBmd5\fR, \fBsha1\fR, and \fBcrc32c\fR @@ -768,10 +924,37 @@ See also the notes on data integrity\&. .RE .PP -\fBno\-tcp\-cork\fR +\fBtcp\-cork\fR .RS 4 -.\" drbd.conf: no-tcp-cork -DRBD usually uses the TCP socket option TCP_CORK to hint to the network stack when it can expect more data, and when it should flush out what it has in its send queue\&. It turned out that there is at lease one network stack that performs worse when one uses this hinting method\&. Therefore we introducted this option, which disable the setting and clearing of the TCP_CORK socket option by DRBD\&. +.\" drbd.conf: tcp-cork +DRBD usually uses the TCP socket option TCP_CORK to hint to the network stack when it can expect more data, and when it should flush out what it has in its send queue\&. It turned out that there is at least one network stack that performs worse when one uses this hinting method\&. Therefore we introducted this option\&. By setting +\fBtcp\-cork\fR +to +\fBno\fR +you can disable the setting and clearing of the TCP_CORK socket option by DRBD\&. +.RE +.PP +\fBon\-congestion \fR\fB\fIcongestion_policy\fR\fR, \fBcongestion\-fill \fR\fB\fIfill_threshold\fR\fR, \fBcongestion\-extents \fR\fB\fIactive_extents_threshold\fR\fR +.RS 4 +By default DRBD blocks when the available TCP send queue becomes full\&. That means it will slow down the application that generates the write requests that cause DRBD to send more data down that TCP connection\&. +.sp +When DRBD is deployed with DRBD\-proxy it might be more desirable that DRBD goes into AHEAD/BEHIND mode shortly before the send queue becomes full\&. In AHEAD/BEHIND mode DRBD does no longer replicate data, but still keeps the connection open\&. +.sp +The advantage of the AHEAD/BEHIND mode is that the application is not slowed down, even if DRBD\-proxy\*(Aqs buffer is not sufficient to buffer all write requests\&. The downside is that the peer node falls behind, and that a resync will be necessary to bring it back into sync\&. During that resync the peer node will have an inconsistent disk\&. +.sp +Available +\fIcongestion_policy\fRs are +\fBblock\fR +and +\fBpull\-ahead\fR\&. The default is +\fBblock\fR\&. +\fIFill_threshold\fR +might be in the range of 0 to 10GiBytes\&. The default is 0 which disables the check\&. +\fIActive_extents_threshold\fR +has the same limits as +\fBal\-extents\fR\&. +.sp +The AHEAD/BEHIND mode and its settings are available since DRBD 8\&.3\&.10\&. .RE .PP \fBwfc\-timeout \fR\fB\fItime\fR\fR @@ -804,7 +987,7 @@ .RS 4 Sets on which node the device should be promoted to primary role by the init script\&. The \fInode\-name\fR -might either be a host name or the key word +might either be a host name or the keyword \fBboth\fR\&. When this option is not set the devices stay in secondary role on both nodes\&. Usually one delegates the role assignment to a cluster manager (e\&.g\&. heartbeat)\&. .RE .PP @@ -822,21 +1005,31 @@ \fBwfc\-timeout\fR and \fBdegr\-wfc\-timeout\fR -statements\&. Only do that if the peer of the stacked resource is usually not available or will not become primary usually\&. By using this option incorrectly, you run the risk of causing unexpected split brain\&. +statements\&. Only do that if the peer of the stacked resource is usually not available or will usually not become primary\&. By using this option incorrectly, you run the risk of causing unexpected split brain\&. .RE .PP -\fBrate \fR\fB\fIrate\fR\fR +\fBresync\-rate \fR\fB\fIrate\fR\fR .RS 4 -.\" drbd.conf: rate +.\" drbd.conf: resync-rate To ensure a smooth operation of the application on top of DRBD, it is possible to limit the bandwidth which may be used by background synchronizations\&. The default is 250 KB/sec, the default unit is KB/sec\&. Optional suffixes K, M, G are allowed\&. .RE .PP -\fBafter \fR\fB\fIres\-name\fR\fR +\fBuse\-rle\fR +.RS 4 +.\" drbd.conf: use-rle +During resync\-handshake, the dirty\-bitmaps of the nodes are exchanged and merged (using bit\-or), so the nodes will have the same understanding of which blocks are dirty\&. On large devices, the fine grained dirty\-bitmap can become large as well, and the bitmap exchange can take quite some time on low\-bandwidth links\&. +.sp +Because the bitmap typically contains compact areas where all bits are unset (clean) or set (dirty), a simple run\-length encoding scheme can considerably reduce the network traffic necessary for the bitmap exchange\&. +.sp +For backward compatibilty reasons, and because on fast links this possibly does not improve transfer time but consumes cpu cycles, this defaults to off\&. +.RE +.PP +\fBresync\-after \fR\fB\fIres\-name\fR\fR .RS 4 -.\" drbd.conf: after -By default, resynchronization of all devices would run in parallel\&. By defining a sync\-after dependency, the resynchronization of this resource will start only if the resource +.\" drbd.conf: resync-after +By default, resynchronization of all devices would run in parallel\&. By defining a resync\-after dependency, the resynchronization of this resource will start only if the resource \fIres\-name\fR -is already in connected state (= finished its resynchronization)\&. +is already in connected state (i\&.e\&., has finished its resynchronization)\&. .RE .PP \fBal\-extents \fR\fB\fIextents\fR\fR @@ -851,7 +1044,7 @@ .RS 4 During online verification (as initiated by the \fBverify\fR -sub\-command), rather than doing a bit\-wise comparison, DRBD applies a hash function to the contents of every block being verified, and compares that hash with the peer\&. This option defines the hash algorithm being used for that purpose\&. It can be set to any of the kernel\'s data digest algorithms\&. In a typical kernel configuration you should have at least one of +sub\-command), rather than doing a bit\-wise comparison, DRBD applies a hash function to the contents of every block being verified, and compares that hash with the peer\&. This option defines the hash algorithm being used for that purpose\&. It can be set to any of the kernel\*(Aqs data digest algorithms\&. In a typical kernel configuration you should have at least one of \fBmd5\fR, \fBsha1\fR, and \fBcrc32c\fR @@ -862,21 +1055,92 @@ .PP \fBcsums\-alg \fR\fB\fIhash\-alg\fR\fR .RS 4 -A resync process sends all marked data blocks form the source to the destination node, as long as no +A resync process sends all marked data blocks from the source to the destination node, as long as no \fBcsums\-alg\fR -is given\&. When one is specified the resync process exchanges hash values of all marked blocks first, and sends only those data blocks over, that have different hash values\&. +is given\&. When one is specified the resync process exchanges hash values of all marked blocks first, and sends only those data blocks that have different hash values\&. .sp This setting is useful for DRBD setups with low bandwidth links\&. During the restart of a crashed primary node, all blocks covered by the activity log are marked for resync\&. But a large part of those will actually be still in sync, therefore using \fBcsums\-alg\fR will lower the required bandwidth in exchange for CPU cycles\&. .RE .PP +\fBc\-plan\-ahead \fR\fB\fIplan_time\fR\fR, \fBc\-fill\-target \fR\fB\fIfill_target\fR\fR, \fBc\-delay\-target \fR\fB\fIdelay_target\fR\fR, \fBc\-max\-rate \fR\fB\fImax_rate\fR\fR +.RS 4 +The dynamic resync speed controller gets enabled with setting +\fIplan_time\fR +to a positive value\&. It aims to fill the buffers along the data path with either a constant amount of data +\fIfill_target\fR, or aims to have a constant delay time of +\fIdelay_target\fR +along the path\&. The controller has an upper bound of +\fImax_rate\fR\&. +.sp +By +\fIplan_time\fR +the agility of the controller is configured\&. Higher values yield for slower/lower responses of the controller to deviation from the target value\&. It should be at least 5 times RTT\&. For regular data paths a +\fIfill_target\fR +in the area of 4k to 100k is appropriate\&. For a setup that contains drbd\-proxy it is advisable to use +\fIdelay_target\fR +instead\&. Only when +\fIfill_target\fR +is set to 0 the controller will use +\fIdelay_target\fR\&. 5 times RTT is a reasonable starting value\&. +\fIMax_rate\fR +should be set to the bandwidth available between the DRBD\-hosts and the machines hosting DRBD\-proxy, or to the available disk\-bandwidth\&. +.sp +The default value of +\fIplan_time\fR +is 0, the default unit is 0\&.1 seconds\&. +\fIFill_target\fR +has 0 and sectors as default unit\&. +\fIDelay_target\fR +has 1 (100ms) and 0\&.1 as default unit\&. +\fIMax_rate\fR +has 10240 (100MiB/s) and KiB/s as default unit\&. +.sp +The dynamic resync speed controller and its settings are available since DRBD 8\&.3\&.9\&. +.RE +.PP +\fBc\-min\-rate \fR\fB\fImin_rate\fR\fR +.RS 4 +A node that is primary and sync\-source has to schedule application IO requests and resync IO requests\&. The +\fImin_rate\fR +tells DRBD use only up to min_rate for resync IO and to dedicate all other available IO bandwidth to application requests\&. +.sp +Note: The value 0 has a special meaning\&. It disables the limitation of resync IO completely, which might slow down application IO considerably\&. Set it to a value of 1, if you prefer that resync IO never slows down application IO\&. +.sp +Note: Although the name might suggest that it is a lower bound for the dynamic resync speed controller, it is not\&. If the DRBD\-proxy buffer is full, the dynamic resync speed controller is free to lower the resync speed down to 0, completely independent of the +\fBc\-min\-rate\fR +setting\&. +.sp +\fIMin_rate\fR +has 4096 (4MiB/s) and KiB/s as default unit\&. +.RE +.PP +\fBon\-no\-data\-accessible \fR\fB\fIond\-policy\fR\fR +.RS 4 +This setting controls what happens to IO requests on a degraded, disk less node (I\&.e\&. no data store is reachable)\&. The available policies are +\fBio\-error\fR +and +\fBsuspend\-io\fR\&. +.sp +If +\fIond\-policy\fR +is set to +\fBsuspend\-io\fR +you can either resume IO by attaching/connecting the last lost data storage, or by the +\fBdrbdadm resume\-io \fR\fB\fIres\fR\fR +command\&. The latter will result in IO errors of course\&. +.sp +The default is +\fBio\-error\fR\&. This setting is available since DRBD 8\&.3\&.9\&. +.RE +.PP \fBcpu\-mask \fR\fB\fIcpu\-mask\fR\fR .RS 4 .\" drbd.conf: cpu-mask -Sets the cpu\-affinity\-mask for DRBD\'s kernel threads of this device\&. The default value of +Sets the cpu\-affinity\-mask for DRBD\*(Aqs kernel threads of this device\&. The default value of \fIcpu\-mask\fR -is 0, which means that DRBD\'s kernel threads should be spread over all CPUs of the machine\&. This value must be given in hexadecimal notation\&. If it is too big it will be truncated\&. +is 0, which means that DRBD\*(Aqs kernel threads should be spread over all CPUs of the machine\&. This value must be given in hexadecimal notation\&. If it is too big it will be truncated\&. .RE .PP \fBpri\-on\-incon\-degr \fR\fB\fIcmd\fR\fR @@ -888,13 +1152,13 @@ \fBpri\-lost\-after\-sb \fR\fB\fIcmd\fR\fR .RS 4 .\" drbd.conf: pri-lost-after-sb -The node is currently primary, but lost the after split brain auto recovery procedure\&. As as consequence, it should be abandoned\&. +The node is currently primary, but lost the after\-split\-brain auto recovery procedure\&. As as consequence, it should be abandoned\&. .RE .PP \fBpri\-lost \fR\fB\fIcmd\fR\fR .RS 4 .\" drbd.conf: pri-lost -The node is currently primary, but DRBD\'s algorithm thinks that it should become sync target\&. As a consequence it should give up its primary role\&. +The node is currently primary, but DRBD\*(Aqs algorithm thinks that it should become sync target\&. As a consequence it should give up its primary role\&. .RE .PP \fBfence\-peer \fR\fB\fIcmd\fR\fR @@ -902,7 +1166,7 @@ .\" drbd.conf: fence-peer The handler is part of the \fBfencing\fR -mechanism\&. This handler is called in case the node needs to fence the peer\'s disk\&. It should use other communication paths than DRBD\'s network link\&. +mechanism\&. This handler is called in case the node needs to fence the peer\*(Aqs disk\&. It should use other communication paths than DRBD\*(Aqs network link\&. .RE .PP \fBlocal\-io\-error \fR\fB\fIcmd\fR\fR @@ -911,22 +1175,28 @@ DRBD got an IO error from the local IO subsystem\&. .RE .PP +\fBinitial\-split\-brain \fR\fB\fIcmd\fR\fR +.RS 4 +.\" drbd.conf: initial-split-brain +DRBD has connected and detected a split brain situation\&. This handler can alert someone in all cases of split brain, not just those that go unresolved\&. +.RE +.PP \fBsplit\-brain \fR\fB\fIcmd\fR\fR .RS 4 .\" drbd.conf: split-brain -DRBD detected a split brain situation\&. Manual recovery is necessary\&. This handler should alert someone on duty\&. +DRBD detected a split brain situation but remains unresolved\&. Manual recovery is necessary\&. This handler should alert someone on duty\&. .RE .PP \fBbefore\-resync\-target \fR\fB\fIcmd\fR\fR .RS 4 .\" drbd.conf: before-resync-target -DRBD calls this handler just before a resync beginns on the node that becomes resync target\&. It might be used to take a snapshot of the backing block device\&. +DRBD calls this handler just before a resync begins on the node that becomes resync target\&. It might be used to take a snapshot of the backing block device\&. .RE .PP \fBafter\-resync\-target \fR\fB\fIcmd\fR\fR .RS 4 .\" drbd.conf: after-resync-target -DRBD calls this handler just after a resync operation finished on the node which\'s disk just became consistent after beeing inconsistent for the duration of the resync\&. It might be used to remove a snapshot of the backing device that was created by the +DRBD calls this handler just after a resync operation finished on the node whose disk just became consistent after being inconsistent for the duration of the resync\&. It might be used to remove a snapshot of the backing device that was created by the \fBbefore\-resync\-target\fR handler\&. .RE @@ -948,20 +1218,25 @@ \fBnetwork\fR section\&. .PP -Both mechanisms might deliver false positives if the user of DRBD modifies the data which gets written to disk while the transfer goes on\&. Currently the swap code and ReiserFS are known to do so\&. In both cases this is not a problem, because when the initiator of the data transfer does this it already knows that that data block will not be part of an on disk data structure\&. +Both mechanisms might deliver false positives if the user of DRBD modifies the data which gets written to disk while the transfer goes on\&. This may happen for swap, or for certain append while global sync, or truncate/rewrite workloads, and not necessarily poses a problem for the integrity of the data\&. Usually when the initiator of the data transfer does this, it already knows that that data block will not be part of an on disk data structure, or will be resubmitted with correct data soon enough\&. .PP -The most recent (2007) example of systematically corruption was an issue with the TCP offloading engine and the driver of a certain type of GBit NIC\&. The actual corruption happened on the DMA transfer from core memory to the card\&. Since the TCP checksum gets calculated on the card this type of corruption stays undetected as long as you do not use either the online +The +\fBdata\-integrity\-alg\fR +causes the receiving side to log an error about "Digest integrity check FAILED: Ns +x\en", where N is the sector offset, and x is the size of the request in bytes\&. It will then disconnect, and reconnect, thus causing a quick resync\&. If the sending side at the same time detected a modification, it warns about "Digest mismatch, buffer modified by upper layers during write: Ns +x\en", which shows that this was a false positive\&. The sending side may detect these buffer modifications immediately after the unmodified data has been copied to the tcp buffers, in which case the receiving side won\*(Aqt notice it\&. +.PP +The most recent (2007) example of systematic corruption was an issue with the TCP offloading engine and the driver of a certain type of GBit NIC\&. The actual corruption happened on the DMA transfer from core memory to the card\&. Since the TCP checksum gets calculated on the card, this type of corruption stays undetected as long as you do not use either the online \fBverify\fR -or the data\-integrity\-alg\&. +or the +\fBdata\-integrity\-alg\fR\&. .PP We suggest to use the \fBdata\-integrity\-alg\fR only during a pre\-production phase due to its CPU costs\&. Further we suggest to do online \fBverify\fR -runs regularly e\&.g\&. once a month during low load period\&. +runs regularly e\&.g\&. once a month during a low load period\&. .SH "VERSION" .sp -This document was revised for version 8\&.3\&.2 of the DRBD distribution\&. +This document was revised for version 8\&.4\&.0 of the DRBD distribution\&. .SH "AUTHOR" .sp Written by Philipp Reisner philipp\&.reisner@linbit\&.com and Lars Ellenberg lars\&.ellenberg@linbit\&.com\&. @@ -977,7 +1252,7 @@ \fBdrbddisk\fR(8), \fBdrbdsetup\fR(8), \fBdrbdadm\fR(8), -\m[blue]\fBDRBD User\'s Guide\fR\m[]\&\s-2\u[1]\d\s+2, +\m[blue]\fBDRBD User\*(Aqs Guide\fR\m[]\&\s-2\u[1]\d\s+2, \m[blue]\fBDRBD web site\fR\m[]\&\s-2\u[3]\d\s+2 .SH "NOTES" .IP " 1." 4 diff -Nru drbd8-8.3.7/documentation/drbd.conf.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbd.conf.xml --- drbd8-8.3.7/documentation/drbd.conf.xml 2010-01-07 09:09:33.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbd.conf.xml 2012-02-02 14:09:14.000000000 +0000 @@ -1,174 +1,189 @@ - - + + - 5 Dec 2008 + 6 May 2011 + DRBD - 8.3.2 + + 8.4.0 + drbd.conf + 5 + Configuration Files + drbd.conf - Configuration file for DRBD's devices - - drbd.conf - - + + Configuration file for DRBD's devices + drbd.conf + + Introduction - The file is read by - . - - The file format was designed as to allow to have - a verbatim copy of the file on both nodes of the cluster. - It is highly recommended to do so in order to keep your configuration - manageable. The file should be the same on both nodes of the cluster. Changes to do not apply - immediately. -A small drbd.conf fileglobal { usage-count yes; } -common { syncer { rate 10M; } } -resource r0 { - protocol C; + + The file is read by . + + The file format was designed as to allow to have a verbatim copy of the file on both + nodes of the cluster. It is highly recommended to do so in order to keep your configuration + manageable. The file should be the same on both nodes of the + cluster. Changes to do not apply immediately. + + By convention the main config contains two include statements. The first one includes + the file , the second one all file with a + suffix. + + + A small example.res file + + resource r0 { net { + protocol C; cram-hmac-alg sha1; shared-secret "FooFunFactory"; } + disk { + resync-rate 10M; + } on alice { - device minor 1; - disk /dev/sda7; + volume 0 { + device minor 1; + disk /dev/sda7; + meta-disk internal; + } address 10.1.1.31:7789; - meta-disk internal; } on bob { - device minor 1; - disk /dev/sda7; + volume 0 { + device minor 1; + disk /dev/sda7; + meta-disk internal; + } address 10.1.1.32:7789; - meta-disk internal; } -} - In this example, there is a single DRBD resource (called r0) which uses - protocol C for the connection between its devices. - The device which runs - on host alice uses - /dev/drbd1 as devices for its application, and - /dev/sda7 as low-level storage for the data. - The IP addresses are used to specify the networking interfaces to be used. - An eventually running resync process should use about 10MByte/second of IO - bandwidth. - - There may be multiple resource sections in a single drbd.conf file. - For more examples, please have a look at the - DRBD User's Guide. - +} + In this example, there is a single DRBD resource (called r0) which uses protocol C + for the connection between its devices. It contains a single volume which runs on host + alice uses /dev/drbd1 as devices for its + application, and /dev/sda7 as low-level storage for the data. The + IP addresses are used to specify the networking interfaces to be used. An eventually running + resync process should use about 10MByte/second of IO bandwidth. This sync-rate statement is + valid for volume 0, but would also be valid for further volumes. In this example it assigns + full 10MByte/second to each volume. + + There may be multiple resource sections in a single drbd.conf file. For more examples, + please have a look at the + DRBD User's Guide. + + File Format - The file consists of sections and parameters. - A section begins with a keyword, sometimes an additional name, and an - opening brace ({). - A section ends with a closing brace (}. - The braces enclose the parameters. - - section [name] { parameter value; [...] } - - A parameter starts with the identifier of the parameter followed - by whitespace. Every subsequent character - is considered - as part of the parameter's value. A special case are Boolean - parameters which only consist of the identifier. - Parameters are terminated by a semicolon (;). - - Some parameter values have default units which might be overruled - by K, M or G. These units are defined in the usual way (K = 2^10 = 1024, - M = 1024 K, G = 1024 M). - - Comments may be placed into the configuration file and must - begin with a hash sign (#). Subsequent characters are ignored - until the end of the line. - + + The file consists of sections and parameters. A section begins with a keyword, sometimes + an additional name, and an opening brace ({). A section ends with a closing + brace (}. The braces enclose the parameters. + + section [name] { parameter value; [...] } + + A parameter starts with the identifier of the parameter followed by whitespace. Every + subsequent character is considered as part of the parameter's value. A special case are + Boolean parameters which consist only of the identifier. Parameters are terminated by a + semicolon (;). + + Some parameter values have default units which might be overruled by K, M or G. These + units are defined in the usual way (K = 2^10 = 1024, M = 1024 K, G = 1024 M). + + Comments may be placed into the configuration file and must begin with a hash sign + (#). Subsequent characters are ignored until the end of the line. + Sections + - - - + + - drbd.confskip - Comments out chunks of text, even spanning more than one line. - Characters between the keyword and the opening - brace ({) are ignored. Everything enclosed by the braces - is skipped. - This comes in handy, if you just want to comment out - some 'resource [name] {...}' section: just precede it with 'skip'. - + + drbd.conf + + skip + Comments out chunks of text, even spanning more than one line. + Characters between the keyword and the opening brace + ({) are ignored. Everything enclosed by the braces is skipped. This + comes in handy, if you just want to comment out some '' section: just precede it with 'skip'. + - - - + + - drbd.confglobal - Configures some global parameters. Currently only - , , - and - are allowed here. You may only have one global section, preferably - as the first section. - + + drbd.conf + + global + Configures some global parameters. Currently only + , , + and are allowed + here. You may only have one global section, preferably as the first section. + - - - + + - drbd.confcommon - All resources inherit the options set in this section. - The common section might have - a , - a , - a , - a and a section. - + + drbd.conf + + common + All resources inherit the options set in this section. The common + section might have a , a , a + , a and a + section. + - - - + + - drbd.confresource - Configures a DRBD resource. - Each resource section needs to have two (or more) - sections - and may have - a , - a , - a , - a and a section. - Required parameter in this section: . - + + drbd.conf + + resource + Configures a DRBD resource. Each resource section needs to have two (or + more) sections and may have a + , a , a , a + and a section. It might contain + s sections. + - - - + + - drbd.confon - Carries the necessary configuration parameters for a DRBD - device of the enclosing resource. - host-name is mandatory and must match the - Linux host name (uname -n) of one of the nodes. - You may list more than one host name here, in case you want to use the same - parameters on several hosts (you'd have to move the IP around usually). - Or you may list more than two such sections. - resource r1 { + + drbd.conf + + on + Carries the necessary configuration parameters for a DRBD device of the + enclosing resource. host-name is mandatory and must match + the Linux host name (uname -n) of one of the nodes. You may list more than one host + name here, in case you want to use the same parameters on several hosts (you'd have to + move the IP around usually). Or you may list more than two such sections. + resource r1 { protocol C; device minor 1; meta-disk internal; @@ -186,45 +201,61 @@ disk /dev/mapper/other-san-as-seen-from-daisy; } } - - See also the section keyword. - Required parameters in this section: , - , , , - . - + See also the section keyword. Required statements in + this section: and . Note for backward + compatibility and convenience it is valid to embed the statements of a single volume + directly into the host section. + - - - + + - drbd.confstacked-on-top-of - For a stacked DRBD setup (3 or 4 nodes), a is used - instead of an section. - Required parameters in this section: and - . - + + drbd.conf + + volume + Defines a volume within a connection. The minor numbers of a replicated + volume might be different on different hosts, the volume number + (vnr) is what groups them together. Required parameters in + this section: , , + . + - - - + + - drbd.confon - Carries the necessary configuration parameters for a DRBD - device of the enclosing resource. - This section is very similar to the section. - The difference to the section is that - the matching of the host sections to machines is done by the IP-address - instead of the node name. - Required parameters in this section: , - , , - , all of which may be - inherited from the resource section, in which case you may shorten this section - down to just the address identifier. - resource r2 { + + drbd.conf + + stacked-on-top-of + For a stacked DRBD setup (3 or 4 nodes), a + is used instead of an section. + Required parameters in this section: and + . + + + + + + + + + drbd.conf + + on + Carries the necessary configuration parameters for a DRBD device of the + enclosing resource. This section is very similar to the section. + The difference to the section is that the matching of the host + sections to machines is done by the IP-address instead of the node name. Required + parameters in this section: , , + , all of which + may be inherited from the resource section, in which case you may + shorten this section down to just the address identifier. resource r2 { protocol C; device minor 2; disk /dev/sda7; @@ -239,1308 +270,1628 @@ meta-disk /dev/sdc8; } } - - + + - - - + + - drbd.confdisk - This section is used to fine tune DRBD's properties - in respect to the low level storage. Please - refer to drbdsetup8 for detailed description of - the parameters. - Optional parameter: , - , , , - , , - , , - . - + + drbd.conf + + disk + This section is used to fine tune DRBD's properties in respect to the + low level storage. Please refer to + drbdsetup + + 8 + for detailed description of the parameters. Optional parameters: + , , , + , , + , , + , , + , , + , , + , , + , , + . + - - - + + - drbd.confnet - This section is used to fine tune DRBD's properties. Please - refer to drbdsetup8 for a detailed description - of this section's parameters. - Optional parameters: - , , - , - , , - , - , , - , , - , , - , , - , , - - + + drbd.conf + + net + This section is used to fine tune DRBD's properties. Please refer to + + drbdsetup + + 8 + for a detailed description of this section's parameters. Optional + parameters: , , + , , , + , , + , , + , , + , , + , , + , , + , , + , , + , , + . + - - - + + - drbd.confstartup - This section is used to fine tune DRBD's properties. Please - refer to drbdsetup8 for a detailed description - of this section's parameters. - Optional parameters: - , , - , - , - and . - + + drbd.conf + + startup + This section is used to fine tune DRBD's properties. Please refer to + + drbdsetup + + 8 + for a detailed description of this section's parameters. Optional + parameters: , , + , , + and . + - - - + + - drbd.confsyncer - This section is used to fine tune the synchronization daemon - for the device. Please - refer to drbdsetup8 for a detailed description - of this section's parameters. - Optional parameters: - , , , - , - , and . - + + drbd.conf + + options + This section is used to fine tune the behaviour of the resource object. + Please refer to + drbdsetup + + 8 + for a detailed description of this section's parameters. Optional + parameters: , and + . + - - - + + - drbd.confhandlers - In this section you can define handlers (executables) that are executed - by the DRBD system in response to certain events. - Optional parameters: - , , - , (formerly oudate-peer), - , , - , . - + + drbd.conf + + handlers + In this section you can define handlers (executables) that are started + by the DRBD system in response to certain events. Optional parameters: + , , + , (formerly oudate-peer), + , , + , , + . + + The interface is done via environment variables: + + is the name of the resource + + + + is the minor number of the DRBD device, in + decimal. + + + + is the path to the primary configuration file; + if you split your configuration into multiple files (e.g. in + ), this will not be helpful. + + + + , , + are the address family (e.g. ), + the peer's address and hostnames. + + is deprecated. + + Please note that not all of these might be set for all handlers, and that some + values might not be useable for a definition. + Parameters + - - - + + - drbd.confminor-count count may be a number from 1 to 255. - - Use minor-count - if you want to define massively more resources later without reloading - the DRBD kernel - module. Per default the module loads with 11 more resources than you have currently - in your config but at least 32. + + drbd.conf + + minor-count + count may be a number from 1 to FIXME. + + Minor-count is a sizing hint for DRBD. It helps to + right-size various memory pools. It should be set in the in the same order of + magnitude than the actual number of minors you use. Per default the module loads with + 11 more resources than you have currently in your config but at least 32. + - - - + + - drbd.confdialog-refresh time may be 0 or a positive number. - - The user dialog redraws the second count every - time seconds (or does no redraws if - time is 0). The default value is 1. + + drbd.conf + + dialog-refresh + time may be 0 or a positive number. + + The user dialog redraws the second count every time + seconds (or does no redraws if time is 0). The default + value is 1. + - - - + + drbd.conf + disable-ip-verification - Use disable-ip-verification - if, for some obscure reasons, drbdadm can/might not use ip or ifconfig - to do a sanity check for the IP address. You can disable the IP verification with - this option. - + + Use disable-ip-verification if, for some obscure + reasons, drbdadm can/might not use or to + do a sanity check for the IP address. You can disable the IP verification with this + option. + - - - + + drbd.conf - usage-count + + usage-count + Please participate in - DRBD's online usage counter. - The most convenient way to do so - is to set this option to . Valid options are: - , and . - + DRBD's online usage counter. + The most convenient way to do so is to set + this option to . Valid options are: , + and . + - - - + + drbd.conf + protocol - On the TCP/IP link the specified protocol - is used. Valid protocol specifiers are A, B, and C. - Protocol A: write IO is reported as completed, if it has - reached local disk and local TCP send buffer. - Protocol B: write IO is reported as completed, if it has reached - local disk and remote buffer cache. - Protocol C: write IO is reported as completed, if it has - reached both local and remote disk. + + On the TCP/IP link the specified protocol is used. + Valid protocol specifiers are A, B, and C. + + Protocol A: write IO is reported as completed, if it has reached local disk and + local TCP send buffer. + + Protocol B: write IO is reported as completed, if it has reached local disk and + remote buffer cache. + + Protocol C: write IO is reported as completed, if it has reached both local and + remote disk. + - - - + + - drbd.confdevice - The name of the block device node of the resource being described. - You must use this device with your application (file system) and - you must not use the low level block device which is specified with the - parameter. - - One can ether ommit the name or - and the minor number. If you ommit the name - a default of /dev/drbdminor will be used. - - Udev will create additional symlinks in /dev/drbd/by-res and /dev/drbd/by-disk. - + + drbd.conf + + device + The name of the block device node of the resource being described. You + must use this device with your application (file system) and you must not use the low + level block device which is specified with the parameter. + + One can ether omit the name or + and the minor number. If you omit the + name a default of /dev/drbdminor + will be used. + + Udev will create additional symlinks in /dev/drbd/by-res and + /dev/drbd/by-disk. + - - - + + - drbd.confdisk - DRBD uses this block device to actually store and retrieve the data. - Never access such a device while DRBD is running on top of it. This - holds also true for dumpe2fs8 and similar commands. - + + drbd.conf + + disk + DRBD uses this block device to actually store and retrieve the data. + Never access such a device while DRBD is running on top of it. This also holds true + for + dumpe2fs + + 8 + and similar commands. + - - - + + - drbd.confaddress - A resource needs one IP address per device, - which is used to wait for incoming connections from the partner device - respectively to reach the partner device. AF - must be one of , , - or . - (For compatibility reasons is an alias for ) - It may be ommited for IPv4 addresses. The actual IPv6 address that follows - the keyword must be placed inside brackets: - ipv6 [fd01:2345:6789:abcd::1]:7800. - - Each DRBD resource needs a TCP port - which is used to connect to the node's partner device. - Two different DRBD resources may not use the same - addr:port combination on the same node. - + + drbd.conf + + address + A resource needs one IP address per device, + which is used to wait for incoming connections from the partner device respectively to + reach the partner device. AF must be one of + , , or + (for compatibility reasons is an alias for + ). It may be omited for IPv4 addresses. The actual IPv6 address + that follows the keyword must be placed inside brackets: + ipv6 [fd01:2345:6789:abcd::1]:7800. + + Each DRBD resource needs a TCP port which is used to + connect to the node's partner device. Two different DRBD resources may not use the + same addr:port combination on the same node. + - - - - - - - - - - - - + + + + + + - drbd.confmeta-diskdrbd.confflexible-meta-disk - Internal means that the last part of the backing device is used to store - the meta-data. You must not use [index] with - internal. Note: Regardless of whether you use the or - the keyword, it will always be of - the size needed for the remaining storage size. - - You can use a single block device to store - meta-data of multiple DRBD devices. - E.g. use meta-disk /dev/sde6[0]; and meta-disk /dev/sde6[1]; - for two different resources. In this case the meta-disk - would need to be at least 256 MB in size. - - With the keyword you specify - a block device as meta-data storage. You usually use this with LVM, - which allows you to have many variable sized block devices. - The required size of the meta-disk block device is - 36kB + Backing-Storage-size / 32k. Round this number to the next 4kb - boundary up and you have the exact size. - Rule of the thumb: 32kByte per 1GByte of storage, round up to the next - MB. + + drbd.conf + + meta-disk + Internal means that the last part of the backing device is used to + store the meta-data. The size of the meta-data is computed based on the size of the + device. + + When a device is specified, either with or without an + index, DRBD stores the meta-data on this device. Without + index, the size of the meta-data is determined by the size + of the data device. This is usually used with LVM, which allows to have many variable + sized block devices. The meta-data size is 36kB + Backing-Storage-size / 32k, rounded up + to the next 4kb boundary. (Rule of the thumb: 32kByte per 1GByte of storage, rounded up + to the next MB.) + + When an index is specified, each index number refers to + a fixed slot of meta-data of 128 MB, which allows a maximum data size of 4 GB. This way, + multiple DBRD devices can share the same meta-data device. For example, if /dev/sde6[0] + and /dev/sde6[1] are used, /dev/sde6 must be at least 256 MB big. Because of the hard size + limit, use of meta-disk indexes is discouraged. + - - - + + - drbd.confon-io-errorhandler is taken, if the lower level - device reports io-error to the upper layers. - - handler may be pass_on, call-local-io-error - or detach. - - pass_on: Report the io-error to the upper layers. On Primary report - it to the mounted file system. On Secondary ignore it. - call-local-io-error: Call the handler script - . - detach: The node drops its low level device, and continues in diskless mode. + + drbd.conf + + on-io-error + handler is taken, if the lower level device + reports io-errors to the upper layers. + + handler may be , + or + + : The node downgrades the disk status to inconsistent, marks the + erroneous block as inconsistent in the bitmap and retries the IO on the remote node. + + : Call the handler script + . + + : The node drops its low level device, and continues in + diskless mode. + - - - + + - drbd.conffencing - Under we understand preventive - measures to avoid situations where both nodes are primary - and disconnected (AKA split brain). - + + drbd.conf + + fencing + By we understand preventive measures to avoid + situations where both nodes are primary and disconnected (AKA split brain). + Valid fencing policies are: + - - - + + - This is the default policy. No fencing actions are undertaken. - + This is the default policy. No fencing actions are taken. + - - - + + - If a node becomes a disconnected primary, it tries to fence - the peer's disk. This is done by calling the fence-peer - handler. The handler is supposed to reach the other node over - alternative communication paths and call 'drbdadm outdate - res' there. - + If a node becomes a disconnected primary, it tries to fence the peer's + disk. This is done by calling the handler. The + handler is supposed to reach the other node over alternative communication paths + and call '' there. + - - - + + - If a node becomes a disconnected primary, it freezes all - its IO operations and calls its fence-peer handler. The - fence-peer handler is supposed to reach the peer over - alternative communication paths and call 'drbdadm outdate - res' there. In case it cannot reach the peer it should - stonith the peer. IO is resumed as soon as the situation - is resolved. In case your handler fails, you can resume - IO with the command. - + If a node becomes a disconnected primary, it freezes all its IO operations + and calls its fence-peer handler. The fence-peer handler is supposed to reach + the peer over alternative communication paths and call 'drbdadm outdate res' + there. In case it cannot reach the peer it should stonith the peer. IO is + resumed as soon as the situation is resolved. In case your handler fails, you + can resume IO with the command. + - - - - - - drbd.conf - use-bmbv - - In case the backing storage's driver has a merge_bvec_fn() function, - DRBD has to pretend that it can only process IO requests in - units not lager than 4kByte. (At time of writing the only known drivers which have such a function - are: md (software raid driver), dm (device mapper - LVM) and DRBD - itself) - To get best performance out of DRBD on top of software RAID (or any - other driver with a merge_bvec_fn() function) you might enable this - function, if you know for sure that the merge_bvec_fn() function will - deliver the same results on all nodes of your cluster. I.e. the - physical disks of the software RAID are of exactly the same - type. Use this option only if you know what you are - doing. - - - - - - - - - - - - - + + + + + + drbd.conf - no-disk-flushes + disk-barrier + drbd.conf - no-disk-flushes + + disk-flushes + drbd.conf - no-disk-flushes + disk-drain - DRBD has four implementations to express write-after-write dependencies to - its backing storage device. DRBD will use the first method that is - supported by the backing storage device and that is not disabled by the user. - - When selecting the method you should not only base your decision on the - measurable performance. In case your backing storage device has a volatile - write cache (plain disks, RAID of plain disks) you should use one - of the first two. In case your backing storage device has battery-backed - write cache you may go with option 3 or 4. Option 4 will deliver the - best performance such devices. - - Unfortunately device mapper (LVM) does not support barriers. - - The letter after "wo:" in /proc/drbd indicates with method is currently in - use for a device: b, f, d, n. The implementations: - + + DRBD has four implementations to express write-after-write dependencies to its + backing storage device. DRBD will use the first method that is supported by the + backing storage device and that is not disabled by the user. By default all three + methods are enabled. + + When selecting the method you should not only base your decision on the + measurable performance. In case your backing storage device has a volatile write cache + (plain disks, RAID of plain disks) you should use one of the first two. In case your + backing storage device has battery-backed write cache you may go with option 3. + Option 4 (disable everything, use "none") is dangerous + on most IO stacks, may result in write-reordering, and if so, + can theoretically be the reason for data corruption, or disturb + the DRBD protocol, causing spurious disconnect/reconnect cycles. + Do not use . + + Unfortunately device mapper (LVM) might not support barriers. + + The letter after "wo:" in /proc/drbd indicates with method is currently in use + for a device: , , , + . The implementations are: + barrier + - The first requirs that the driver of the - backing storage device support barriers (called 'tagged command queuing' in - SCSI and 'native command queuing' in SATA speak). The use of this - method can be disabled by the we option. - + The first requires that the driver of the backing storage device support + barriers (called 'tagged command queuing' in SCSI and 'native command queuing' + in SATA speak). The use of this method can be disabled by setting the + options to . + flush + - The second requires that the backing device support disk flushes (called - 'force unit access' in the drive vendors speak). The use of this method - can be disabled using the option. - + The second requires that the backing device support disk flushes (called + 'force unit access' in the drive vendors speak). The use of this method can be + disabled setting to . + drain + - The third method is simply to let write requests drain before - write requests of a new reordering domain are issued. That was the - only implementation before 8.0.9. You can prevent to use of this - method by using the option. - + The third method is simply to let write requests drain before write + requests of a new reordering domain are issued. This was the only implementation + before 8.0.9. + none + - The fourth method is to not express write-after-write dependencies to - the backing store at all. - + The fourth method is to not express write-after-write dependencies to + the backing store at all, by also specifying . + This is dangerous + on most IO stacks, may result in write-reordering, and if so, + can theoretically be the reason for data corruption, or disturb + the DRBD protocol, causing spurious disconnect/reconnect cycles. + Do not use . + - - - + + drbd.conf - no-md-flushes + + md-flushes - Disables the use of disk flushes and barrier BIOs when accessing - the meta data device. See the notes on . - + + Disables the use of disk flushes and barrier BIOs when accessing the meta data + device. See the notes on . + - - - + + drbd.conf + max-bio-bvecs - In some special circumstances the device mapper stack manages to - pass BIOs to DRBD that violate the constraints that are set forth - by DRBD's merge_bvec() function and which have more than one bvec. - A known example is: - phys-disk -> DRBD -> LVM -> Xen -> missaligned partition (63) -> DomU FS. - Then you might see "bio would need to, but cannot, be split:" in - the Dom0's kernel log. - The best workaround is to proper align the partition within - the VM (E.g. start it at sector 1024). Costs 480 KiByte of storage. - Unfortunately the default of most Linux partitioning tools is - to start the first partition at an odd number (63). Therefore - most distribution's install helpers for virtual linux machines will - end up with missaligned partitions. - The second best workaround is to limit DRBD's max bvecs per BIO - (= max-bio-bvecs) to 1. Might cost performance. - The default value of is 0, which means that - there is no user imposed limitation. - + + In some special circumstances the device mapper stack manages to pass BIOs to + DRBD that violate the constraints that are set forth by DRBD's merge_bvec() function + and which have more than one bvec. A known example is: phys-disk -> DRBD -> LVM + -> Xen -> misaligned partition (63) -> DomU FS. Then you might see "bio would + need to, but cannot, be split:" in the Dom0's kernel log. + + The best workaround is to proper align the partition within the VM (E.g. start + it at sector 1024). This costs 480 KiB of storage. Unfortunately the default of most + Linux partitioning tools is to start the first partition at an odd number (63). + Therefore most distribution's install helpers for virtual linux machines will end up + with misaligned partitions. The second best workaround is to limit DRBD's max bvecs + per BIO (= ) to 1, but that might cost + performance. + + The default value of is 0, which means that there + is no user imposed limitation. + - + - drbd.confsndbuf-size size is the size of the TCP socket send buffer. - The default value is 0, i.e. autotune. You can specify smaller or larger values. Larger values - are appropriate for reasonable write throughput with protocol A over high - latency networks. Very large values like 1M may cause problems. Also values - below 32K do not make much sense. Since 8.0.13 resp. 8.2.7, setting the size - value to 0 means that the kernel should autotune this. - + + drbd.conf + disk-timeout + + + If the driver of the lower_device + does not finish an IO request within disk_timeout, + DRBD considers the disk as failed. If DRBD is connected to a remote host, + it will reissue local pending IO requests to the peer, and ship all new + IO requests to the peer only. The disk state advances to diskless, as soon + as the backing block device has finished all IO requests. + The default value of is 0, which means that no timeout is enforced. + The default unit is 100ms. This option is available since 8.3.12. + + - + - drbd.confrcvbuf-size size is the size of the TCP socket receive buffer. - The default value is 0, i.e. autotune. You can specify smaller or larger values. - Usually this should be left at its default. Setting the size - value to 0 means that the kernel should autotune this. - + + drbd.conf + read-balancing + + + The supported methods for load balancing of + read requests are , , + , + , , + , , + , + and . + The default value of is . + This option is available since 8.4.1. + + - - - + + - drbd.conftimeout -If the partner node fails to send an expected response packet within -time 10ths -of a second, the partner node -is considered dead and therefore the TCP/IP connection is abandoned. This must be lower than connect-int and ping-int. -The default value is 60 = 6 seconds, the unit 0.1 seconds. - + + drbd.conf + + sndbuf-size + size is the size of the TCP socket send + buffer. The default value is 0, i.e. autotune. You can specify smaller or larger + values. Larger values are appropriate for reasonable write throughput with protocol A + over high latency networks. Values below 32K do not make sense. Since 8.0.13 resp. + 8.2.7, setting the size value to 0 means that the kernel + should autotune this. + - - - + + - drbd.confconnect-int -In case it is not possible to connect to the remote DRBD device immediately, -DRBD keeps on trying to connect. With this option you can set the time -between two tries. The default value is 10 seconds, the unit is 1 second. - + + drbd.conf + + rcvbuf-size + size is the size of the TCP socket receive + buffer. The default value is 0, i.e. autotune. You can specify smaller or larger + values. Usually this should be left at its default. Setting the + size value to 0 means that the kernel should autotune + this. + - - - + + - drbd.confping-int -If the TCP/IP connection linking a DRBD device pair is idle for more than -time seconds, DRBD will generate a keep-alive -packet to check if its partner is still alive. The default is 10 seconds, -the unit is 1 second. - + + drbd.conf + + timeout + If the partner node fails to send an expected response packet within + time tenths of a second, the partner node is considered + dead and therefore the TCP/IP connection is abandoned. This must be lower than + connect-int and ping-int. The + default value is 60 = 6 seconds, the unit 0.1 seconds. + - - - + + - drbd.confping-timeout - The time the peer has time to answer to a keep-alive packet. In case - the peer's reply is not received within this time period, it is - considered as dead. The default value is 500ms, the default unit is 100ms. - + + drbd.conf + + connect-int + In case it is not possible to connect to the remote DRBD device + immediately, DRBD keeps on trying to connect. With this option you can set the time + between two retries. The default value is 10 seconds, the unit is 1 second. + - - - + + - drbd.confmax-buffers - Maximum number of requests to be allocated by DRBD. Unit is PAGE_SIZE, - which is 4 KB on most systems. - The minimum is hard coded to 32 (=128 KB). - For high-performance installations it might help, if you - increase that number. These buffers are used to hold - data blocks while they are written to disk. - + + drbd.conf + + ping-int + If the TCP/IP connection linking a DRBD device pair is idle for more + than time seconds, DRBD will generate a keep-alive packet + to check if its partner is still alive. The default is 10 seconds, the unit is 1 + second. + - - - + + - drbd.confko-count - In case the secondary node fails to complete a single write - request for count times the - timeout, it is expelled from the - cluster. (I.e. the primary node goes into StandAlone mode.) - The default value is 0, which disables this feature. - + + drbd.conf + + ping-timeout + The time the peer has time to answer to a keep-alive packet. In case + the peer's reply is not received within this time period, it is considered as dead. + The default value is 500ms, the default unit are tenths of a second. + - - - + + - drbd.confmax-epoch-size - The highest number of data blocks between two write barriers. - If you set this smaller than 10, you might decrease your performance. - + + drbd.conf + + max-buffers + Maximum number of requests to be allocated by DRBD. Unit is PAGE_SIZE, + which is 4 KiB on most systems. The minimum is hard coded to 32 (=128 KiB). For + high-performance installations it might help if you increase that number. These + buffers are used to hold data blocks while they are written to disk. + - - - + + - drbd.confallow-two-primaries - With this option set you may assign primary role to both nodes. You only should - use this option if you use a shared storage file system on top of - DRBD. At the time of writing the only ones are: OCFS2 and GFS. If you - use this option with any other file system, you are going to crash your - nodes and to corrupt your data! - + + drbd.conf + + ko-count + In case the secondary node fails to complete a single write request for + count times the timeout, it is + expelled from the cluster. (I.e. the primary node goes into + mode.) The default value is 0, which disables this + feature. + - - - + + + + + drbd.conf + + max-epoch-size + The highest number of data blocks between two write barriers. If you + set this smaller than 10, you might decrease your performance. + + + + + + + + + drbd.conf + + allow-two-primaries + With this option set you may assign the primary role to both nodes. You + only should use this option if you use a shared storage file system on top of DRBD. At + the time of writing the only ones are: OCFS2 and GFS. If you use this option with any + other file system, you are going to crash your nodes and to corrupt your data! + + + + + + drbd.conf - unplug-watermark + + unplug-watermark - When the number of pending write requests on the standby - (secondary) node exceeds the unplug-watermark, we trigger - the request processing of our backing storage device. - Some storage controllers deliver better performance with small - values, others deliver best performance when the value is set to - the same value as max-buffers. Minimum 16, default 128, maximum - 131072. - + + When the number of pending write requests on the standby (secondary) node + exceeds the , we trigger the request processing of + our backing storage device. Some storage controllers deliver better performance with + small values, others deliver best performance when the value is set to the same value + as max-buffers. Minimum 16, default 128, maximum 131072. + - - - + + - drbd.confcram-hmac-alg - You need to specify the HMAC algorithm to enable peer authentication - at all. You are strongly encouraged to use peer authentication. The HMAC - algorithm will be used for the challenge response authentication - of the peer. You may specify any digest algorithm that is named in - /proc/crypto. - + + drbd.conf + + cram-hmac-alg + You need to specify the HMAC algorithm to enable peer authentication at + all. You are strongly encouraged to use peer authentication. The HMAC algorithm will + be used for the challenge response authentication of the peer. You may specify any + digest algorithm that is named in . + - - - + + - drbd.confshared-secret - The shared secret used in peer authentication. May be up to 64 characters. - Note that peer authentication is disabled as long as no cram-hmac-alg - (see above) is specified. - + + drbd.conf + + shared-secret + The shared secret used in peer authentication. May be up to 64 + characters. Note that peer authentication is disabled as long as no + (see above) is specified. + - - - policy - + policy + drbd.conf - after-sb-0pri + + after-sb-0pri - possible policies are: - + + possible policies are: + - - - + + - No automatic resynchronization, simply disconnect. - + No automatic resynchronization, simply disconnect. + - - - + + - Auto sync from the node that was primary before the split-brain situation happened. - + Auto sync from the node that was primary before the split-brain situation + happened. + - - - + + - Auto sync from the node that became primary as second during - the split-brain situation. - + Auto sync from the node that became primary as second during the + split-brain situation. + - - - + + - In case one node did not write anything since the split - brain became evident, sync from the node that wrote something - to the node that did not write anything. In case none wrote - anything this policy uses a random decision to perform - a "resync" of 0 blocks. In case both have written something - this policy disconnects the nodes. - + In case one node did not write anything since the split brain became + evident, sync from the node that wrote something to the node that did not write + anything. In case none wrote anything this policy uses a random decision to + perform a "resync" of 0 blocks. In case both have written something this policy + disconnects the nodes. + - - - + + - Auto sync from the node that touched more blocks during the - split brain situation. - + Auto sync from the node that touched more blocks during the split brain + situation. + - - - + + - Auto sync to the named node. - + Auto sync to the named node. + - - - policy - + policy + drbd.conf - after-sb-1pri + + after-sb-1pri - possible policies are: - + + possible policies are: + - - - + + - No automatic resynchronization, simply disconnect. - + No automatic resynchronization, simply disconnect. + - - - + + - Discard the version of the secondary if the outcome - of the algorithm would also - destroy the current secondary's data. Otherwise disconnect. - + Discard the version of the secondary if the outcome of the + algorithm would also destroy the current + secondary's data. Otherwise disconnect. + - - - + + - Always take the decision of the - algorithm. Even if that causes an erratic change of - the primary's view of the data. This is only useful if - you use a 1node FS (i.e. not OCFS2 or GFS) with the - allow-two-primaries flag, _AND_ if you really know what you - are doing. This is DANGEROUS and MAY CRASH YOUR MACHINE - if you have an FS mounted on the primary node. - + Always take the decision of the algorithm, + even if that causes an erratic change of the primary's view of the data. This is + only useful if you use a one-node FS (i.e. not OCFS2 or GFS) with the + flag, AND if you + really know what you are doing. This is DANGEROUS and MAY CRASH YOUR + MACHINE if you have an FS mounted on the primary node. + - - - + + - Discard the secondary's version. - + Discard the secondary's version. + - - - + + - Always honor the outcome of the algorithm. In case it decides the current - secondary has the right data, it calls the "pri-lost-after-sb" - handler on the current primary. - + Always honor the outcome of the algorithm. + In case it decides the current secondary has the right data, it calls the + "pri-lost-after-sb" handler on the current primary. + - - - policy - + policy + drbd.conf - after-sb-2pri + + after-sb-2pri - possible policies are: - + + possible policies are: + - - - + + - No automatic resynchronization, simply disconnect. - + No automatic resynchronization, simply disconnect. + - - - + + - Always take the decision of the - algorithm. Even if that causes an erratic change of - the primary's view of the data. This is only useful if - you use a 1node FS (i.e. not OCFS2 or GFS) with the - allow-two-primaries flag, _AND_ if you really know what you - are doing. This is DANGEROUS and MAY CRASH YOUR MACHINE - if you have an FS mounted on the primary node. - + Always take the decision of the algorithm, + even if that causes an erratic change of the primary's view of the data. This is + only useful if you use a one-node FS (i.e. not OCFS2 or GFS) with the + flag, AND if you + really know what you are doing. This is DANGEROUS and MAY CRASH YOUR + MACHINE if you have an FS mounted on the primary node. + - - - + + - Call the "pri-lost-after-sb" helper program on one of the - machines. This program is expected to reboot the - machine, i.e. make it secondary. - + Call the "pri-lost-after-sb" helper program on one of the machines. This + program is expected to reboot the machine, i.e. make it secondary. + - - - + + - Normally the automatic after-split-brain policies are only - used if current states of the UUIDs do not indicate the - presence of a third node. - - With this option you request that the automatic - after-split-brain policies are used as long as the data - sets of the nodes are somehow related. This might cause - a full sync, if the UUIDs indicate the presence of a third - node. (Or double faults led to strange UUID sets.) - + Normally the automatic after-split-brain policies are only used if current + states of the UUIDs do not indicate the presence of a third node. + + With this option you request that the automatic after-split-brain policies are + used as long as the data sets of the nodes are somehow related. This might cause a + full sync, if the UUIDs indicate the presence of a third node. (Or double faults led + to strange UUID sets.) + - - - policy - + policy + drbd.conf - rr-conflict + + rr-conflict - To solve the cases when the outcome of the resync decision is - incompatible with the current role assignment in the cluster. - + + This option helps to solve the cases when the outcome of the resync decision is + incompatible with the current role assignment in the cluster. + - - - + + - No automatic resynchronization, simply disconnect. - + No automatic resynchronization, simply disconnect. + - - - + + - Sync to the primary node is allowed, violating the - assumption that data on a block device are stable for one - of the nodes. Dangerous, do not use. - + Sync to the primary node is allowed, violating the assumption that data on + a block device are stable for one of the nodes. Dangerous, do not + use. + - - - + + - Call the "pri-lost" helper program on one of the - machines. This program is expected to reboot the - machine, i.e. make it secondary. - + Call the "pri-lost" helper program on one of the machines. This program is + expected to reboot the machine, i.e. make it secondary. + - - - alg - + alg + drbd.conf + data-integrity-alg - DRBD can ensure the data integrity of the user's data on the network - by comparing hash values. Normally this is ensured by the 16 bit checksums - in the headers of TCP/IP packets. - This option can be set to any of the kernel's data digest algorithms. - In a typical kernel configuration you should have - at least one of , , and - available. By default this is not enabled. + + DRBD can ensure the data integrity of the user's data on the network by + comparing hash values. Normally this is ensured by the 16 bit checksums in the headers + of TCP/IP packets. + + This option can be set to any of the kernel's data digest algorithms. In a + typical kernel configuration you should have at least one of , + , and available. By default this is not + enabled. + See also the notes on data integrity. + - - - + + drbd.conf - no-tcp-cork + + tcp-cork - DRBD usually uses the TCP socket option TCP_CORK to hint to the network - stack when it can expect more data, and when it should flush out what it - has in its send queue. It turned out that there is at lease one network - stack that performs worse when one uses this hinting method. Therefore - we introducted this option, which disable the setting and clearing of - the TCP_CORK socket option by DRBD. + + DRBD usually uses the TCP socket option TCP_CORK to hint to the network stack + when it can expect more data, and when it should flush out what it has in its send + queue. It turned out that there is at least one network stack that performs worse when + one uses this hinting method. Therefore we introducted this option. By setting + to you can disable the setting and + clearing of the TCP_CORK socket option by DRBD. + - - - + + + + + + - Wait for connection timeout. - drbd.confwfc-timeout - The init script drbd8 blocks the boot process - until the DRBD resources are connected. - When the cluster manager starts later, - it does not see a resource with internal split-brain. - In case you want to limit the wait time, do it here. - Default is 0, which means unlimited. The unit is seconds. - + By default DRBD blocks when the available TCP send queue becomes full. That + means it will slow down the application that generates the write requests that cause + DRBD to send more data down that TCP connection. + + When DRBD is deployed with DRBD-proxy it might be more desirable that DRBD goes + into AHEAD/BEHIND mode shortly before the send queue becomes full. In AHEAD/BEHIND + mode DRBD does no longer replicate data, but still keeps the connection open. + + The advantage of the AHEAD/BEHIND mode is that the application is not slowed + down, even if DRBD-proxy's buffer is not sufficient to buffer all write requests. The + downside is that the peer node falls behind, and that a resync will be necessary to + bring it back into sync. During that resync the peer node will have an inconsistent + disk. + + Available congestion_policys are + and . The default is + . Fill_threshold might be in the + range of 0 to 10GiBytes. The default is 0 which disables the check. + Active_extents_threshold has the same limits as + . + + The AHEAD/BEHIND mode and its settings are available since DRBD 8.3.10. + - - - + + + + Wait for connection timeout. + drbd.conf + + wfc-timeout + The init script + drbd + + 8 + blocks the boot process until the DRBD resources are connected. When + the cluster manager starts later, it does not see a resource with internal + split-brain. In case you want to limit the wait time, do it here. Default is 0, which + means unlimited. The unit is seconds. + + + + + + - drbd.confdegr-wfc-timeout - Wait for connection timeout, if this node was a degraded cluster. - In case a degraded cluster (= cluster with only one node left) - is rebooted, this timeout value is used instead of wfc-timeout, - because the peer is less likely to show up in time, - if it had been dead before. Value 0 means unlimited. - + + drbd.conf + + degr-wfc-timeout + Wait for connection timeout, if this node was a degraded cluster. In + case a degraded cluster (= cluster with only one node left) is rebooted, this timeout + value is used instead of wfc-timeout, because the peer is less likely to show up in + time, if it had been dead before. Value 0 means unlimited. + - - - + + - drbd.confoutdated-wfc-timeout - Wait for connection timeout, if the peer was outdated. - In case a degraded cluster (= cluster with only one node left) - with an outdated peer disk is rebooted, this timeout value is used instead of wfc-timeout, - because the peer is not allowed to become primary in the meantime. - Value 0 means unlimited. - + + drbd.conf + + outdated-wfc-timeout + Wait for connection timeout, if the peer was outdated. In case a + degraded cluster (= cluster with only one node left) with an outdated peer disk is + rebooted, this timeout value is used instead of wfc-timeout, because the peer is not + allowed to become primary in the meantime. Value 0 means unlimited. + - - - + + - By setting this option you can make the init script to continue - to wait even if the device pair had a split brain situation - and therefore refuses to connect. - + By setting this option you can make the init script to continue to wait even if + the device pair had a split brain situation and therefore refuses to connect. + - - - + + - Sets on which node the device should be promoted to primary role by - the init script. The node-name might either - be a host name or the key word . When this option is - not set the devices stay in secondary role on both nodes. Usually - one delegates the role assignment to a cluster manager (e.g. heartbeat). - + Sets on which node the device should be promoted to primary role by the init + script. The node-name might either be a host name or the + keyword . When this option is not set the devices stay in + secondary role on both nodes. Usually one delegates the role assignment to a cluster + manager (e.g. heartbeat). + - - - + + - Usually and are - ignored for stacked devices, instead twice the amount of - is used for the connection timeouts. - With the keyword you disable this, and force - DRBD to mind the and - statements. Only do that if the peer of the stacked resource is usually not - available or will not become primary usually. - By using this option incorrectly, you run the risk of causing unexpected split brain. - + Usually and are + ignored for stacked devices, instead twice the amount of + is used for the connection timeouts. With the + keyword you disable this, and force DRBD to mind the and + statements. Only do that if the peer of the stacked + resource is usually not available or will usually not become primary. By using this + option incorrectly, you run the risk of causing unexpected split brain. + - - - + + - drbd.confrate - To ensure a smooth operation of the application on top of DRBD, - it is possible to limit the bandwidth which may be used by - background synchronizations. The default is 250 KB/sec, the - default unit is KB/sec. Optional suffixes K, M, G are allowed. - + + drbd.conf + + resync-rate + To ensure a smooth operation of the application on top of DRBD, it is + possible to limit the bandwidth which may be used by background synchronizations. The + default is 250 KB/sec, the default unit is KB/sec. Optional suffixes K, M, G are + allowed. + - - - + + - drbd.confafter - By default, resynchronization of all devices would run in parallel. - By defining a sync-after dependency, the resynchronization of this - resource will start only if the resource res-name - is already in connected state (= finished its resynchronization). - + + drbd.conf + + use-rle + During resync-handshake, the dirty-bitmaps of the nodes are exchanged + and merged (using bit-or), so the nodes will have the same understanding of which + blocks are dirty. On large devices, the fine grained dirty-bitmap can become large as + well, and the bitmap exchange can take quite some time on low-bandwidth links. + + Because the bitmap typically contains compact areas where all bits are unset + (clean) or set (dirty), a simple run-length encoding scheme can considerably reduce + the network traffic necessary for the bitmap exchange. + + For backward compatibilty reasons, and because on fast links this possibly does + not improve transfer time but consumes cpu cycles, this defaults to off. + - - - + + - drbd.confal-extents - DRBD automatically performs hot area detection. With this - parameter you control how big the hot area (= active set) can - get. Each extent marks 4M of the backing storage (= low-level device). - In case a primary node leaves the cluster unexpectedly, the areas covered - by the active set must be resynced upon rejoining of the failed - node. The data structure is stored in the meta-data area, therefore each - change of the active set is a write operation - to the meta-data device. A higher number of extents gives - longer resync times but less updates to the meta-data. The - default number of extents is - 127. (Minimum: 7, Maximum: 3843) - + + drbd.conf + + resync-after + By default, resynchronization of all devices would run in parallel. By + defining a resync-after dependency, the resynchronization of this resource will start + only if the resource res-name is already in connected state + (i.e., has finished its resynchronization). + - - - + + - During online verification (as initiated by the - verify sub-command), - rather than doing a bit-wise comparison, DRBD applies a hash function - to the contents of every block being verified, and compares that - hash with the peer. This option defines the hash algorithm being - used for that purpose. It can be set to any of the kernel's data - digest algorithms. In a typical kernel configuration you should have - at least one of , , and - available. By default this is not enabled; you must set this - option explicitly in order to be able to use on-line device verification. + + drbd.conf + + al-extents + DRBD automatically performs hot area detection. With this parameter you + control how big the hot area (= active set) can get. Each extent marks 4M of the + backing storage (= low-level device). In case a primary node leaves the cluster + unexpectedly, the areas covered by the active set must be resynced upon rejoining of + the failed node. The data structure is stored in the meta-data area, therefore each + change of the active set is a write operation to the meta-data device. A higher number + of extents gives longer resync times but less updates to the meta-data. The default + number of extents is 127. (Minimum: 7, Maximum: + 3843) + + + + + + + + During online verification (as initiated by the verify sub-command), rather than doing a bit-wise + comparison, DRBD applies a hash function to the contents of every block being + verified, and compares that hash with the peer. This option defines the hash algorithm + being used for that purpose. It can be set to any of the kernel's data digest + algorithms. In a typical kernel configuration you should have at least one of + , , and available. By + default this is not enabled; you must set this option explicitly in order to be able + to use on-line device verification. + See also the notes on data integrity. + - - - + + - A resync process sends all marked data blocks form the source to - the destination node, as long as no is - given. When one is specified the resync process exchanges hash values of all - marked blocks first, and sends only those data blocks over, that have different - hash values. - This setting is useful for DRBD setups with low bandwidth links. - During the restart of a crashed primary node, all blocks covered by the - activity log are marked for resync. But a large part of those will actually - be still in sync, therefore using will lower - the required bandwidth in exchange for CPU cycles. + A resync process sends all marked data blocks from the source to the destination + node, as long as no is given. When one is specified the + resync process exchanges hash values of all marked blocks first, and sends only those + data blocks that have different hash values. + + This setting is useful for DRBD setups with low bandwidth links. During the + restart of a crashed primary node, all blocks covered by the activity log are marked + for resync. But a large part of those will actually be still in sync, therefore using + will lower the required bandwidth in exchange for CPU + cycles. + - - - + + + + + + + + - drbd.confcpu-mask - Sets the cpu-affinity-mask for DRBD's kernel threads of this device. The - default value of cpu-mask is 0, which means - that DRBD's kernel threads should be spread over all CPUs of the machine. - This value must be given in hexadecimal notation. If it is too big it will - be truncated. - + The dynamic resync speed controller gets enabled with setting + plan_time to a positive value. It aims to fill the buffers + along the data path with either a constant amount of data + fill_target, or aims to have a constant delay time of + delay_target along the path. The controller has an upper + bound of max_rate. + + By plan_time the agility of the controller is + configured. Higher values yield for slower/lower responses of the controller to + deviation from the target value. It should be at least 5 times RTT. For regular data + paths a fill_target in the area of 4k to 100k is + appropriate. For a setup that contains drbd-proxy it is advisable to use + delay_target instead. Only when + fill_target is set to 0 the controller will use + delay_target. 5 times RTT is a reasonable starting value. + Max_rate should be set to the bandwidth available between + the DRBD-hosts and the machines hosting DRBD-proxy, or to the available + disk-bandwidth. + + The default value of plan_time is 0, the default unit + is 0.1 seconds. Fill_target has 0 and sectors as default + unit. Delay_target has 1 (100ms) and 0.1 as default unit. + Max_rate has 10240 (100MiB/s) and KiB/s as default + unit. + + The dynamic resync speed controller and its settings are available since DRBD + 8.3.9. + - - - + + - drbd.confpri-on-incon-degr - This handler is called if the node is primary, degraded - and if the local copy of the data is inconsistent. + A node that is primary and sync-source has to schedule application IO requests + and resync IO requests. The min_rate tells DRBD use only up + to min_rate for resync IO and to dedicate all other available IO bandwidth to + application requests. + + Note: The value 0 has a special meaning. It disables the limitation of resync IO + completely, which might slow down application IO considerably. Set it to a value of 1, + if you prefer that resync IO never slows down application IO. + + Note: Although the name might suggest that it is a lower bound for the dynamic + resync speed controller, it is not. If the DRBD-proxy buffer is full, the dynamic + resync speed controller is free to lower the resync speed down to 0, completely + independent of the setting. + + Min_rate has 4096 (4MiB/s) and KiB/s as default + unit. + - - - + + - drbd.confpri-lost-after-sb - The node is currently primary, but lost the after split - brain auto recovery procedure. As as consequence, it should be abandoned. - + This setting controls what happens to IO requests on a degraded, disk less node + (I.e. no data store is reachable). The available policies are + and . + + If ond-policy is set to + you can either resume IO by attaching/connecting the last lost data storage, or by the + drbdadm resume-io res + command. The latter will result in IO errors of course. + + The default is . This setting is available since DRBD + 8.3.9. + - - - + + - drbd.confpri-lost - The node is currently primary, but DRBD's algorithm - thinks that it should become sync target. As a consequence it should - give up its primary role. - + + drbd.conf + + cpu-mask + Sets the cpu-affinity-mask for DRBD's kernel threads of this device. + The default value of cpu-mask is 0, which means that DRBD's + kernel threads should be spread over all CPUs of the machine. This value must be given + in hexadecimal notation. If it is too big it will be truncated. + - - - + + - drbd.conffence-peer - The handler is part of the - mechanism. This handler is called in case the node needs to fence the - peer's disk. It should use other communication paths than DRBD's network - link. + + drbd.conf + + pri-on-incon-degr + This handler is called if the node is primary, degraded and if the + local copy of the data is inconsistent. + - - - + + - drbd.conflocal-io-error - DRBD got an IO error from the local IO subsystem. - + + drbd.conf + + pri-lost-after-sb + The node is currently primary, but lost the after-split-brain auto + recovery procedure. As as consequence, it should be abandoned. + - - - + + - drbd.confsplit-brain - DRBD detected a split brain situation. Manual recovery is necessary. - This handler should alert someone on duty. - + + drbd.conf + + pri-lost + The node is currently primary, but DRBD's algorithm thinks that it + should become sync target. As a consequence it should give up its primary role. + - - - + + - drbd.confbefore-resync-target - DRBD calls this handler just before a resync beginns on the node - that becomes resync target. It might be used to take a snapshot of the - backing block device. - + + drbd.conf + + fence-peer + The handler is part of the mechanism. This + handler is called in case the node needs to fence the peer's disk. It should use other + communication paths than DRBD's network link. + - - - + + - drbd.confafter-resync-target - DRBD calls this handler just after a resync operation finished on the - node which's disk just became consistent after beeing inconsistent for the - duration of the resync. It might be used to remove a snapshot of the backing device - that was created by the handler. - + + drbd.conf + + local-io-error + DRBD got an IO error from the local IO subsystem. + + + + + + + + + drbd.conf + + initial-split-brain + DRBD has connected and detected a split brain situation. This handler + can alert someone in all cases of split brain, not just those that go + unresolved. + + + + + + + + + drbd.conf + + split-brain + DRBD detected a split brain situation but remains unresolved. Manual + recovery is necessary. This handler should alert someone on duty. + + + + + + + + + drbd.conf + + before-resync-target + DRBD calls this handler just before a resync begins on the node that + becomes resync target. It might be used to take a snapshot of the backing block + device. + + + + + + + + + drbd.conf + + after-resync-target + DRBD calls this handler just after a resync operation finished on the + node whose disk just became consistent after being inconsistent for the duration of + the resync. It might be used to remove a snapshot of the backing device that was + created by the handler. + Other Keywords + - - - + + - drbd.confinclude - Include all files matching the wildcard pattern file-pattern. - The statement - is only allowed on the top level, i.e. it is not allowed inside any section. - + + drbd.conf + + include + Include all files matching the wildcard pattern + file-pattern. The statement is + only allowed on the top level, i.e. it is not allowed inside any section. + Notes on data integrity - There are two independent methods in DRBD to ensure the integrity of -the mirrored data. The online-verify mechanism and the -of the section. - Both mechanisms might deliver false positives if the user of DRBD modifies the -data which gets written to disk while the transfer goes on. Currently the swap code and -ReiserFS are known to do so. In both cases this is not a problem, because when the -initiator of the data transfer does this it already knows that that data block will -not be part of an on disk data structure. - The most recent (2007) example of systematically corruption was an -issue with the TCP offloading engine and the driver of a certain type -of GBit NIC. The actual corruption happened on the DMA transfer from -core memory to the card. Since the TCP checksum gets calculated on the card -this type of corruption stays undetected as long as you do not use -either the online or the data-integrity-alg. - We suggest to use the only during a -pre-production phase due to its CPU costs. Further we suggest to do online - runs regularly e.g. once a month during low load period. + + There are two independent methods in DRBD to ensure the integrity of the mirrored data. + The online-verify mechanism and the of the + section. + + Both mechanisms might deliver false positives if the user of DRBD modifies the data + which gets written to disk while the transfer goes on. This may happen for swap, or for + certain append while global sync, or truncate/rewrite workloads, and not necessarily poses a + problem for the integrity of the data. Usually when the initiator of the data transfer does + this, it already knows that that data block will not be part of an on disk data structure, or + will be resubmitted with correct data soon enough. + + The causes the receiving side to log an error about + "Digest integrity check FAILED: Ns +x\n", where N is the sector offset, and x is the size of + the request in bytes. It will then disconnect, and reconnect, thus causing a quick resync. If + the sending side at the same time detected a modification, it warns about "Digest mismatch, + buffer modified by upper layers during write: Ns +x\n", which shows that this was a false + positive. The sending side may detect these buffer modifications immediately after the + unmodified data has been copied to the tcp buffers, in which case the receiving side won't + notice it. + + The most recent (2007) example of systematic corruption was an issue with the TCP + offloading engine and the driver of a certain type of GBit NIC. The actual corruption happened + on the DMA transfer from core memory to the card. Since the TCP checksum gets calculated on + the card, this type of corruption stays undetected as long as you do not use either the online + or the . + + We suggest to use the only during a pre-production + phase due to its CPU costs. Further we suggest to do online runs + regularly e.g. once a month during a low load period. + Version - This document was revised for version 8.3.2 of the DRBD distribution. + + This document was revised for version 8.4.0 of the DRBD distribution. + Author - Written by Philipp Reisner philipp.reisner@linbit.com - and Lars Ellenberg lars.ellenberg@linbit.com. + + Written by Philipp Reisner philipp.reisner@linbit.com and Lars + Ellenberg lars.ellenberg@linbit.com. + Reporting Bugs + Report bugs to drbd-user@lists.linbit.com. + Copyright - Copyright 2001-2008 LINBIT Information Technologies, -Philipp Reisner, Lars Ellenberg. This is free software; -see the source for copying conditions. There is NO warranty; -not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + + Copyright 2001-2008 LINBIT Information Technologies, Philipp Reisner, Lars Ellenberg. + This is free software; see the source for copying conditions. There is NO warranty; not even + for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See Also - drbd8, - drbddisk8, - drbdsetup8, - drbdadm8, - DRBD User's Guide, - DRBD web site + + + drbd + + 8 + , + drbddisk + + 8 + , + drbdsetup + + 8 + , + drbdadm + + 8 + , + DRBD User's Guide, + DRBD web site + diff -Nru drbd8-8.3.7/documentation/drbdadm.8 drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdadm.8 --- drbd8-8.3.7/documentation/drbdadm.8 2010-01-13 16:17:24.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdadm.8 2012-02-02 14:09:58.000000000 +0000 @@ -1,13 +1,22 @@ '\" t .\" Title: drbdadm .\" Author: [see the "Author" section] -.\" Generator: DocBook XSL Stylesheets v1.75.1 -.\" Date: 5 Dec 2008 +.\" Generator: DocBook XSL Stylesheets v1.75.2 +.\" Date: 6 May 2011 .\" Manual: System Administration -.\" Source: DRBD 8.3.2 +.\" Source: DRBD 8.4.0 .\" Language: English .\" -.TH "DRBDADM" "8" "5 Dec 2008" "DRBD 8.3.2" "System Administration" +.TH "DRBDADM" "8" "6 May 2011" "DRBD 8.4.0" "System Administration" +.\" ----------------------------------------------------------------- +.\" * Define some portability stuff +.\" ----------------------------------------------------------------- +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.\" http://bugs.debian.org/507673 +.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' .\" ----------------------------------------------------------------- .\" * set default formatting .\" ----------------------------------------------------------------- @@ -22,42 +31,97 @@ drbdadm \- Administration tool for DRBD .\" drbdadm .SH "SYNOPSIS" .HP \w'\fBdrbdadm\fR\ 'u -\fBdrbdadm\fR [\-d] [\-c\ {\fIfile\fR}] [\-t\ {\fIfile\fR}] [\-s\ {\fIcmd\fR}] [\-m\ {\fIcmd\fR}] [\-S] [\-h\ {\fIhost\fR}] {\fIcommand\fR} [all | \fIresource\fR...] +\fBdrbdadm\fR [\-d] [\-c\ {\fIfile\fR}] [\-t\ {\fIfile\fR}] [\-s\ {\fIcmd\fR}] [\-m\ {\fIcmd\fR}] [\-S] [\-h\ {\fIhost\fR}] [\-\-\ {\fIbackend\-options\fR}] {\fIcommand\fR} [{all} | {\fIresource\fR\fI[/volume>]\fR...}] .SH "DESCRIPTION" .PP -Drbdadm is the high level tool of the DRBD program suite\&. Drbdadm is to drbdsetup and drbdmeta what ifup/ifdown is to ifconfig\&. Drbdadm reads its configuration file and performs the specified commands by calling the drbdsetup and/or the drbdmeta program\&. +\fBDrbdadm\fR +is the high level tool of the DRBD program suite\&. +\fBDrbdadm\fR +is to +\fBdrbdsetup\fR +and +\fBdrbdmeta\fR +what +\fBifup\fR/\fBifdown\fR +is to +\fBifconfig\fR\&. +\fBDrbdadm\fR +reads its configuration file and performs the specified commands by calling the +\fBdrbdsetup\fR +and/or the +\fBdrbdmeta\fR +program\&. +.PP +\fBDrbdadm\fR +can operate on whole resources or on individual volumes in a resource\&. The sub commands: +\fBattach\fR, +\fBdetach\fR, +\fBprimary\fR, +\fBsecondary\fR, +\fBinvalidate\fR, +\fBinvalidate\-remote\fR, +\fBoutdate\fR, +\fBresize\fR, +\fBverify\fR, +\fBpause\-sync\fR, +\fBresume\-sync\fR, +\fBrole\fR, +\fBcsytate\fR, +\fBdstate\fR, +\fBcreate\-md\fR, +\fBshow\-gi\fR, +\fBget\-gi\fR, +\fBdump\-md\fR, +\fBwipe\-md\fR +work on whole resources and on individual volumes\&. +.PP +Resource level only commands are: +\fBconnect\fR, +\fBdisconnect\fR, +\fBup\fR, +\fBdown\fR, +\fBwait\-connect\fR +and +\fBdump\fR\&. .SH "OPTIONS" .PP \fB\-d\fR, \fB\-\-dry\-run\fR .RS 4 -Just prints the calls of drbdsetup to stdout, but does not run the commands\&. +Just prints the calls of +\fBdrbdsetup\fR +to stdout, but does not run the commands\&. .RE .PP -\fB\-c\fR, \fB\-\-config\-file\fR\fIfile\fR +\fB\-c\fR, \fB\-\-config\-file\fR \fIfile\fR .RS 4 Specifies the configuration file drbdadm will use\&. If this parameter is not specified, drbdadm will look for +\fB/etc/drbd\-84\&.conf\fR, \fB/etc/drbd\-83\&.conf\fR, \fB/etc/drbd\-08\&.conf\fR and \fB/etc/drbd\&.conf\fR\&. .RE .PP -\fB\-t\fR, \fB\-\-config\-to\-test\fR\fIfile\fR +\fB\-t\fR, \fB\-\-config\-to\-test\fR \fIfile\fR .RS 4 Specifies an additional configuration file drbdadm to check\&. This option is only allowed with the dump and the sh\-nop commands\&. .RE .PP -\fB\-s\fR, \fB\-\-drbdsetup\fR\fIfile\fR +\fB\-s\fR, \fB\-\-drbdsetup\fR \fIfile\fR .RS 4 -Specifies the full path to the drbdsetup program\&. If this option is omitted, drbdadm will look for +Specifies the full path to the +\fBdrbdsetup\fR +program\&. If this option is omitted, drbdadm will look for \fB/sbin/drbdsetup\fR and \fB\&./drbdsetup\fR\&. .RE .PP -\fB\-m\fR, \fB\-\-drbdmeta\fR\fIfile\fR +\fB\-m\fR, \fB\-\-drbdmeta\fR \fIfile\fR .RS 4 -Specifies the full path to the drbdmeta program\&. If this option is omitted, drbdadm will look for +Specifies the full path to the +\fBdrbdmeta\fR +program\&. If this option is omitted, drbdadm will look for \fB/sbin/drbdmeta\fR and \fB\&./drbdmeta\fR\&. @@ -72,23 +136,33 @@ .RS 4 Specifies to which peer node to connect\&. Only necessary if there are more than two host sections in the resource you are working on\&. .RE +.PP +\fB\-\-\fR \fIbackend\-options\fR +.RS 4 +All options following the doubly hyphen are considered +\fIbackend\-options\fR\&. These are passed through to the backend command\&. I\&.e\&. to +\fBdrbdsetup\fR, +\fBdrbdmeta\fR +or +\fBdrbd\-proxy\-ctl\fR\&. +.RE .SH "COMMANDS" .PP attach .RS 4 -Attaches a local backing block device to the DRBD resource\'s device\&. +Attaches a local backing block device to the DRBD resource\*(Aqs device\&. .RE .PP detach .RS 4 .\" drbdadm: detach -Removes the backing storage device from a DRBD resource\'s device\&. +Removes the backing storage device from a DRBD resource\*(Aqs device\&. .RE .PP connect .RS 4 .\" drbdadm: connect -Sets up the network configuration of the resource\'s device\&. If the peer device is already configured, the two DRBD devices will connect\&. If there are more than two host sections in the resource you need to use the +Sets up the network configuration of the resource\*(Aqs device\&. If the peer device is already configured, the two DRBD devices will connect\&. If there are more than two host sections in the resource you need to use the \fB\-\-peer\fR option to select the peer you want to connect to\&. .RE @@ -120,13 +194,15 @@ primary .RS 4 .\" drbdadm: primary -Promote the resource\'s device into primary role\&. You need to do this before any access to the device, such as creating or mounting a file system\&. +Promote the resource\*(Aqs device into primary role\&. You need to do this before any access to the device, such as creating or mounting a file system\&. .RE .PP secondary .RS 4 .\" drbdadm: secondary -Brings the device back into secondary role\&. This is needed since in a connected DRBD device pair, only one of the two peers may have primary role (except if allow\-two\-primaries is explicitly set in the configuration file)\&. +Brings the device back into secondary role\&. This is needed since in a connected DRBD device pair, only one of the two peers may have primary role (except if +\fBallow\-two\-primaries\fR +is explicitly set in the configuration file)\&. .RE .PP invalidate @@ -134,31 +210,40 @@ .\" drbdadm: invalidate Forces DRBD to consider the data on the \fIlocal\fR -backing storage device as out\-of\-sync\&. Therefore DRBD will copy each and every block over from its peer, to bring the local storage device back in sync\&. +backing storage device as out\-of\-sync\&. Therefore DRBD will copy each and every block from its peer, to bring the local storage device back in sync\&. .RE .PP invalidate\-remote .RS 4 .\" drbdadm: invalidate-remote This command is similar to the invalidate command, however, the -\fIpeer\'s\fR +\fIpeer\*(Aqs\fR backing storage is invalidated and hence rewritten with the data of the local node\&. .RE .PP resize .RS 4 .\" drbdadm: resize -Causes DRBD to re\-examine all sizing constraints, and resize the resource\'s device accordingly\&. For example in case you increased the size of your backing storage devices (on both nodes of course), then DRBD will adopt to the new size after you called this command on one of your nodes\&. Since new storage space must be synchronised this command only works if there is at least one primary node present\&. +Causes DRBD to re\-examine all sizing constraints, and resize the resource\*(Aqs device accordingly\&. For example, if you increased the size of your backing storage devices (on both nodes, of course), then DRBD will adopt the new size after you called this command on one of your nodes\&. Since new storage space must be synchronised this command only works if there is at least one primary node present\&. .sp The \fB\-\-assume\-peer\-has\-space\fR -allows you to resize a device which is currently not connected to the peer\&. Use with care, since if you do not resize the peer\'s disk as well, further connect attempts of the two will fail\&. +allows you to resize a device which is currently not connected to the peer\&. Use with care, since if you do not resize the peer\*(Aqs disk as well, further connect attempts of the two will fail\&. +.RE +.PP +check\-resize +.RS 4 +.\" drbdadm: check-resize +Calls drbdmeta to eventually move internal meta data\&. If the backing device was resized, while DRBD was not running, meta data has to be moved to the end of the device, so that the next +\fBattach\fR +command can succeed\&. .RE .PP create\-md .RS 4 .\" drbdadm: create-md -Initializes the meta data storage\&. This needs to be done before a DRBD resource can be taken online for the first time\&. +Initializes the meta data storage\&. This needs to be done before a DRBD resource can be taken online for the first time\&. In case of issues with that command have a look at +\fBdrbdmeta\fR(8) .RE .PP get\-gi @@ -215,48 +300,22 @@ Shows the current connection state of the devices\&. .RE .PP -status -.RS 4 -.\" drbdadm: status -Shows the current status of all devices defined in the current config file, in xml\-like format\&. Example output: -.sp -.if n \{\ -.RS 4 -.\} -.nf - - - - - - - - -.fi -.if n \{\ -.RE -.\} -.sp -.RE -.PP dump .RS 4 .\" drbdadm: dump -Just parse the configuration file and dump it to stdout\&. May be used to check the configuration file for syntactical correctness\&. +Just parse the configuration file and dump it to stdout\&. May be used to check the configuration file for syntactic correctness\&. .RE .PP outdate .RS 4 .\" drbdadm: outdate -Used to mark the node\'s data as outdated\&. Usually used by the peer\'s fence\-peer handler\&. +Used to mark the node\*(Aqs data as outdated\&. Usually used by the peer\*(Aqs fence\-peer handler\&. .RE .PP verify .RS 4 .\" drbdadm: verify -Starts online verify\&. During online verify, data on both nodes is compared for inconsistency\&. See +Starts online verify\&. During online verify, data on both nodes is compared for equality\&. See /proc/drbd for online verify progress\&. If out\-of\-sync blocks are found, they are \fInot\fR @@ -272,7 +331,7 @@ pause\-sync .RS 4 .\" drbdadm: pause-sync -Temporarily suspend an ongoing resynchronization by setting the local pause flag\&. Resync only progresses if neither the local nor the remote pause flag is set\&. It might be desirable to postpone DRBD\'s resynchronization until after any resynchronization of the backing storage\'s RAID setup\&. +Temporarily suspend an ongoing resynchronization by setting the local pause flag\&. Resync only progresses if neither the local nor the remote pause flag is set\&. It might be desirable to postpone DRBD\*(Aqs resynchronization until after any resynchronization of the backing storage\*(Aqs RAID setup\&. .RE .PP resume\-sync @@ -286,7 +345,9 @@ .\" drbdadm: new-current-uuid Generates a new currend UUID and rotates all other UUID values\&. .sp -This can be used to shorten the initial resync of a cluster\&. See the drbdsetup manpage for a more details\&. +This can be used to shorten the initial resync of a cluster\&. See the +\fBdrbdsetup\fR +manpage for a more details\&. .RE .PP dstate @@ -301,7 +362,7 @@ .RE .SH "VERSION" .sp -This document was revised for version 8\&.3\&.2 of the DRBD distribution\&. +This document was revised for version 8\&.4\&.0 of the DRBD distribution\&. .SH "AUTHOR" .sp Written by Philipp Reisner philipp\&.reisner@linbit\&.com and Lars Ellenberg lars\&.ellenberg@linbit\&.com @@ -310,12 +371,16 @@ Report bugs to drbd\-user@lists\&.linbit\&.com\&. .SH "COPYRIGHT" .sp -Copyright 2001\-2008 LINBIT Information Technologies, Philipp Reisner, Lars Ellenberg\&. This is free software; see the source for copying conditions\&. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE\&. +Copyright 2001\-2011 LINBIT Information Technologies, Philipp Reisner, Lars Ellenberg\&. This is free software; see the source for copying conditions\&. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE\&. .SH "SEE ALSO" .PP \fBdrbd.conf\fR(5), \fBdrbd\fR(8), -\fBdrbddisk\fR(8)\fBdrbdsetup\fR(8)\fBdrbdmeta\fR(8)\m[blue]\fBDRBD project web site\fR\m[]\&\s-2\u[1]\d\s+2 +\fBdrbddisk\fR(8), +\fBdrbdsetup\fR(8), +\fBdrbdmeta\fR(8) +and the +\m[blue]\fBDRBD project web site\fR\m[]\&\s-2\u[1]\d\s+2 .SH "NOTES" .IP " 1." 4 DRBD project web site diff -Nru drbd8-8.3.7/documentation/drbdadm.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdadm.xml --- drbd8-8.3.7/documentation/drbdadm.xml 2010-01-07 09:09:58.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdadm.xml 2012-02-02 14:09:14.000000000 +0000 @@ -1,454 +1,651 @@ - - + + - 5 Dec 2008 + 6 May 2011 + DRBD - 8.3.2 + + 8.4.0 + drbdadm + 8 + System Administration + drbdadm - Administration tool for DRBD - - drbdadm - - + + Administration tool for DRBD + drbdadm + + drbdadm + -d - -cfile - -tfile - -scmd - -mcmd + + -cfile + + -tfile + + -scmd + + -mcmd + -S - -hhost - - command - + + -hhost + + --backend-options + + command + - all - - resource - + all + + resource/volume> + Description - Drbdadm is the high level tool of the DRBD program suite. Drbdadm is to - drbdsetup and drbdmeta what ifup/ifdown is to ifconfig. Drbdadm reads its - configuration file and performs the specified commands by calling the - drbdsetup and/or the drbdmeta program. - + + is the high level tool of the DRBD program suite. + is to and what + / is to . + reads its configuration file and performs the specified commands by + calling the and/or the program. + + can operate on whole resources or on individual volumes in a + resource. The sub commands: , , + , , , + , , , + , , , + , , , + , , , + , work on whole resources and on + individual volumes. + + Resource level only commands are: , , + , , and + . + Options + , + - Just prints the calls of drbdsetup to stdout, but does not run - the commands. - + Just prints the calls of to stdout, but does not run + the commands. + - , file + , + file + - Specifies the configuration file drbdadm will use. If this parameter - is not specified, drbdadm will look for , - and . - + Specifies the configuration file drbdadm will use. If this parameter is not + specified, drbdadm will look for , + , and + . - , file + , + file + - Specifies an additional configuration file drbdadm to check. This option - is only allowed with the dump and the sh-nop commands. - + Specifies an additional configuration file drbdadm to check. This option is only + allowed with the dump and the sh-nop commands. - , file + , + file + - Specifies the full path to the drbdsetup program. If this option is - omitted, drbdadm will look for and . - + Specifies the full path to the program. If this option + is omitted, drbdadm will look for and + . + - , file + , + file + - Specifies the full path to the drbdmeta program. If this option is - omitted, drbdadm will look for and - . - + Specifies the full path to the program. If this option + is omitted, drbdadm will look for and + . + , + - Specifies that this command should be performed on a stacked resource. - + Specifies that this command should be performed on a stacked resource. + , + - Specifies to which peer node to connect. Only necessary if there are - more than two host sections in the resource you are working on. - + Specifies to which peer node to connect. Only necessary if there are more than two + host sections in the resource you are working on. + + + + + backend-options + + + All options following the doubly hyphen are considered + backend-options. These are passed through to the backend + command. I.e. to , or + . + Commands + attach + - Attaches a local backing block device to the DRBD resource's device. - + Attaches a local backing block device to the DRBD resource's device. + detach + - drbdadmdetach - Removes the backing storage device from a DRBD resource's device. - + + drbdadm + + detach + Removes the backing storage device from a DRBD resource's device. + connect + - drbdadmconnect - Sets up the network configuration of the resource's device. If the - peer device is already configured, the two DRBD devices will connect. - If there are more than two host sections in the resource you need - to use the option to select the peer you want to - connect to. - + + drbdadm + + connect + Sets up the network configuration of the resource's device. If the peer + device is already configured, the two DRBD devices will connect. If there are more than + two host sections in the resource you need to use the option to + select the peer you want to connect to. + disconnect + - drbdadmdisconnect - Removes the network configuration from the resource. The device - will then go into StandAlone state. - + + drbdadm + + disconnect + Removes the network configuration from the resource. The device will then + go into StandAlone state. + syncer + - drbdadmsyncer - Loads the resynchronization parameters into the device. - + + drbdadm + + syncer + Loads the resynchronization parameters into the device. + up + - drbdadmup - Is a shortcut for attach and connect. - + + drbdadm + + up + Is a shortcut for attach and connect. + down + - drbdadmdown - Is a shortcut for disconnect and detach. - + + drbdadm + + down + Is a shortcut for disconnect and detach. + primary + - drbdadmprimary - Promote the resource's device into primary role. You need to do - this before any access to the device, such as creating or mounting a file system. - + + drbdadm + + primary + Promote the resource's device into primary role. You need to do this + before any access to the device, such as creating or mounting a file system. + secondary + - drbdadmsecondary - Brings the device back into secondary role. This is needed since in - a connected DRBD device pair, only one of the two peers may have - primary role (except if allow-two-primaries is explicitly set in - the configuration file). - + + drbdadm + + secondary + Brings the device back into secondary role. This is needed since in a + connected DRBD device pair, only one of the two peers may have primary role (except if + is explicitly set in the configuration + file). + invalidate + - drbdadminvalidate - Forces DRBD to consider the data on the local backing - storage device as out-of-sync. Therefore DRBD will copy each - and every block over from its peer, to bring the local storage - device back in sync. - + + drbdadm + + invalidate + Forces DRBD to consider the data on the local + backing storage device as out-of-sync. Therefore DRBD will copy each and every block + from its peer, to bring the local storage device back in sync. + invalidate-remote + - drbdadminvalidate-remote - This command is similar to the invalidate command, however, the - peer's backing storage is invalidated and hence rewritten - with the data of the local node. - + + drbdadm + + invalidate-remote + This command is similar to the invalidate command, however, the + peer's backing storage is invalidated and hence rewritten with the + data of the local node. + resize + - drbdadmresize - Causes DRBD to re-examine all sizing constraints, and resize the - resource's device accordingly. For example in case you increased the - size of your backing storage devices (on both nodes of course), - then DRBD will adopt to the new size after you called - this command on one of your nodes. Since new storage space must be - synchronised this command only works if there is at least one primary - node present. - - The allows you to - resize a device which is currently not connected to the peer. - Use with care, since if you do not resize the peer's disk as well, - further connect attempts of the two will fail. - + + drbdadm + + resize + Causes DRBD to re-examine all sizing constraints, and resize the + resource's device accordingly. For example, if you increased the size of your backing + storage devices (on both nodes, of course), then DRBD will adopt the new size after you + called this command on one of your nodes. Since new storage space must be synchronised + this command only works if there is at least one primary node present. + + The allows you to resize a device which + is currently not connected to the peer. Use with care, since if you do not resize the + peer's disk as well, further connect attempts of the two will fail. + + + check-resize + + + + drbdadm + + check-resize + Calls drbdmeta to eventually move internal meta data. If the backing + device was resized, while DRBD was not running, meta data has to be moved to the end of + the device, so that the next command can succeed. + + + create-md + - drbdadmcreate-md - Initializes the meta data storage. This needs to be - done before a DRBD resource can be taken online for the first - time. - + + drbdadm + + create-md + Initializes the meta data storage. This needs to be done before a DRBD + resource can be taken online for the first time. In case of issues with that command + have a look at + drbdmeta + + 8 + + get-gi + - drbdadmget-gi - Shows a short textual representation of the data generation - identifiers. - + + drbdadm + + get-gi + Shows a short textual representation of the data generation + identifiers. + show-gi + - drbdadmshow-gi - Prints a textual representation of the data generation - identifiers including explanatory information. - + + drbdadm + + show-gi + Prints a textual representation of the data generation identifiers + including explanatory information. + dump-md + - drbdadmdump-md - Dumps the whole contents of the meta data storage, including - the stored bit-map and activity-log, in a textual representation. - + + drbdadm + + dump-md + Dumps the whole contents of the meta data storage, including the stored + bit-map and activity-log, in a textual representation. + outdate + - drbdadmoutdate - Sets the outdated flag in the meta data. - + + drbdadm + + outdate + Sets the outdated flag in the meta data. + adjust + - drbdadmadjust - Synchronizes the configuration of the device with your configuration - file. You should always examine the output of the dry-run - mode before actually executing this command. - + + drbdadm + + adjust + Synchronizes the configuration of the device with your configuration + file. You should always examine the output of the dry-run mode before actually executing + this command. + wait-connect + - drbdadmwait-connect - Waits until the device is connected to its peer device. - + + drbdadm + + wait-connect + Waits until the device is connected to its peer device. + role + - drbdadmrole - Shows the current roles of the devices (local/peer). - E.g. Primary/Secondary - + + drbdadm + + role + Shows the current roles of the devices (local/peer). E.g. + Primary/Secondary + state + - drbdadmstate - Deprecated alias for "role", see above. - + + drbdadm + + state + Deprecated alias for "role", see above. + cstate + - drbdadmcstate - Shows the current connection state of the devices. - - - - - status - - drbdadmstatus - Shows the current status of all devices defined in the current config file, - in xml-like format. Example output: -<drbd-status version="8.3.2" api="88"> -<resources config_file="/etc/drbd.conf"> -<resource minor="0" name="s0" cs="SyncTarget" st1="Secondary" st2="Secondary" - ds1="Inconsistent" ds2="UpToDate" resynced_precent ="5.9" /> -<resource minor="1" name="s1" cs="WFConnection" st1="Secondary" - st2="Unknown" ds1="Inconsistent" ds2="Outdated" /> -<resource minor="3" name="dummy" cs="Unconfigured" /> -<!-- resource minor="4" name="scratch" not available or not yet created --> -</resources> -</drbd-status> - + + drbdadm + + cstate + Shows the current connection state of the devices. + dump + - drbdadmdump - Just parse the configuration file and dump it to stdout. May - be used to check the configuration file for syntactical correctness. - + + drbdadm + + dump + Just parse the configuration file and dump it to stdout. May be used to + check the configuration file for syntactic correctness. + outdate + - drbdadmoutdate - Used to mark the node's data as outdated. Usually used by the - peer's fence-peer handler. - + + drbdadm + + outdate + Used to mark the node's data as outdated. Usually used by the peer's + fence-peer handler. + verify + - drbdadmverify - Starts online verify. During online verify, data on - both nodes is compared for inconsistency. See - /proc/drbd for online verify progress. If out-of-sync - blocks are found, they are not resynchronized - automatically. To do that, disconnect and - connect the resource when verification has completed. - + + drbdadm + + verify + Starts online verify. During online verify, data on both nodes is + compared for equality. See /proc/drbd for online + verify progress. If out-of-sync blocks are found, they are not + resynchronized automatically. To do that, disconnect + and connect the resource when verification has + completed. + See also the notes on data integrity on the drbd.conf manpage. + pause-sync + - drbdadmpause-sync - Temporarily suspend an ongoing resynchronization by setting the - local pause flag. Resync only progresses if neither the local - nor the remote pause flag is set. It might be desirable to - postpone DRBD's resynchronization until after any - resynchronization of the backing storage's RAID setup. - + + drbdadm + + pause-sync + Temporarily suspend an ongoing resynchronization by setting the local + pause flag. Resync only progresses if neither the local nor the remote pause flag is + set. It might be desirable to postpone DRBD's resynchronization until after any + resynchronization of the backing storage's RAID setup. + resume-sync + - drbdadmresume-sync - Unset the local sync pause flag. - + + drbdadm + + resume-sync + Unset the local sync pause flag. + new-current-uuid + - drbdadmnew-current-uuid - Generates a new currend UUID and rotates all other UUID values. - - This can be used to shorten the initial resync of a cluster. - See the drbdsetup manpage for a more details. - + + drbdadm + + new-current-uuid + Generates a new currend UUID and rotates all other UUID values. + + This can be used to shorten the initial resync of a cluster. See the + manpage for a more details. + dstate + - drbdadmdstate - Show the current state of the backing storage devices. (local/peer) - + + drbdadm + + dstate + Show the current state of the backing storage devices. + (local/peer) + hidden-commands + - Shows all commands undocumented on purpose. - + Shows all commands undocumented on purpose. + Version - This document was revised for version 8.3.2 of the DRBD distribution. + + This document was revised for version 8.4.0 of the DRBD distribution. + Author - Written by Philipp Reisner philipp.reisner@linbit.com - and Lars Ellenberg lars.ellenberg@linbit.com - + + Written by Philipp Reisner philipp.reisner@linbit.com and Lars + Ellenberg lars.ellenberg@linbit.com + Reporting Bugs - Report bugs to drbd-user@lists.linbit.com. - + + Report bugs to drbd-user@lists.linbit.com. + Copyright - Copyright 2001-2008 LINBIT Information Technologies, -Philipp Reisner, Lars Ellenberg. This is free software; -see the source for copying conditions. There is NO warranty; -not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - + + Copyright 2001-2011 LINBIT Information Technologies, Philipp Reisner, Lars Ellenberg. + This is free software; see the source for copying conditions. There is NO warranty; not even + for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See Also - drbd.conf5, - drbd8, - drbddisk8drbdsetup8drbdmeta8DRBD project web site + + + drbd.conf + + 5 + , + drbd + + 8 + , + drbddisk + + 8 + , + drbdsetup + + 8 + , + drbdmeta + + 8 + and the + DRBD project web site + diff -Nru drbd8-8.3.7/documentation/drbddisk.8 drbd8-8.4.1+git55a81dc~cmd1/documentation/drbddisk.8 --- drbd8-8.3.7/documentation/drbddisk.8 2010-01-13 16:17:26.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbddisk.8 2012-02-02 14:09:59.000000000 +0000 @@ -1,7 +1,7 @@ '\" t .\" Title: drbddisk .\" Author: [see the "Author" section] -.\" Generator: DocBook XSL Stylesheets v1.75.1 +.\" Generator: DocBook XSL Stylesheets v1.75.2 .\" Date: 15 Oct 2008 .\" Manual: System Administration .\" Source: DRBD 8.3.2 @@ -9,6 +9,15 @@ .\" .TH "DRBDDISK" "8" "15 Oct 2008" "DRBD 8.3.2" "System Administration" .\" ----------------------------------------------------------------- +.\" * Define some portability stuff +.\" ----------------------------------------------------------------- +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.\" http://bugs.debian.org/507673 +.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" ----------------------------------------------------------------- .\" * set default formatting .\" ----------------------------------------------------------------- .\" disable hyphenation diff -Nru drbd8-8.3.7/documentation/drbdmeta.8 drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdmeta.8 --- drbd8-8.3.7/documentation/drbdmeta.8 2010-01-13 16:17:25.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdmeta.8 2012-02-02 14:09:59.000000000 +0000 @@ -1,7 +1,7 @@ '\" t .\" Title: drbdmeta .\" Author: [see the "Author" section] -.\" Generator: DocBook XSL Stylesheets v1.75.1 +.\" Generator: DocBook XSL Stylesheets v1.75.2 .\" Date: 15 Oct 2008 .\" Manual: System Administration .\" Source: DRBD 8.3.2 @@ -9,6 +9,15 @@ .\" .TH "DRBDMETA" "8" "15 Oct 2008" "DRBD 8.3.2" "System Administration" .\" ----------------------------------------------------------------- +.\" * Define some portability stuff +.\" ----------------------------------------------------------------- +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.\" http://bugs.debian.org/507673 +.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' +.\" ----------------------------------------------------------------- .\" * set default formatting .\" ----------------------------------------------------------------- .\" disable hyphenation @@ -19,15 +28,29 @@ .\" * MAIN CONTENT STARTS HERE * .\" ----------------------------------------------------------------- .SH "NAME" -drbdmeta \- DRBD\'s meta data management tool .\" drbdmeta +drbdmeta \- DRBD\*(Aqs meta data management tool .\" drbdmeta .SH "SYNOPSIS" .HP \w'\fBdrbdmeta\fR\ 'u -\fBdrbdmeta\fR {\fIdevice\fR} [v06\ \fIminor\fR | v07\ \fImeta_dev\ index\fR | v08\ \fImeta_dev\ index\fR] {\fIcommand\fR} [\fIcmd\ args\fR...] +\fBdrbdmeta\fR [\-\-force] [\-\-ignore\-sanity\-checks] {\fIdevice\fR} {v06\ \fIminor\fR | v07\ \fImeta_dev\ index\fR | v08\ \fImeta_dev\ index\fR} {\fIcommand\fR} [\fIcmd\ args\fR...] .SH "DESCRIPTION" .PP -Drbdmeta is used to create, display and modify the contents of DRBD\'s meta data storage\&. Usually you do not want to use this command directly, but use it through the front end drbdadm\&. +Drbdmeta is used to create, display and modify the contents of DRBD\*(Aqs meta data storage\&. Usually you do not want to use this command directly, but start it via the frontend +\fBdrbdadm\fR(8)\&. +.PP +This command only works if the DRBD resource is currently down, or at least detached from its backing storage\&. The first parameter is the device node associated to the resource\&. With the second parameter you can select the version of the meta data\&. Currently all major DRBD releases (0\&.6, 0\&.7 and 8) are supported\&. +.SH "OPTIONS" +.PP +\-\-force +.RS 4 +.\" drbdmeta: --force +All questions that get asked by drbdmeta are treated as if the user answered \*(Aqyes\*(Aq\&. +.RE .PP -This command only works if the DRBD resource is currently down, or at least detached from its backing storage\&. The first parameter is the device node associated to the resource\&. With the second parameter you select which the version of the meta data\&. Currently all major DRBD releases (0\&.6, 0\&.7 and 8) are supported\&. +\-\-ignore\-sanity\-checks +.RS 4 +.\" drbdmeta: --ignore-sanity-checks +Some sanity checks cause drbdmeta to terminate\&. E\&.g\&. if a file system image would get destroyed by creating the meta data\&. By using that option you can force drbdmeta to ignore these checks\&. +.RE .SH "COMMANDS" .PP create\-md @@ -51,7 +74,7 @@ dump\-md .RS 4 .\" drbdmeta: dump-md -Dumps the whole contents of the meta data storage including the stored bit\-map and activity\-log, in a textual representation\&. +Dumps the whole contents of the meta data storage including the stored bit\-map and activity\-log in a textual representation\&. .RE .PP outdate @@ -63,11 +86,17 @@ dstate .RS 4 .\" drbdmeta: dstate -Prints the state of the data on the backing storage\&. The output is always followed by \'/DUnknown\' since drbdmeta only looks at the local meta data\&. +Prints the state of the data on the backing storage\&. The output is always followed by \*(Aq/DUnknown\*(Aq since drbdmeta only looks at the local meta data\&. +.RE +.PP +check\-resize +.RS 4 +.\" drbdmeta: check-resize +Examines the device size of a backing device, and it\*(Aqs last known device size, recorded in a file /var/lib/drbd/drbd\-minor\-??\&.lkbd\&. In case the size of the backing device changed, and the meta data can be found at the old position, it moves the meta data to the right position at the end of the block device\&. .RE .SH "EXPERT'S COMMANDS" .PP -Drbdmeta allows you to modify the meta data as well\&. This is intentionally omitted for the command\'s usage output, since you should only use it if you really know what you are doing\&. By setting the generation identifiers to wrong values, you risk to overwrite your up\-to\-data data with an older version of your data\&. +Drbdmeta allows you to modify the meta data as well\&. This is intentionally omitted for the command\*(Aqs usage output, since you should only use it if you really know what you are doing\&. By setting the generation identifiers to wrong values, you risk to overwrite your up\-to\-data data with an older version of your data\&. .PP set\-gi \fIgi\fR .RS 4 diff -Nru drbd8-8.3.7/documentation/drbdmeta.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdmeta.xml --- drbd8-8.3.7/documentation/drbdmeta.xml 2010-01-07 09:09:33.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdmeta.xml 2012-02-02 14:09:14.000000000 +0000 @@ -22,10 +22,12 @@ drbdmeta + --force + --ignore-sanity-checks device - + v06 minor v07 meta_dev index v08 meta_dev index @@ -42,16 +44,39 @@ Description Drbdmeta is used to create, display and modify the contents of DRBD's meta data storage. Usually you do not want to use this - command directly, but use it through the front end drbdadm. + command directly, but start it via the frontend + drbdadm8. This command only works if the DRBD resource is currently down, or at least detached from its backing storage. The first parameter is the device node associated to the resource. With the second - parameter you select which the version of the meta data. Currently + parameter you can select the version of the meta data. Currently all major DRBD releases (0.6, 0.7 and 8) are supported. + Options + + + --force + + drbdmeta--force + All questions that get asked by drbdmeta are treated as if + the user answered 'yes'. + + + + --ignore-sanity-checks + + drbdmeta--ignore-sanity-checks + Some sanity checks cause drbdmeta to terminate. E.g. if a file system image would get + destroyed by creating the meta data. By using that option you can force drbdmeta + to ignore these checks. + + + + + Commands @@ -90,7 +115,7 @@ drbdmetadump-md Dumps the whole contents of the meta data storage including - the stored bit-map and activity-log, in a textual representation. + the stored bit-map and activity-log in a textual representation. @@ -114,6 +139,19 @@ + + + check-resize + + drbdmetacheck-resize + Examines the device size of a backing device, and it's last known device size, + recorded in a file /var/lib/drbd/drbd-minor-??.lkbd. In case the size of the + backing device changed, and the meta data can be found at the old position, + it moves the meta data to the right position at the end of the block device. + + + + diff -Nru drbd8-8.3.7/documentation/drbdsetup.8 drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup.8 --- drbd8-8.3.7/documentation/drbdsetup.8 2010-01-13 16:17:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup.8 2012-09-03 22:37:27.000000000 +0000 @@ -1,13 +1,22 @@ '\" t .\" Title: drbdsetup .\" Author: [see the "Author" section] -.\" Generator: DocBook XSL Stylesheets v1.75.1 -.\" Date: 5 Dec 2008 +.\" Generator: DocBook XSL Stylesheets v1.76.1 +.\" Date: 6 May 2011 .\" Manual: System Administration -.\" Source: DRBD 8.3.2 +.\" Source: DRBD 8.4.0 .\" Language: English .\" -.TH "DRBDSETUP" "8" "5 Dec 2008" "DRBD 8.3.2" "System Administration" +.TH "DRBDSETUP" "8" "6 May 2011" "DRBD 8.4.0" "System Administration" +.\" ----------------------------------------------------------------- +.\" * Define some portability stuff +.\" ----------------------------------------------------------------- +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.\" http://bugs.debian.org/507673 +.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html +.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.ie \n(.g .ds Aq \(aq +.el .ds Aq ' .\" ----------------------------------------------------------------- .\" * set default formatting .\" ----------------------------------------------------------------- @@ -22,61 +31,73 @@ drbdsetup \- Setup tool for DRBD .\" drbdsetup .SH "SYNOPSIS" .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {disk} {\fIlower_dev\fR} {\fImeta_data_dev\fR} {\fImeta_data_index\fR} [\-d\ {\fIsize\fR}] [\-e\ {\fIerr_handler\fR}] [\-f\ {\fIfencing_policy\fR}] [\-b] +\fBdrbdsetup\fR new\-resource \fIresource\fR [\-\-cpu\-mask\ {\fIval\fR}] [\-\-on\-no\-data\-accessible\ {io\-error\ |\ suspend\-io}] +.HP \w'\fBdrbdsetup\fR\ 'u +\fBdrbdsetup\fR new\-minor \fIresource\fR \fIminor\fR \fIvolume\fR +.HP \w'\fBdrbdsetup\fR\ 'u +\fBdrbdsetup\fR del\-resource \fIresource\fR +.HP \w'\fBdrbdsetup\fR\ 'u +\fBdrbdsetup\fR del\-minor \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {net} [\fIaf:\fR] {\fIlocal_addr\fR} [\fI:port\fR] [\fIaf:\fR] {\fIremote_addr\fR} [\fI:port\fR] {\fIprotocol\fR} [\-c\ {\fItime\fR}] [\-i\ {\fItime\fR}] [\-t\ {\fIval\fR}] [\-S\ {\fIsize\fR}] [\-r\ {\fIsize\fR}] [\-k\ {\fIcount\fR}] [\-e\ {\fImax_epoch_size\fR}] [\-b\ {\fImax_buffers\fR}] [\-m] [\-a\ {\fIhash_alg\fR}] [\-x\ {\fIshared_secret\fR}] [\-A\ {\fIasb\-0p\-policy\fR}] [\-B\ {\fIasb\-1p\-policy\fR}] [\-C\ {\fIasb\-2p\-policy\fR}] [\-D] [\-R\ {\fIrole\-resync\-conflict\-policy\fR}] [\-p\ {\fIping_timeout\fR}] [\-u\ {\fIval\fR}] [\-d\ {\fIhash_alg\fR}] [\-o] +\fBdrbdsetup\fR attach \fIminor\fR \fIlower_dev\fR \fImeta_data_dev\fR \fImeta_data_index\fR [\-\-size\ {\fIval\fR}] [\-\-max\-bio\-bvecs\ {\fIval\fR}] [\-\-on\-io\-error\ {pass_on\ |\ call\-local\-io\-error\ |\ detach}] [\-\-fencing\ {dont\-care\ |\ resource\-only\ |\ resource\-and\-stonith}] [\-\-disk\-barrier] [\-\-disk\-flushes] [\-\-disk\-drain] [\-\-md\-flushes] [\-\-resync\-rate\ {\fIval\fR}] [\-\-resync\-after\ {\fIval\fR}] [\-\-al\-extents\ {\fIval\fR}] [\-\-c\-plan\-ahead\ {\fIval\fR}] [\-\-c\-delay\-target\ {\fIval\fR}] [\-\-c\-fill\-target\ {\fIval\fR}] [\-\-c\-max\-rate\ {\fIval\fR}] [\-\-c\-min\-rate\ {\fIval\fR}] [\-\-disk\-timeout\ {\fIval\fR}] [\-\-read\-balancing\ {prefer\-local\ |\ prefer\-remote\ |\ round\-robin\ |\ least\-pending\ |\ when\-congested\-remote\ |\ 32K\-striping\ |\ 64K\-striping\ |\ 128K\-striping\ |\ 256K\-striping\ |\ 512K\-striping\ |\ 1M\-striping}] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {syncer} [\-a\ {\fIdev_minor\fR}] [\-r\ {\fIrate\fR}] [\-e\ {\fIextents\fR}] [\-v\ {\fIverify\-hash\-alg\fR}] [\-c\ {\fIcpu\-mask\fR}] [\-C\ {\fIcsums\-hash\-alg\fR}] [\-R\ {\fIuse\-rle\fR}] +\fBdrbdsetup\fR connect \fIresource\fR \fIlocal_addr\fR \fIremote_addr\fR [\-\-tentative] [\-\-discard\-my\-data] [\-\-protocol\ {A\ |\ B\ |\ C}] [\-\-timeout\ {\fIval\fR}] [\-\-max\-epoch\-size\ {\fIval\fR}] [\-\-max\-buffers\ {\fIval\fR}] [\-\-unplug\-watermark\ {\fIval\fR}] [\-\-connect\-int\ {\fIval\fR}] [\-\-ping\-int\ {\fIval\fR}] [\-\-sndbuf\-size\ {\fIval\fR}] [\-\-rcvbuf\-size\ {\fIval\fR}] [\-\-ko\-count\ {\fIval\fR}] [\-\-allow\-two\-primaries] [\-\-cram\-hmac\-alg\ {\fIval\fR}] [\-\-shared\-secret\ {\fIval\fR}] [\-\-after\-sb\-0pri\ {disconnect\ |\ discard\-younger\-primary\ |\ discard\-older\-primary\ |\ discard\-zero\-changes\ |\ discard\-least\-changes\ |\ discard\-local\ |\ discard\-remote}] [\-\-after\-sb\-1pri\ {disconnect\ |\ consensus\ |\ discard\-secondary\ |\ call\-pri\-lost\-after\-sb\ |\ violently\-as0p}] [\-\-after\-sb\-2pri\ {disconnect\ |\ call\-pri\-lost\-after\-sb\ |\ violently\-as0p}] [\-\-always\-asbp] [\-\-rr\-conflict\ {disconnect\ |\ call\-pri\-lost\ |\ violently}] [\-\-ping\-timeout\ {\fIval\fR}] [\-\-data\-integrity\-alg\ {\fIval\fR}] [\-\-tcp\-cork] [\-\-on\-congestion\ {block\ |\ pull\-ahead\ |\ disconnect}] [\-\-congestion\-fill\ {\fIval\fR}] [\-\-congestion\-extents\ {\fIval\fR}] [\-\-csums\-alg\ {\fIval\fR}] [\-\-verify\-alg\ {\fIval\fR}] [\-\-use\-rle] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {disconnect} +\fBdrbdsetup\fR disk\-options \fIminor\fR [\-\-on\-io\-error\ {pass_on\ |\ call\-local\-io\-error\ |\ detach}] [\-\-fencing\ {dont\-care\ |\ resource\-only\ |\ resource\-and\-stonith}] [\-\-disk\-barrier] [\-\-disk\-flushes] [\-\-disk\-drain] [\-\-md\-flushes] [\-\-resync\-rate\ {\fIval\fR}] [\-\-resync\-after\ {\fIval\fR}] [\-\-al\-extents\ {\fIval\fR}] [\-\-c\-plan\-ahead\ {\fIval\fR}] [\-\-c\-delay\-target\ {\fIval\fR}] [\-\-c\-fill\-target\ {\fIval\fR}] [\-\-c\-max\-rate\ {\fIval\fR}] [\-\-c\-min\-rate\ {\fIval\fR}] [\-\-disk\-timeout\ {\fIval\fR}] [\-\-read\-balancing\ {prefer\-local\ |\ prefer\-remote\ |\ round\-robin\ |\ least\-pending\ |\ when\-congested\-remote\ |\ 32K\-striping\ |\ 64K\-striping\ |\ 128K\-striping\ |\ 256K\-striping\ |\ 512K\-striping\ |\ 1M\-striping}] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {detach} +\fBdrbdsetup\fR net\-options \fIlocal_addr\fR \fIremote_addr\fR [\-\-protocol\ {A\ |\ B\ |\ C}] [\-\-timeout\ {\fIval\fR}] [\-\-max\-epoch\-size\ {\fIval\fR}] [\-\-max\-buffers\ {\fIval\fR}] [\-\-unplug\-watermark\ {\fIval\fR}] [\-\-connect\-int\ {\fIval\fR}] [\-\-ping\-int\ {\fIval\fR}] [\-\-sndbuf\-size\ {\fIval\fR}] [\-\-rcvbuf\-size\ {\fIval\fR}] [\-\-ko\-count\ {\fIval\fR}] [\-\-allow\-two\-primaries] [\-\-cram\-hmac\-alg\ {\fIval\fR}] [\-\-shared\-secret\ {\fIval\fR}] [\-\-after\-sb\-0pri\ {disconnect\ |\ discard\-younger\-primary\ |\ discard\-older\-primary\ |\ discard\-zero\-changes\ |\ discard\-least\-changes\ |\ discard\-local\ |\ discard\-remote}] [\-\-after\-sb\-1pri\ {disconnect\ |\ consensus\ |\ discard\-secondary\ |\ call\-pri\-lost\-after\-sb\ |\ violently\-as0p}] [\-\-after\-sb\-2pri\ {disconnect\ |\ call\-pri\-lost\-after\-sb\ |\ violently\-as0p}] [\-\-always\-asbp] [\-\-rr\-conflict\ {disconnect\ |\ call\-pri\-lost\ |\ violently}] [\-\-ping\-timeout\ {\fIval\fR}] [\-\-data\-integrity\-alg\ {\fIval\fR}] [\-\-tcp\-cork] [\-\-on\-congestion\ {block\ |\ pull\-ahead\ |\ disconnect}] [\-\-congestion\-fill\ {\fIval\fR}] [\-\-congestion\-extents\ {\fIval\fR}] [\-\-csums\-alg\ {\fIval\fR}] [\-\-verify\-alg\ {\fIval\fR}] [\-\-use\-rle] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {down} +\fBdrbdsetup\fR resource\-options \fIresource\fR [\-\-cpu\-mask\ {\fIval\fR}] [\-\-on\-no\-data\-accessible\ {io\-error\ |\ suspend\-io}] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {primary} [\-o] +\fBdrbdsetup\fR disconnect \fIlocal_addr\fR \fIremote_addr\fR [\-\-force] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {secondary} +\fBdrbdsetup\fR detach \fIminor\fR [\-\-force] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {verify} [\-s\ {\fIstart\-position\fR}] +\fBdrbdsetup\fR primary \fIminor\fR [\-\-force] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {invalidate} +\fBdrbdsetup\fR secondary \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {invalidate\-remote} +\fBdrbdsetup\fR down \fIresource\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {wait\-connect} [\-t\ {\fIwfc_timeout\fR}] [\-d\ {\fIdegr_wfc_timeout\fR}] [\-o\ {\fIoutdated_wfc_timeout\fR}] [\-w] +\fBdrbdsetup\fR verify \fIminor\fR [\-\-start\ {\fIval\fR}] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {wait\-sync} [\-t\ {\fIwfc_timeout\fR}] [\-d\ {\fIdegr_wfc_timeout\fR}] [\-o\ {\fIoutdated_wfc_timeout\fR}] [\-w] +\fBdrbdsetup\fR invalidate \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {role} +\fBdrbdsetup\fR invalidate\-remote \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {cstate} +\fBdrbdsetup\fR wait\-connect \fIminor\fR [\-\-wfc\-timeout\ {\fIval\fR}] [\-\-degr\-wfc\-timeout\ {\fIval\fR}] [\-\-outdated\-wfc\-timeout\ {\fIval\fR}] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {dstate} +\fBdrbdsetup\fR wait\-sync \fIminor\fR [\-\-wfc\-timeout\ {\fIval\fR}] [\-\-degr\-wfc\-timeout\ {\fIval\fR}] [\-\-outdated\-wfc\-timeout\ {\fIval\fR}] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {status} +\fBdrbdsetup\fR role \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {resize} [\-d\ {\fIsize\fR}] [\-f\ {\fIassume\-peer\-has\-space\fR}] +\fBdrbdsetup\fR cstate \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {pause\-sync} +\fBdrbdsetup\fR dstate \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {resume\-sync} +\fBdrbdsetup\fR resize \fIminor\fR [\-\-size\ {\fIval\fR}] [\-\-assume\-peer\-has\-space] [\-\-assume\-clean] .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {outdate} +\fBdrbdsetup\fR check\-resize \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {show\-gi} +\fBdrbdsetup\fR pause\-sync \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {get\-gi} +\fBdrbdsetup\fR resume\-sync \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {show} +\fBdrbdsetup\fR outdate \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {suspend\-io} +\fBdrbdsetup\fR show\-gi \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {resume\-io} +\fBdrbdsetup\fR get\-gi \fIminor\fR .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {events} [\-u] [\-a] +\fBdrbdsetup\fR show {\fIresource\fR | \fIminor\fR | \fIall\fR} .HP \w'\fBdrbdsetup\fR\ 'u -\fBdrbdsetup\fR {\fIdevice\fR} {new\-current\-uuid} [\-c] +\fBdrbdsetup\fR suspend\-io \fIminor\fR +.HP \w'\fBdrbdsetup\fR\ 'u +\fBdrbdsetup\fR resume\-io \fIminor\fR +.HP \w'\fBdrbdsetup\fR\ 'u +\fBdrbdsetup\fR events {\fIresource\fR | \fIminor\fR | \fIall\fR} +.HP \w'\fBdrbdsetup\fR\ 'u +\fBdrbdsetup\fR new\-current\-uuid \fIminor\fR [\-\-clear\-bitmap] .SH "DESCRIPTION" .PP drbdsetup is used to associate DRBD devices with their backing block devices, to set up DRBD device pairs to mirror their backing block devices, and to inspect the configuration of running DRBD devices\&. @@ -94,17 +115,29 @@ .RS 4 In case the specified DRBD device (minor number) does not exist yet, create it implicitly\&. .RE +.SS "new\-resource" .PP -\fB\-\-set\-defaults\fR -.RS 4 -When -\fB\-\-set\-defaults\fR -is given on the command line, all options of the invoked sub\-command that are not explicitly set are reset to their default values\&. -.RE -.SS "disk" +Resources are the primary objects of any DRBD configuration\&. A resource must be created with the +\fBnew\-resource\fR +command before any volumes or minor devices can be created\&. Connections are referenced by name\&. +.SS "new\-minor" +.PP +A +\fIminor\fR +is used as a synonym for replicated block device\&. It is represented in the /dev/ directory by a block device\&. It is the application\*(Aqs interface to the DRBD\-replicated block devices\&. These block devices get addressed by their minor numbers on the drbdsetup commandline\&. +.PP +A pair of replicated block devices may have different minor numbers on the two machines\&. They are associated by a common +\fIvolume\-number\fR\&. Volume numbers are local to each connection\&. Minor numbers are global on one node\&. +.SS "del\-resource" +.PP +Destroys a resource object\&. This is only possible if the resource has no volumes\&. +.SS "del\-minor" +.PP +Minors can only be destroyed if its disk is detached\&. +.SS "attach, disk\-options" .\" drbdsetup: disk .PP -Associates +Attach associates \fIdevice\fR with \fIlower_device\fR @@ -118,9 +151,11 @@ \fBnet\fR command\&.) .PP -\fB\-d\fR, \fB\-\-disk\-size \fR\fB\fIsize\fR\fR +With the disk\-options command it is possible to change the options of a minor while it is attached\&. +.PP +\fB\-\-disk\-size \fR\fB\fIsize\fR\fR .RS 4 -You can override DRBD\'s size determination method with this option\&. If you need to use the device before it was ever connected to its peer, use this option to pass the +You can override DRBD\*(Aqs size determination method with this option\&. If you need to use the device before it was ever connected to its peer, use this option to pass the \fIsize\fR of the DRBD device to the driver\&. Default unit is sectors (1s = 512 bytes)\&. .sp @@ -129,11 +164,11 @@ parameter in drbd\&.conf, we strongly recommend to add an explicit unit postfix\&. drbdadm and drbdsetup used to have mismatching default units\&. .RE .PP -\fB\-e\fR, \fB\-\-on\-io\-error \fR\fB\fIerr_handler\fR\fR +\fB\-\-on\-io\-error \fR\fB\fIerr_handler\fR\fR .RS 4 If the driver of the \fIlower_device\fR -reports an error to DRBD, DRBD will either pass the error to the upper layers of the operating system, call a helper program, or detach the device from its backing storage and perform all further IO by requesting it from the peer\&. The valid +reports an error to DRBD, DRBD will mark the disk as inconsistent, call a helper program, or detach the device from its backing storage and perform all further IO by requesting it from the peer\&. The valid \fIerr_handlers\fR are: \fBpass_on\fR, @@ -142,96 +177,196 @@ \fBdetach\fR\&. .RE .PP -\fB\-f\fR, \fB\-\-fencing \fR\fB\fIfencing_policy\fR\fR +\fB\-\-fencing \fR\fB\fIfencing_policy\fR\fR .RS 4 Under \fBfencing\fR -we understand preventative measures to avoid situations where both nodes are primary and disconnected (AKA split brain)\&. +we understand preventive measures to avoid situations where both nodes are primary and disconnected (AKA split brain)\&. .sp Valid fencing policies are: .PP \fBdont\-care\fR .RS 4 -This is the default policy\&. No fencing actions are undertaken\&. +This is the default policy\&. No fencing actions are done\&. .RE .PP \fBresource\-only\fR .RS 4 -If a node becomes a disconnected primary\&. it tries to outdate the peer\'s disk\&. This is done by calling the fence\-peer handler\&. The handler is supposed to reach the other node over alternative communication paths and call \'drbdadm outdate res\' there\&. +If a node becomes a disconnected primary, it tries to outdate the peer\*(Aqs disk\&. This is done by calling the fence\-peer handler\&. The handler is supposed to reach the other node over alternative communication paths and call \*(Aqdrbdadm outdate res\*(Aq there\&. .RE .PP \fBresource\-and\-stonith\fR .RS 4 -If a node becomes a disconnected primary, it freezes all its IO operations and calls its fence\-peer handler\&. The fence\-peer handler is supposed to reach the peer over alternative communication paths and call \'drbdadm outdate res\' there\&. In case it cannot reach the peer, it should stonith the peer\&. IO is resumed as soon as the situation is resolved\&. In case your handler fails, you can resume IO with the +If a node becomes a disconnected primary, it freezes all its IO operations and calls its fence\-peer handler\&. The fence\-peer handler is supposed to reach the peer over alternative communication paths and call \*(Aqdrbdadm outdate res\*(Aq there\&. In case it cannot reach the peer, it should stonith the peer\&. IO is resumed as soon as the situation is resolved\&. In case your handler fails, you can resume IO with the \fBresume\-io\fR command\&. .RE .RE .PP -\fB\-b\fR, \fB\-\-use\-bmbv\fR +\fB\-\-disk\-barrier\fR, \fB\-\-disk\-flushes\fR, \fB\-\-disk\-drain\fR .RS 4 -In case the backing storage\'s driver has a merge_bvec_fn() function, DRBD has to pretend that it can only process IO requests in units not lager than 4kByte\&. (At time of writing the only known drivers which have such a function are: md (software raid driver), dm (device mapper \- LVM) and DRBD itself) +DRBD has four implementations to express write\-after\-write dependencies to its backing storage device\&. DRBD will use the first method that is supported by the backing storage device and that is not disabled by the user\&. By default all three options are enabled\&. .sp -To get best performance out of DRBD on top of software raid (or any other driver with a merge_bvec_fn() function) you might enable this function, if you know for sure that the merge_bvec_fn() function will deliver the same results on all nodes of your cluster\&. I\&.e\&. the physical disks of the software raid are exactly of the same type\&. USE THIS OPTION ONLY IF YOU KNOW WHAT YOU ARE DOING\&. -.RE -.PP -\fB\-a\fR, \fB\-\-no\-disk\-barrier\fR, \fB\-i\fR, \fB\-\-no\-disk\-flushes\fR, \fB\-D\fR, \fB\-\-no\-disk\-drain\fR -.RS 4 -DRBD has four implementations to express write\-after\-write dependencies to its backing storage device\&. DRBD will use the first method that is supported by the backing storage device and that is not disabled by the user\&. +When selecting the method you should not only base your decision on the measurable performance\&. In case your backing storage device has a volatile write cache (plain disks, RAID of plain disks) you should use one of the first two\&. In case your backing storage device has battery\-backed write cache you may go with option 3\&. Option 4 (disable everything, use "none") +\fIis dangerous\fR +on most IO stacks, may result in write\-reordering, and if so, can theoretically be the reason for data corruption, or disturb the DRBD protocol, causing spurious disconnect/reconnect cycles\&. +\fIDo not use\fR +\fBno\-disk\-drain\fR\&. .sp -When selecting the method you should not only base your decision on the measurable performance\&. In case your backing storage device has a volatile write cache (plain disks, RAID of plain disks) you should use one of the first two\&. In case your backing storage device has battery\-backed write cache you may go with option 3 or 4\&. Option 4 will deliver the best performance such devices\&. -.sp -Unfortunately device mapper (LVM) does not support barriers\&. +Unfortunately device mapper (LVM) might not support barriers\&. .sp The letter after "wo:" in /proc/drbd indicates with method is currently in use for a device: b, f, d, n\&. The implementations: .PP barrier .RS 4 -The first requirs that the driver of the backing storage device support barriers (called \'tagged command queuing\' in SCSI and \'native command queuing\' in SATA speak)\&. The use of this method can be disabled by the we -\fB\-\-no\-disk\-barrier\fR -option\&. +The first requires that the driver of the backing storage device support barriers (called \*(Aqtagged command queuing\*(Aq in SCSI and \*(Aqnative command queuing\*(Aq in SATA speak)\&. The use of this method can be disabled by setting the +\fBdisk\-barrier\fR +options to +\fBno\fR\&. .RE .PP flush .RS 4 -The second requires that the backing device support disk flushes (called \'force unit access\' in the drive vendors speak)\&. The use of this method can be disabled using the -\fB\-\-no\-disk\-flushes\fR -option\&. +The second requires that the backing device support disk flushes (called \*(Aqforce unit access\*(Aq in the drive vendors speak)\&. The use of this method can be disabled setting +\fBdisk\-flushes\fR +to +\fBno\fR\&. .RE .PP drain .RS 4 -The third method is simply to let write requests drain before write requests of a new reordering domain are issued\&. That was the only implementation before 8\&.0\&.9\&. You can prevent to use of this method by using the -\fB\-\-no\-disk\-drain\fR -option\&. +The third method is simply to let write requests drain before write requests of a new reordering domain are issued\&. That was the only implementation before 8\&.0\&.9\&. .RE .PP none .RS 4 -The fourth method is to not express write\-after\-write dependencies to the backing store at all\&. +The fourth method is to not express write\-after\-write dependencies to the backing store at all, by also specifying +\fB\-\-no\-disk\-drain\fR\&. This +\fIis dangerous\fR +on most IO stacks, may result in write\-reordering, and if so, can theoretically be the reason for data corruption, or disturb the DRBD protocol, causing spurious disconnect/reconnect cycles\&. +\fIDo not use\fR +\fB\-\-no\-disk\-drain\fR\&. .RE .RE .PP -\fB\-m\fR, \fB\-\-no\-md\-flushes\fR +\fB\-\-md\-flushes\fR .RS 4 Disables the use of disk flushes and barrier BIOs when accessing the meta data device\&. See the notes on -\fB\-\-no\-disk\-flushes\fR\&. +\fB\-\-disk\-flushes\fR\&. .RE .PP -\fB\-s\fR, \fB\-\-max\-bio\-bvecs\fR +\fB\-\-max\-bio\-bvecs\fR .RS 4 -In some special circumstances the device mapper stack manages to pass BIOs to DRBD that violate the constraints that are set forth by DRBD\'s merge_bvec() function and which have more than one bvec\&. A known example is: phys\-disk \-> DRBD \-> LVM \-> Xen \-> missaligned partition (63) \-> DomU FS\&. Then you might see "bio would need to, but cannot, be split:" in the Dom0\'s kernel log\&. +In some special circumstances the device mapper stack manages to pass BIOs to DRBD that violate the constraints that are set forth by DRBD\*(Aqs merge_bvec() function and which have more than one bvec\&. A known example is: phys\-disk \-> DRBD \-> LVM \-> Xen \-> missaligned partition (63) \-> DomU FS\&. Then you might see "bio would need to, but cannot, be split:" in the Dom0\*(Aqs kernel log\&. .sp -The best workaround is to proper align the partition within the VM (E\&.g\&. start it at sector 1024)\&. Costs 480 KiByte of storage\&. Unfortunately the default of most Linux partitioning tools is to start the first partition at an odd number (63)\&. Therefore most distribution\'s install helpers for virtual linux machines will end up with missaligned partitions\&. The second best workaround is to limit DRBD\'s max bvecs per BIO (= max\-bio\-bvecs) to 1\&. Might cost performance\&. +The best workaround is to proper align the partition within the VM (E\&.g\&. start it at sector 1024)\&. That costs 480 KiB of storage\&. Unfortunately the default of most Linux partitioning tools is to start the first partition at an odd number (63)\&. Therefore most distributions install helpers for virtual linux machines will end up with missaligned partitions\&. The second best workaround is to limit DRBD\*(Aqs max bvecs per BIO (i\&.e\&., the +\fBmax\-bio\-bvecs\fR +option) to 1, but that might cost performance\&. .sp The default value of \fBmax\-bio\-bvecs\fR is 0, which means that there is no user imposed limitation\&. .RE -.SS "net" +.PP +\fB\-\-resync\-rate \fR\fB\fIrate\fR\fR +.RS 4 +To ensure smooth operation of the application on top of DRBD, it is possible to limit the bandwidth that may be used by background synchronization\&. The default is 250 KiB/sec, the default unit is KiB/sec\&. +.RE +.PP +\fB\-\-resync\-after \fR\fB\fIminor\fR\fR +.RS 4 +Start resync on this device only if the device with +\fIminor\fR +is already in connected state\&. Otherwise this device waits in SyncPause state\&. +.RE +.PP +\fB\-\-al\-extents \fR\fB\fIextents\fR\fR +.RS 4 +DRBD automatically performs hot area detection\&. With this parameter you control how big the hot area (=active set) can get\&. Each extent marks 4M of the backing storage\&. In case a primary node leaves the cluster unexpectedly, the areas covered by the active set must be resynced upon rejoining of the failed node\&. The data structure is stored in the meta\-data area, therefore each change of the active set is a write operation to the meta\-data device\&. A higher number of extents gives longer resync times but less updates to the meta\-data\&. The default number of +\fIextents\fR +is 127\&. (Minimum: 7, Maximum: 3843) +.RE +.PP +\fB\-\-c\-plan\-ahead \fR\fB\fIplan_time\fR\fR, \fB\-\-c\-fill\-target \fR\fB\fIfill_target\fR\fR, \fB\-\-c\-delay\-target \fR\fB\fIdelay_target\fR\fR, \fB\-\-c\-max\-rate \fR\fB\fImax_rate\fR\fR +.RS 4 +The dynamic resync speed controller gets enabled with setting +\fIplan_time\fR +to a positive value\&. It aims to fill the buffers along the data path with either a constant amount of data +\fIfill_target\fR, or aims to have a constant delay time of +\fIdelay_target\fR +along the path\&. The controller has an upper bound of +\fImax_rate\fR\&. +.sp +By +\fIplan_time\fR +the agility of the controller is configured\&. Higher values yield for slower/lower responses of the controller to deviation from the target value\&. It should be at least 5 times RTT\&. For regular data paths a +\fIfill_target\fR +in the area of 4k to 100k is appropriate\&. For a setup that contains drbd\-proxy it is advisable to use +\fIdelay_target\fR +instead\&. Only when +\fIfill_target\fR +is set to 0 the controller will use +\fIdelay_target\fR\&. 5 times RTT is a reasonable starting value\&. +\fIMax_rate\fR +should be set to the bandwidth available between the DRBD\-hosts and the machines hosting DRBD\-proxy, or to the available disk\-bandwidth\&. +.sp +The default value of +\fIplan_time\fR +is 0, the default unit is 0\&.1 seconds\&. +\fIFill_target\fR +has 0 and sectors as default unit\&. +\fIDelay_target\fR +has 1 (100ms) and 0\&.1 as default unit\&. +\fIMax_rate\fR +has 10240 (100MiB/s) and KiB/s as default unit\&. +.RE +.PP +\fB\-\-c\-min\-rate \fR\fB\fImin_rate\fR\fR +.RS 4 +We track the disk IO rate caused by the resync, so we can detect non\-resync IO on the lower level device\&. If the lower level device seems to be busy, and the current resync rate is above +\fImin_rate\fR, we throttle the resync\&. +.sp +The default value of +\fImin_rate\fR +is 4M, the default unit is k\&. If you want to not throttle at all, set it to zero, if you want to throttle always, set it to one\&. +.RE +.PP +\fB\-t\fR, \fB\-\-disk\-timeout \fR\fB\fIdisk_timeout\fR\fR +.RS 4 +If the driver of the +\fIlower_device\fR +does not finish an IO request within +\fIdisk_timeout\fR, DRBD considers the disk as failed\&. If DRBD is connected to a remote host, it will reissue local pending IO requests to the peer, and ship all new IO requests to the peer only\&. The disk state advances to diskless, as soon as the backing block device has finished all IO requests\&. +.sp +The default value of is 0, which means that no timeout is enforced\&. The default unit is 100ms\&. This option is available since 8\&.3\&.12\&. +.RE +.PP +\fB\-\-read\-balancing \fR\fB\fImethod\fR\fR +.RS 4 +The supported +\fImethods\fR +for load balancing of read requests are +\fBprefer\-local\fR, +\fBprefer\-remote\fR, +\fBround\-robin\fR, +\fBleast\-pending\fR +and +\fBwhen\-congested\-remote\fR, +\fB32K\-striping\fR, +\fB64K\-striping\fR, +\fB128K\-striping\fR, +\fB256K\-striping\fR, +\fB512K\-striping\fR +and +\fB1M\-striping\fR\&. +.sp +The default value of is +\fBprefer\-local\fR\&. This option is available since 8\&.4\&.1\&. +.RE +.SS "connect, net\-options" .\" drbdsetup: net .PP -Sets up the +Connect sets up the \fIdevice\fR to listen on \fIaf:local_addr:port\fR @@ -245,41 +380,45 @@ gets used\&. Other supported address families are \fBipv6\fR, \fBssocks\fR -for Dolphin Interconnect Solutions\' "super sockets" and +for Dolphin Interconnect Solutions\*(Aq "super sockets" and \fBsdp\fR for Sockets Direct Protocol (Infiniband)\&. .PP +The net\-options command allows you to change options while the connection is established\&. +.PP +\fB\-\-protocol \fR\fB\fIprotocol\fR\fR +.RS 4 On the TCP/IP link the specified \fIprotocol\fR is used\&. Valid protocol specifiers are A, B, and C\&. -.PP +.sp Protocol A: write IO is reported as completed, if it has reached local disk and local TCP send buffer\&. -.PP +.sp Protocol B: write IO is reported as completed, if it has reached local disk and remote buffer cache\&. -.PP +.sp Protocol C: write IO is reported as completed, if it has reached both local and remote disk\&. +.RE .PP -\fB\-c\fR, \fB\-\-connect\-int \fR\fB\fItime\fR\fR +\fB\-\-connect\-int \fR\fB\fItime\fR\fR .RS 4 -In case it is not possible to connect to the remote DRBD device immediately, DRBD keeps on trying to connect\&. With this option you can set the time between two tries\&. The default value is 10 seconds, the unit is 1 second\&. +In case it is not possible to connect to the remote DRBD device immediately, DRBD keeps on trying to connect\&. With this option you can set the time between two retries\&. The default value is 10\&. The unit is seconds\&. .RE .PP -\fB\-i\fR, \fB\-\-ping\-int \fR\fB\fItime\fR\fR +\fB\-\-ping\-int \fR\fB\fItime\fR\fR .RS 4 If the TCP/IP connection linking a DRBD device pair is idle for more than \fItime\fR -seconds, DRBD will generate a keep\-alive packet to check if its partner is still alive\&. The default value is 10 seconds, the unit is 1 second\&. +seconds, DRBD will generate a keep\-alive packet to check if its partner is still alive\&. The default value is 10\&. The unit is seconds\&. .RE .PP -\fB\-t\fR, \fB\-\-timeout \fR\fB\fIval\fR\fR +\fB\-\-timeout \fR\fB\fIval\fR\fR .RS 4 If the partner node fails to send an expected response packet within \fIval\fR -10ths -of a second, the partner node is considered dead and therefore the TCP/IP connection is abandoned\&. The default value is 60 (= 6 seconds)\&. +tenths of a second, the partner node is considered dead and therefore the TCP/IP connection is abandoned\&. The default value is 60 (= 6 seconds)\&. .RE .PP -\fB\-S\fR, \fB\-\-sndbuf\-size \fR\fB\fIsize\fR\fR +\fB\-\-sndbuf\-size \fR\fB\fIsize\fR\fR .RS 4 The socket send buffer is used to store packets sent to the secondary node, which are not yet acknowledged (from a network point of view) by the secondary node\&. When using protocol A, it might be necessary to increase the size of this data structure in order to increase asynchronicity between primary and secondary nodes\&. But keep in mind that more asynchronicity is synonymous with more data loss in the case of a primary node failure\&. Since 8\&.0\&.13 resp\&. 8\&.2\&.7 setting the \fIsize\fR @@ -288,54 +427,54 @@ is 0, i\&.e\&. autotune\&. .RE .PP -\fB\-r\fR, \fB\-\-rcvbuf\-size \fR\fB\fIsize\fR\fR +\fB\-\-rcvbuf\-size \fR\fB\fIsize\fR\fR .RS 4 -Packets received from the network are stored in the socket receive buffer first\&. From there they are consumed by DRBD\&. Before 8\&.3\&.2 the receive buffer\'s size was always set to the size of the socket send buffer\&. Since 8\&.3\&.2 they can be tuned independently\&. A value of 0 means that the kernel should autotune this\&. The default +Packets received from the network are stored in the socket receive buffer first\&. From there they are consumed by DRBD\&. Before 8\&.3\&.2 the receive buffer\*(Aqs size was always set to the size of the socket send buffer\&. Since 8\&.3\&.2 they can be tuned independently\&. A value of 0 means that the kernel should autotune this\&. The default \fIsize\fR is 0, i\&.e\&. autotune\&. .RE .PP -\fB\-k\fR, \fB\-\-ko\-count \fR\fB\fIcount\fR\fR +\fB\-\-ko\-count \fR\fB\fIcount\fR\fR .RS 4 In case the secondary node fails to complete a single write request for \fIcount\fR times the -\fItimeout\fR, it is expelled from the cluster\&. (I\&.e\&. the primary node goes into StandAlone mode\&.) The default is 0, which disables this feature\&. +\fItimeout\fR, it is expelled from the cluster, i\&.e\&. the primary node goes into StandAlone mode\&. The default is 0, which disables this feature\&. .RE .PP -\fB\-e\fR, \fB\-\-max\-epoch\-size \fR\fB\fIval\fR\fR +\fB\-\-max\-epoch\-size \fR\fB\fIval\fR\fR .RS 4 With this option the maximal number of write requests between two barriers is limited\&. Should be set to the same as -\fB\-\-max\-buffers \fR\&. Values smaller than 100 can lead to degraded performance\&. The default value is 2048\&. +\fB\-\-max\-buffers\fR\&. Values smaller than 10 can lead to degraded performance\&. The default value is 2048\&. .RE .PP -\fB\-b\fR, \fB\-\-max\-buffers \fR\fB\fIval\fR\fR +\fB\-\-max\-buffers \fR\fB\fIval\fR\fR .RS 4 -With this option the maximal number of buffer pages allocated by DRBD\'s receiver thread is limited\&. Should be set to the same as -\fB\-\-max\-epoch\-size \fR\&. Small values could lead to degraded performance\&. (Minimum 32) The default value is 2048\&. +With this option the maximal number of buffer pages allocated by DRBD\*(Aqs receiver thread is limited\&. Should be set to the same as +\fB\-\-max\-epoch\-size\fR\&. Small values could lead to degraded performance\&. The default value is 2048, the minimum 32\&. .RE .PP -\fB\-u\fR, \fB\-\-unplug\-watermark \fR\fB\fIval\fR\fR +\fB\-\-unplug\-watermark \fR\fB\fIval\fR\fR .RS 4 When the number of pending write requests on the standby (secondary) node exceeds the unplug\-watermark, we trigger the request processing of our backing storage device\&. Some storage controllers deliver better performance with small values, others deliver best performance when the value is set to the same value as max\-buffers\&. Minimum 16, default 128, maximum 131072\&. .RE .PP -\fB\-m\fR, \fB\-\-allow\-two\-primaries \fR +\fB\-\-allow\-two\-primaries \fR .RS 4 With this option set you may assign primary role to both nodes\&. You only should use this option if you use a shared storage file system on top of DRBD\&. At the time of writing the only ones are: OCFS2 and GFS\&. If you use this option with any other file system, you are going to crash your nodes and to corrupt your data! .RE .PP -\fB\-a\fR, \fB\-\-cram\-hmac\-alg \fR\fIalg\fR +\fB\-\-cram\-hmac\-alg \fR\fB\fIalg\fR\fR .RS 4 You need to specify the HMAC algorithm to enable peer authentication at all\&. You are strongly encouraged to use peer authentication\&. The HMAC algorithm will be used for the challenge response authentication of the peer\&. You may specify any digest algorithm that is named in /proc/crypto\&. .RE .PP -\fB\-x\fR, \fB\-\-shared\-secret \fR\fIsecret\fR +\fB\-\-shared\-secret \fR\fB\fIsecret\fR\fR .RS 4 The shared secret used in peer authentication\&. May be up to 64 characters\&. .RE .PP -\fB\-A\fR, \fB\-\-after\-sb\-0pri \fR\fIasb\-0p\-policy\fR +\fB\-\-after\-sb\-0pri \fR\fB\fIasb\-0p\-policy\fR\fR .RS 4 possible policies are: .PP @@ -370,7 +509,7 @@ .RE .RE .PP -\fB\-B\fR, \fB\-\-after\-sb\-1pri \fR\fIasb\-1p\-policy\fR +\fB\-\-after\-sb\-1pri \fR\fB\fIasb\-1p\-policy\fR\fR .RS 4 possible policies are: .PP @@ -383,19 +522,19 @@ .RS 4 Discard the version of the secondary if the outcome of the \fBafter\-sb\-0pri\fR -algorithm would also destroy the current secondary\'s data\&. Otherwise disconnect\&. +algorithm would also destroy the current secondary\*(Aqs data\&. Otherwise disconnect\&. .RE .PP \fBdiscard\-secondary\fR .RS 4 -Discard the secondary\'s version\&. +Discard the secondary\*(Aqs version\&. .RE .PP \fBcall\-pri\-lost\-after\-sb\fR .RS 4 Always honor the outcome of the \fBafter\-sb\-0pri \fR -algorithm\&. In case it decides the current secondary has the right data, call the +algorithm\&. In case it decides the current secondary has the correct data, call the \fBpri\-lost\-after\-sb\fR on the current primary\&. .RE @@ -404,11 +543,11 @@ .RS 4 Always honor the outcome of the \fBafter\-sb\-0pri \fR -algorithm\&. In case it decides the current secondary has the right data, accept a possible instantaneous change of the primary\'s data\&. +algorithm\&. In case it decides the current secondary has the correct data, accept a possible instantaneous change of the primary\*(Aqs data\&. .RE .RE .PP -\fB\-C\fR, \fB\-\-after\-sb\-2pri \fR\fIasb\-2p\-policy\fR +\fB\-\-after\-sb\-2pri \fR\fB\fIasb\-2p\-policy\fR\fR .RS 4 possible policies are: .PP @@ -430,20 +569,20 @@ .RS 4 Always honor the outcome of the \fBafter\-sb\-0pri \fR -algorithm\&. In case it decides the current secondary has the right data, accept a possible instantaneous change of the primary\'s data\&. +algorithm\&. In case it decides the current secondary has the right data, accept a possible instantaneous change of the primary\*(Aqs data\&. .RE .RE .PP -\fB\-P\fR, \fB\-\-always\-asbp\fR +\fB\-\-always\-asbp\fR .RS 4 Normally the automatic after\-split\-brain policies are only used if current states of the UUIDs do not indicate the presence of a third node\&. .sp With this option you request that the automatic after\-split\-brain policies are used as long as the data sets of the nodes are somehow related\&. This might cause a full sync, if the UUIDs indicate the presence of a third node\&. (Or double faults have led to strange UUID sets\&.) .RE .PP -\fB\-R\fR, \fB\-\-rr\-conflict \fR\fIrole\-resync\-conflict\-policy\fR +\fB\-\-rr\-conflict \fR\fB\fIrole\-resync\-conflict\-policy\fR\fR .RS 4 -This option sets DRBD\'s behavior when DRBD deduces from its meta data that a resynchronization is needed, and the SyncTarget node is already primary\&. The possible settings are: +This option sets DRBD\*(Aqs behavior when DRBD deduces from its meta data that a resynchronization is needed, and the SyncTarget node is already primary\&. The possible settings are: \fBdisconnect\fR, \fBcall\-pri\-lost\fR and @@ -458,12 +597,12 @@ .sp With the \fBviolently\fR -setting you allow DRBD to force a primary node into SyncTarget state\&. This means that with that action the data exposed by DRBD change to the SyncSource\'s version of the data instantaneously\&. USE THIS OPTION ONLY IF YOU KNOW WHAT YOU ARE DOING\&. +setting you allow DRBD to force a primary node into SyncTarget state\&. This means that the data exposed by DRBD changes to the SyncSource\*(Aqs version of the data instantaneously\&. USE THIS OPTION ONLY IF YOU KNOW WHAT YOU ARE DOING\&. .RE .PP -\fB\-d\fR, \fB\-\-data\-integrity\-alg \fR\fIhash_alg\fR +\fB\-\-data\-integrity\-alg \fR\fB\fIhash_alg\fR\fR .RS 4 -DRBD can ensure the data integrity of the user\'s data on the network by comparing hash values\&. Normally this is ensured by the 16 bit checksums in the headers of TCP/IP packets\&. This option can be set to any of the kernel\'s data digest algorithms\&. In a typical kernel configuration you should have at least one of +DRBD can ensure the data integrity of the user\*(Aqs data on the network by comparing hash values\&. Normally this is ensured by the 16 bit checksums in the headers of TCP/IP packets\&. This option can be set to any of the kernel\*(Aqs data digest algorithms\&. In a typical kernel configuration you should have at least one of \fBmd5\fR, \fBsha1\fR, and \fBcrc32c\fR @@ -472,51 +611,54 @@ See also the notes on data integrity on the drbd\&.conf manpage\&. .RE .PP -\fB\-o\fR, \fB\-\-no\-tcp\-cork \fR +\fB\-\-no\-tcp\-cork\fR .RS 4 -DRBD usually uses the TCP socket option TCP_CORK to hint to the network stack when it can expect more data, and when it should flush out what it has in its send queue\&. It turned out that there is at lease one network stack that performs worse when one uses this hinting method\&. Therefore we introducted this option, which disable the setting and clearing of the TCP_CORK socket option by DRBD\&. +DRBD usually uses the TCP socket option TCP_CORK to hint to the network stack when it can expect more data, and when it should flush out what it has in its send queue\&. There is at least one network stack that performs worse when one uses this hinting method\&. Therefore we introduced this option, which disable the setting and clearing of the TCP_CORK socket option by DRBD\&. .RE .PP -\fB\-p\fR, \fB\-\-ping\-timeout \fR\fIping_timeout\fR +\fB\-\-ping\-timeout \fR\fB\fIping_timeout\fR\fR .RS 4 -The time the peer has to answer to a keep\-alive packet\&. In case the peer\'s reply is not received within this time period, it is considered as dead\&. The default value is 500ms, the default unit is 100ms\&. +The time the peer has to answer to a keep\-alive packet\&. In case the peer\*(Aqs reply is not received within this time period, it is considered dead\&. The default unit is tenths of a second, the default value is 5 (for half a second)\&. .RE .PP -\fB\-D\fR, \fB\-\-discard\-my\-data \fR +\fB\-\-discard\-my\-data\fR .RS 4 -Use this option to manually recover from a split\-brain situation\&. In case you do not have any automatic after\-split\-brain policies selected, the nodes refuse to connect\&. By passing this option you make a node to sync target immediately after successful connect\&. +Use this option to manually recover from a split\-brain situation\&. In case you do not have any automatic after\-split\-brain policies selected, the nodes refuse to connect\&. By passing this option you make this node a sync target immediately after successful connect\&. .RE -.SS "syncer" -.\" drbdsetup: syncer .PP -Changes the synchronization daemon parameters of -\fIdevice\fR -at runtime\&. -.PP -\fB\-r\fR, \fB\-\-rate \fR\fB\fIrate\fR\fR +\fB\-\-tentative\fR .RS 4 -To ensure smooth operation of the application on top of DRBD, it is possible to limit the bandwidth that may be used by background synchronization\&. The default is 250 KB/sec, the default unit is KB/sec\&. +Causes DRBD to abort the connection process after the resync handshake, i\&.e\&. no resync gets performed\&. You can find out which resync DRBD would perform by looking at the kernel\*(Aqs log file\&. .RE .PP -\fB\-a\fR, \fB\-\-after \fR\fB\fIminor\fR\fR +\fB\-\-on\-congestion \fR\fB\fIcongestion_policy\fR\fR, \fB\-\-congestion\-fill \fR\fB\fIfill_threshold\fR\fR, \fB\-\-congestion\-extents \fR\fB\fIactive_extents_threshold\fR\fR .RS 4 -Start resync on this device only if the device with -\fIminor\fR -is already in connected state\&. Otherwise this device waits in SyncPause state\&. -.RE -.PP -\fB\-e\fR, \fB\-\-al\-extents \fR\fB\fIextents\fR\fR -.RS 4 -DRBD automatically performs hot area detection\&. With this parameter you control how big the hot area (=active set) can get\&. Each extent marks 4M of the backing storage\&. In case a primary node leaves the cluster unexpectedly, the areas covered by the active set must be resynced upon rejoining of the failed node\&. The data structure is stored in the meta\-data area, therefore each change of the active set is a write operation to the meta\-data device\&. A higher number of extents gives longer resync times but less updates to the meta\-data\&. The default number of -\fIextents\fR -is 127\&. (Minimum: 7, Maximum: 3843) +By default DRBD blocks when the available TCP send queue becomes full\&. That means it will slow down the application that generates the write requests that cause DRBD to send more data down that TCP connection\&. +.sp +When DRBD is deployed with DRBD\-proxy it might be more desirable that DRBD goes into AHEAD/BEHIND mode shortly before the send queue becomes full\&. In AHEAD/BEHIND mode DRBD does no longer replicate data, but still keeps the connection open\&. +.sp +The advantage of the AHEAD/BEHIND mode is that the application is not slowed down, even if DRBD\-proxy\*(Aqs buffer is not sufficient to buffer all write requests\&. The downside is that the peer node falls behind, and that a resync will be necessary to bring it back into sync\&. During that resync the peer node will have an inconsistent disk\&. +.sp +Available +\fIcongestion_policy\fRs are +\fBblock\fR +and +\fBpull\-ahead\fR\&. The default is +\fBblock\fR\&. +\fIFill_threshold\fR +might be in the range of 0 to 10GiBytes\&. The default is 0 which disables the check\&. +\fIActive_extents_threshold\fR +has the same limits as +\fBal\-extents\fR\&. +.sp +The AHEAD/BEHIND mode and its settings are available since DRBD 8\&.3\&.10\&. .RE .PP -\fB\-v\fR, \fB\-\-verify\-alg \fR\fB\fIhash\-alg\fR\fR +\fB\-\-verify\-alg \fR\fB\fIhash\-alg\fR\fR .RS 4 During online verification (as initiated by the \fBverify\fR -sub\-command), rather than doing a bit\-wise comparison, DRBD applies a hash function to the contents of every block being verified, and compares that hash with the peer\&. This option defines the hash algorithm being used for that purpose\&. It can be set to any of the kernel\'s data digest algorithms\&. In a typical kernel configuration you should have at least one of +sub\-command), rather than doing a bit\-wise comparison, DRBD applies a hash function to the contents of every block being verified, and compares that hash with the peer\&. This option defines the hash algorithm being used for that purpose\&. It can be set to any of the kernel\*(Aqs data digest algorithms\&. In a typical kernel configuration you should have at least one of \fBmd5\fR, \fBsha1\fR, and \fBcrc32c\fR @@ -525,14 +667,7 @@ See also the notes on data integrity on the drbd\&.conf manpage\&. .RE .PP -\fB\-c\fR, \fB\-\-cpu\-mask \fR\fB\fIcpu\-mask\fR\fR -.RS 4 -Sets the cpu\-affinity\-mask for DRBD\'s kernel threads of this device\&. The default value of -\fIcpu\-mask\fR -is 0, which means that DRBD\'s kernel threads should be spread over all CPUs of the machine\&. This value must be given in hexadecimal notation\&. If it is too big it will be truncated\&. -.RE -.PP -\fB\-C\fR, \fB\-\-csums\-alg \fR\fB\fIhash\-alg\fR\fR +\fB\-\-csums\-alg \fR\fB\fIhash\-alg\fR\fR .RS 4 A resync process sends all marked data blocks form the source to the destination node, as long as no \fBcsums\-alg\fR @@ -543,16 +678,46 @@ will lower the required bandwidth in exchange for CPU cycles\&. .RE .PP -\fB\-R\fR, \fB\-\-use\-rle\fR +\fB\-\-use\-rle\fR .RS 4 During resync\-handshake, the dirty\-bitmaps of the nodes are exchanged and merged (using bit\-or), so the nodes will have the same understanding of which blocks are dirty\&. On large devices, the fine grained dirty\-bitmap can become large as well, and the bitmap exchange can take quite some time on low\-bandwidth links\&. .sp Because the bitmap typically contains compact areas where all bits are unset (clean) or set (dirty), a simple run\-length encoding scheme can considerably reduce the network traffic necessary for the bitmap exchange\&. .sp -For backward compatibilty reasons, and because on fast links this possibly does not improve transfer time but consumes cpu cycles, this defaults to off\&. +For backward compatibility reasons, and because on fast links this possibly does not improve transfer time but consumes cpu cycles, this defaults to off\&. .sp Introduced in 8\&.3\&.2\&. .RE +.SS "resource\-options" +.\" drbdsetup: resource-options +.PP +Changes the options of the resource at runtime\&. +.PP +\fB\-\-cpu\-mask \fR\fB\fIcpu\-mask\fR\fR +.RS 4 +Sets the cpu\-affinity\-mask for DRBD\*(Aqs kernel threads of this device\&. The default value of +\fIcpu\-mask\fR +is 0, which means that DRBD\*(Aqs kernel threads should be spread over all CPUs of the machine\&. This value must be given in hexadecimal notation\&. If it is too big it will be truncated\&. +.RE +.PP +\fB\-\-on\-no\-data\-accessible \fR\fB\fIond\-policy\fR\fR +.RS 4 +This setting controls what happens to IO requests on a degraded, disk less node (I\&.e\&. no data store is reachable)\&. The available policies are +\fBio\-error\fR +and +\fBsuspend\-io\fR\&. +.sp +If +\fIond\-policy\fR +is set to +\fBsuspend\-io\fR +you can either resume IO by attaching/connecting the last lost data storage, or by the +\fBdrbdadm resume\-io \fR\fB\fIres\fR\fR +command\&. The latter will result in IO errors of course\&. +.sp +The default is +\fBio\-error\fR\&. This setting is available since DRBD 8\&.3\&.9\&. +.RE .SS "primary" .\" drbdsetup: primary .PP @@ -568,9 +733,14 @@ \fB\-\-allow\-two\-primaries\fR option, you override this behavior and instruct DRBD to allow two primaries\&. .PP -\fB\-o\fR, \fB\-\-overwrite\-data\-of\-peer\fR +\fB\-\-overwrite\-data\-of\-peer\fR .RS 4 -Becoming primary fails if the local replica is inconsistent\&. By using this option you can force it into primary role anyway\&. USE THIS OPTION ONLY IF YOU KNOW WHAT YOU ARE DOING\&. +Alias for \-\-force\&. +.RE +.PP +\fB\-\-force\fR +.RS 4 +Becoming primary fails if the local replica is not up\-to\-date\&. I\&.e\&. when it is inconsistent, outdated of consistent\&. By using this option you can force it into primary role anyway\&. USE THIS OPTION ONLY IF YOU KNOW WHAT YOU ARE DOING\&. .RE .SS "secondary" .\" drbdsetup: secondary @@ -584,7 +754,7 @@ .\" drbdsetup: verify .PP This initiates on\-line device verification\&. During on\-line verification, the contents of every block on the local node are compared to those on the peer node\&. Device verification progress can be monitored via -/proc/drbd\&. Any blocks whose content differs from that of the corresponding block on the peer node will be marked out\-of\-sync in DRBD\'s on\-disk bitmap; they are +/proc/drbd\&. Any blocks whose content differs from that of the corresponding block on the peer node will be marked out\-of\-sync in DRBD\*(Aqs on\-disk bitmap; they are \fInot\fR brought back in sync automatically\&. To do that, simply disconnect and reconnect the resource\&. .PP @@ -596,11 +766,13 @@ .PP See also the notes on data integrity on the drbd\&.conf manpage\&. .PP -\fB\-s\fR, \fB\-\-start \fR\fB\fIstart\-sector\fR\fR +\fB\-\-start \fR\fB\fIstart\-sector\fR\fR .RS 4 Since version 8\&.3\&.2, on\-line verification should resume from the last position after connection loss\&. It may also be started from an arbitrary position by setting this option\&. .sp -Default unit is sectors\&. You may also specify a unit explicitly\&. The start\-sector will be rounded down to a multiple of 8 sectors (4kB)\&. +Default unit is sectors\&. You may also specify a unit explicitly\&. The +\fBstart\-sector\fR +will be rounded down to a multiple of 8 sectors (4kB)\&. .RE .SS "invalidate" .\" drbdsetup: invalidate @@ -614,6 +786,8 @@ .\" drbdsetup: invalidate-remote .PP This forces the local device of a pair of connected DRBD devices into SyncSource state, which means that all data blocks of the device are copied to the peer\&. +.PP +On a disconnected device, this will set all bits in the out of sync bitmap\&. As a side effect, this suspends updates to the on disk activity log\&. Updates to the on disk activity log will be resumed automatically when necessary\&. .SS "wait\-connect" .\" drbdsetup: wait-connect .PP @@ -621,7 +795,7 @@ \fIdevice\fR can communicate with its partner device\&. .PP -\fB\-t\fR, \fB\-\-wfc\-timeout \fR\fB\fIwfc_timeout\fR\fR, \fB\-d\fR, \fB\-\-degr\-wfc\-timeout \fR\fB\fIdegr_wfc_timeout\fR\fR, \fB\-o\fR, \fB\-\-outdated\-wfc\-timeout \fR\fB\fIoutdated_wfc_timeout\fR\fR, \fB\-w\fR, \fB\-\-wait\-after\-sb\fR +\fB\-\-wfc\-timeout \fR\fB\fIwfc_timeout\fR\fR, \fB\-\-degr\-wfc\-timeout \fR\fB\fIdegr_wfc_timeout\fR\fR, \fB\-\-outdated\-wfc\-timeout \fR\fB\fIoutdated_wfc_timeout\fR\fR, \fB\-\-wait\-after\-sb\fR .RS 4 This command will fail if the \fIdevice\fR @@ -631,9 +805,9 @@ \fIwfc_timeout\fR is used\&. If the peer was already down before this node was rebooted, the \fIdegr_wfc_timeout\fR -is used\&. If the peer was sucessfully outdated before this node was rebooted the +is used\&. If the peer was successfully outdated before this node was rebooted the \fIoutdated_wfc_timeout\fR -is used\&. The default value for all those timeout values is 0 which means to wait forever\&. In case the connection status goes down to StandAlone because the peer appeared but the devices had a split brain situation, the default for the command is to terminate\&. You can change this behavior with the +is used\&. The default value for all those timeout values is 0 which means to wait forever\&. The unit is seconds\&. In case the connection status goes down to StandAlone because the peer appeared but the devices had a split brain situation, the default for the command is to terminate\&. You can change this behavior with the \fB\-\-wait\-after\-sb\fR option\&. .RE @@ -663,6 +837,13 @@ \fIdevice\fR\&. This means that the \fIdevice\fR is detached from its backing storage device\&. +.PP +\fB\-f\fR, \fB\-\-force\fR +.RS 4 +A regular detach returns after the disk state finally reached diskless\&. As a consequence detaching from a frozen backing block device never terminates\&. +.sp +On the other hand A forced detach returns immediately\&. It allows you to detach DRBD from a frozen backing block device\&. Please note that the disk will be marked as failed until all pending IO requests where finished by the backing block device\&. +.RE .SS "down" .\" drbdsetup: down .PP @@ -674,7 +855,8 @@ .PP Shows the current roles of the \fIdevice\fR -and its peer\&. (local/peer)\&. +and its peer, as +\fIlocal\fR/\fIpeer\fR\&. .SS "state" .\" drbdsetup: state .PP @@ -687,39 +869,37 @@ .SS "dstate" .\" drbdsetup: dstate .PP -Shows the current states of the backing storage devices\&. (local/peer) -.SS "status" -.\" drbdsetup: status -.PP -Shows the current status of the device in xml\-like format\&. Example output: -.sp -.if n \{\ -.RS 4 -.\} -.nf - - -.fi -.if n \{\ -.RE -.\} -.sp +Shows the current states of the backing storage devices, as +\fIlocal\fR/\fIpeer\fR\&. .SS "resize" .\" drbdsetup: resize .PP This causes DRBD to reexamine the size of the -\fIdevice\fR\'s backing storage device\&. To actually do online growing you need to extend the backing storages on both devices and call the +\fIdevice\fR\*(Aqs backing storage device\&. To actually do online growing you need to extend the backing storages on both devices and call the \fBresize\fR -command one of your nodes\&. +command on one of your nodes\&. .PP The \fB\-\-assume\-peer\-has\-space\fR -allows you to resize a device which is currently not connected to the peer\&. Use with care, since if you do not resize the peer\'s disk as well, further connect attempts of the two will fail\&. +allows you to resize a device which is currently not connected to the peer\&. Use with care, since if you do not resize the peer\*(Aqs disk as well, further connect attempts of the two will fail\&. +.PP +When the +\fB\-\-assume\-clean\fR +option is given DRBD will skip the resync of the new storage\&. Only do this if you know that the new storage was initialized to the same content by other means\&. +.SS "check\-resize" +.\" drbdsetup: check-resize +.PP +To enable DRBD to detect offline resizing of backing devices this command may be used to record the current size of backing devices\&. The size is stored in files in /var/lib/drbd/ named drbd\-minor\-??\&.lkbd +.PP +This command is called by +\fBdrbdadm resize \fR\fB\fIres\fR\fR +after +\fBdrbdsetup \fR\fB\fIdevice\fR\fR\fB resize\fR +returned\&. .SS "pause\-sync" .\" drbdsetup: pause-sync .PP -Temporarily suspend an ongoing resynchronization by setting the local pause flag\&. Resync only progresses if neither the local nor the remote pause flag is set\&. It might be desirable to postpone DRBD\'s resynchronization after eventual resynchronization of the backing storage\'s RAID setup\&. +Temporarily suspend an ongoing resynchronization by setting the local pause flag\&. Resync only progresses if neither the local nor the remote pause flag is set\&. It might be desirable to postpone DRBD\*(Aqs resynchronization after eventual resynchronization of the backing storage\*(Aqs RAID setup\&. .SS "resume\-sync" .\" drbdsetup: resume-sync .PP @@ -729,15 +909,17 @@ .PP Mark the data on the local backing storage as outdated\&. An outdated device refuses to become primary\&. This is used in conjunction with \fBfencing\fR -and by the peer\'s fence\-peer handler\&. +and by the peer\*(Aqs +\fBfence\-peer\fR +handler\&. .SS "show\-gi" .\" drbdsetup: show-gi .PP -Displays the device\'s data generation identifiers verbosely\&. +Displays the device\*(Aqs data generation identifiers verbosely\&. .SS "get\-gi" .\" drbdsetup: get-gi .PP -Displays the device\'s data generation identifiers\&. +Displays the device\*(Aqs data generation identifiers\&. .SS "show" .\" drbdsetup: show .PP @@ -756,25 +938,25 @@ .SS "events" .\" drbdsetup: events .PP -Displays every state change of DRBD and all calls to helper programs\&. This might be used to get notified of DRBD\'s state changes by piping the output to another program\&. +Displays every state change of DRBD and all calls to helper programs\&. This might be used to get notified of DRBD\*(Aqs state changes by piping the output to another program\&. .PP -\fB\-a\fR, \fB\-\-all\-devices\fR +\fB\-\-all\-devices\fR .RS 4 Display the events of all DRBD minors\&. .RE .PP -\fB\-u\fR, \fB\-\-unfiltered\fR +\fB\-\-unfiltered\fR .RS 4 This is a debugging aid that displays the content of all received netlink messages\&. .RE .SS "new\-current\-uuid" .\" drbdsetup: new-current-uuid .PP -Generates a new currend UUID and rotates all other UUID values\&. This has at least two use cases, namely to skip the initial sync, and to reduce network bandwidth when starting in a single node configuration and then later (re\-)integrating a remote site\&. +Generates a new current UUID and rotates all other UUID values\&. This has at least two use cases, namely to skip the initial sync, and to reduce network bandwidth when starting in a single node configuration and then later (re\-)integrating a remote site\&. .PP Available option: .PP -\fB\-c\fR, \fB\-\-clear\-bitmap\fR +\fB\-\-clear\-bitmap\fR .RS 4 Clears the sync bitmap in addition to generating a new current UUID\&. .RE @@ -793,7 +975,7 @@ \fIboth\fR nodes, initialize meta data and configure the device\&. .sp -\fBdrbdadm \-\- \-\-force create\-md \fR\fB\fIres\fR\fR +\fBdrbdadm create\-md \-\-force \fR\fB\fIres\fR\fR .RE .sp .RS 4 @@ -819,7 +1001,7 @@ .\} They are now Connected Secondary/Secondary Inconsistent/Inconsistent\&. Generate a new current\-uuid and clear the dirty bitmap\&. .sp -\fBdrbdadm \-\- \-\-clear\-bitmap new\-current\-uuid \fR\fB\fIres\fR\fR +\fBdrbdadm new\-current\-uuid \-\-clear\-bitmap \fR\fB\fIres\fR\fR .RE .sp .RS 4 @@ -836,17 +1018,17 @@ .sp \fBmkfs \-t \fR\fB\fIfs\-type\fR\fR\fB $(drbdadm sh\-dev \fR\fB\fIres\fR\fR\fB)\fR .RE -.sp -.RE .PP -One obvious side\-effect is that the replica are full of old garbage (unless you made them identical using other means), so any online\-verify is expected to find any number of out\-of\-sync blocks\&. +One obvious side\-effect is that the replica is full of old garbage (unless you made them identical using other means), so any online\-verify is expected to find any number of out\-of\-sync blocks\&. .PP \fIYou must not use this on pre\-existing data!\fR -Even though it may appear to work at first glance, once you switch to the other node, your data is toast, as it never got replicated\&. So do +Even though it may appear to work at first glance, once you switch to the other node, your data is toast, as it never got replicated\&. So \fIdo not leave out the mkfs\fR (or equivalent)\&. .PP -This can also be used to shorten the initial resync of a cluster where the second node is added after the first node is gone into production, by means of disk shipping\&. This use\-case works on disconnected devices only, the device may be in primary or secondary role\&. The necessary steps are: +This can also be used to shorten the initial resync of a cluster where the second node is added after the first node is gone into production, by means of disk shipping\&. This use\-case works on disconnected devices only, the device may be in primary or secondary role\&. +.PP +The necessary steps on the current active server are: .sp .RS 4 .ie n \{\ @@ -856,7 +1038,7 @@ .sp -1 .IP " 1." 4.2 .\} -\fBdrbdsetup \fR\fB\fIdevice\fR\fR\fB new\-current\-uuid \-\-clear\-bitmap\fR +\fBdrbdsetup new\-current\-uuid \-\-clear\-bitmap \fR\fB\fIminor\fR\fR\fB \fR .RE .sp .RS 4 @@ -878,27 +1060,16 @@ .sp -1 .IP " 3." 4.2 .\} -\fBdrbdsetup \fR\fB\fIdevice\fR\fR\fB new\-current\-uuid\fR +\fBdrbdsetup new\-current\-uuid \fR\fB\fIminor\fR\fR\fB \fR .RE .sp -.RS 4 -.ie n \{\ -\h'-04' 4.\h'+01'\c -.\} -.el \{\ -.sp -1 -.IP " 4." 4.2 -.\} -Add the disk to the new secondary node, and join it to the cluster\&. You will get a resync of that parts that where changed since the first call to +Now add the disk to the new secondary node, and join it to the cluster\&. You will get a resync of that parts that were changed since the first call to \fBdrbdsetup\fR in step 1\&. -.RE -.sp -.RE .SH "EXAMPLES" .PP For examples, please have a look at the -\m[blue]\fBDRBD User\'s Guide\fR\m[]\&\s-2\u[1]\d\s+2\&. +\m[blue]\fBDRBD User\*(Aqs Guide\fR\m[]\&\s-2\u[1]\d\s+2\&. .SH "VERSION" .sp This document was revised for version 8\&.3\&.2 of the DRBD distribution\&. @@ -915,7 +1086,9 @@ .PP \fBdrbd.conf\fR(5), \fBdrbd\fR(8), -\fBdrbddisk\fR(8)\fBdrbdadm\fR(8)\m[blue]\fBDRBD User\'s Guide\fR\m[]\&\s-2\u[1]\d\s+2, +\fBdrbddisk\fR(8), +\fBdrbdadm\fR(8), +\m[blue]\fBDRBD User\*(Aqs Guide\fR\m[]\&\s-2\u[1]\d\s+2, \m[blue]\fBDRBD web site\fR\m[]\&\s-2\u[2]\d\s+2 .SH "NOTES" .IP " 1." 4 diff -Nru drbd8-8.3.7/documentation/drbdsetup.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup.xml --- drbd8-8.3.7/documentation/drbdsetup.xml 2010-01-07 09:09:33.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup.xml 2012-02-02 14:09:14.000000000 +0000 @@ -1,1558 +1,1716 @@ - - + + - 5 Dec 2008 + 6 May 2011 + DRBD - 8.3.2 + + 8.4.0 + drbdsetup + 8 + System Administration + drbdsetup - Setup tool for DRBD - - drbdsetup - - + + Setup tool for DRBD + drbdsetup + + - - drbdsetup - - device - - disk - - lower_dev - - - meta_data_dev - - - meta_data_index - - -dsize - -eerr_handler - -ffencing_policy - -b - - - drbdsetup - - device - - net - - af: - - - local_addr - - - :port - - - af: - - - remote_addr - - - :port - - - protocol - - -ctime - -itime - -tval - -Ssize - -rsize - -kcount - -emax_epoch_size - -bmax_buffers - -m - -ahash_alg - -xshared_secret - -Aasb-0p-policy - -Basb-1p-policy - -Casb-2p-policy - -D - -Rrole-resync-conflict-policy - -pping_timeout - -uval - -dhash_alg - -o - - - drbdsetup - - device - - syncer - -adev_minor - -rrate - -eextents - -vverify-hash-alg - -ccpu-mask - -Ccsums-hash-alg - -Ruse-rle - - - drbdsetup - - device - - disconnect - - - drbdsetup - - device - - detach - - - drbdsetup - - device - - down - - - drbdsetup - - device - - primary - -o - - - drbdsetup - - device - - secondary - - - drbdsetup - - device - - verify - -sstart-position - - - drbdsetup - - device - - invalidate - - - drbdsetup - - device - - invalidate-remote - - - drbdsetup - - device - - wait-connect - -twfc_timeout - -ddegr_wfc_timeout - -ooutdated_wfc_timeout - -w - - - drbdsetup - - device - - wait-sync - -twfc_timeout - -ddegr_wfc_timeout - -ooutdated_wfc_timeout - -w - - - drbdsetup - - device - - role - - - drbdsetup - - device - - cstate - - - drbdsetup - - device - - dstate - - - drbdsetup - - device - - status - - - drbdsetup - - device - - resize - -dsize - -fassume-peer-has-space - - - drbdsetup - - device - - pause-sync - - - drbdsetup - - device - - resume-sync - - - drbdsetup - - device - - outdate - - - drbdsetup - - device - - show-gi - - - drbdsetup - - device - - get-gi - - - drbdsetup - - device - - show - - - drbdsetup - - device - - suspend-io - - - drbdsetup - - device - - resume-io - - - drbdsetup - - device - - events - -u - -a - - - drbdsetup - - device - - new-current-uuid - -c - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Description - drbdsetup is used to associate DRBD devices with their backing - block devices, to set up DRBD device pairs to mirror their - backing block devices, and to inspect the configuration of - running DRBD devices. - + + drbdsetup is used to associate DRBD devices with their backing block devices, to set up + DRBD device pairs to mirror their backing block devices, and to inspect the configuration of + running DRBD devices. + Note - drbdsetup is a low level tool of the DRBD program suite. It is - used by the data disk and drbd scripts to communicate with - the device driver. - + + drbdsetup is a low level tool of the DRBD program suite. It is used by the data disk and + drbd scripts to communicate with the device driver. + Commands - Each drbdsetup sub-command might require arguments and bring its own - set of options. All values have default units which might be overruled - by K, M or G. These units are defined in the usual way (e.g. K = 2^10 = 1024). - + + Each drbdsetup sub-command might require arguments and bring its own set of options. All + values have default units which might be overruled by K, M or G. These units are defined in + the usual way (e.g. K = 2^10 = 1024). + Common options - All drbdsetup sub-commands accept these two options - In case the specified DRBD device (minor number) does not - exist yet, create it implicitly. - When is given on the - command line, all options of the invoked sub-command that - are not explicitly set are reset to their default values. - + All drbdsetup sub-commands accept these two options + + - + + In case the specified DRBD device (minor number) does not exist yet, create it + implicitly. + + + + + + + new-resource + + Resources are the primary objects of any DRBD configuration. A resource must be created + with the command before any volumes or minor devices can be created. + Connections are referenced by name. + - disk + new-minor + + A minor is used as a synonym for replicated block device. It is + represented in the /dev/ directory by a block device. It is the application's interface to + the DRBD-replicated block devices. These block devices get addressed by their minor numbers + on the drbdsetup commandline. + + A pair of replicated block devices may have different minor numbers on the two + machines. They are associated by a common volume-number. Volume numbers + are local to each connection. Minor numbers are global on one node. + + + + del-resource + + Destroys a resource object. This is only possible if the resource has no + volumes. + + + + del-minor + + Minors can only be destroyed if its disk is detached. + + + + attach, disk-options + drbdsetup + disk - Associates device with - lower_device to store its data blocks on. - The (or ) should - only be used if you wish not to use as much as possible from the - backing block devices. - If you do not use , the device - is only ready for use as soon as it was connected to its peer once. - (See the command.) - + + Attach associates device with + lower_device to store its data blocks on. The + (or ) should only be used if you wish not to use as much as + possible from the backing block devices. If you do not use , the + device is only ready for use as soon as it was connected to its + peer once. (See the command.) + + With the disk-options command it is possible to change the options of a minor while it + is attached. + - , - + + - You can override DRBD's size determination method with this - option. If you need to use the device before it was ever - connected to its peer, use this option to pass the - size of the DRBD device to the - driver. Default unit is sectors (1s = 512 bytes). - - If you use the size parameter in drbd.conf, - we strongly recommend to add an explicit unit postfix. - drbdadm and drbdsetup used to have mismatching default units. - + You can override DRBD's size determination method with this option. If you need + to use the device before it was ever connected to its peer, use this option to pass + the size of the DRBD device to the driver. Default unit is + sectors (1s = 512 bytes). + + If you use the size parameter in drbd.conf, we + strongly recommend to add an explicit unit postfix. drbdadm and drbdsetup used to have + mismatching default units. + - , - + + - If the driver of the lower_device - reports an error to DRBD, DRBD will either pass the error - to the upper layers of the operating system, call a helper - program, or detach the device from its backing storage and - perform all further IO by requesting it from the peer. The - valid err_handlers are: - , - and . - + If the driver of the lower_device + reports an error to DRBD, DRBD will mark the disk as inconsistent, + call a helper program, or detach the device from its backing storage and perform all + further IO by requesting it from the peer. The valid + err_handlers are: , + and . + - , - + + - Under we understand preventative - measures to avoid situations where both nodes are primary - and disconnected (AKA split brain). - - Valid fencing policies are: - + Under we understand preventive measures to avoid + situations where both nodes are primary and disconnected (AKA split brain). + + Valid fencing policies are: + - - - + + - This is the default policy. No fencing actions are undertaken. - + This is the default policy. No fencing actions are done. + - - - + + - If a node becomes a disconnected primary. it tries to outdate - the peer's disk. This is done by calling the fence-peer - handler. The handler is supposed to reach the other node over - alternative communication paths and call 'drbdadm outdate - res' there. - + If a node becomes a disconnected primary, it tries to outdate the peer's + disk. This is done by calling the fence-peer handler. The handler is supposed to + reach the other node over alternative communication paths and call 'drbdadm + outdate res' there. + - - - + + - If a node becomes a disconnected primary, it freezes all - its IO operations and calls its fence-peer handler. The - fence-peer handler is supposed to reach the peer over - alternative communication paths and call 'drbdadm outdate - res' there. In case it cannot reach the peer, it should - stonith the peer. IO is resumed as soon as the situation - is resolved. In case your handler fails, you can resume - IO with the command. - + If a node becomes a disconnected primary, it freezes all its IO operations + and calls its fence-peer handler. The fence-peer handler is supposed to reach + the peer over alternative communication paths and call 'drbdadm outdate res' + there. In case it cannot reach the peer, it should stonith the peer. IO is + resumed as soon as the situation is resolved. In case your handler fails, you + can resume IO with the command. + - , - + + + + + + - In case the backing storage's driver has a merge_bvec_fn() - function, DRBD has to - pretend that it can only process IO requests in units - not lager than 4kByte. (At time of writing the only known - drivers which - have such a function are: md (software raid driver), - dm (device mapper - LVM) and DRBD itself) - To get best performance out of DRBD on top of software - raid (or any other driver with a merge_bvec_fn() function) - you might enable this function, if you know for sure - that the merge_bvec_fn() function will deliver the same - results on all nodes of your cluster. I.e. the physical - disks of the software raid are exactly of the same type. - USE THIS OPTION ONLY IF YOU KNOW WHAT YOU ARE DOING. - - - - , - , - , - - DRBD has four implementations to express write-after-write dependencies to - its backing storage device. DRBD will use the first method that is - supported by the backing storage device and that is not disabled by the user. - - When selecting the method you should not only base your decision on the - measurable performance. In case your backing storage device has a volatile - write cache (plain disks, RAID of plain disks) you should use one - of the first two. In case your backing storage device has battery-backed - write cache you may go with option 3 or 4. Option 4 will deliver the - best performance such devices. - - Unfortunately device mapper (LVM) does not support barriers. - - The letter after "wo:" in /proc/drbd indicates with method is currently in - use for a device: b, f, d, n. The implementations: - + DRBD has four implementations to express write-after-write dependencies to its + backing storage device. DRBD will use the first method that is supported by the + backing storage device and that is not disabled by the user. By default all three + options are enabled. + + When selecting the method you should not only base your decision on the + measurable performance. In case your backing storage device has a volatile write cache + (plain disks, RAID of plain disks) you should use one of the first two. In case your + backing storage device has battery-backed write cache you may go with option 3. + Option 4 (disable everything, use "none") is dangerous + on most IO stacks, may result in write-reordering, and if so, + can theoretically be the reason for data corruption, or disturb + the DRBD protocol, causing spurious disconnect/reconnect cycles. + Do not use . + + Unfortunately device mapper (LVM) might not support barriers. + + The letter after "wo:" in /proc/drbd indicates with method is currently in use + for a device: b, f, d, n. The implementations: barrier + - The first requirs that the driver of the - backing storage device support barriers (called 'tagged command queuing' in - SCSI and 'native command queuing' in SATA speak). The use of this - method can be disabled by the we option. - + The first requires that the driver of the backing storage device support + barriers (called 'tagged command queuing' in SCSI and 'native command queuing' + in SATA speak). The use of this method can be disabled by setting the + options to . + flush + - The second requires that the backing device support disk flushes (called - 'force unit access' in the drive vendors speak). The use of this method - can be disabled using the option. - + The second requires that the backing device support disk flushes (called + 'force unit access' in the drive vendors speak). The use of this method can be + disabled setting to . + drain + - The third method is simply to let write requests drain before - write requests of a new reordering domain are issued. That was the - only implementation before 8.0.9. You can prevent to use of this - method by using the option. - + The third method is simply to let write requests drain before write + requests of a new reordering domain are issued. That was the only implementation + before 8.0.9. + none + - The fourth method is to not express write-after-write dependencies to - the backing store at all. - + The fourth method is to not express write-after-write dependencies to + the backing store at all, by also specifying . + This is dangerous + on most IO stacks, may result in write-reordering, and if so, + can theoretically be the reason for data corruption, or disturb + the DRBD protocol, causing spurious disconnect/reconnect cycles. + Do not use . + - , - + + - Disables the use of disk flushes and barrier BIOs when - accessing the meta data device. See the notes - on . - + Disables the use of disk flushes and barrier BIOs when accessing the meta data + device. See the notes on . + - , - - - In some special circumstances the device mapper stack manages to - pass BIOs to DRBD that violate the constraints that are set forth - by DRBD's merge_bvec() function and which have more than one bvec. - A known example is: - phys-disk -> DRBD -> LVM -> Xen -> missaligned partition (63) -> DomU FS. - Then you might see "bio would need to, but cannot, be split:" in - the Dom0's kernel log. - The best workaround is to proper align the partition within - the VM (E.g. start it at sector 1024). Costs 480 KiByte of storage. - Unfortunately the default of most Linux partitioning tools is - to start the first partition at an odd number (63). Therefore - most distribution's install helpers for virtual linux machines will - end up with missaligned partitions. - The second best workaround is to limit DRBD's max bvecs per BIO - (= max-bio-bvecs) to 1. Might cost performance. - The default value of is 0, which means that - there is no user imposed limitation. - + + + + In some special circumstances the device mapper stack manages to pass BIOs to + DRBD that violate the constraints that are set forth by DRBD's merge_bvec() function + and which have more than one bvec. A known example is: phys-disk -> DRBD -> LVM + -> Xen -> missaligned partition (63) -> DomU FS. Then you might see "bio + would need to, but cannot, be split:" in the Dom0's kernel log. + + The best workaround is to proper align the partition within the VM (E.g. start + it at sector 1024). That costs 480 KiB of storage. Unfortunately the default of most + Linux partitioning tools is to start the first partition at an odd number (63). + Therefore most distributions install helpers for virtual linux machines will end up + with missaligned partitions. The second best workaround is to limit DRBD's max bvecs + per BIO (i.e., the option) to 1, but that might cost + performance. + + The default value of is 0, which means that there + is no user imposed limitation. + + + + + + + + To ensure smooth operation of the application on top of DRBD, it is possible to + limit the bandwidth that may be used by background synchronization. The default is 250 + KiB/sec, the default unit is KiB/sec. + + + + + + + + Start resync on this device only if the device with + minor is already in connected state. Otherwise this device + waits in SyncPause state. + + + + + + + + DRBD automatically performs hot area detection. With this parameter you control + how big the hot area (=active set) can get. Each extent marks 4M of the backing + storage. In case a primary node leaves the cluster unexpectedly, the areas covered by + the active set must be resynced upon rejoining of the failed node. The data structure + is stored in the meta-data area, therefore each change of the active set is a write + operation to the meta-data device. A higher number of extents gives longer resync + times but less updates to the meta-data. The default number of + extents is 127. (Minimum: 7, Maximum: 3843) + + + + + + + + + + + + The dynamic resync speed controller gets enabled with setting + plan_time to a positive value. It aims to fill the buffers + along the data path with either a constant amount of data + fill_target, or aims to have a constant delay time of + delay_target along the path. The controller has an upper + bound of max_rate. + + By plan_time the agility of the controller is + configured. Higher values yield for slower/lower responses of the controller to + deviation from the target value. It should be at least 5 times RTT. For regular data + paths a fill_target in the area of 4k to 100k is + appropriate. For a setup that contains drbd-proxy it is advisable to use + delay_target instead. Only when + fill_target is set to 0 the controller will use + delay_target. 5 times RTT is a reasonable starting value. + Max_rate should be set to the bandwidth available between + the DRBD-hosts and the machines hosting DRBD-proxy, or to the available + disk-bandwidth. + + The default value of plan_time is 0, the default unit + is 0.1 seconds. Fill_target has 0 and sectors as default + unit. Delay_target has 1 (100ms) and 0.1 as default unit. + Max_rate has 10240 (100MiB/s) and KiB/s as default + unit. + + + + + + + + We track the disk IO rate caused by the resync, so we can detect non-resync IO + on the lower level device. If the lower level device seems to be busy, and the current + resync rate is above min_rate, we throttle the + resync. + + The default value of min_rate is 4M, the default unit + is k. If you want to not throttle at all, set it to zero, if you want to throttle + always, set it to one. + + + + , + + + If the driver of the lower_device + does not finish an IO request within disk_timeout, + DRBD considers the disk as failed. If DRBD is connected to a remote host, + it will reissue local pending IO requests to the peer, and ship all new + IO requests to the peer only. The disk state advances to diskless, as soon + as the backing block device has finished all IO requests. + The default value of is 0, which means that no timeout is enforced. + The default unit is 100ms. This option is available since 8.3.12. + + + + + + + + + + + The supported methods for load balancing of + read requests are , , + , and + , , + , , + , + and . + The default value of is . + This option is available since 8.4.1. + + + + + - net + connect, net-options + drbdsetup + net - Sets up the device to listen on - af:local_addr:port for incoming connections - and to try to connect to af:remote_addr:port. - If port is omitted, 7788 is used as default. - If af is omitted gets - used. Other supported address families are , - for Dolphin Interconnect Solutions' "super sockets" - and for Sockets Direct Protocol (Infiniband). - - On the TCP/IP link the specified protocol - is used. Valid protocol specifiers are A, B, and C. - Protocol A: write IO is reported as completed, if it has reached - local disk and local TCP send buffer. - Protocol B: write IO is reported as completed, if it has reached - local disk and remote buffer cache. - Protocol C: write IO is reported as completed, if it has - reached both local and remote disk. + + Connect sets up the device to listen on + af:local_addr:port for incoming connections and to try to connect + to af:remote_addr:port. If port is + omitted, 7788 is used as default. If af is omitted + gets used. Other supported address families are , + for Dolphin Interconnect Solutions' "super sockets" and + for Sockets Direct Protocol (Infiniband). + + The net-options command allows you to change options while the connection is + established. + - , - + + - In case it is not possible to connect to the remote DRBD - device immediately, DRBD keeps on trying to connect. With - this option you can set the time between two tries. The - default value is 10 seconds, the unit is 1 second. - + On the TCP/IP link the specified protocol is used. + Valid protocol specifiers are A, B, and C. + + Protocol A: write IO is reported as completed, if it has reached local disk and + local TCP send buffer. + + Protocol B: write IO is reported as completed, if it has reached local disk and + remote buffer cache. + + Protocol C: write IO is reported as completed, if it has reached both local and + remote disk. + - , - + + - If the TCP/IP connection linking a DRBD device pair is idle - for more than time seconds, DRBD - will generate a keep-alive packet to check if its partner is - still alive. The default value is 10 seconds, the unit is 1 second. - + In case it is not possible to connect to the remote DRBD device immediately, + DRBD keeps on trying to connect. With this option you can set the time between two + retries. The default value is 10. The unit is seconds. + - , - + + - If the partner node fails to send an expected response packet - within val - 10ths of a second, the partner node - is considered dead and therefore the TCP/IP connection is - abandoned. The default value is 60 (= 6 seconds). - + If the TCP/IP connection linking a DRBD device pair is idle for more than + time seconds, DRBD will generate a keep-alive packet to + check if its partner is still alive. The default value is 10. The unit is + seconds. + - , - + + - The socket send buffer is used to store packets sent to the - secondary node, which are not yet acknowledged (from a network - point of view) by the secondary node. When using protocol A, - it might be necessary to increase the size of this data - structure in order to increase asynchronicity between primary - and secondary nodes. But keep in mind that more asynchronicity - is synonymous with more data loss in the case of a primary - node failure. Since 8.0.13 resp. 8.2.7 setting the size - value to 0 means that the kernel should autotune this. - The default size is - 0, i.e. autotune. - + If the partner node fails to send an expected response packet within + val tenths of a second, the partner node is considered dead + and therefore the TCP/IP connection is abandoned. The default value is 60 (= 6 + seconds). + - , - + + - Packets received from the network are stored in the socket receive - buffer first. From there they are consumed by DRBD. Before 8.3.2 the - receive buffer's size was always set to the size of the socket - send buffer. Since 8.3.2 they can be tuned independently. - A value of 0 means that the kernel should autotune this. - The default size is - 0, i.e. autotune. - + The socket send buffer is used to store packets sent to the secondary node, + which are not yet acknowledged (from a network point of view) by the secondary node. + When using protocol A, it might be necessary to increase the size of this data + structure in order to increase asynchronicity between primary and secondary nodes. But + keep in mind that more asynchronicity is synonymous with more data loss in the case of + a primary node failure. Since 8.0.13 resp. 8.2.7 setting the + size value to 0 means that the kernel should autotune this. + The default size is 0, i.e. autotune. + - , - + + - In case the secondary node fails to complete a single write - request for count times the - timeout, it is expelled from the - cluster. (I.e. the primary node goes into StandAlone mode.) - The default is 0, which disables this feature. - + Packets received from the network are stored in the socket receive buffer first. + From there they are consumed by DRBD. Before 8.3.2 the receive buffer's size was + always set to the size of the socket send buffer. Since 8.3.2 they can be tuned + independently. A value of 0 means that the kernel should autotune this. The default + size is 0, i.e. autotune. + - , + + - With this option the maximal number of write requests between - two barriers is limited. Should be set to the same as - . Values smaller than 100 can - lead to degraded performance. The default value is 2048. - + In case the secondary node fails to complete a single write request for + count times the timeout, it is + expelled from the cluster, i.e. the primary node goes into StandAlone mode. The + default is 0, which disables this feature. + - , - + + - With this option the maximal number of buffer pages allocated - by DRBD's receiver thread is limited. Should be set to the - same as . Small values - could lead to degraded performance. (Minimum 32) The default value is - 2048. - + With this option the maximal number of write requests between two barriers is + limited. Should be set to the same as . Values smaller + than 10 can lead to degraded performance. The default value is 2048. + - , - + + - When the number of pending write requests on the standby - (secondary) node exceeds the unplug-watermark, we trigger - the request processing of our backing storage device. - Some storage controllers deliver better performance with small - values, others deliver best performance when the value is set to - the same value as max-buffers. Minimum 16, default 128, maximum - 131072. - + With this option the maximal number of buffer pages allocated by DRBD's receiver + thread is limited. Should be set to the same as . + Small values could lead to degraded performance. The default value is 2048, the + minimum 32. + - , - + + - With this option set you may assign primary role to both nodes. You - only should use this option if you use a shared storage - file system on top of DRBD. At the time of writing the only - ones are: OCFS2 and GFS. If you use this option with any - other file system, you are going to crash your nodes and to - corrupt your data! - + When the number of pending write requests on the standby (secondary) node + exceeds the unplug-watermark, we trigger the request processing of our backing storage + device. Some storage controllers deliver better performance with small values, others + deliver best performance when the value is set to the same value as max-buffers. + Minimum 16, default 128, maximum 131072. + - , - alg + + - You need to specify the HMAC algorithm to enable peer - authentication at all. You are strongly encouraged to use - peer authentication. - The HMAC algorithm will be used for the challenge - response authentication of the peer. You may specify any - digest algorithm that is named in /proc/crypto. - + With this option set you may assign primary role to both nodes. You only should + use this option if you use a shared storage file system on top of DRBD. At the time of + writing the only ones are: OCFS2 and GFS. If you use this option with any other file + system, you are going to crash your nodes and to corrupt your data! + - , - secret + + - The shared secret used in peer authentication. May be up to - 64 characters. - + You need to specify the HMAC algorithm to enable peer authentication at all. You + are strongly encouraged to use peer authentication. The HMAC algorithm will be used + for the challenge response authentication of the peer. You may specify any digest + algorithm that is named in /proc/crypto. + - , - asb-0p-policy + + - possible policies are: - + The shared secret used in peer authentication. May be up to 64 + characters. + + + + + + + + possible policies are: + - - - + + - No automatic resynchronization, simply disconnect. - + No automatic resynchronization, simply disconnect. + - - - + + - Auto sync from the node that was primary before the split-brain situation occurred. - + Auto sync from the node that was primary before the split-brain situation + occurred. + - - - + + - Auto sync from the node that became primary as second during - the split-brain situation. - + Auto sync from the node that became primary as second during the + split-brain situation. + - - - + + - In case one node did not write anything since the split - brain became evident, sync from the node that wrote something - to the node that did not write anything. In case none wrote - anything this policy uses a random decision to perform - a "resync" of 0 blocks. In case both have written something - this policy disconnects the nodes. - + In case one node did not write anything since the split brain became + evident, sync from the node that wrote something to the node that did not write + anything. In case none wrote anything this policy uses a random decision to + perform a "resync" of 0 blocks. In case both have written something this policy + disconnects the nodes. + - - - + + - Auto sync from the node that touched more blocks during the - split brain situation. - + Auto sync from the node that touched more blocks during the split brain + situation. + - - - + + - Auto sync to the named node. - + Auto sync to the named node. + - , - asb-1p-policy + + - possible policies are: - + possible policies are: + - - - + + - No automatic resynchronization, simply disconnect. - + No automatic resynchronization, simply disconnect. + - - - + + - Discard the version of the secondary if the outcome - of the algorithm would also - destroy the current secondary's data. Otherwise disconnect. - + Discard the version of the secondary if the outcome of the + algorithm would also destroy the current + secondary's data. Otherwise disconnect. + - - - + + - Discard the secondary's version. - + Discard the secondary's version. + - - - + + - Always honor the outcome of the algorithm. In case it decides the current - secondary has the right data, call the - on the current primary. - + Always honor the outcome of the algorithm. + In case it decides the current secondary has the correct data, call the + on the current primary. + - - - + + - Always honor the outcome of the algorithm. In case it decides the current - secondary has the right data, accept a possible instantaneous - change of the primary's data. - + Always honor the outcome of the algorithm. + In case it decides the current secondary has the correct data, accept a possible + instantaneous change of the primary's data. + - , - asb-2p-policy + + - possible policies are: - + possible policies are: + - - - + + - No automatic resynchronization, simply disconnect. - + No automatic resynchronization, simply disconnect. + - - - + + - Always honor the outcome of the algorithm. In case it decides the current - secondary has the right data, call the - on the current primary. - + Always honor the outcome of the algorithm. + In case it decides the current secondary has the right data, call the + on the current primary. + - - - + + - Always honor the outcome of the algorithm. In case it decides the current - secondary has the right data, accept a possible instantaneous - change of the primary's data. - + Always honor the outcome of the algorithm. + In case it decides the current secondary has the right data, accept a possible + instantaneous change of the primary's data. + - , - + + - Normally the automatic after-split-brain policies are only - used if current states of the UUIDs do not indicate the - presence of a third node. - - With this option you request that the automatic - after-split-brain policies are used as long as the data - sets of the nodes are somehow related. This might cause - a full sync, if the UUIDs indicate the presence of a third - node. (Or double faults have led to strange UUID sets.) - + Normally the automatic after-split-brain policies are only used if current + states of the UUIDs do not indicate the presence of a third node. + + With this option you request that the automatic after-split-brain policies are + used as long as the data sets of the nodes are somehow related. This might cause a + full sync, if the UUIDs indicate the presence of a third node. (Or double faults have + led to strange UUID sets.) + - , - role-resync-conflict-policy + + - This option sets DRBD's behavior when DRBD deduces from its - meta data that a resynchronization is needed, and the SyncTarget - node is already primary. The possible settings are: - , - and - . While - speaks for itself, with the - setting the handler is called - which is expected to either change the role of the node to - secondary, or remove the node from the cluster. - The default is . - With the setting you allow DRBD - to force a primary node into SyncTarget state. This means - that with that action the data exposed by DRBD change to - the SyncSource's version of the data instantaneously. - USE THIS OPTION ONLY IF YOU KNOW WHAT YOU ARE DOING. - + This option sets DRBD's behavior when DRBD deduces from its meta data that a + resynchronization is needed, and the SyncTarget node is already primary. The possible + settings are: , and + . While speaks for itself, with + the setting the handler is + called which is expected to either change the role of the node to secondary, or remove + the node from the cluster. The default is . + + With the setting you allow DRBD to force a primary + node into SyncTarget state. This means that the data exposed by DRBD changes to the + SyncSource's version of the data instantaneously. USE THIS OPTION ONLY IF YOU KNOW + WHAT YOU ARE DOING. + - , - hash_alg + + - DRBD can ensure the data integrity of the user's data on the network - by comparing hash values. Normally this is ensured by the 16 bit checksums - in the headers of TCP/IP packets. This option - can be set to any of the kernel's data digest algorithms. - In a typical kernel configuration you should have - at least one of , , and - available. By default this is not enabled. - + DRBD can ensure the data integrity of the user's data on the network by + comparing hash values. Normally this is ensured by the 16 bit checksums in the headers + of TCP/IP packets. This option can be set to any of the kernel's data digest + algorithms. In a typical kernel configuration you should have at least one of + , , and available. By + default this is not enabled. + See also the notes on data integrity on the drbd.conf manpage. + - , - + + - DRBD usually uses the TCP socket option TCP_CORK to hint to the network - stack when it can expect more data, and when it should flush out what it - has in its send queue. It turned out that there is at lease one network - stack that performs worse when one uses this hinting method. Therefore - we introducted this option, which disable the setting and clearing of - the TCP_CORK socket option by DRBD. - + DRBD usually uses the TCP socket option TCP_CORK to hint to the network stack + when it can expect more data, and when it should flush out what it has in its send + queue. There is at least one network stack that performs worse when one uses this + hinting method. Therefore we introduced this option, which disable the setting and + clearing of the TCP_CORK socket option by DRBD. + - , - ping_timeout + + - The time the peer has to answer to a keep-alive packet. In case the peer's reply is not received within this - time period, it is considered as dead. The default value is 500ms, - the default unit is 100ms. - + The time the peer has to answer to a keep-alive packet. In case the peer's reply + is not received within this time period, it is considered dead. The default unit is + tenths of a second, the default value is 5 (for half a second). + - , - + + - Use this option to manually recover from a split-brain - situation. In case you do not have any automatic after-split-brain policies selected, the nodes refuse to - connect. By passing this option you make a node to - sync target immediately after successful connect. - + Use this option to manually recover from a split-brain situation. In case you do + not have any automatic after-split-brain policies selected, the nodes refuse to + connect. By passing this option you make this node a sync target immediately after + successful connect. - - - - syncer - - drbdsetup - syncer - - Changes the synchronization daemon parameters of - device at runtime. - - + - , - + + - To ensure smooth operation of the application on top of DRBD, - it is possible to limit the bandwidth that may be used by - background synchronization. The default is 250 KB/sec, the - default unit is KB/sec. - + Causes DRBD to abort the connection process after the resync handshake, i.e. no + resync gets performed. You can find out which resync DRBD would perform by looking at + the kernel's log file. + - , - + + + + + + - Start resync on this device only if the device with - minor is already in connected - state. Otherwise this device waits in SyncPause state. - + By default DRBD blocks when the available TCP send queue becomes full. That + means it will slow down the application that generates the write requests that cause + DRBD to send more data down that TCP connection. + + When DRBD is deployed with DRBD-proxy it might be more desirable that DRBD goes + into AHEAD/BEHIND mode shortly before the send queue becomes full. In AHEAD/BEHIND + mode DRBD does no longer replicate data, but still keeps the connection open. + + The advantage of the AHEAD/BEHIND mode is that the application is not slowed + down, even if DRBD-proxy's buffer is not sufficient to buffer all write requests. The + downside is that the peer node falls behind, and that a resync will be necessary to + bring it back into sync. During that resync the peer node will have an inconsistent + disk. + + Available congestion_policys are + and . The default is + . Fill_threshold might be in the + range of 0 to 10GiBytes. The default is 0 which disables the check. + Active_extents_threshold has the same limits as + . + + The AHEAD/BEHIND mode and its settings are available since DRBD 8.3.10. + - , - - - DRBD automatically performs hot area detection. With this - parameter you control how big the hot area (=active set) can - get. Each extent marks 4M of the backing storage. In case a - primary node leaves the cluster unexpectedly, the areas covered - by the active set must be resynced upon rejoining of the failed - node. The data structure is stored in the meta-data area, - therefore each change of the active set is a write operation - to the meta-data device. A higher number of extents gives - longer resync times but less updates to the meta-data. The - default number of extents is - 127. (Minimum: 7, Maximum: 3843) - + + + + During online verification (as initiated by the verify sub-command), rather than doing a bit-wise + comparison, DRBD applies a hash function to the contents of every block being + verified, and compares that hash with the peer. This option defines the hash algorithm + being used for that purpose. It can be set to any of the kernel's data digest + algorithms. In a typical kernel configuration you should have at least one of + , , and available. By + default this is not enabled; you must set this option explicitly in order to be able + to use on-line device verification. + + See also the notes on data integrity on the drbd.conf manpage. + - , - + + - During online verification (as initiated by the - verify sub-command), - rather than doing a bit-wise comparison, DRBD applies a hash function - to the contents of every block being verified, and compares that - hash with the peer. This option defines the hash algorithm being - used for that purpose. It can be set to any of the kernel's data - digest algorithms. In a typical kernel configuration you should have - at least one of , , and - available. By default this is not enabled; you must set this - option explicitly in order to be able to use on-line device verification. - - See also the notes on data integrity on the drbd.conf manpage. + A resync process sends all marked data blocks form the source to the destination + node, as long as no is given. When one is specified the + resync process exchanges hash values of all marked blocks first, and sends only those + data blocks over, that have different hash values. + + This setting is useful for DRBD setups with low bandwidth links. During the + restart of a crashed primary node, all blocks covered by the activity log are marked + for resync. But a large part of those will actually be still in sync, therefore using + will lower the required bandwidth in exchange for CPU + cycles. + - , - + + - Sets the cpu-affinity-mask for DRBD's kernel threads of this - device. The default value of cpu-mask is - 0, which means that DRBD's kernel threads should be spread over - all CPUs of the machine. This value must be given in hexadecimal - notation. If it is too big it will be truncated. - + During resync-handshake, the dirty-bitmaps of the nodes are exchanged and merged + (using bit-or), so the nodes will have the same understanding of which blocks are + dirty. On large devices, the fine grained dirty-bitmap can become large as well, and + the bitmap exchange can take quite some time on low-bandwidth links. + + Because the bitmap typically contains compact areas where all bits are unset + (clean) or set (dirty), a simple run-length encoding scheme can considerably reduce + the network traffic necessary for the bitmap exchange. + + For backward compatibility reasons, and because on fast links this possibly does + not improve transfer time but consumes cpu cycles, this defaults to off. + + Introduced in 8.3.2. + + + + + resource-options + + + drbdsetup + + resource-options + + + Changes the options of the resource at runtime. + + - , - + + - A resync process sends all marked data blocks form the source to - the destination node, as long as no is - given. When one is specified the resync process exchanges hash values of all - marked blocks first, and sends only those data blocks over, that have different - hash values. - This setting is useful for DRBD setups with low bandwidth links. - During the restart of a crashed primary node, all blocks covered by the - activity log are marked for resync. But a large part of those will actually - be still in sync, therefore using will lower - the required bandwidth in exchange for CPU cycles. - + Sets the cpu-affinity-mask for DRBD's kernel threads of this device. The default + value of cpu-mask is 0, which means that DRBD's kernel + threads should be spread over all CPUs of the machine. This value must be given in + hexadecimal notation. If it is too big it will be truncated. + - , - + + - During resync-handshake, the dirty-bitmaps of the nodes are - exchanged and merged (using bit-or), so the nodes will have the - same understanding of which blocks are dirty. On large devices, - the fine grained dirty-bitmap can become large as well, and the - bitmap exchange can take quite some time on low-bandwidth links. - - Because the bitmap typically contains compact areas where all - bits are unset (clean) or set (dirty), a simple run-length - encoding scheme can considerably reduce the network traffic - necessary for the bitmap exchange. - - For backward compatibilty reasons, and because on fast links this - possibly does not improve transfer time but consumes cpu cycles, - this defaults to off. - - Introduced in 8.3.2. - + This setting controls what happens to IO requests on a degraded, disk less node + (I.e. no data store is reachable). The available policies are + and . + + If ond-policy is set to + you can either resume IO by attaching/connecting the last lost data storage, or by the + drbdadm resume-io res + command. The latter will result in IO errors of course. + + The default is . This setting is available since DRBD + 8.3.9. + primary + drbdsetup + primary - Sets the device into primary role. This - means that applications (e.g. a file system) may open the - device for read and write access. Data - written to the device in primary role are - mirrored to the device in secondary role. - - Normally it is not possible to set both devices of a connected DRBD device - pair to primary role. By using the - option, you override this behavior and instruct DRBD to allow two - primaries. - + + Sets the device into primary role. This means that + applications (e.g. a file system) may open the device for read + and write access. Data written to the device in primary role are + mirrored to the device in secondary role. + + Normally it is not possible to set both devices of a connected DRBD device pair to + primary role. By using the option, you override this + behavior and instruct DRBD to allow two primaries. + - , - + + - Becoming primary fails if the local replica is - inconsistent. By using this option you can force it into - primary role anyway. USE THIS OPTION ONLY IF YOU KNOW WHAT - YOU ARE DOING. - + Alias for --force. + + + + + + + + + + Becoming primary fails if the local replica is not up-to-date. I.e. when it is + inconsistent, outdated of consistent. By using this option you can force it into + primary role anyway. USE THIS OPTION ONLY IF YOU KNOW WHAT YOU ARE DOING. + secondary + drbdsetup + secondary - Brings the device into secondary role. - This operation fails as long as at least one application (or file - system) has opened the device. - - It is possible that both devices of a connected DRBD device pair are secondary. - + + Brings the device into secondary role. This operation fails + as long as at least one application (or file system) has opened the device. + + It is possible that both devices of a connected DRBD device pair are secondary. + verify + drbdsetup + verify - This initiates on-line device verification. During on-line verification, - the contents of every block on the local node are compared to those on - the peer node. Device verification progress can be monitored via - /proc/drbd. - Any blocks whose content differs from that of the corresponding block - on the peer node will be marked out-of-sync in DRBD's on-disk bitmap; they - are not brought back in sync automatically. To - do that, simply disconnect and reconnect the resource. - - If on-line verification is already in progress, this command - silently does nothing. - - This command will fail if the device is - not part of a connected device pair. - + + This initiates on-line device verification. During on-line verification, the contents + of every block on the local node are compared to those on the peer node. Device verification + progress can be monitored via /proc/drbd. Any blocks + whose content differs from that of the corresponding block on the peer node will be marked + out-of-sync in DRBD's on-disk bitmap; they are not brought back in sync + automatically. To do that, simply disconnect and reconnect the resource. + + If on-line verification is already in progress, this command silently does + nothing. + + This command will fail if the device is not part of a + connected device pair. + See also the notes on data integrity on the drbd.conf manpage. + - , - + + - Since version 8.3.2, on-line verification should resume from the - last position after connection loss. It may also be started from - an arbitrary position by setting this option. - - Default unit is sectors. You may also specify a unit explicitly. - The start-sector will be rounded down to a multiple of 8 sectors (4kB). - + Since version 8.3.2, on-line verification should resume from the last position + after connection loss. It may also be started from an arbitrary position by setting + this option. + + Default unit is sectors. You may also specify a unit explicitly. The + will be rounded down to a multiple of 8 sectors + (4kB). + invalidate + drbdsetup + invalidate - This forces the local device of a pair of connected DRBD devices - into SyncTarget state, which means that all data blocks of the - device are copied over from the peer. - - This command will fail if the device is - not part of a connected device pair. - + + This forces the local device of a pair of connected DRBD devices into SyncTarget + state, which means that all data blocks of the device are copied over from the peer. + + This command will fail if the device is not part of a + connected device pair. + invalidate-remote + drbdsetup + invalidate-remote - This forces the local device of a pair of connected DRBD devices - into SyncSource state, which means that all data blocks of the - device are copied to the peer. - + + This forces the local device of a pair of connected DRBD devices into SyncSource + state, which means that all data blocks of the device are copied to the peer. + + On a disconnected device, this will set all bits in the out of sync bitmap. As a side + effect, this suspends updates to the on disk activity log. Updates to the on disk activity log + will be resumed automatically when necessary. + wait-connect + drbdsetup + wait-connect - Returns as soon as the device can - communicate with its partner device. - + + Returns as soon as the device can communicate with its + partner device. + - , - - , - - , - - , - - This command will fail if the - device cannot communicate with its - partner for timeout - seconds. If the peer was working before this node was - rebooted, the wfc_timeout is used. If the peer was already - down before this node was rebooted, the degr_wfc_timeout - is used. If the peer was sucessfully outdated before this - node was rebooted the outdated_wfc_timeout is used. - The default value for all those timeout values - is 0 which means to wait forever. - In case the connection status goes down to StandAlone because - the peer appeared but the devices had a split brain situation, - the default for the command is to terminate. You can change this - behavior with the option. - + + + + + + + + + + This command will fail if the device cannot + communicate with its partner for timeout seconds. If the + peer was working before this node was rebooted, the + wfc_timeout is used. If the peer was already down before + this node was rebooted, the degr_wfc_timeout is used. If + the peer was successfully outdated before this node was rebooted the + outdated_wfc_timeout is used. The default value for all + those timeout values is 0 which means to wait forever. The unit is seconds. In case + the connection status goes down to StandAlone because the peer appeared but the + devices had a split brain situation, the default for the command is to terminate. You + can change this behavior with the option. + wait-sync + drbdsetup + wait-sync - Returns as soon as the device leaves any - synchronization into connected state. The options - are the same as with the wait-connect - command. - + + Returns as soon as the device leaves any synchronization + into connected state. The options are the same as with the + wait-connect command. + disconnect + drbdsetup + disconnect - Removes the information set by the command - from the device. This means - that the device goes into unconnected - state and will no longer listen for incoming connections. - + + Removes the information set by the command from the + device. This means that the device + goes into unconnected state and will no longer listen for incoming connections. + detach + drbdsetup + detach - Removes the information set by the command - from the device. This means - that the device is detached from its - backing storage device. + Removes the information set by the command from the + device. This means that the device is + detached from its backing storage device. + + + , + + + A regular detach returns after the disk state finally reached + diskless. As a consequence detaching from a frozen backing block device + never terminates. + On the other hand A forced detach returns immediately. It allows + you to detach DRBD from a frozen backing block device. Please note that + the disk will be marked as failed until all pending IO requests where + finished by the backing block device. + + + + + down + drbdsetup + down - Removes all configuration information from the - device and forces it back to - unconfigured state. - + + Removes all configuration information from the device and + forces it back to unconfigured state. + role + drbdsetup + role - Shows the current roles of the device and - its peer. (local/peer). - + + Shows the current roles of the device and its peer, as + local/peer. + state + drbdsetup + state + Deprecated alias for "role" + cstate + drbdsetup + cstate - Shows the current connection state of the - device. - + + Shows the current connection state of the device. + dstate + drbdsetup + dstate - Shows the current states of the backing storage devices. (local/peer) - + + Shows the current states of the backing storage devices, as + local/peer. + - status + resize + drbdsetup - status + + resize - Shows the current status of the device in xml-like format. Example output: - <resource minor="0" name="s0" cs="SyncTarget" st1="Secondary" st2="Secondary" - ds1="Inconsistent" ds2="UpToDate" resynced_precent="5.9" /> - - + + This causes DRBD to reexamine the size of the device's + backing storage device. To actually do online growing you need to extend the backing + storages on both devices and call the command on one of your + nodes. + + The allows you to resize a device which is + currently not connected to the peer. Use with care, since if you do not resize the peer's + disk as well, further connect attempts of the two will fail. + + When the option is given DRBD will skip the resync of + the new storage. Only do this if you know that the new storage was initialized to the same + content by other means. + - resize + check-resize + drbdsetup - resize + + check-resize - This causes DRBD to reexamine the size of the - device's backing storage device. To - actually do online growing you need to extend the backing storages - on both devices and call the command one of - your nodes. - - The allows you to - resize a device which is currently not connected to the peer. - Use with care, since if you do not resize the peer's disk as well, - further connect attempts of the two will fail. - + + To enable DRBD to detect offline resizing of backing devices this command may be used + to record the current size of backing devices. The size is stored in files in /var/lib/drbd/ + named drbd-minor-??.lkbd + + This command is called by drbdadm resize + res after drbdsetup + device resize returned. + pause-sync + drbdsetup + pause-sync - Temporarily suspend an ongoing resynchronization by setting the local - pause flag. Resync only progresses if neither the local nor the - remote pause flag is set. It might be desirable to postpone DRBD's - resynchronization after eventual resynchronization of the backing - storage's RAID setup. - + + Temporarily suspend an ongoing resynchronization by setting the local pause flag. + Resync only progresses if neither the local nor the remote pause flag is set. It might be + desirable to postpone DRBD's resynchronization after eventual resynchronization of the + backing storage's RAID setup. + resume-sync + drbdsetup + resume-sync - Unset the local sync pause flag. - + + Unset the local sync pause flag. + outdate + drbdsetup + outdate - Mark the data on the local backing storage as outdated. An outdated - device refuses to become primary. This is used in conjunction with - and by the peer's fence-peer handler. - + + Mark the data on the local backing storage as outdated. An outdated device refuses to + become primary. This is used in conjunction with and by the peer's + handler. + show-gi + drbdsetup + show-gi - Displays the device's data generation identifiers verbosely. - + + Displays the device's data generation identifiers verbosely. + get-gi + drbdsetup + get-gi - Displays the device's data generation identifiers. - + + Displays the device's data generation identifiers. + show + drbdsetup + show - Shows all available configuration information of the - device. - + + Shows all available configuration information of the + device. + suspend-io + drbdsetup + suspend-io - This command is of no apparent use and just provided for the sake - of completeness. - + + This command is of no apparent use and just provided for the sake of + completeness. + resume-io + drbdsetup + resume-io - If the fence-peer handler fails to stonith the peer node, - and your policy is set to - resource-and-stonith, you can unfreeze IO operations with this - command. - + + If the fence-peer handler fails to stonith the peer node, and your + policy is set to resource-and-stonith, you can unfreeze IO + operations with this command. + events + drbdsetup + events - Displays every state change of DRBD and all calls to helper - programs. This might be used to get notified of DRBD's state - changes by piping the output to another program. - - , - Display the events of all DRBD minors. - , - This is a debugging aid that displays the content of - all received netlink messages. - - + + Displays every state change of DRBD and all calls to helper programs. This might be + used to get notified of DRBD's state changes by piping the output to another program. + + + + + + Display the events of all DRBD minors. + + + + + + + + This is a debugging aid that displays the content of all received netlink + messages. + + + + new-current-uuid + drbdsetup + new-current-uuid - Generates a new currend UUID and rotates all other UUID values. This - has at least two use cases, namely to skip the initial sync, and to - reduce network bandwidth when starting in a single node configuration - and then later (re-)integrating a remote site. - - Available option: - , - Clears the sync bitmap in addition to generating a new current UUID. - - - This can be used to skip the initial sync, if you want to start from scratch. - This use-case does only work on "Just Created" meta data. - Necessary steps: - On both nodes, initialize meta data and configure the device. - drbdadm -- --force create-md res They need to do the initial handshake, so they know their sizes. - drbdadm up res They are now Connected Secondary/Secondary Inconsistent/Inconsistent. - Generate a new current-uuid and clear the dirty bitmap. - drbdadm -- --clear-bitmap new-current-uuid res They are now Connected Secondary/Secondary UpToDate/UpToDate. - Make one side primary and create a file system. - drbdadm primary resmkfs -t fs-type $(drbdadm sh-dev res) - - One obvious side-effect is that the replica are full of old garbage - (unless you made them identical using other means), so any - online-verify is expected to find any number of out-of-sync blocks. - - You must not use this on pre-existing data! - Even though it may appear to work at first glance, once you switch to - the other node, your data is toast, as it never got replicated. - So do do not leave out the mkfs (or equivalent). - - This can also be used to shorten the initial resync of a cluster where the second node - is added after the first node is gone into production, by means of disk shipping. - This use-case works on disconnected devices only, the device may be in - primary or secondary role. - The necessary steps are: - drbdsetup device new-current-uuid --clear-bitmap Take the copy of the current active server. E.g. by pulling a disk out of - the RAID1 controller, or by copying with dd. You need to copy the actual - data, and the meta data. - drbdsetup device new-current-uuid Add the disk to the new secondary node, and join it to the cluster. You will - get a resync of that parts that where changed since the first call to - drbdsetup in step 1. - - + Generates a new current UUID and rotates all other UUID values. This has at least two + use cases, namely to skip the initial sync, and to reduce network bandwidth when starting in + a single node configuration and then later (re-)integrating a remote site. + + Available option: + + + + + Clears the sync bitmap in addition to generating a new current UUID. + + + + + This can be used to skip the initial sync, if you want to start from scratch. This + use-case does only work on "Just Created" meta data. Necessary steps: + + On both nodes, initialize meta data and configure the + device. + + drbdadm create-md --force + res + + + + They need to do the initial handshake, so they know their sizes. + + drbdadm up + res + + + + They are now Connected Secondary/Secondary Inconsistent/Inconsistent. + Generate a new current-uuid and clear the dirty bitmap. + + drbdadm new-current-uuid --clear-bitmap + res + + + + They are now Connected Secondary/Secondary UpToDate/UpToDate. Make one side + primary and create a file system. + + drbdadm primary + res + + mkfs -t fs-type $(drbdadm + sh-dev res) + + + + One obvious side-effect is that the replica is full of old garbage (unless you made + them identical using other means), so any online-verify is expected to find any number of + out-of-sync blocks. + + You must not use this on pre-existing data! Even though it may + appear to work at first glance, once you switch to the other node, your data is toast, as it + never got replicated. So do not leave out the mkfs (or + equivalent). + + This can also be used to shorten the initial resync of a cluster where the second node + is added after the first node is gone into production, by means of disk shipping. This + use-case works on disconnected devices only, the device may be in primary or secondary + role. + + The necessary steps on the current active server are: + + drbdsetup new-current-uuid --clear-bitmap minor + + + + + Take the copy of the current active server. E.g. by pulling a disk out of the + RAID1 controller, or by copying with dd. You need to copy the actual data, and the + meta data. + + + + drbdsetup new-current-uuid minor + + + Now add the disk to the new secondary node, and join it to the cluster. You + will get a resync of that parts that were changed since the first call to drbdsetup in step 1. + Examples - For examples, please have a look at the - DRBD User's Guide. - + + For examples, please have a look at the + DRBD User's Guide. + + Version - This document was revised for version 8.3.2 of the DRBD distribution. - + + This document was revised for version 8.3.2 of the DRBD distribution. + Author - Written by Philipp Reisner philipp.reisner@linbit.com - and Lars Ellenberg lars.ellenberg@linbit.com - + + Written by Philipp Reisner philipp.reisner@linbit.com and Lars + Ellenberg lars.ellenberg@linbit.com + Reporting Bugs - Report bugs to drbd-user@lists.linbit.com. - + + Report bugs to drbd-user@lists.linbit.com. + Copyright - Copyright 2001-2008 LINBIT Information Technologies, -Philipp Reisner, Lars Ellenberg. This is free software; -see the source for copying conditions. There is NO warranty; -not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - + + Copyright 2001-2008 LINBIT Information Technologies, Philipp Reisner, Lars Ellenberg. + This is free software; see the source for copying conditions. There is NO warranty; not even + for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See Also - drbd.conf5, - drbd8, - drbddisk8drbdadm8DRBD User's Guide, - DRBD web site + + + drbd.conf + + 5 + , + drbd + + 8 + , + drbddisk + + 8 + , + drbdadm + + 8 + , + DRBD User's Guide, + DRBD web site + diff -Nru drbd8-8.3.7/documentation/drbdsetup_attach.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_attach.xml --- drbd8-8.3.7/documentation/drbdsetup_attach.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_attach.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupattachminorlower_devmeta_data_devmeta_data_index--size val--max-bio-bvecs val--on-io-errorpass_oncall-local-io-errordetach--fencingdont-careresource-onlyresource-and-stonith--disk-barrier--disk-flushes--disk-drain--md-flushes--resync-rate val--resync-after val--al-extents val--c-plan-ahead val--c-delay-target val--c-fill-target val--c-max-rate val--c-min-rate val--disk-timeout val--read-balancingprefer-localprefer-remoteround-robinleast-pendingwhen-congested-remote32K-striping64K-striping128K-striping256K-striping512K-striping1M-striping diff -Nru drbd8-8.3.7/documentation/drbdsetup_check-resize.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_check-resize.xml --- drbd8-8.3.7/documentation/drbdsetup_check-resize.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_check-resize.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupcheck-resizeminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_connect.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_connect.xml --- drbd8-8.3.7/documentation/drbdsetup_connect.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_connect.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupconnectresourcelocal_addrremote_addr--tentative--discard-my-data--protocolABC--timeout val--max-epoch-size val--max-buffers val--unplug-watermark val--connect-int val--ping-int val--sndbuf-size val--rcvbuf-size val--ko-count val--allow-two-primaries--cram-hmac-alg val--shared-secret val--after-sb-0pridisconnectdiscard-younger-primarydiscard-older-primarydiscard-zero-changesdiscard-least-changesdiscard-localdiscard-remote--after-sb-1pridisconnectconsensusdiscard-secondarycall-pri-lost-after-sbviolently-as0p--after-sb-2pridisconnectcall-pri-lost-after-sbviolently-as0p--always-asbp--rr-conflictdisconnectcall-pri-lostviolently--ping-timeout val--data-integrity-alg val--tcp-cork--on-congestionblockpull-aheaddisconnect--congestion-fill val--congestion-extents val--csums-alg val--verify-alg val--use-rle diff -Nru drbd8-8.3.7/documentation/drbdsetup_cstate.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_cstate.xml --- drbd8-8.3.7/documentation/drbdsetup_cstate.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_cstate.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupcstateminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_del-minor.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_del-minor.xml --- drbd8-8.3.7/documentation/drbdsetup_del-minor.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_del-minor.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupdel-minorminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_del-resource.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_del-resource.xml --- drbd8-8.3.7/documentation/drbdsetup_del-resource.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_del-resource.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupdel-resourceresource diff -Nru drbd8-8.3.7/documentation/drbdsetup_detach.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_detach.xml --- drbd8-8.3.7/documentation/drbdsetup_detach.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_detach.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupdetachminor--force diff -Nru drbd8-8.3.7/documentation/drbdsetup_disconnect.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_disconnect.xml --- drbd8-8.3.7/documentation/drbdsetup_disconnect.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_disconnect.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupdisconnectlocal_addrremote_addr--force diff -Nru drbd8-8.3.7/documentation/drbdsetup_disk-options.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_disk-options.xml --- drbd8-8.3.7/documentation/drbdsetup_disk-options.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_disk-options.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,3 @@ + +drbdsetupdisk-optionsminor + --on-io-errorpass_oncall-local-io-errordetach--fencingdont-careresource-onlyresource-and-stonith--disk-barrier--disk-flushes--disk-drain--md-flushes--resync-rate val--resync-after val--al-extents val--c-plan-ahead val--c-delay-target val--c-fill-target val--c-max-rate val--c-min-rate val--disk-timeout val--read-balancingprefer-localprefer-remoteround-robinleast-pendingwhen-congested-remote32K-striping64K-striping128K-striping256K-striping512K-striping1M-striping diff -Nru drbd8-8.3.7/documentation/drbdsetup_down.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_down.xml --- drbd8-8.3.7/documentation/drbdsetup_down.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_down.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupdownresource diff -Nru drbd8-8.3.7/documentation/drbdsetup_dstate.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_dstate.xml --- drbd8-8.3.7/documentation/drbdsetup_dstate.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_dstate.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupdstateminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_events.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_events.xml --- drbd8-8.3.7/documentation/drbdsetup_events.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_events.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,6 @@ + +drbdsetupevents + resource + minor + all + diff -Nru drbd8-8.3.7/documentation/drbdsetup_get-gi.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_get-gi.xml --- drbd8-8.3.7/documentation/drbdsetup_get-gi.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_get-gi.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupget-giminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_invalidate-remote.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_invalidate-remote.xml --- drbd8-8.3.7/documentation/drbdsetup_invalidate-remote.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_invalidate-remote.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupinvalidate-remoteminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_invalidate.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_invalidate.xml --- drbd8-8.3.7/documentation/drbdsetup_invalidate.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_invalidate.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupinvalidateminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_net-options.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_net-options.xml --- drbd8-8.3.7/documentation/drbdsetup_net-options.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_net-options.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,3 @@ + +drbdsetupnet-optionslocal_addrremote_addr + --protocolABC--timeout val--max-epoch-size val--max-buffers val--unplug-watermark val--connect-int val--ping-int val--sndbuf-size val--rcvbuf-size val--ko-count val--allow-two-primaries--cram-hmac-alg val--shared-secret val--after-sb-0pridisconnectdiscard-younger-primarydiscard-older-primarydiscard-zero-changesdiscard-least-changesdiscard-localdiscard-remote--after-sb-1pridisconnectconsensusdiscard-secondarycall-pri-lost-after-sbviolently-as0p--after-sb-2pridisconnectcall-pri-lost-after-sbviolently-as0p--always-asbp--rr-conflictdisconnectcall-pri-lostviolently--ping-timeout val--data-integrity-alg val--tcp-cork--on-congestionblockpull-aheaddisconnect--congestion-fill val--congestion-extents val--csums-alg val--verify-alg val--use-rle diff -Nru drbd8-8.3.7/documentation/drbdsetup_new-current-uuid.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_new-current-uuid.xml --- drbd8-8.3.7/documentation/drbdsetup_new-current-uuid.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_new-current-uuid.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupnew-current-uuidminor--clear-bitmap diff -Nru drbd8-8.3.7/documentation/drbdsetup_new-minor.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_new-minor.xml --- drbd8-8.3.7/documentation/drbdsetup_new-minor.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_new-minor.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupnew-minorresourceminorvolume diff -Nru drbd8-8.3.7/documentation/drbdsetup_new-resource.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_new-resource.xml --- drbd8-8.3.7/documentation/drbdsetup_new-resource.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_new-resource.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupnew-resourceresource--cpu-mask val--on-no-data-accessibleio-errorsuspend-io diff -Nru drbd8-8.3.7/documentation/drbdsetup_outdate.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_outdate.xml --- drbd8-8.3.7/documentation/drbdsetup_outdate.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_outdate.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupoutdateminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_pause-sync.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_pause-sync.xml --- drbd8-8.3.7/documentation/drbdsetup_pause-sync.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_pause-sync.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetuppause-syncminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_primary.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_primary.xml --- drbd8-8.3.7/documentation/drbdsetup_primary.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_primary.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupprimaryminor--force diff -Nru drbd8-8.3.7/documentation/drbdsetup_resize.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_resize.xml --- drbd8-8.3.7/documentation/drbdsetup_resize.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_resize.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupresizeminor--size val--assume-peer-has-space--assume-clean diff -Nru drbd8-8.3.7/documentation/drbdsetup_resource-options.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_resource-options.xml --- drbd8-8.3.7/documentation/drbdsetup_resource-options.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_resource-options.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,3 @@ + +drbdsetupresource-optionsresource + --cpu-mask val--on-no-data-accessibleio-errorsuspend-io diff -Nru drbd8-8.3.7/documentation/drbdsetup_resume-io.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_resume-io.xml --- drbd8-8.3.7/documentation/drbdsetup_resume-io.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_resume-io.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupresume-iominor diff -Nru drbd8-8.3.7/documentation/drbdsetup_resume-sync.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_resume-sync.xml --- drbd8-8.3.7/documentation/drbdsetup_resume-sync.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_resume-sync.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupresume-syncminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_role.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_role.xml --- drbd8-8.3.7/documentation/drbdsetup_role.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_role.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetuproleminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_secondary.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_secondary.xml --- drbd8-8.3.7/documentation/drbdsetup_secondary.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_secondary.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupsecondaryminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_show-gi.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_show-gi.xml --- drbd8-8.3.7/documentation/drbdsetup_show-gi.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_show-gi.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupshow-giminor diff -Nru drbd8-8.3.7/documentation/drbdsetup_show.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_show.xml --- drbd8-8.3.7/documentation/drbdsetup_show.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_show.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,7 @@ + +drbdsetupshow + resource + minor + all + + diff -Nru drbd8-8.3.7/documentation/drbdsetup_suspend-io.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_suspend-io.xml --- drbd8-8.3.7/documentation/drbdsetup_suspend-io.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_suspend-io.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupsuspend-iominor diff -Nru drbd8-8.3.7/documentation/drbdsetup_verify.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_verify.xml --- drbd8-8.3.7/documentation/drbdsetup_verify.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_verify.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,2 @@ + +drbdsetupverifyminor--start val diff -Nru drbd8-8.3.7/documentation/drbdsetup_wait-connect.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_wait-connect.xml --- drbd8-8.3.7/documentation/drbdsetup_wait-connect.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_wait-connect.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,3 @@ + +drbdsetupwait-connectminor--wfc-timeout val--degr-wfc-timeout val--outdated-wfc-timeout val + diff -Nru drbd8-8.3.7/documentation/drbdsetup_wait-sync.xml drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_wait-sync.xml --- drbd8-8.3.7/documentation/drbdsetup_wait-sync.xml 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/drbdsetup_wait-sync.xml 2012-09-03 22:37:26.000000000 +0000 @@ -0,0 +1,3 @@ + +drbdsetupwait-syncminor--wfc-timeout val--degr-wfc-timeout val--outdated-wfc-timeout val + diff -Nru drbd8-8.3.7/documentation/xml-usage-to-docbook.xsl drbd8-8.4.1+git55a81dc~cmd1/documentation/xml-usage-to-docbook.xsl --- drbd8-8.3.7/documentation/xml-usage-to-docbook.xsl 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/documentation/xml-usage-to-docbook.xsl 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,58 @@ + + + + + + drbdsetup + + + + + + + + + + + + + + + + + + + + + + + + + + + -- + + + val + + + + + -- + + + + + + + + -- + + + + + + + + + diff -Nru drbd8-8.3.7/drbd/Kbuild drbd8-8.4.1+git55a81dc~cmd1/drbd/Kbuild --- drbd8-8.3.7/drbd/Kbuild 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/Kbuild 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,94 @@ +obj-m := drbd.o + +clean-files := compat.h .config.timestamp + +LINUXINCLUDE := -I$(src) $(LINUXINCLUDE) + +# Files in the standard include directories take precendence over files +# in the compat directory. +# +# Add -I$(src) to EXTRA_CFLAGS again: some (rhel5, maybe other) kbuild does not +# yet use LINUXINCLUDE like we expect it to ;( fortunately it does not contain +# in-tree drbd either yet, so precedence of include files is not important. +# +# override: we absolutely need this, even if EXTRA_CFLAGS originates from make +# command line or environment +override EXTRA_CFLAGS += -I$(src) -I$(src)/compat + +# The augmented rbtree helper functions are not exported at least until kernel +# version 2.6.38-rc2. +ifeq ($(shell grep -e '\' \ + -e '\' \ + -e '\' \ + $(objtree)/Module.symvers | wc -l),3) +EXTRA_CFLAGS += -DAUGMENTED_RBTREE_SYMBOLS_EXPORTED +endif + +ifeq ($(shell grep -e '\' \ + $(objtree)/Module.symvers | wc -l),1) +EXTRA_CFLAGS += -DIDR_GET_NEXT_EXPORTED +else +compat_objs += compat/idr.o +endif + +drbd-y := drbd_buildtag.o drbd_bitmap.o drbd_proc.o +drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o +drbd-y += lru_cache.o drbd_main.o drbd_strings.o drbd_nl.o +drbd-y += drbd_interval.o drbd_state.o $(compat_objs) +drbd-y += drbd_nla.o + +ifndef CONFIG_CONNECTOR + drbd-y += connector.o cn_queue.o +endif + +$(patsubst %,$(obj)/%,$(drbd-y)): $(obj)/compat.h + +obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o + +# ====================================================================== + +# remember KERNELRELEASE for install target +# .kernelversion can be included in Makefile as well as +# sourced from shell +$(shell echo -e "VERSION=$(VERSION)\n" \ + "PATCHLEVEL=$(PATCHLEVEL)\n" \ + "SUBLEVEL=$(SUBLEVEL)\n" \ + "EXTRAVERSION=$(EXTRAVERSION)\n" \ + "LOCALVERSION=$(LOCALVERSION)\n" \ + "KERNELRELEASE=$(KERNELRELEASE)\n" \ + "KERNELVERSION=$(KERNELVERSION)" \ + > $(src)/.drbd_kernelrelease.new \ +) + +# Are we in stage 2 of the build (modpost)? + +KBUILD_STAGE ?= $(if $(filter $(srctree)/scripts/Makefile.modpost,$(MAKEFILE_LIST)),modpost) + +ifneq ($(shell date -r $(objtree)/.config),$(shell date -r $(obj)/.config.timestamp 2> /dev/null)) +COMPAT_FORCE := FORCE +endif + +ifneq ($(KBUILD_STAGE),modpost) +$(obj)/compat.h: $(wildcard $(src)/compat/tests/*.c) $(COMPAT_FORCE) + $(call filechk,compat.h) + $(Q)touch $@ + $(Q)touch -r $(objtree)/.config $(obj)/.config.timestamp +endif + +filechk_compat.h = \ + for cfg in $(sort $(filter-out FORCE,$^)); do \ + var=`echo $$cfg | \ + sed -e "s,.*/,COMPAT_," -e "s,\.c,," | \ + tr -- -a-z _A-Z | \ + tr -dc A-Z0-9_`; \ + if $(CC) $(c_flags) $(COMPAT_CFLAGS) -c -o $(obj)/dummy.o $$cfg \ + > /dev/null $(if $(quiet),2>&1); then \ + echo "\#define $$var"; \ + rm -f $(obj)/dummy.{o,gcda,gcno}; \ + else \ + echo "/* \#undef $$var */"; \ + fi; \ + done + + + diff -Nru drbd8-8.3.7/drbd/Kconfig drbd8-8.4.1+git55a81dc~cmd1/drbd/Kconfig --- drbd8-8.3.7/drbd/Kconfig 2010-01-07 09:09:33.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/Kconfig 2012-02-02 14:09:14.000000000 +0000 @@ -36,13 +36,3 @@ See also: http://www.drbd.org/, http://www.linux-ha.org If unsure, say N. - -config DRBD_TRACE - tristate "DRBD tracing" - depends on BLK_DEV_DRBD - select TRACEPOINTS - help - - Say Y here if you want to be able to trace various events in DRBD. - - If unsure, say N. diff -Nru drbd8-8.3.7/drbd/Makefile drbd8-8.4.1+git55a81dc~cmd1/drbd/Makefile --- drbd8-8.3.7/drbd/Makefile 2010-01-07 09:09:33.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/Makefile 2012-02-02 14:09:14.000000000 +0000 @@ -36,31 +36,19 @@ # Lets test on PATCHLEVEL, that won't change too soon... ifneq ($(PATCHLEVEL),) + ifneq ($(VERSION),3) ifneq ($(VERSION),2) $(error "won't compile with this kernel version") endif ifneq ($(PATCHLEVEL),6) $(error "won't compile with this kernel version") endif + endif CONFIG_BLK_DEV_DRBD := m - CONFIG_DRBD_TRACE := $(shell test $${SUBLEVEL} -ge 30 && echo m || echo n) - include $(DRBDSRC)/Makefile-2.6 + include $(src)/Kbuild - override EXTRA_CFLAGS += -I$(DRBDSRC) - # remember KERNELRELEASE for install target - # .kernelversion can be included in Makefile as well as - # sourced from shell - $(shell echo -e "VERSION=$(VERSION)\n" \ - "PATCHLEVEL=$(PATCHLEVEL)\n" \ - "SUBLEVEL=$(SUBLEVEL)\n" \ - "EXTRAVERSION=$(EXTRAVERSION)\n" \ - "LOCALVERSION=$(LOCALVERSION)\n" \ - "KERNELRELEASE=$(KERNELRELEASE)\n" \ - "KERNELVERSION=$(KERNELVERSION)" \ - > $(DRBDSRC)/.drbd_kernelrelease.new \ - ) else # called from command line in current directory @@ -69,7 +57,6 @@ SHELL=/bin/bash DRBDSRC := $(shell pwd) - export DRBDSRC # to be overridden on command line: PREFIX := / @@ -90,11 +77,6 @@ endif endif - KDIR_Makefile_PATCHLEVEL = $(shell test -e $(KDIR)/Makefile && grep "^PATCHLEVEL = " $(KDIR)/Makefile | cut -d " " -f 3) - ifneq ($(findstring $(KDIR_Makefile_PATCHLEVEL),12345),) - $(error "won't compile with this kernel version") - endif - .PHONY: drbd.o default all greeting clean kbuild install dep tags drbd.o: greeting kbuild @@ -114,30 +96,31 @@ .PHONY: drbd_buildtag.c drbd_buildtag.c: - @is_tarball=`test -e ../.git/. && echo false || echo true`;\ - set -e; exec > $@.new; \ + @set -e; exec > $@.new; \ echo -e "/* automatically generated. DO NOT EDIT. */"; \ echo -e "#include "; \ echo -e "const char *drbd_buildtag(void)\n{"; \ - if $$is_tarball; then \ - if ! test -e $@ ; then \ - echo >&2 "your DRBD source tree is broken. unpack again."; \ + if test -e ../.git && GITHEAD=$$(git rev-parse HEAD); then \ + GITDIFF=$$(cd .. && git diff --name-only HEAD | \ + tr -s '\t\n' ' ' | \ + sed -e 's/^/ /;s/ *$$//'); \ + echo -e "\treturn \"GIT-hash: $$GITHEAD$$GITDIFF\""; \ + elif ! test -e $@ ; then \ + echo >&2 "$@ not found."; \ + test -e ../.git && \ + >&2 printf "%s\n" \ + "git did not work, but this looks like a git checkout?" \ + "Install git and try again." || \ + echo >&2 "Your DRBD source tree is broken. Unpack again."; \ exit 1; \ - fi; \ - grep return $@ ; \ else \ - GITHEAD=$$(git rev-parse HEAD); \ - GITDIFF=$$(cd .. && git diff --name-only HEAD | tr -s '\t\n' ' ' | \ - sed -e 's/^/ /;s/ *$$//'); \ - echo -e "\treturn \"GIT-hash: $$GITHEAD$$GITDIFF\""; \ + grep return $@ ; \ fi ; \ echo -e "\t\t\" build by $$USER@$$HOSTNAME, `date "+%F %T"`\";\n}"; \ mv --force $@.new $@ kbuild: drbd_buildtag.c @rm -f .drbd_kernelrelease* - -test -f ../scripts/adjust_drbd_config_h.sh && \ - KDIR=$(KDIR) O=$(O) $(SHELL) ../scripts/adjust_drbd_config_h.sh # previous to 2.6.6 (suse: 2.6.5-dunno), this should be: $(MAKE) -C $(KDIR) $(if $(O),O=$(O),) SUBDIRS=$(DRBDSRC) $(ARCH_UM) modules # $(MAKE) -C $(KDIR) M=$(DRBDSRC) $(ARCH_UM) modules @@ -159,8 +142,9 @@ @echo "done." clean: - rm -rf .tmp_versions + rm -rf .tmp_versions Module.markers Module.symvers modules.order rm -f *.[oas] *.ko .*.cmd .*.d .*.tmp *.mod.c .*.flags .depend .kernel* + rm -f compat/*.[oas] compat/.*.cmd distclean: clean @if git show HEAD:drbd/linux/drbd_config.h > linux/drbd_config.h.tmp \ @@ -179,7 +163,7 @@ # for VERSION, PATCHLEVEL, SUBLEVEL, EXTRAVERSION, KERNELRELEASE include .drbd_kernelrelease MODOBJ := drbd.ko - MODSUBDIR := kernel/drivers/block + MODSUBDIR := updates LINUX := $(wildcard /lib/modules/$(KERNELRELEASE)/build) install: diff -Nru drbd8-8.3.7/drbd/Makefile-2.6 drbd8-8.4.1+git55a81dc~cmd1/drbd/Makefile-2.6 --- drbd8-8.3.7/drbd/Makefile-2.6 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/Makefile-2.6 1970-01-01 00:00:00.000000000 +0000 @@ -1,12 +0,0 @@ -drbd-y := drbd_buildtag.o drbd_bitmap.o drbd_proc.o -drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o -drbd-y += lru_cache.o drbd_main.o drbd_strings.o drbd_nl.o - -ifndef CONFIG_CONNECTOR - drbd-y += connector.o cn_queue.o -endif - -drbd_trace-y := drbd_tracing.o - -obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o -obj-$(CONFIG_DRBD_TRACE) += drbd_trace.o diff -Nru drbd8-8.3.7/drbd/cn_queue.c drbd8-8.4.1+git55a81dc~cmd1/drbd/cn_queue.c --- drbd8-8.3.7/drbd/cn_queue.c 2009-07-27 08:47:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/cn_queue.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,212 +0,0 @@ -/* - * cn_queue.c - * - * 2004-2005 Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * Modified by Philipp Reiser to work on older 2.6.x kernels. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include /* In case kzalloc() is missing. */ - -#ifdef NEED_BACKPORT_OF_KZALLOC -static inline void *kzalloc(size_t size, int flags) -{ - void *rv = kmalloc(size, flags); - if (rv) - memset(rv, 0, size); - - return rv; -} -#endif - - -#ifndef KERNEL_HAS_MSLEEP -/** - * msleep - sleep safely even with waitqueue interruptions - * @msecs: Time in milliseconds to sleep for - */ -static inline void msleep(unsigned int msecs) -{ - unsigned long timeout = (msecs * HZ + 999) / 1000; - - while (timeout) { - set_current_state(TASK_UNINTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } -} - -#endif - -void cn_queue_wrapper(void *data) -{ - struct cn_callback_data *d = data; - - d->callback(d->callback_priv); - - d->destruct_data(d->ddata); - d->ddata = NULL; - - kfree(d->free); -} - -static struct cn_callback_entry *cn_queue_alloc_callback_entry(char *name, struct cb_id *id, void (*callback)(void *)) -{ - struct cn_callback_entry *cbq; - - cbq = kzalloc(sizeof(*cbq), GFP_KERNEL); - if (!cbq) { - printk(KERN_ERR "Failed to create new callback queue.\n"); - return NULL; - } - - snprintf(cbq->id.name, sizeof(cbq->id.name), "%s", name); - memcpy(&cbq->id.id, id, sizeof(struct cb_id)); - cbq->data.callback = callback; - - INIT_WORK(&cbq->work, &cn_queue_wrapper, &cbq->data); - return cbq; -} - -static void cn_queue_free_callback(struct cn_callback_entry *cbq) -{ - cancel_delayed_work(&cbq->work); - flush_workqueue(cbq->pdev->cn_queue); - - kfree(cbq); -} - -int cn_cb_equal(struct cb_id *i1, struct cb_id *i2) -{ - return ((i1->idx == i2->idx) && (i1->val == i2->val)); -} - -int cn_queue_add_callback(struct cn_queue_dev *dev, char *name, struct cb_id *id, void (*callback)(void *)) -{ - struct cn_callback_entry *cbq, *__cbq; - int found = 0; - - cbq = cn_queue_alloc_callback_entry(name, id, callback); - if (!cbq) - return -ENOMEM; - - atomic_inc(&dev->refcnt); - cbq->pdev = dev; - - spin_lock_bh(&dev->queue_lock); - list_for_each_entry(__cbq, &dev->queue_list, callback_entry) { - if (cn_cb_equal(&__cbq->id.id, id)) { - found = 1; - break; - } - } - if (!found) - list_add_tail(&cbq->callback_entry, &dev->queue_list); - spin_unlock_bh(&dev->queue_lock); - - if (found) { - atomic_dec(&dev->refcnt); - cn_queue_free_callback(cbq); - return -EINVAL; - } - - cbq->nls = dev->nls; - cbq->seq = 0; - cbq->group = cbq->id.id.idx; - - return 0; -} - -void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id) -{ - struct cn_callback_entry *cbq, *n; - int found = 0; - - spin_lock_bh(&dev->queue_lock); - list_for_each_entry_safe(cbq, n, &dev->queue_list, callback_entry) { - if (cn_cb_equal(&cbq->id.id, id)) { - list_del(&cbq->callback_entry); - found = 1; - break; - } - } - spin_unlock_bh(&dev->queue_lock); - - if (found) { - cn_queue_free_callback(cbq); - atomic_dec_and_test(&dev->refcnt); - } -} - -struct cn_queue_dev *cn_queue_alloc_dev(char *name, struct sock *nls) -{ - struct cn_queue_dev *dev; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return NULL; - - snprintf(dev->name, sizeof(dev->name), "%s", name); - atomic_set(&dev->refcnt, 0); - INIT_LIST_HEAD(&dev->queue_list); - spin_lock_init(&dev->queue_lock); - - dev->nls = nls; - dev->netlink_groups = 0; - - dev->cn_queue = create_workqueue(dev->name); - if (!dev->cn_queue) { - kfree(dev); - return NULL; - } - - return dev; -} - -void cn_queue_free_dev(struct cn_queue_dev *dev) -{ - struct cn_callback_entry *cbq, *n; - - flush_workqueue(dev->cn_queue); - destroy_workqueue(dev->cn_queue); - - spin_lock_bh(&dev->queue_lock); - list_for_each_entry_safe(cbq, n, &dev->queue_list, callback_entry) - list_del(&cbq->callback_entry); - spin_unlock_bh(&dev->queue_lock); - - while (atomic_read(&dev->refcnt)) { - printk(KERN_INFO "Waiting for %s to become free: refcnt=%d.\n", - dev->name, atomic_read(&dev->refcnt)); - msleep(1000); - } - - kfree(dev); - dev = NULL; -} diff -Nru drbd8-8.3.7/drbd/compat/asm-generic/bitops/le.h drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/asm-generic/bitops/le.h --- drbd8-8.3.7/drbd/compat/asm-generic/bitops/le.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/asm-generic/bitops/le.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,80 @@ +#ifndef _ASM_GENERIC_BITOPS_LE_H_ +#define _ASM_GENERIC_BITOPS_LE_H_ + +#include +#include + +#if defined(__LITTLE_ENDIAN) + +#define BITOP_LE_SWIZZLE 0 + +static inline unsigned long find_next_zero_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_zero_bit(addr, size, offset); +} + +static inline unsigned long find_next_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static inline unsigned long find_first_zero_bit_le(const void *addr, + unsigned long size) +{ + return find_first_zero_bit(addr, size); +} + +#elif defined(__BIG_ENDIAN) + +#define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) + +extern unsigned long find_next_zero_bit_le(const void *addr, + unsigned long size, unsigned long offset); +extern unsigned long find_next_bit_le(const void *addr, + unsigned long size, unsigned long offset); + +#define find_first_zero_bit_le(addr, size) \ + find_next_zero_bit_le((addr), (size), 0) + +#else +#error "Please fix " +#endif + +static inline int test_bit_le(int nr, const void *addr) +{ + return test_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} + +static inline void __set_bit_le(int nr, void *addr) +{ + __set_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} + +static inline void __clear_bit_le(int nr, void *addr) +{ + __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} + +static inline int test_and_set_bit_le(int nr, void *addr) +{ + return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} + +static inline int test_and_clear_bit_le(int nr, void *addr) +{ + return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} + +static inline int __test_and_set_bit_le(int nr, void *addr) +{ + return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} + +static inline int __test_and_clear_bit_le(int nr, void *addr) +{ + return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} + +#endif /* _ASM_GENERIC_BITOPS_LE_H_ */ diff -Nru drbd8-8.3.7/drbd/compat/bitops.h drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/bitops.h --- drbd8-8.3.7/drbd/compat/bitops.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/bitops.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,81 @@ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) +/* did not yet include generic_find_next_le_bit() {{{ */ + +#if defined(__LITTLE_ENDIAN) + +#define generic_find_next_le_bit(addr, size, offset) \ + find_next_bit(addr, size, offset) + +#elif defined(__BIG_ENDIAN) +/* from 2.6.33 lib/find_bit.c */ + +/* include/linux/byteorder does not support "unsigned long" type */ +static inline unsigned long ext2_swabp(const unsigned long * x) +{ +#if BITS_PER_LONG == 64 + return (unsigned long) __swab64p((u64 *) x); +#elif BITS_PER_LONG == 32 + return (unsigned long) __swab32p((u32 *) x); +#else +#error BITS_PER_LONG not defined +#endif +} + +/* include/linux/byteorder doesn't support "unsigned long" type */ +static inline unsigned long ext2_swab(const unsigned long y) +{ +#if BITS_PER_LONG == 64 + return (unsigned long) __swab64((u64) y); +#elif BITS_PER_LONG == 32 + return (unsigned long) __swab32((u32) y); +#else +#error BITS_PER_LONG not defined +#endif +} + +unsigned long generic_find_next_le_bit(const unsigned long *addr, unsigned + long size, unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= (BITS_PER_LONG - 1UL); + if (offset) { + tmp = ext2_swabp(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + + while (size & ~(BITS_PER_LONG - 1)) { + tmp = *(p++); + if (tmp) + goto found_middle_swap; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = ext2_swabp(p); +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); + +found_middle_swap: + return result + __ffs(ext2_swab(tmp)); +} +#else +#error "unknown byte order" +#endif +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) */ diff -Nru drbd8-8.3.7/drbd/compat/idr.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/idr.c --- drbd8-8.3.7/drbd/compat/idr.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/idr.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,48 @@ +#include +#include +#include +#include +#include + +/* The idr_get_next() function exists since 2009-04-02 Linux-2.6.29 (commit 38460b48) + but is exported for use in modules since 2010-01-29 Linux-2.6.35 (commit 4d1ee80f) */ +#ifndef IDR_GET_NEXT_EXPORTED +#ifndef rcu_dereference_raw +/* see c26d34a rcu: Add lockdep-enabled variants of rcu_dereference() */ +#define rcu_dereference_raw(p) rcu_dereference(p) +#endif +void *idr_get_next(struct idr *idp, int *nextidp) +{ + struct idr_layer *p, *pa[MAX_LEVEL]; + struct idr_layer **paa = &pa[0]; + int id = *nextidp; + int n, max; + + /* find first ent */ + n = idp->layers * IDR_BITS; + max = 1 << n; + p = rcu_dereference_raw(idp->top); + if (!p) + return NULL; + + while (id < max) { + while (n > 0 && p) { + n -= IDR_BITS; + *paa++ = p; + p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); + } + + if (p) { + *nextidp = id; + return p; + } + + id += 1 << n; + while (n < fls(id)) { + n += IDR_BITS; + p = *--paa; + } + } + return NULL; +} +#endif diff -Nru drbd8-8.3.7/drbd/compat/linux/autoconf.h drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/autoconf.h --- drbd8-8.3.7/drbd/compat/linux/autoconf.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/autoconf.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1 @@ +/* empty file, for compat reasons */ diff -Nru drbd8-8.3.7/drbd/compat/linux/dynamic_debug.h drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/dynamic_debug.h --- drbd8-8.3.7/drbd/compat/linux/dynamic_debug.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/dynamic_debug.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,8 @@ +#ifndef _DYNAMIC_DEBUG_H +#define _DYNAMIC_DEBUG_H + +#ifndef dynamic_dev_dbg +#define dynamic_dev_dbg(dev, fmt, ...) +#endif + +#endif diff -Nru drbd8-8.3.7/drbd/compat/linux/hardirq.h drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/hardirq.h --- drbd8-8.3.7/drbd/compat/linux/hardirq.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/hardirq.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1 @@ +/* Just an empty file. */ diff -Nru drbd8-8.3.7/drbd/compat/linux/memcontrol.h drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/memcontrol.h --- drbd8-8.3.7/drbd/compat/linux/memcontrol.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/memcontrol.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,3 @@ +/* just an empty file + * memcontrol.h did not exist prior to 2.6.25. + * but it needs more recent kernels for mm_inline.h to work. */ diff -Nru drbd8-8.3.7/drbd/compat/linux/mutex.h drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/mutex.h --- drbd8-8.3.7/drbd/compat/linux/mutex.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/mutex.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,41 @@ +/* "Backport" of the mutex to older Linux-2.6.x kernels. + */ +#ifndef __LINUX_MUTEX_H +#define __LINUX_MUTEX_H + +#include + +struct mutex { + struct semaphore sem; +}; + +static inline void mutex_init(struct mutex *m) +{ + sema_init(&m->sem, 1); +} + +static inline void mutex_lock(struct mutex *m) +{ + down(&m->sem); +} + +static inline int mutex_lock_interruptible(struct mutex *m) +{ + return down_interruptible(&m->sem); +} + +static inline void mutex_unlock(struct mutex *m) +{ + up(&m->sem); +} + +static inline int mutex_is_locked(struct mutex *lock) +{ + return atomic_read(&lock->sem.count) != 1; +} + +static inline int mutex_trylock(struct mutex *lock) +{ + return !down_trylock(&lock->sem); +} +#endif diff -Nru drbd8-8.3.7/drbd/compat/linux/tracepoint.h drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/tracepoint.h --- drbd8-8.3.7/drbd/compat/linux/tracepoint.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/linux/tracepoint.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1 @@ +struct tracepoint; diff -Nru drbd8-8.3.7/drbd/compat/tests/bio_split_has_bio_split_pool_parameter.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/bio_split_has_bio_split_pool_parameter.c --- drbd8-8.3.7/drbd/compat/tests/bio_split_has_bio_split_pool_parameter.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/bio_split_has_bio_split_pool_parameter.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,12 @@ +#include + +/* + * bio_split() had a memory pool parameter until commit 6feef53 (2.6.28-rc1). + */ +void test(void) +{ + struct bio *bio = NULL; + struct bio_pair *bio_pair; + + bio_pair = bio_split(bio, bio_split_pool, 0); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/bioset_create_has_three_parameters.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/bioset_create_has_three_parameters.c --- drbd8-8.3.7/drbd/compat/tests/bioset_create_has_three_parameters.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/bioset_create_has_three_parameters.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,10 @@ +#include + +/* + * Note that up until 2.6.21 inclusive, it was + * struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale) + */ +void dummy(void) +{ + bioset_create(16, 16, 4); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_IS_ERR_OR_NULL.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_IS_ERR_OR_NULL.c --- drbd8-8.3.7/drbd/compat/tests/have_IS_ERR_OR_NULL.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_IS_ERR_OR_NULL.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,8 @@ +#include + +int foo() +{ + void *x = 0; + + return IS_ERR_OR_NULL(x); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_bioset_create_front_pad.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_bioset_create_front_pad.c --- drbd8-8.3.7/drbd/compat/tests/have_bioset_create_front_pad.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_bioset_create_front_pad.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,30 @@ +#include + +/* + * upstream commit (included in 2.6.29) + * commit bb799ca0202a360fa74d5f17039b9100caebdde7 + * Author: Jens Axboe + * Date: Wed Dec 10 15:35:05 2008 +0100 + * + * bio: allow individual slabs in the bio_set + * + * does + * -struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) + * +struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) + * + * Note that up until 2.6.21 inclusive, it was + * struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale) + * so if we want to support old kernels (RHEL5), we will need an additional compat check. + * + * This also means that we must not use the front_pad trick as long as we want + * to keep compatibility with < 2.6.29. + */ +extern struct bio_set *compat_check_bioset_create(unsigned int, unsigned int); + +#ifndef __same_type +# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) +#endif +void dummy(void) +{ + BUILD_BUG_ON(!__same_type(&compat_check_bioset_create, &bioset_create)); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_blk_queue_max_hw_sectors.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_blk_queue_max_hw_sectors.c --- drbd8-8.3.7/drbd/compat/tests/have_blk_queue_max_hw_sectors.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_blk_queue_max_hw_sectors.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,5 @@ +#include + +#ifndef blk_queue_max_hw_sectors +void *p = blk_queue_max_hw_sectors; +#endif diff -Nru drbd8-8.3.7/drbd/compat/tests/have_blk_queue_max_segments.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_blk_queue_max_segments.c --- drbd8-8.3.7/drbd/compat/tests/have_blk_queue_max_segments.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_blk_queue_max_segments.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,5 @@ +#include + +#ifndef blk_queue_max_segments +void *p = blk_queue_max_segments; +#endif diff -Nru drbd8-8.3.7/drbd/compat/tests/have_blkdev_get_by_path.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_blkdev_get_by_path.c --- drbd8-8.3.7/drbd/compat/tests/have_blkdev_get_by_path.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_blkdev_get_by_path.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +/* + * In kernel version 2.6.38-rc1, open_bdev_exclusive() was replaced by + * blkdev_get_by_path(); see commits e525fd89 and d4d77629. + */ +void foo(void) { + struct block_device *blkdev; + + blkdev = blkdev_get_by_path("", (fmode_t) 0, (void *) 0); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_bool_type.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_bool_type.c --- drbd8-8.3.7/drbd/compat/tests/have_bool_type.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_bool_type.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,3 @@ +#include + +bool x; diff -Nru drbd8-8.3.7/drbd/compat/tests/have_clear_bit_unlock.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_clear_bit_unlock.c --- drbd8-8.3.7/drbd/compat/tests/have_clear_bit_unlock.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_clear_bit_unlock.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,18 @@ +#include +/* Including asm/system.h is necessary for s390. + + They define smp_mb__before_clear_bit() in asm/system.h + From asm/bitops.h they include asm-generic/bitops/lock.h + The macro defining clear_bit_unlock() in + asm-generic/bitops/lock.h needs smp_mb__before_clear_bit(). + + They fail to include asm/system.h from asm/bitops.h +*/ +#include + +void foo() +{ + unsigned long bar; + + clear_bit_unlock(0, &bar); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_cn_netlink_skb_parms.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_cn_netlink_skb_parms.c --- drbd8-8.3.7/drbd/compat/tests/have_cn_netlink_skb_parms.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_cn_netlink_skb_parms.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,13 @@ +#include +#include + +#ifndef __same_type +# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) +#endif + +void dummy(void) +{ + void (*cb) (struct cn_msg *, struct netlink_skb_parms *) = NULL; + struct cn_callback_data ccb; + BUILD_BUG_ON(!(__same_type(ccb.callback, cb))); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_ctrl_attr_mcast_groups.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_ctrl_attr_mcast_groups.c --- drbd8-8.3.7/drbd/compat/tests/have_ctrl_attr_mcast_groups.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_ctrl_attr_mcast_groups.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,6 @@ +#include + +void f(void) +{ + int i = CTRL_ATTR_MCAST_GROUPS; +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_dst_groups.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_dst_groups.c --- drbd8-8.3.7/drbd/compat/tests/have_dst_groups.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_dst_groups.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,8 @@ +#include +#include + +void dummy(void) +{ + static struct netlink_skb_parms p; + p.dst_groups = 0; +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_find_next_zero_bit_le.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_find_next_zero_bit_le.c --- drbd8-8.3.7/drbd/compat/tests/have_find_next_zero_bit_le.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_find_next_zero_bit_le.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,13 @@ +#include +#include + +unsigned long func(void) +{ + void *addr; + unsigned long size, offset; + + addr = NULL; + size = 0; + offset = 0; + return find_next_zero_bit_le(addr, size, offset); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_fmode_t.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_fmode_t.c --- drbd8-8.3.7/drbd/compat/tests/have_fmode_t.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_fmode_t.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,6 @@ +#include + +void foo() +{ + fmode_t mode; +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_genlmsg_msg_size.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_genlmsg_msg_size.c --- drbd8-8.3.7/drbd/compat/tests/have_genlmsg_msg_size.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_genlmsg_msg_size.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,9 @@ +#include + +void f(void) +{ + int dummy; + + dummy = genlmsg_msg_size(0); + dummy = genlmsg_total_size(0); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_genlmsg_new.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_genlmsg_new.c --- drbd8-8.3.7/drbd/compat/tests/have_genlmsg_new.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_genlmsg_new.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,8 @@ +#include + +void f(void) +{ + struct sk_buff *skb; + + skb = genlmsg_new(123, GFP_KERNEL); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_genlmsg_put_reply.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_genlmsg_put_reply.c --- drbd8-8.3.7/drbd/compat/tests/have_genlmsg_put_reply.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_genlmsg_put_reply.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +void f(void) +{ + struct sk_buff *skb = NULL; + struct genl_info *info = NULL; + struct genl_family *family = NULL; + void *ret; + + ret = genlmsg_put_reply(skb, info, family, 0, 0); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_genlmsg_reply.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_genlmsg_reply.c --- drbd8-8.3.7/drbd/compat/tests/have_genlmsg_reply.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_genlmsg_reply.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,8 @@ +#include + +void f(void) +{ + struct sk_buff *skb = NULL; + struct genl_info *info = NULL; + int ret = genlmsg_reply(skb, info); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_linux_byteorder_swabb_h.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_linux_byteorder_swabb_h.c --- drbd8-8.3.7/drbd/compat/tests/have_linux_byteorder_swabb_h.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_linux_byteorder_swabb_h.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,2 @@ +#include +#include diff -Nru drbd8-8.3.7/drbd/compat/tests/have_nlmsg_hdr.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_nlmsg_hdr.c --- drbd8-8.3.7/drbd/compat/tests/have_nlmsg_hdr.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_nlmsg_hdr.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,8 @@ +#include +#include + +void f(void) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *hdr = nlmsg_hdr(skb); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_nr_cpu_ids.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_nr_cpu_ids.c --- drbd8-8.3.7/drbd/compat/tests/have_nr_cpu_ids.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_nr_cpu_ids.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,6 @@ +#include + +void foo() +{ + int x = nr_cpu_ids; +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_open_bdev_exclusive.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_open_bdev_exclusive.c --- drbd8-8.3.7/drbd/compat/tests/have_open_bdev_exclusive.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_open_bdev_exclusive.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,13 @@ +#include +#include + +/* + * In kernel version v2.6.28-rc1, open_bdev_excl() was replaced by + * open_bdev_exclusive(); see commit 30c40d2. + */ +void foo(void) +{ + struct block_device *blkdev; + + blkdev = open_bdev_exclusive("", (fmode_t) 0, (void *) 0); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_proc_create_data.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_proc_create_data.c --- drbd8-8.3.7/drbd/compat/tests/have_proc_create_data.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_proc_create_data.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,5 @@ +#include + +#ifndef proc_create_data +void *p = proc_create_data; +#endif diff -Nru drbd8-8.3.7/drbd/compat/tests/have_rb_augment_functions.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_rb_augment_functions.c --- drbd8-8.3.7/drbd/compat/tests/have_rb_augment_functions.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_rb_augment_functions.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,14 @@ +#include + +/* introduced with commit b945d6b2, Linux 2.6.35-rc5 */ + +void foo(void) { + struct rb_node *n; + + rb_augment_insert((struct rb_node *) NULL, + (rb_augment_f) NULL, + NULL); + + n = rb_augment_erase_begin((struct rb_node *)NULL); + rb_augment_erase_end((struct rb_node *) NULL, (rb_augment_f) NULL, NULL); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/have_sock_shutdown.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_sock_shutdown.c --- drbd8-8.3.7/drbd/compat/tests/have_sock_shutdown.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_sock_shutdown.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,5 @@ +#include + +#ifndef kernel_sock_shutdown +void *p = kernel_sock_shutdown; +#endif diff -Nru drbd8-8.3.7/drbd/compat/tests/have_void_make_request.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_void_make_request.c --- drbd8-8.3.7/drbd/compat/tests/have_void_make_request.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/have_void_make_request.c 2012-09-03 21:31:23.000000000 +0000 @@ -0,0 +1,19 @@ +#include + +/* hm. sometimes this pragma is ignored :( + * use BUILD_BUG_ON instead. +#pragma GCC diagnostic warning "-Werror" + */ + +/* in Commit 5a7bbad27a410350e64a2d7f5ec18fc73836c14f (between Linux-3.1 and 3.2) + make_request() becomes type void. Before it had type int. + */ + +void drbd_make_request(struct request_queue *q, struct bio *bio) +{ +} + +void foo(void) +{ + BUILD_BUG_ON(!(__same_type(&drbd_make_request, make_request_fn))); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/need_genlmsg_multicast_wrapper.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/need_genlmsg_multicast_wrapper.c --- drbd8-8.3.7/drbd/compat/tests/need_genlmsg_multicast_wrapper.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/need_genlmsg_multicast_wrapper.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,9 @@ +#include + +void f(void) +{ + struct sk_buff *skb = NULL; + int ret; + + ret = genlmsg_multicast(skb, 0, 0); +} diff -Nru drbd8-8.3.7/drbd/compat/tests/use_blk_queue_max_sectors_anyways.c drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/use_blk_queue_max_sectors_anyways.c --- drbd8-8.3.7/drbd/compat/tests/use_blk_queue_max_sectors_anyways.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/compat/tests/use_blk_queue_max_sectors_anyways.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,15 @@ +#include + +#ifndef blk_queue_max_hw_sectors +void *p = blk_queue_max_hw_sectors; +#endif + +/* For kernel versions 2.6.31 to 2.6.33 inclusive, even though + * blk_queue_max_hw_sectors is present, we actually need to use + * blk_queue_max_sectors to set max_hw_sectors. :-( + * RHEL6 2.6.32 chose to be different and already has eliminated + * blk_queue_max_sectors as upstream 2.6.34 did. + */ +#ifndef blk_queue_max_sectors +void *q = blk_queue_max_sectors; +#endif diff -Nru drbd8-8.3.7/drbd/connector.c drbd8-8.4.1+git55a81dc~cmd1/drbd/connector.c --- drbd8-8.3.7/drbd/connector.c 2009-10-06 11:32:41.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/connector.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,513 +0,0 @@ -/* - * connector.c - * - * 2004-2005 Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * Modified by Philipp Reiser to work on older 2.6.x kernels. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef DRBD_CONNECTOR_BACKPORT_HEADER -#error "drbd backported connector.c compiled against kernel connector.h will not work" -#error "enable CONFIG_CONNECTOR in your kernel and try again" -#endif - -#include - -#ifdef DRBD_NL_DST_GROUPS - /* pre 2.6.16 */ -# define NETLINK_GROUP(skb) NETLINK_CB(skb).dst_groups -#else -# define NETLINK_GROUP(skb) NETLINK_CB(skb).dst_group -#endif - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Evgeniy Polyakov "); -MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector."); - -static u32 cn_idx = CN_IDX_CONNECTOR; -static u32 cn_val = CN_VAL_CONNECTOR; - -module_param(cn_idx, uint, 0); -module_param(cn_val, uint, 0); -MODULE_PARM_DESC(cn_idx, "Connector's main device idx."); -MODULE_PARM_DESC(cn_val, "Connector's main device val."); - -static DECLARE_MUTEX(notify_lock); -static LIST_HEAD(notify_list); - -static struct cn_dev cdev; - -int cn_already_initialized = 0; - -/* - * msg->seq and msg->ack are used to determine message genealogy. - * When someone sends message it puts there locally unique sequence - * and random acknowledge numbers. Sequence number may be copied into - * nlmsghdr->nlmsg_seq too. - * - * Sequence number is incremented with each message to be sent. - * - * If we expect reply to our message then the sequence number in - * received message MUST be the same as in original message, and - * acknowledge number MUST be the same + 1. - * - * If we receive a message and its sequence number is not equal to the - * one we are expecting then it is a new message. - * - * If we receive a message and its sequence number is the same as one - * we are expecting but it's acknowledgement number is not equal to - * the acknowledgement number in the original message + 1, then it is - * a new message. - * - */ -int cn_netlink_send(struct cn_msg *msg, u32 __group, gfp_t gfp_mask) -{ - struct cn_callback_entry *__cbq; - unsigned int size; - struct sk_buff *skb; - struct nlmsghdr *nlh; - struct cn_msg *data; - struct cn_dev *dev = &cdev; - u32 group = 0; - int found = 0; - - if (!__group) { - spin_lock_bh(&dev->cbdev->queue_lock); - list_for_each_entry(__cbq, &dev->cbdev->queue_list, - callback_entry) { - if (cn_cb_equal(&__cbq->id.id, &msg->id)) { - found = 1; - group = __cbq->group; - } - } - spin_unlock_bh(&dev->cbdev->queue_lock); - - if (!found) - return -ENODEV; - } else { - group = __group; - } - - size = NLMSG_SPACE(sizeof(*msg) + msg->len); - - skb = alloc_skb(size, gfp_mask); - if (!skb) - return -ENOMEM; - - nlh = NLMSG_PUT(skb, 0, msg->seq, NLMSG_DONE, size - sizeof(*nlh)); - - data = NLMSG_DATA(nlh); - - memcpy(data, msg, sizeof(*data) + msg->len); - - NETLINK_GROUP(skb) = group; - - netlink_broadcast(dev->nls, skb, 0, group, gfp_mask); - - return 0; - -nlmsg_failure: - kfree_skb(skb); - return -EINVAL; -} - -/* - * Callback helper - queues work and setup destructor for given data. - */ -static int cn_call_callback(struct cn_msg *msg, void (*destruct_data)(void *), void *data) -{ - struct cn_callback_entry *__cbq; - struct cn_dev *dev = &cdev; - int err = -ENODEV; - - spin_lock_bh(&dev->cbdev->queue_lock); - list_for_each_entry(__cbq, &dev->cbdev->queue_list, callback_entry) { - if (cn_cb_equal(&__cbq->id.id, &msg->id)) { - if (likely(!test_bit(0, &__cbq->work.pending) && - __cbq->data.ddata == NULL)) { - __cbq->data.callback_priv = msg; - - __cbq->data.ddata = data; - __cbq->data.destruct_data = destruct_data; - - if (queue_work(dev->cbdev->cn_queue, - &__cbq->work)) - err = 0; - } else { - struct work_struct *w; - struct cn_callback_data *d; - - w = kmalloc(sizeof(*w) + sizeof(*d), GFP_ATOMIC); - if (w) { - memset(w,0,sizeof(*w) + sizeof(*d)); - d = (struct cn_callback_data *)(w+1); - - d->callback_priv = msg; - d->callback = __cbq->data.callback; - d->ddata = data; - d->destruct_data = destruct_data; - d->free = w; - - INIT_LIST_HEAD(&w->entry); - w->pending = 0; - w->func = &cn_queue_wrapper; - w->data = d; - init_timer(&w->timer); - - if (queue_work(dev->cbdev->cn_queue, w)) - err = 0; - else { - kfree(w); - err = -EINVAL; - } - } else - err = -ENOMEM; - } - break; - } - } - spin_unlock_bh(&dev->cbdev->queue_lock); - - return err; -} - -/* - * Skb receive helper - checks skb and msg size and calls callback - * helper. - */ -static int __cn_rx_skb(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - u32 pid, uid, seq, group; - struct cn_msg *msg; - - pid = NETLINK_CREDS(skb)->pid; - uid = NETLINK_CREDS(skb)->uid; - seq = nlh->nlmsg_seq; - group = NETLINK_GROUP(skb); - msg = NLMSG_DATA(nlh); - - /* DRBD specific change: Only allow packets from ROOT */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - return cn_call_callback(msg, (void (*)(void *))kfree_skb, skb); -} - -/* - * Main netlink receiving function. - * - * It checks skb and netlink header sizes and calls the skb receive - * helper with a shared skb. - */ -static void cn_rx_skb(struct sk_buff *__skb) -{ - struct nlmsghdr *nlh; - u32 len; - int err; - struct sk_buff *skb; - - skb = skb_get(__skb); - - if (skb->len >= NLMSG_SPACE(0)) { - nlh = (struct nlmsghdr *)skb->data; - - if (nlh->nlmsg_len < sizeof(struct cn_msg) || - skb->len < nlh->nlmsg_len || - nlh->nlmsg_len > CONNECTOR_MAX_MSG_SIZE) { - kfree_skb(skb); - goto out; - } - - len = NLMSG_ALIGN(nlh->nlmsg_len); - if (len > skb->len) - len = skb->len; - - err = __cn_rx_skb(skb, nlh); - if (err < 0) - kfree_skb(skb); - } - -out: - kfree_skb(__skb); -} - -/* - * Netlink socket input callback - dequeues the skbs and calls the - * main netlink receiving function. - */ -static void cn_input(struct sock *sk, int len) -{ - struct sk_buff *skb; - - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) - cn_rx_skb(skb); -} - -/* - * Notification routing. - * - * Gets id and checks if there are notification request for it's idx - * and val. If there are such requests notify the listeners with the - * given notify event. - * - */ -static void cn_notify(struct cb_id *id, u32 notify_event) -{ - struct cn_ctl_entry *ent; - - down(¬ify_lock); - list_for_each_entry(ent, ¬ify_list, notify_entry) { - int i; - struct cn_notify_req *req; - struct cn_ctl_msg *ctl = ent->msg; - int idx_found, val_found; - - idx_found = val_found = 0; - - req = (struct cn_notify_req *)ctl->data; - for (i = 0; i < ctl->idx_notify_num; ++i, ++req) { - if (id->idx >= req->first && - id->idx < req->first + req->range) { - idx_found = 1; - break; - } - } - - for (i = 0; i < ctl->val_notify_num; ++i, ++req) { - if (id->val >= req->first && - id->val < req->first + req->range) { - val_found = 1; - break; - } - } - - if (idx_found && val_found) { - struct cn_msg m = { .ack = notify_event, }; - - memcpy(&m.id, id, sizeof(m.id)); - cn_netlink_send(&m, ctl->group, GFP_KERNEL); - } - } - up(¬ify_lock); -} - -/* - * Callback add routing - adds callback with given ID and name. - * If there is registered callback with the same ID it will not be added. - * - * May sleep. - */ -int cn_add_callback(struct cb_id *id, char *name, void (*callback)(void *)) -{ - int err; - struct cn_dev *dev = &cdev; - - err = cn_queue_add_callback(dev->cbdev, name, id, callback); - if (err) - return err; - - cn_notify(id, 0); - - return 0; -} - -/* - * Callback remove routing - removes callback - * with given ID. - * If there is no registered callback with given - * ID nothing happens. - * - * May sleep while waiting for reference counter to become zero. - */ -void cn_del_callback(struct cb_id *id) -{ - struct cn_dev *dev = &cdev; - - cn_queue_del_callback(dev->cbdev, id); - cn_notify(id, 1); -} - -/* - * Checks two connector's control messages to be the same. - * Returns 1 if they are the same or if the first one is corrupted. - */ -static int cn_ctl_msg_equals(struct cn_ctl_msg *m1, struct cn_ctl_msg *m2) -{ - int i; - struct cn_notify_req *req1, *req2; - - if (m1->idx_notify_num != m2->idx_notify_num) - return 0; - - if (m1->val_notify_num != m2->val_notify_num) - return 0; - - if (m1->len != m2->len) - return 0; - - if ((m1->idx_notify_num + m1->val_notify_num) * sizeof(*req1) != - m1->len) - return 1; - - req1 = (struct cn_notify_req *)m1->data; - req2 = (struct cn_notify_req *)m2->data; - - for (i = 0; i < m1->idx_notify_num; ++i) { - if (req1->first != req2->first || req1->range != req2->range) - return 0; - req1++; - req2++; - } - - for (i = 0; i < m1->val_notify_num; ++i) { - if (req1->first != req2->first || req1->range != req2->range) - return 0; - req1++; - req2++; - } - - return 1; -} - -/* - * Main connector device's callback. - * - * Used for notification of a request's processing. - */ -static void cn_callback(void *data) -{ - struct cn_msg *msg = data; - struct cn_ctl_msg *ctl; - struct cn_ctl_entry *ent; - u32 size; - - if (msg->len < sizeof(*ctl)) - return; - - ctl = (struct cn_ctl_msg *)msg->data; - - size = (sizeof(*ctl) + ((ctl->idx_notify_num + - ctl->val_notify_num) * - sizeof(struct cn_notify_req))); - - if (msg->len != size) - return; - - if (ctl->len + sizeof(*ctl) != msg->len) - return; - - /* - * Remove notification. - */ - if (ctl->group == 0) { - struct cn_ctl_entry *n; - - down(¬ify_lock); - list_for_each_entry_safe(ent, n, ¬ify_list, notify_entry) { - if (cn_ctl_msg_equals(ent->msg, ctl)) { - list_del(&ent->notify_entry); - kfree(ent); - } - } - up(¬ify_lock); - - return; - } - - size += sizeof(*ent); - - ent = kmalloc(size, GFP_KERNEL); - if (!ent) - return; - - memset(ent,0,size); - ent->msg = (struct cn_ctl_msg *)(ent + 1); - - memcpy(ent->msg, ctl, size - sizeof(*ent)); - - down(¬ify_lock); - list_add(&ent->notify_entry, ¬ify_list); - up(¬ify_lock); -} - -int __init cn_init(void) -{ - struct cn_dev *dev = &cdev; - int err; - - dev->input = cn_input; - dev->id.idx = cn_idx; - dev->id.val = cn_val; - -#ifdef DRBD_NL_DST_GROUPS - /* history of upstream commits between kernel.org 2.6.13 and 2.6.14-rc1: - * 4fdb3bb723db469717c6d38fda667d8b0fa86ebd 2005-08-10 adds module parameter - * d629b836d151d43332492651dd841d32e57ebe3b 2005-08-15 renames dst_groups to dst_group - * 066286071d3542243baa68166acb779187c848b3 2005-08-15 adds groups parameter - * so it is not exactly correct to trigger on the rename dst_groups to dst_group, - * but sufficiently close. - */ - dev->nls = netlink_kernel_create(NETLINK_CONNECTOR,dev->input); -#else - dev->nls = netlink_kernel_create(NETLINK_CONNECTOR, - CN_NETLINK_USERS + 0xf, - dev->input, THIS_MODULE); -#endif - if (!dev->nls) - return -EIO; - - dev->cbdev = cn_queue_alloc_dev("cqueue", dev->nls); - if (!dev->cbdev) { - if (dev->nls->sk_socket) - sock_release(dev->nls->sk_socket); - return -EINVAL; - } - - err = cn_add_callback(&dev->id, "connector", &cn_callback); - if (err) { - cn_queue_free_dev(dev->cbdev); - if (dev->nls->sk_socket) - sock_release(dev->nls->sk_socket); - return -EINVAL; - } - - cn_already_initialized = 1; - - return 0; -} - -void __exit cn_fini(void) -{ - struct cn_dev *dev = &cdev; - - cn_already_initialized = 0; - - cn_del_callback(&dev->id); - cn_queue_free_dev(dev->cbdev); - if (dev->nls->sk_socket) - sock_release(dev->nls->sk_socket); -} diff -Nru drbd8-8.3.7/drbd/drbd_actlog.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_actlog.c --- drbd8-8.3.7/drbd/drbd_actlog.c 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_actlog.c 2012-02-02 14:09:14.000000000 +0000 @@ -24,22 +24,73 @@ */ #include +#include #include +#include +#include #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_wrappers.h" -/* We maintain a trivial check sum in our on disk activity log. - * With that we can ensure correct operation even when the storage - * device might do a partial (last) sector write while loosing power. - */ -struct __packed al_transaction { - u32 magic; - u32 tr_number; - struct __packed { - u32 pos; - u32 extent; } updates[1 + AL_EXTENTS_PT]; - u32 xor_sum; + +enum al_transaction_types { + AL_TR_UPDATE = 0, + AL_TR_INITIALIZED = 0xffff +}; +/* all fields on disc in big endian */ +struct __packed al_transaction_on_disk { + /* don't we all like magic */ + __be32 magic; + + /* to identify the most recent transaction block + * in the on disk ring buffer */ + __be32 tr_number; + + /* checksum on the full 4k block, with this field set to 0. */ + __be32 crc32c; + + /* type of transaction, special transaction types like: + * purge-all, set-all-idle, set-all-active, ... to-be-defined + * see also enum al_transaction_types */ + __be16 transaction_type; + + /* we currently allow only a few thousand extents, + * so 16bit will be enough for the slot number. */ + + /* how many updates in this transaction */ + __be16 n_updates; + + /* maximum slot number, "al-extents" in drbd.conf speak. + * Having this in each transaction should make reconfiguration + * of that parameter easier. */ + __be16 context_size; + + /* slot number the context starts with */ + __be16 context_start_slot_nr; + + /* Some reserved bytes. Expected usage is a 64bit counter of + * sectors-written since device creation, and other data generation tag + * supporting usage */ + __be32 __reserved[4]; + + /* --- 36 byte used --- */ + + /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes + * in one transaction, then use the remaining byte in the 4k block for + * context information. "Flexible" number of updates per transaction + * does not help, as we have to account for the case when all update + * slots are used anyways, so it would only complicate code without + * additional benefit. + */ + __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; + + /* but the extent number is 32bit, which at an extent size of 4 MiB + * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ + __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; + + /* --- 420 bytes used (36 + 64*6) --- */ + + /* 4096 - 420 = 3676 = 919 * 4 */ + __be32 context[AL_CONTEXT_PER_TRANSACTION]; }; struct update_odbm_work { @@ -49,25 +100,39 @@ struct update_al_work { struct drbd_work w; - struct lc_element *al_ext; struct completion event; - unsigned int enr; - /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ - unsigned int old_enr; + int err; }; -int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); +static int al_write_transaction(struct drbd_conf *mdev); -/* The actual tracepoint needs to have constant number of known arguments... - */ -void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...) +void *drbd_md_get_buffer(struct drbd_conf *mdev) +{ + int r; + + wait_event(mdev->misc_wait, + (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 || + mdev->state.disk <= D_FAILED); + + return r ? NULL : page_address(mdev->md_io_page); +} + +void drbd_md_put_buffer(struct drbd_conf *mdev) +{ + if (atomic_dec_and_test(&mdev->md_io_in_use)) + wake_up(&mdev->misc_wait); +} + +static bool md_io_allowed(struct drbd_conf *mdev) { - va_list ap; + enum drbd_disk_state ds = mdev->state.disk; + return ds >= D_NEGOTIATING || ds == D_ATTACHING; +} - va_start(ap, fmt); - trace__drbd_resync(mdev, level, fmt, ap); - va_end(ap); +void wait_until_done_or_disk_failure(struct drbd_conf *mdev, unsigned int *done) +{ + wait_event(mdev->misc_wait, *done || !md_io_allowed(mdev)); } STATIC int _drbd_md_sync_page_io(struct drbd_conf *mdev, @@ -76,138 +141,95 @@ int rw, int size) { struct bio *bio; - struct drbd_md_io md_io; - int ok; - - md_io.mdev = mdev; - init_completion(&md_io.event); - md_io.error = 0; + int err; if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) - rw |= (1UL << BIO_RW_BARRIER); -#ifdef BIO_RW_SYNC - rw |= (1<md_io.done = 0; + mdev->md_io.error = -ENODEV; + + bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; - ok = (bio_add_page(bio, page, size, 0) == size); - if (!ok) + err = -EIO; + if (bio_add_page(bio, page, size, 0) != size) goto out; - bio->bi_private = &md_io; + bio->bi_private = &mdev->md_io; bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; - trace_drbd_bio(mdev, "Md", bio, 0, NULL); + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); + err = -ENODEV; + goto out; + } - if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) + bio_get(bio); /* one bio_put() is in the completion handler */ + atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ + if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_for_completion(&md_io.event); - ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; + wait_until_done_or_disk_failure(mdev, &mdev->md_io.done); + if (bio_flagged(bio, BIO_UPTODATE)) + err = mdev->md_io.error; +#ifndef REQ_FLUSH /* check for unsupported barrier op. * would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 */ - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) { + if (err && mdev->md_io.done && (bio->bi_rw & DRBD_REQ_HARDBARRIER)) { /* Try again with no barrier */ dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); set_bit(MD_NO_BARRIER, &mdev->flags); - rw &= ~(1 << BIO_RW_BARRIER); + rw &= ~DRBD_REQ_HARDBARRIER; bio_put(bio); goto retry; } +#endif out: bio_put(bio); - return ok; + return err; } int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, sector_t sector, int rw) { - int logical_block_size, mask, ok; - int offset = 0; + int err; struct page *iop = mdev->md_io_page; - D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); + D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); if (!bdev->md_bdev) { if (DRBD_ratelimit(5*HZ, 5)) { dev_err(DEV, "bdev->md_bdev==NULL\n"); dump_stack(); } - return 0; - } - - logical_block_size = bdev_logical_block_size(bdev->md_bdev); - if (logical_block_size == 0) - logical_block_size = MD_SECTOR_SIZE; - - /* in case logical_block_size != 512 [ s390 only? ] */ - if (logical_block_size != MD_SECTOR_SIZE) { - mask = (logical_block_size / MD_SECTOR_SIZE) - 1; - D_ASSERT(mask == 1 || mask == 3 || mask == 7); - D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); - offset = sector & mask; - sector = sector & ~mask; - iop = mdev->md_io_tmpp; - - if (rw & WRITE) { - /* these are GFP_KERNEL pages, pre-allocated - * on device initialization */ - void *p = page_address(mdev->md_io_page); - void *hp = page_address(mdev->md_io_tmpp); - - ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, - READ, logical_block_size); - - if (unlikely(!ok)) { - dev_err(DEV, "drbd_md_sync_page_io(,%llus," - "READ [logical_block_size!=512]) failed!\n", - (unsigned long long)sector); - return 0; - } - - memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); - } + return -EIO; } -#if DUMP_MD >= 3 - dev_info(DEV, "%s [%d]:%s(,%llus,%s)\n", + dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); -#endif if (sector < drbd_md_first_sector(bdev) || - sector > drbd_md_last_sector(bdev)) + sector + 7 > drbd_md_last_sector(bdev)) dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); - ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); - if (unlikely(!ok)) { - dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); - return 0; - } - - if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { - void *p = page_address(mdev->md_io_page); - void *hp = page_address(mdev->md_io_tmpp); - - memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); + err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); + if (err) { + dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); } - - return ok; + return err; } static @@ -215,583 +237,263 @@ { struct lc_element *al_ext; struct lc_element *tmp; - unsigned long al_flags = 0; + int wake; spin_lock_irq(&mdev->al_lock); tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); if (unlikely(tmp != NULL)) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { + wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); spin_unlock_irq(&mdev->al_lock); + if (wake) + wake_up(&mdev->al_wait); return NULL; } } - al_ext = lc_get(mdev->act_log, enr); - al_flags = mdev->act_log->flags; + al_ext = lc_get(mdev->act_log, enr); spin_unlock_irq(&mdev->al_lock); - - /* - if (!al_ext) { - if (al_flags & LC_STARVING) - dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); - if (al_flags & LC_DIRTY) - dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); - } - */ - return al_ext; } -void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) +void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) { - unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); - struct lc_element *al_ext; - struct update_al_work al_work; + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned enr; + bool locked = false; + D_ASSERT(atomic_read(&mdev->local_cnt) > 0); - trace_drbd_actlog(mdev, sector, "al_begin_io"); + for (enr = first; enr <= last; enr++) + wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); - wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); + /* Serialize multiple transactions. + * This uses test_and_set_bit, memory barrier is implicit. + */ + wait_event(mdev->al_wait, + mdev->act_log->pending_changes == 0 || + (locked = lc_try_lock_for_transaction(mdev->act_log))); - if (al_ext->lc_number != enr) { + if (locked) { /* drbd_al_write_transaction(mdev,al_ext,enr); * recurses into generic_make_request(), which * disallows recursion, bios being serialized on the * current->bio_tail list now. * we have to delegate updates to the activity log * to the worker thread. */ - init_completion(&al_work.event); - al_work.al_ext = al_ext; - al_work.enr = enr; - al_work.old_enr = al_ext->lc_number; - al_work.w.cb = w_al_write_transaction; - drbd_queue_work_front(&mdev->data.work, &al_work.w); - wait_for_completion(&al_work.event); - - mdev->al_writ_cnt++; - - /* - DUMPI(al_ext->lc_number); - DUMPI(mdev->act_log->new_number); - */ - spin_lock_irq(&mdev->al_lock); - lc_changed(mdev->act_log, al_ext); - spin_unlock_irq(&mdev->al_lock); + + /* Double check: it may have been committed by someone else, + * while we have been waiting for the lock. */ + if (mdev->act_log->pending_changes) { + al_write_transaction(mdev); + mdev->al_writ_cnt++; + + spin_lock_irq(&mdev->al_lock); + /* FIXME + if (err) + we need an "lc_cancel" here; + */ + lc_committed(mdev->act_log); + spin_unlock_irq(&mdev->al_lock); + } + lc_unlock(mdev->act_log); wake_up(&mdev->al_wait); } } -void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) +void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) { - unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned enr; struct lc_element *extent; unsigned long flags; - trace_drbd_actlog(mdev, sector, "al_complete_io"); - spin_lock_irqsave(&mdev->al_lock, flags); - extent = lc_find(mdev->act_log, enr); - - if (!extent) { - spin_unlock_irqrestore(&mdev->al_lock, flags); - dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); - return; + for (enr = first; enr <= last; enr++) { + extent = lc_find(mdev->act_log, enr); + if (!extent) { + dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); + continue; + } + lc_put(mdev->act_log, extent); } - - if (lc_put(mdev->act_log, extent) == 0) - wake_up(&mdev->al_wait); - spin_unlock_irqrestore(&mdev->al_lock, flags); + wake_up(&mdev->al_wait); } -int -w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) -{ - struct update_al_work *aw = container_of(w, struct update_al_work, w); - struct lc_element *updated = aw->al_ext; - const unsigned int new_enr = aw->enr; - const unsigned int evicted = aw->old_enr; - struct al_transaction *buffer; - sector_t sector; - int i, n, mx; - unsigned int extent_nr; - u32 xor_sum = 0; - - if (!get_ldev(mdev)) { - dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); - complete(&((struct update_al_work *)w)->event); - return 1; - } - /* do we have to do a bitmap write, first? - * TODO reduce maximum latency: - * submit both bios, then wait for both, - * instead of doing two synchronous sector writes. */ - if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) - drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); - - mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = (struct al_transaction *)page_address(mdev->md_io_page); - - buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); - buffer->tr_number = cpu_to_be32(mdev->al_tr_number); - - n = lc_index_of(mdev->act_log, updated); - - buffer->updates[0].pos = cpu_to_be32(n); - buffer->updates[0].extent = cpu_to_be32(new_enr); - - xor_sum ^= new_enr; - - mx = min_t(int, AL_EXTENTS_PT, - mdev->act_log->nr_elements - mdev->al_tr_cycle); - for (i = 0; i < mx; i++) { - unsigned idx = mdev->al_tr_cycle + i; - extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; - buffer->updates[i+1].pos = cpu_to_be32(idx); - buffer->updates[i+1].extent = cpu_to_be32(extent_nr); - xor_sum ^= extent_nr; - } - for (; i < AL_EXTENTS_PT; i++) { - buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); - buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); - xor_sum ^= LC_FREE; - } - mdev->al_tr_cycle += AL_EXTENTS_PT; - if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) - mdev->al_tr_cycle = 0; - - buffer->xor_sum = cpu_to_be32(xor_sum); - - sector = mdev->ldev->md.md_offset - + mdev->ldev->md.al_offset + mdev->al_tr_pos; - - if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) - drbd_chk_io_error(mdev, 1, TRUE); - - if (++mdev->al_tr_pos > - div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) - mdev->al_tr_pos = 0; - - D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); - mdev->al_tr_number++; - - mutex_unlock(&mdev->md_io_mutex); - - complete(&((struct update_al_work *)w)->event); - put_ldev(mdev); +#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) +/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT + * are still coupled, or assume too much about their relation. + * Code below will not work if this is violated. + * Will be cleaned up with some followup patch. + */ +# error FIXME +#endif - return 1; +static unsigned int al_extent_to_bm_page(unsigned int al_enr) +{ + return al_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); } -/** - * drbd_al_read_tr() - Read a single transaction from the on disk activity log - * @mdev: DRBD device. - * @bdev: Block device to read form. - * @b: pointer to an al_transaction. - * @index: On disk slot of the transaction to read. - * - * Returns -1 on IO error, 0 on checksum error and 1 upon success. - */ -STATIC int drbd_al_read_tr(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, - struct al_transaction *b, - int index) +static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) { - sector_t sector; - int rv, i; - u32 xor_sum = 0; - - sector = bdev->md.md_offset + bdev->md.al_offset + index; - - /* Dont process error normally, - * as this is done before disk is attached! */ - if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) - return -1; - - rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); - - for (i = 0; i < AL_EXTENTS_PT + 1; i++) - xor_sum ^= be32_to_cpu(b->updates[i].extent); - rv &= (xor_sum == be32_to_cpu(b->xor_sum)); - - return rv; + return rs_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* resync extent number to bit */ + (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); } -/** - * drbd_al_read_log() - Restores the activity log from its on disk representation. - * @mdev: DRBD device. - * @bdev: Block device to read form. - * - * Returns 1 on success, returns 0 when reading the log failed due to IO errors. - */ -int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +static int +_al_write_transaction(struct drbd_conf *mdev) { - struct al_transaction *buffer; - int i; - int rv; - int mx; - int active_extents = 0; - int transactions = 0; - int found_valid = 0; - int from = 0; - int to = 0; - u32 from_tnr = 0; - u32 to_tnr = 0; - u32 cnr; - - mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); - - /* lock out all other meta data io for now, - * and make sure the page is mapped. - */ - mutex_lock(&mdev->md_io_mutex); - buffer = page_address(mdev->md_io_page); - - /* Find the valid transaction in the log */ - for (i = 0; i <= mx; i++) { - rv = drbd_al_read_tr(mdev, bdev, buffer, i); - if (rv == 0) - continue; - if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); - return 0; - } - cnr = be32_to_cpu(buffer->tr_number); + struct al_transaction_on_disk *buffer; + struct lc_element *e; + sector_t sector; + int i, mx; + unsigned extent_nr; + unsigned crc = 0; + int err = 0; - if (++found_valid == 1) { - from = i; - to = i; - from_tnr = cnr; - to_tnr = cnr; - continue; - } - if ((int)cnr - (int)from_tnr < 0) { - D_ASSERT(from_tnr - cnr + i - from == mx+1); - from = i; - from_tnr = cnr; - } - if ((int)cnr - (int)to_tnr > 0) { - D_ASSERT(cnr - to_tnr == i - to); - to = i; - to_tnr = cnr; - } + if (!get_ldev(mdev)) { + dev_err(DEV, "disk is %s, cannot start al transaction\n", + drbd_disk_str(mdev->state.disk)); + return -EIO; } - if (!found_valid) { - dev_warn(DEV, "No usable activity log found.\n"); - mutex_unlock(&mdev->md_io_mutex); - return 1; - } - - /* Read the valid transactions. - * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ - i = from; - while (1) { - int j, pos; - unsigned int extent_nr; - unsigned int trn; - - rv = drbd_al_read_tr(mdev, bdev, buffer, i); - ERR_IF(rv == 0) goto cancel; - if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); - return 0; - } - - trn = be32_to_cpu(buffer->tr_number); - - spin_lock_irq(&mdev->al_lock); - - /* This loop runs backwards because in the cyclic - elements there might be an old version of the - updated element (in slot 0). So the element in slot 0 - can overwrite old versions. */ - for (j = AL_EXTENTS_PT; j >= 0; j--) { - pos = be32_to_cpu(buffer->updates[j].pos); - extent_nr = be32_to_cpu(buffer->updates[j].extent); + /* The bitmap write may have failed, causing a state change. */ + if (mdev->state.disk < D_INCONSISTENT) { + dev_err(DEV, + "disk is %s, cannot write al transaction\n", + drbd_disk_str(mdev->state.disk)); + put_ldev(mdev); + return -EIO; + } - if (extent_nr == LC_FREE) - continue; + buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ + if (!buffer) { + dev_err(DEV, "disk failed while waiting for md_io buffer\n"); + put_ldev(mdev); + return -ENODEV; + } - lc_set(mdev->act_log, extent_nr, pos); - active_extents++; - } - spin_unlock_irq(&mdev->al_lock); + memset(buffer, 0, sizeof(*buffer)); + buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); + buffer->tr_number = cpu_to_be32(mdev->al_tr_number); - transactions++; + i = 0; -cancel: - if (i == to) + /* Even though no one can start to change this list + * once we set the LC_LOCKED -- from drbd_al_begin_io(), + * lc_try_lock_for_transaction() --, someone may still + * be in the process of changing it. */ + spin_lock_irq(&mdev->al_lock); + list_for_each_entry(e, &mdev->act_log->to_be_changed, list) { + if (i == AL_UPDATES_PER_TRANSACTION) { + i++; break; + } + buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); + buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); + if (e->lc_number != LC_FREE) + drbd_bm_mark_for_writeout(mdev, + al_extent_to_bm_page(e->lc_number)); i++; - if (i > mx) - i = 0; } + spin_unlock_irq(&mdev->al_lock); + BUG_ON(i > AL_UPDATES_PER_TRANSACTION); - mdev->al_tr_number = to_tnr+1; - mdev->al_tr_pos = to; - if (++mdev->al_tr_pos > - div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) - mdev->al_tr_pos = 0; - - /* ok, we are done with it */ - mutex_unlock(&mdev->md_io_mutex); - - dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", - transactions, active_extents); - - return 1; -} - -struct drbd_atodb_wait { - atomic_t count; - struct completion io_done; - struct drbd_conf *mdev; - int error; -}; + buffer->n_updates = cpu_to_be16(i); + for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { + buffer->update_slot_nr[i] = cpu_to_be16(-1); + buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); + } -STATIC BIO_ENDIO_TYPE atodb_endio BIO_ENDIO_ARGS(struct bio *bio, int error) -{ - struct drbd_atodb_wait *wc = bio->bi_private; - struct drbd_conf *mdev = wc->mdev; - struct page *page; - int uptodate = bio_flagged(bio, BIO_UPTODATE); - - BIO_ENDIO_FN_START; - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - if (!error && !uptodate) - error = -EIO; - - drbd_chk_io_error(mdev, error, TRUE); - if (error && wc->error == 0) - wc->error = error; + buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements); + buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle); - if (atomic_dec_and_test(&wc->count)) - complete(&wc->io_done); + mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, + mdev->act_log->nr_elements - mdev->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = mdev->al_tr_cycle + i; + extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; + buffer->context[i] = cpu_to_be32(extent_nr); + } + for (; i < AL_CONTEXT_PER_TRANSACTION; i++) + buffer->context[i] = cpu_to_be32(LC_FREE); - page = bio->bi_io_vec[0].bv_page; - put_page(page); - bio_put(bio); - mdev->bm_writ_cnt++; - put_ldev(mdev); + mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; + if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) + mdev->al_tr_cycle = 0; - BIO_ENDIO_FN_RETURN; -} + sector = mdev->ldev->md.md_offset + + mdev->ldev->md.al_offset + + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); -#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) -/* activity log to on disk bitmap -- prepare bio unless that sector - * is already covered by previously prepared bios */ -STATIC int atodb_prepare_unless_covered(struct drbd_conf *mdev, - struct bio **bios, - unsigned int enr, - struct drbd_atodb_wait *wc) __must_hold(local) -{ - struct bio *bio; - struct page *page; - sector_t on_disk_sector = enr + mdev->ldev->md.md_offset - + mdev->ldev->md.bm_offset; - unsigned int page_offset = PAGE_SIZE; - int offset; - int i = 0; - int err = -ENOMEM; - - /* Check if that enr is already covered by an already created bio. - * Caution, bios[] is not NULL terminated, - * but only initialized to all NULL. - * For completely scattered activity log, - * the last invocation iterates over all bios, - * and finds the last NULL entry. - */ - while ((bio = bios[i])) { - if (bio->bi_sector == on_disk_sector) - return 0; - i++; - } - /* bios[i] == NULL, the next not yet used slot */ + crc = crc32c(0, buffer, 4096); + buffer->crc32c = cpu_to_be32(crc); - /* GFP_KERNEL, we are not in the write-out path */ - bio = bio_alloc(GFP_KERNEL, 1); - if (bio == NULL) - return -ENOMEM; - - if (i > 0) { - const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; - page_offset = prev_bv->bv_offset + prev_bv->bv_len; - page = prev_bv->bv_page; - } - if (page_offset == PAGE_SIZE) { - page = alloc_page(__GFP_HIGHMEM); - if (page == NULL) - goto out_bio_put; - page_offset = 0; + if (drbd_bm_write_hinted(mdev)) + err = -EIO; + /* drbd_chk_io_error done already */ + else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(mdev, 1, true); } else { - get_page(page); + /* advance ringbuffer position and transaction counter */ + mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); + mdev->al_tr_number++; } - offset = S2W(enr); - drbd_bm_get_lel(mdev, offset, - min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset), - kmap(page) + page_offset); - kunmap(page); - - bio->bi_private = wc; - bio->bi_end_io = atodb_endio; - bio->bi_bdev = mdev->ldev->md_bdev; - bio->bi_sector = on_disk_sector; - - if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE) - goto out_put_page; - - atomic_inc(&wc->count); - /* we already know that we may do this... - * get_ldev_if_state(mdev,D_ATTACHING); - * just get the extra reference, so that the local_cnt reflects - * the number of pending IO requests DRBD at its backing device. - */ - atomic_inc(&mdev->local_cnt); - - bios[i] = bio; - - return 0; + drbd_md_put_buffer(mdev); + put_ldev(mdev); -out_put_page: - err = -EINVAL; - put_page(page); -out_bio_put: - bio_put(bio); return err; } -/** - * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents - * @mdev: DRBD device. - * - * Called when we detach (unconfigure) local storage, - * or when we go from R_PRIMARY to R_SECONDARY role. - */ -void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) -{ - int i, nr_elements; - unsigned int enr; - struct bio **bios; - struct drbd_atodb_wait wc; - - ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) - return; /* sorry, I don't have any act_log etc... */ - - wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); - - nr_elements = mdev->act_log->nr_elements; - - /* GFP_KERNEL, we are not in anyone's write-out path */ - bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); - if (!bios) - goto submit_one_by_one; - - atomic_set(&wc.count, 0); - init_completion(&wc.io_done); - wc.mdev = mdev; - wc.error = 0; - - for (i = 0; i < nr_elements; i++) { - enr = lc_element_by_index(mdev->act_log, i)->lc_number; - if (enr == LC_FREE) - continue; - /* next statement also does atomic_inc wc.count and local_cnt */ - if (atodb_prepare_unless_covered(mdev, bios, - enr/AL_EXT_PER_BM_SECT, - &wc)) - goto free_bios_submit_one_by_one; - } - - /* unnecessary optimization? */ - lc_unlock(mdev->act_log); - wake_up(&mdev->al_wait); - - /* all prepared, submit them */ - for (i = 0; i < nr_elements; i++) { - if (bios[i] == NULL) - break; - if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { - bios[i]->bi_rw = WRITE; - bio_endio(bios[i], -EIO); - } else { - submit_bio(WRITE, bios[i]); - } - } - - drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); - - /* always (try to) flush bitmap to stable storage */ - drbd_md_flush(mdev); - - /* In case we did not submit a single IO do not wait for - * them to complete. ( Because we would wait forever here. ) - * - * In case we had IOs and they are already complete, there - * is not point in waiting anyways. - * Therefore this if () ... */ - if (atomic_read(&wc.count)) - wait_for_completion(&wc.io_done); - put_ldev(mdev); - - kfree(bios); - return; - - free_bios_submit_one_by_one: - /* free everything by calling the endio callback directly. */ - for (i = 0; i < nr_elements && bios[i]; i++) - bio_endio(bios[i], 0); - - kfree(bios); +static int w_al_write_transaction(struct drbd_work *w, int unused) +{ + struct update_al_work *aw = container_of(w, struct update_al_work, w); + struct drbd_conf *mdev = w->mdev; + int err; - submit_one_by_one: - dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); + err = _al_write_transaction(mdev); + aw->err = err; + complete(&aw->event); - for (i = 0; i < mdev->act_log->nr_elements; i++) { - enr = lc_element_by_index(mdev->act_log, i)->lc_number; - if (enr == LC_FREE) - continue; - /* Really slow: if we have al-extents 16..19 active, - * sector 4 will be written four times! Synchronous! */ - drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); - } - - lc_unlock(mdev->act_log); - wake_up(&mdev->al_wait); - put_ldev(mdev); + return err != -EIO ? err : 0; } -/** - * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents - * @mdev: DRBD device. - */ -void drbd_al_apply_to_bm(struct drbd_conf *mdev) +/* Calls from worker context (see w_restart_disk_io()) need to write the + transaction directly. Others came through generic_make_request(), + those need to delegate it to the worker. */ +static int al_write_transaction(struct drbd_conf *mdev) { - unsigned int enr; - unsigned long add = 0; - char ppb[10]; - int i; - - wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + struct update_al_work al_work; - for (i = 0; i < mdev->act_log->nr_elements; i++) { - enr = lc_element_by_index(mdev->act_log, i)->lc_number; - if (enr == LC_FREE) - continue; - add += drbd_bm_ALe_set_all(mdev, enr); - } + if (current == mdev->tconn->worker.task) + return _al_write_transaction(mdev); - lc_unlock(mdev->act_log); - wake_up(&mdev->al_wait); + init_completion(&al_work.event); + al_work.w.cb = w_al_write_transaction; + al_work.w.mdev = mdev; + drbd_queue_work_front(&mdev->tconn->data.work, &al_work.w); + wait_for_completion(&al_work.event); - dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", - ppsize(ppb, Bit2KB(add))); + return al_work.err; } static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) @@ -821,7 +523,7 @@ struct lc_element *al_ext; int i; - D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); + D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags)); for (i = 0; i < mdev->act_log->nr_elements; i++) { al_ext = lc_element_by_index(mdev->act_log, i); @@ -833,18 +535,20 @@ wake_up(&mdev->al_wait); } -STATIC int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) +STATIC int w_update_odbm(struct drbd_work *w, int unused) { struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); + struct drbd_conf *mdev = w->mdev; + struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; if (!get_ldev(mdev)) { if (DRBD_ratelimit(5*HZ, 5)) dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); kfree(udw); - return 1; + return 0; } - drbd_bm_write_sect(mdev, udw->enr); + drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); put_ldev(mdev); kfree(udw); @@ -859,9 +563,9 @@ break; } } - drbd_bcast_sync_progress(mdev); + drbd_bcast_event(mdev, &sib); - return 1; + return 0; } @@ -894,16 +598,20 @@ else ext->rs_failed += count; if (ext->rs_left < ext->rs_failed) { - dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " - "rs_failed=%d count=%d\n", + dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d cstate=%s\n", (unsigned long long)sector, ext->lce.lc_number, ext->rs_left, - ext->rs_failed, count); - dump_stack(); + ext->rs_failed, count, + drbd_conn_str(mdev->state.conn)); - lc_put(mdev->resync, &ext->lce); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return; + /* We don't expect to be able to clear more bits + * than have been set when we originally counted + * the set bits to cache that value in ext->rs_left. + * Whatever the reason (disconnect during resync, + * delayed local completion of an application write), + * try to fix it up by recounting here. */ + ext->rs_left = drbd_bm_e_weight(mdev, enr); } } else { /* Normally this element should be in the cache, @@ -924,11 +632,12 @@ dev_warn(DEV, "Kicking resync_lru element enr=%u " "out with rs_failed=%d\n", ext->lce.lc_number, ext->rs_failed); - set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); } ext->rs_left = rs_left; ext->rs_failed = success ? 0 : count; - lc_changed(mdev->resync, &ext->lce); + /* we don't keep a persistent log of the resync lru, + * we can commit any change right away. */ + lc_committed(mdev->resync); } lc_put(mdev->resync, &ext->lce); /* no race, we are within the al_lock! */ @@ -940,10 +649,10 @@ if (udw) { udw->enr = ext->lce.lc_number; udw->w.cb = w_update_odbm; - drbd_queue_work_front(&mdev->data.work, &udw->w); + udw->w.mdev = mdev; + drbd_queue_work_front(&mdev->tconn->data.work, &udw->w); } else { dev_warn(DEV, "Could not kmalloc an udw\n"); - set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); } } } else { @@ -954,6 +663,22 @@ } } +void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go) +{ + unsigned long now = jiffies; + unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark]; + int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS; + if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { + if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go && + mdev->state.conn != C_PAUSED_SYNC_T && + mdev->state.conn != C_PAUSED_SYNC_S) { + mdev->rs_mark_time[next] = now; + mdev->rs_mark_left[next] = still_to_go; + mdev->rs_last_mark = next; + } + } +} + /* clear the bit corresponding to the piece of storage in question: * size byte of data starting from sector. Only clear a bits of the affected * one ore more _aligned_ BM_BLOCK_SIZE blocks. @@ -971,7 +696,7 @@ int wake_up = 0; unsigned long flags; - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -979,8 +704,10 @@ nr_sectors = drbd_get_capacity(mdev->this_bdev); esector = sector + (size >> 9) - 1; - ERR_IF(sector >= nr_sectors) return; - ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + if (!expect(sector < nr_sectors)) + return; + if (!expect(esector < nr_sectors)) + esector = nr_sectors - 1; lbnr = BM_SECT_TO_BIT(nr_sectors-1); @@ -995,10 +722,6 @@ ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); - trace_drbd_resync(mdev, TRACE_LVL_METRICS, - "drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", - (unsigned long long)sector, size, sbnr, ebnr); - if (sbnr > ebnr) return; @@ -1006,29 +729,18 @@ * ok, (capacity & 7) != 0 sometimes, but who cares... * we count rs_{total,left} in bits, not sectors. */ - spin_lock_irqsave(&mdev->al_lock, flags); count = drbd_bm_clear_bits(mdev, sbnr, ebnr); - if (count) { - /* we need the lock for drbd_try_clear_on_disk_bm */ - if (jiffies - mdev->rs_mark_time > HZ*10) { - /* should be rolling marks, - * but we estimate only anyways. */ - if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && - mdev->state.conn != C_PAUSED_SYNC_T && - mdev->state.conn != C_PAUSED_SYNC_S) { - mdev->rs_mark_time = jiffies; - mdev->rs_mark_left = drbd_bm_total_weight(mdev); - } - } - if (get_ldev(mdev)) { - drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); - put_ldev(mdev); - } + if (count && get_ldev(mdev)) { + drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); + spin_lock_irqsave(&mdev->al_lock, flags); + drbd_try_clear_on_disk_bm(mdev, sector, count, true); + spin_unlock_irqrestore(&mdev->al_lock, flags); + /* just wake_up unconditional now, various lc_chaged(), * lc_put() in drbd_try_clear_on_disk_bm(). */ wake_up = 1; + put_ldev(mdev); } - spin_unlock_irqrestore(&mdev->al_lock, flags); if (wake_up) wake_up(&mdev->al_wait); } @@ -1036,47 +748,41 @@ /* * this is intended to set one request worth of data out of sync. * affects at least 1 bit, - * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. + * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. * * called by tl_clear and drbd_send_dblock (==drbd_make_request). * so this can be _any_ process. */ -void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, +int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line) { - unsigned long sbnr, ebnr, lbnr, flags; + unsigned long sbnr, ebnr, flags; sector_t esector, nr_sectors; - unsigned int enr, count; + unsigned int enr, count = 0; struct lc_element *e; - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "sector: %llus, size: %d\n", (unsigned long long)sector, size); - return; + return 0; } if (!get_ldev(mdev)) - return; /* no disk, no metadata, no bitmap to set bits in */ + return 0; /* no disk, no metadata, no bitmap to set bits in */ nr_sectors = drbd_get_capacity(mdev->this_bdev); esector = sector + (size >> 9) - 1; - ERR_IF(sector >= nr_sectors) + if (!expect(sector < nr_sectors)) goto out; - ERR_IF(esector >= nr_sectors) - esector = (nr_sectors-1); - - lbnr = BM_SECT_TO_BIT(nr_sectors-1); + if (!expect(esector < nr_sectors)) + esector = nr_sectors - 1; /* we set it out of sync, * we do not need to round anything here */ sbnr = BM_SECT_TO_BIT(sector); ebnr = BM_SECT_TO_BIT(esector); - trace_drbd_resync(mdev, TRACE_LVL_METRICS, - "drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", - (unsigned long long)sector, size, sbnr, ebnr); - /* ok, (capacity & 7) != 0 sometimes, but who cares... * we count rs_{total,left} in bits, not sectors. */ spin_lock_irqsave(&mdev->al_lock, flags); @@ -1090,6 +796,8 @@ out: put_ldev(mdev); + + return count; } static @@ -1111,7 +819,7 @@ if (bm_ext->lce.lc_number != enr) { bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); bm_ext->rs_failed = 0; - lc_changed(mdev->resync, &bm_ext->lce); + lc_committed(mdev->resync); wakeup = 1; } if (bm_ext->lce.refcnt == 1) @@ -1127,7 +835,7 @@ if (rs_flags & LC_STARVING) dev_warn(DEV, "Have to wait for element" " (resync LRU too small?)\n"); - BUG_ON(rs_flags & LC_DIRTY); + BUG_ON(rs_flags & LC_LOCKED); } return bm_ext; @@ -1135,26 +843,12 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) { - struct lc_element *al_ext; - int rv = 0; + int rv; spin_lock_irq(&mdev->al_lock); - if (unlikely(enr == mdev->act_log->new_number)) - rv = 1; - else { - al_ext = lc_find(mdev->act_log, enr); - if (al_ext) { - if (al_ext->refcnt) - rv = 1; - } - } + rv = lc_is_used(mdev->act_log, enr); spin_unlock_irq(&mdev->al_lock); - /* - if (unlikely(rv)) { - dev_info(DEV, "Delaying sync read until app's write is done\n"); - } - */ return rv; } @@ -1163,44 +857,50 @@ * @mdev: DRBD device. * @sector: The sector number. * - * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. + * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. */ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) { unsigned int enr = BM_SECT_TO_EXT(sector); struct bm_extent *bm_ext; int i, sig; + int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. + 200 times -> 20 seconds. */ - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "drbd_rs_begin_io: sector=%llus (rs_end=%d)\n", - (unsigned long long)sector, enr); - +retry: sig = wait_event_interruptible(mdev->al_wait, (bm_ext = _bme_get(mdev, enr))); if (sig) - return 0; + return -EINTR; if (test_bit(BME_LOCKED, &bm_ext->flags)) - return 1; + return 0; for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(mdev->al_wait, - !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); - if (sig) { + !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) || + test_bit(BME_PRIORITY, &bm_ext->flags)); + + if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { spin_lock_irq(&mdev->al_lock); if (lc_put(mdev->resync, &bm_ext->lce) == 0) { - clear_bit(BME_NO_WRITES, &bm_ext->flags); + bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ mdev->resync_locked--; wake_up(&mdev->al_wait); } spin_unlock_irq(&mdev->al_lock); - return 0; + if (sig) + return -EINTR; + if (schedule_timeout_interruptible(HZ/10)) + return -EINTR; + if (sa && --sa == 0) + dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec." + "Resync stalled?\n"); + goto retry; } } - set_bit(BME_LOCKED, &bm_ext->flags); - - return 1; + return 0; } /** @@ -1220,9 +920,6 @@ struct bm_extent *bm_ext; int i; - trace_drbd_resync(mdev, TRACE_LVL_ALL, "drbd_try_rs_begin_io: sector=%llus\n", - (unsigned long long)sector); - spin_lock_irq(&mdev->al_lock); if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { /* in case you have very heavy scattered io, it may @@ -1239,10 +936,6 @@ * we also have to wake_up */ - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "dropping %u, apparently got 'synced' by application io\n", - mdev->resync_wenr); - e = lc_find(mdev->resync, mdev->resync_wenr); bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; if (bm_ext) { @@ -1270,21 +963,14 @@ * but then could not set BME_LOCKED, * so we tried again. * drop the extra reference. */ - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "dropping extra reference on %u\n", enr); - bm_ext->lce.refcnt--; D_ASSERT(bm_ext->lce.refcnt > 0); } goto check_al; } else { /* do we rather want to try later? */ - if (mdev->resync_locked > mdev->resync->nr_elements-3) { - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "resync_locked = %u!\n", mdev->resync_locked); - + if (mdev->resync_locked > mdev->resync->nr_elements-3) goto try_again; - } /* Do or do not. There is no try. -- Yoda */ e = lc_get(mdev->resync, enr); bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; @@ -1293,13 +979,13 @@ if (rs_flags & LC_STARVING) dev_warn(DEV, "Have to wait for element" " (resync LRU too small?)\n"); - BUG_ON(rs_flags & LC_DIRTY); + BUG_ON(rs_flags & LC_LOCKED); goto try_again; } if (bm_ext->lce.lc_number != enr) { bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); bm_ext->rs_failed = 0; - lc_changed(mdev->resync, &bm_ext->lce); + lc_committed(mdev->resync); wake_up(&mdev->al_wait); D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); } @@ -1309,11 +995,7 @@ goto check_al; } check_al: - trace_drbd_resync(mdev, TRACE_LVL_ALL, "checking al for %u\n", enr); - for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { - if (unlikely(al_enr+i == mdev->act_log->new_number)) - goto try_again; if (lc_is_used(mdev->act_log, al_enr+i)) goto try_again; } @@ -1324,7 +1006,6 @@ return 0; try_again: - trace_drbd_resync(mdev, TRACE_LVL_ALL, "need to try again for %u\n", enr); if (bm_ext) mdev->resync_wenr = enr; spin_unlock_irq(&mdev->al_lock); @@ -1338,10 +1019,6 @@ struct bm_extent *bm_ext; unsigned long flags; - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n", - (long long)sector, enr); - spin_lock_irqsave(&mdev->al_lock, flags); e = lc_find(mdev->resync, enr); bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; @@ -1361,8 +1038,7 @@ } if (lc_put(mdev->resync, &bm_ext->lce) == 0) { - clear_bit(BME_LOCKED, &bm_ext->flags); - clear_bit(BME_NO_WRITES, &bm_ext->flags); + bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ mdev->resync_locked--; wake_up(&mdev->al_wait); } @@ -1376,8 +1052,6 @@ */ void drbd_rs_cancel_all(struct drbd_conf *mdev) { - trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_cancel_all\n"); - spin_lock_irq(&mdev->al_lock); if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ @@ -1403,15 +1077,13 @@ struct bm_extent *bm_ext; int i; - trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_del_all\n"); - spin_lock_irq(&mdev->al_lock); if (get_ldev_if_state(mdev, D_FAILED)) { /* ok, ->resync is there. */ for (i = 0; i < mdev->resync->nr_elements; i++) { e = lc_element_by_index(mdev->resync, i); - bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + bm_ext = lc_entry(e, struct bm_extent, lce); if (bm_ext->lce.lc_number == LC_FREE) continue; if (bm_ext->lce.lc_number == mdev->resync_wenr) { @@ -1457,11 +1129,7 @@ sector_t esector, nr_sectors; int wake_up = 0; - trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, - "drbd_rs_failed_io: sector=%llus, size=%u\n", - (unsigned long long)sector, size); - - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -1469,8 +1137,10 @@ nr_sectors = drbd_get_capacity(mdev->this_bdev); esector = sector + (size >> 9) - 1; - ERR_IF(sector >= nr_sectors) return; - ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + if (!expect(sector < nr_sectors)) + return; + if (!expect(esector < nr_sectors)) + esector = nr_sectors - 1; lbnr = BM_SECT_TO_BIT(nr_sectors-1); @@ -1498,7 +1168,7 @@ mdev->rs_failed += count; if (get_ldev(mdev)) { - drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); + drbd_try_clear_on_disk_bm(mdev, sector, count, false); put_ldev(mdev); } diff -Nru drbd8-8.3.7/drbd/drbd_bitmap.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_bitmap.c --- drbd8-8.3.7/drbd/drbd_bitmap.c 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_bitmap.c 2012-02-02 14:09:14.000000000 +0000 @@ -26,19 +26,64 @@ #include #include #include +#include +#include #include + +#include + #include "drbd_int.h" +/* See the ifdefs and comments inside that header file. + * On recent kernels this is not needed. */ +#include "compat/bitops.h" + /* OPAQUE outside this file! * interface defined in drbd_int.h * convention: * function name drbd_bm_... => used elsewhere, "public". * function name bm_... => internal to implementation, "private". + */ - * Note that since find_first_bit returns int, at the current granularity of - * the bitmap (4KB per byte), this implementation "only" supports up to - * 1<<(32+12) == 16 TB... + +/* + * LIMITATIONS: + * We want to support >= peta byte of backend storage, while for now still using + * a granularity of one bit per 4KiB of storage. + * 1 << 50 bytes backend storage (1 PiB) + * 1 << (50 - 12) bits needed + * 38 --> we need u64 to index and count bits + * 1 << (38 - 3) bitmap bytes needed + * 35 --> we still need u64 to index and count bytes + * (that's 32 GiB of bitmap for 1 PiB storage) + * 1 << (35 - 2) 32bit longs needed + * 33 --> we'd even need u64 to index and count 32bit long words. + * 1 << (35 - 3) 64bit longs needed + * 32 --> we could get away with a 32bit unsigned int to index and count + * 64bit long words, but I rather stay with unsigned long for now. + * We probably should neither count nor point to bytes or long words + * directly, but either by bitnumber, or by page index and offset. + * 1 << (35 - 12) + * 22 --> we need that much 4KiB pages of bitmap. + * 1 << (22 + 3) --> on a 64bit arch, + * we need 32 MiB to store the array of page pointers. + * + * Because I'm lazy, and because the resulting patch was too large, too ugly + * and still incomplete, on 32bit we still "only" support 16 TiB (minus some), + * (1 << 32) bits * 4k storage. + * + + * bitmap storage and IO: + * Bitmap is stored little endian on disk, and is kept little endian in + * core memory. Currently we still hold the full bitmap in core as long + * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage + * seems excessive. + * + * We plan to reduce the amount of in-core bitmap pages by paging them in + * and out against their on-disk location as necessary, but need to make + * sure we don't cause too much meta data IO, and must not deadlock in + * tight memory situations. This needs some more work. */ /* @@ -54,40 +99,25 @@ struct drbd_bitmap { struct page **bm_pages; spinlock_t bm_lock; - /* WARNING unsigned long bm_*: - * 32bit number of bit offset is just enough for 512 MB bitmap. - * it will blow up if we make the bitmap bigger... - * not that it makes much sense to have a bitmap that large, - * rather change the granularity to 16k or 64k or something. - * (that implies other problems, however...) - */ + + /* see LIMITATIONS: above */ + unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ unsigned long bm_bits; size_t bm_words; size_t bm_number_of_pages; sector_t bm_dev_capacity; - struct semaphore bm_change; /* serializes resize operations */ + struct mutex bm_change; /* serializes resize operations */ - atomic_t bm_async_io; - wait_queue_head_t bm_io_wait; + wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */ - unsigned long bm_flags; + enum bm_flag bm_flags; /* debugging aid, in case we are still racy somewhere */ char *bm_why; struct task_struct *bm_task; }; -/* definition of bits in bm_flags */ -#define BM_LOCKED 0 -#define BM_MD_IO_ERROR 1 -#define BM_P_VMALLOCED 2 - -static int bm_is_locked(struct drbd_bitmap *b) -{ - return test_bit(BM_LOCKED, &b->bm_flags); -} - #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) { @@ -95,16 +125,12 @@ if (!DRBD_ratelimit(5*HZ, 5)) return; dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", - current == mdev->receiver.task ? "receiver" : - current == mdev->asender.task ? "asender" : - current == mdev->worker.task ? "worker" : current->comm, - func, b->bm_why ?: "?", - b->bm_task == mdev->receiver.task ? "receiver" : - b->bm_task == mdev->asender.task ? "asender" : - b->bm_task == mdev->worker.task ? "worker" : "?"); + drbd_task_to_thread_name(mdev->tconn, current), + func, b->bm_why ?: "?", + drbd_task_to_thread_name(mdev->tconn, b->bm_task)); } -void drbd_bm_lock(struct drbd_conf *mdev, char *why) +void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags) { struct drbd_bitmap *b = mdev->bitmap; int trylock_failed; @@ -114,21 +140,18 @@ return; } - trylock_failed = down_trylock(&b->bm_change); + trylock_failed = !mutex_trylock(&b->bm_change); if (trylock_failed) { dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", - current == mdev->receiver.task ? "receiver" : - current == mdev->asender.task ? "asender" : - current == mdev->worker.task ? "worker" : current->comm, - why, b->bm_why ?: "?", - b->bm_task == mdev->receiver.task ? "receiver" : - b->bm_task == mdev->asender.task ? "asender" : - b->bm_task == mdev->worker.task ? "worker" : "?"); - down(&b->bm_change); + drbd_task_to_thread_name(mdev->tconn, current), + why, b->bm_why ?: "?", + drbd_task_to_thread_name(mdev->tconn, b->bm_task)); + mutex_lock(&b->bm_change); } - if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) + if (BM_LOCKED_MASK & b->bm_flags) dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); + b->bm_flags |= flags & BM_LOCKED_MASK; b->bm_why = why; b->bm_task = current; @@ -142,48 +165,160 @@ return; } - if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) + if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags)) dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); + b->bm_flags &= ~BM_LOCKED_MASK; b->bm_why = NULL; b->bm_task = NULL; - up(&b->bm_change); + mutex_unlock(&b->bm_change); +} + +/* we store some "meta" info about our pages in page->private */ +/* at a granularity of 4k storage per bitmap bit: + * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks + * 1<<38 bits, + * 1<<23 4k bitmap pages. + * Use 24 bits as page index, covers 2 peta byte storage + * at a granularity of 4k per bit. + * Used to report the failed page idx on io error from the endio handlers. + */ +#define BM_PAGE_IDX_MASK ((1UL<<24)-1) +/* this page is currently read in, or written back */ +#define BM_PAGE_IO_LOCK 31 +/* if there has been an IO error for this page */ +#define BM_PAGE_IO_ERROR 30 +/* this is to be able to intelligently skip disk IO, + * set if bits have been set since last IO. */ +#define BM_PAGE_NEED_WRITEOUT 29 +/* to mark for lazy writeout once syncer cleared all clearable bits, + * we if bits have been cleared since last IO. */ +#define BM_PAGE_LAZY_WRITEOUT 28 +/* pages marked with this "HINT" will be considered for writeout + * on activity log transactions */ +#define BM_PAGE_HINT_WRITEOUT 27 + +/* store_page_idx uses non-atomic assignment. It is only used directly after + * allocating the page. All other bm_set_page_* and bm_clear_page_* need to + * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap + * changes) may happen from various contexts, and wait_on_bit/wake_up_bit + * requires it all to be atomic as well. */ +static void bm_store_page_idx(struct page *page, unsigned long idx) +{ + BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); + page_private(page) |= idx; } -#define bm_end_info(ignored...) ((void)(0)) +static unsigned long bm_page_to_idx(struct page *page) +{ + return page_private(page) & BM_PAGE_IDX_MASK; +} -#if 0 -#define catch_oob_access_start() do { \ - do { \ - if ((bm-p_addr) >= PAGE_SIZE/sizeof(long)) { \ - printk(KERN_ALERT "drbd_bitmap.c:%u %s: p_addr:%p bm:%p %d\n", \ - __LINE__ , __func__ , p_addr, bm, (bm-p_addr)); \ - break; \ - } -#define catch_oob_access_end() \ - } while (0); } while (0) -#else -#define catch_oob_access_start() do { -#define catch_oob_access_end() } while (0) -#endif +/* As is very unlikely that the same page is under IO from more than one + * context, we can get away with a bit per page and one wait queue per bitmap. + */ +static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr) +{ + struct drbd_bitmap *b = mdev->bitmap; + void *addr = &page_private(b->bm_pages[page_nr]); + wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr)); +} -/* word offset to long pointer */ -STATIC unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) +static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr) +{ + struct drbd_bitmap *b = mdev->bitmap; + void *addr = &page_private(b->bm_pages[page_nr]); + clear_bit_unlock(BM_PAGE_IO_LOCK, addr); + wake_up(&mdev->bitmap->bm_io_wait); +} + +/* set _before_ submit_io, so it may be reset due to being changed + * while this page is in flight... will get submitted later again */ +static void bm_set_page_unchanged(struct page *page) +{ + /* use cmpxchg? */ + clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); + clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); +} + +static void bm_set_page_need_writeout(struct page *page) +{ + set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); +} + +/** + * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout + * @mdev: DRBD device. + * @page_nr: the bitmap page to mark with the "hint" flag + * + * From within an activity log transaction, we mark a few pages with these + * hints, then call drbd_bm_write_hinted(), which will only write out changed + * pages which are flagged with this mark. + */ +void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr) { struct page *page; - unsigned long page_nr; + if (page_nr >= mdev->bitmap->bm_number_of_pages) { + dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n", + page_nr, (int)mdev->bitmap->bm_number_of_pages); + return; + } + page = mdev->bitmap->bm_pages[page_nr]; + set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); +} + +static int bm_test_page_unchanged(struct page *page) +{ + volatile const unsigned long *addr = &page_private(page); + return (*addr & ((1UL<> PAGE_SHIFT; */ - page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); + unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3); BUG_ON(page_nr >= b->bm_number_of_pages); - page = b->bm_pages[page_nr]; + return page_nr; +} +static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) +{ + /* page_nr = (bitnr/8) >> PAGE_SHIFT; */ + unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3); + BUG_ON(page_nr >= b->bm_number_of_pages); + return page_nr; +} + +static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km) +{ + struct page *page = b->bm_pages[idx]; return (unsigned long *) kmap_atomic(page, km); } -static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) +static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) { - return __bm_map_paddr(b, offset, KM_IRQ1); + return __bm_map_pidx(b, idx, KM_IRQ1); } static void __bm_unmap(unsigned long *p_addr, const enum km_type km) @@ -201,7 +336,7 @@ /* word offset from start of bitmap to word number _in_page_ * modulo longs per page #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) - hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) + hm, well, Philipp thinks gcc might not optimize the % into & (... - 1) so do it explicitly: */ #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) @@ -215,6 +350,7 @@ * to be able to report device specific. */ + STATIC void bm_free_pages(struct page **pages, unsigned long number) { unsigned long i; @@ -259,8 +395,8 @@ /* Trying kmalloc first, falling back to vmalloc. * GFP_KERNEL is ok, as this is done when a lower level disk is - * "attached" to the drbd. Context is receiver thread or cqueue - * thread. As we have no disk yet, we are not in the IO path, + * "attached" to the drbd. Context is receiver thread or drbdsetup / + * netlink process. As we have no disk yet, we are not in the IO path, * not even the IO path of the peer. */ bytes = sizeof(struct page *)*want; new_pages = kmalloc(bytes, GFP_KERNEL); @@ -282,6 +418,9 @@ bm_vk_free(new_pages, vmalloced); return NULL; } + /* we want to know which page it is + * from the endio handlers */ + bm_store_page_idx(page, i); new_pages[i] = page; } } else { @@ -293,9 +432,9 @@ } if (vmalloced) - set_bit(BM_P_VMALLOCED, &b->bm_flags); + b->bm_flags |= BM_P_VMALLOCED; else - clear_bit(BM_P_VMALLOCED, &b->bm_flags); + b->bm_flags &= ~BM_P_VMALLOCED; return new_pages; } @@ -312,7 +451,7 @@ if (!b) return -ENOMEM; spin_lock_init(&b->bm_lock); - init_MUTEX(&b->bm_change); + mutex_init(&b->bm_change); init_waitqueue_head(&b->bm_io_wait); mdev->bitmap = b; @@ -322,7 +461,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev) { - ERR_IF(!mdev->bitmap) return 0; + if (!expect(mdev->bitmap)) + return 0; return mdev->bitmap->bm_dev_capacity; } @@ -330,9 +470,10 @@ */ void drbd_bm_cleanup(struct drbd_conf *mdev) { - ERR_IF (!mdev->bitmap) return; + if (!expect(mdev->bitmap)) + return; bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); - bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); + bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags)); kfree(mdev->bitmap); mdev->bitmap = NULL; } @@ -342,28 +483,41 @@ * this masks out the remaining bits. * Returns the number of bits cleared. */ +#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1) +#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1) STATIC int bm_clear_surplus(struct drbd_bitmap *b) { - const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; - size_t w = b->bm_bits >> LN2_BPL; - int cleared = 0; + unsigned long mask; unsigned long *p_addr, *bm; + int tmp; + int cleared = 0; - p_addr = bm_map_paddr(b, w); - bm = p_addr + MLPP(w); - if (w < b->bm_words) { - catch_oob_access_start(); + /* number of bits modulo bits per page */ + tmp = (b->bm_bits & BITS_PER_PAGE_MASK); + /* mask the used bits of the word containing the last bit */ + mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1; + /* bitmap is always stored little endian, + * on disk and in core memory alike */ + mask = cpu_to_lel(mask); + + p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1); + bm = p_addr + (tmp/BITS_PER_LONG); + if (mask) { + /* If mask != 0, we are not exactly aligned, so bm now points + * to the long containing the last bit. + * If mask == 0, bm already points to the word immediately + * after the last (long word aligned) bit. */ cleared = hweight_long(*bm & ~mask); *bm &= mask; - catch_oob_access_end(); - w++; bm++; + bm++; } - if (w < b->bm_words) { - catch_oob_access_start(); + if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) { + /* on a 32bit arch, we may need to zero out + * a padding long to align with a 64bit remote */ cleared += hweight_long(*bm); *bm = 0; - catch_oob_access_end(); } bm_unmap(p_addr); return cleared; @@ -371,72 +525,75 @@ STATIC void bm_set_surplus(struct drbd_bitmap *b) { - const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; - size_t w = b->bm_bits >> LN2_BPL; + unsigned long mask; unsigned long *p_addr, *bm; + int tmp; - p_addr = bm_map_paddr(b, w); - bm = p_addr + MLPP(w); - if (w < b->bm_words) { - catch_oob_access_start(); + /* number of bits modulo bits per page */ + tmp = (b->bm_bits & BITS_PER_PAGE_MASK); + /* mask the used bits of the word containing the last bit */ + mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1; + /* bitmap is always stored little endian, + * on disk and in core memory alike */ + mask = cpu_to_lel(mask); + + p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1); + bm = p_addr + (tmp/BITS_PER_LONG); + if (mask) { + /* If mask != 0, we are not exactly aligned, so bm now points + * to the long containing the last bit. + * If mask == 0, bm already points to the word immediately + * after the last (long word aligned) bit. */ *bm |= ~mask; - bm++; w++; - catch_oob_access_end(); + bm++; } - if (w < b->bm_words) { - catch_oob_access_start(); - *bm = ~(0UL); - catch_oob_access_end(); + if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) { + /* on a 32bit arch, we may need to zero out + * a padding long to align with a 64bit remote */ + *bm = ~0UL; } bm_unmap(p_addr); } -STATIC unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) +/* you better not modify the bitmap while this is running, + * or its results will be stale */ +STATIC unsigned long bm_count_bits(struct drbd_bitmap *b) { - unsigned long *p_addr, *bm, offset = 0; + unsigned long *p_addr; unsigned long bits = 0; - unsigned long i, do_now; + unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1; + int idx, i, last_word; - while (offset < b->bm_words) { - i = do_now = min_t(size_t, b->bm_words-offset, LWPP); - p_addr = __bm_map_paddr(b, offset, KM_USER0); - bm = p_addr + MLPP(offset); - while (i--) { - catch_oob_access_start(); -#ifndef __LITTLE_ENDIAN - if (swap_endian) - *bm = lel_to_cpu(*bm); -#endif - bits += hweight_long(*bm++); - catch_oob_access_end(); - } + /* all but last page */ + for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { + p_addr = __bm_map_pidx(b, idx, KM_USER0); + for (i = 0; i < LWPP; i++) + bits += hweight_long(p_addr[i]); __bm_unmap(p_addr, KM_USER0); - offset += do_now; cond_resched(); } - + /* last (or only) page */ + last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; + p_addr = __bm_map_pidx(b, idx, KM_USER0); + for (i = 0; i < last_word; i++) + bits += hweight_long(p_addr[i]); + p_addr[last_word] &= cpu_to_lel(mask); + bits += hweight_long(p_addr[last_word]); + /* 32bit arch, may have an unused padding long */ + if (BITS_PER_LONG == 32 && (last_word & 1) == 0) + p_addr[last_word+1] = 0; + __bm_unmap(p_addr, KM_USER0); return bits; } -static unsigned long bm_count_bits(struct drbd_bitmap *b) -{ - return __bm_count_bits(b, 0); -} - -static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b) -{ - return __bm_count_bits(b, 1); -} - /* offset and len in long words.*/ STATIC void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) { unsigned long *p_addr, *bm; + unsigned int idx; size_t do_now, end; -#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) - end = offset + len; if (end > b->bm_words) { @@ -446,17 +603,16 @@ while (offset < end) { do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; - p_addr = bm_map_paddr(b, offset); + idx = bm_word_to_page_idx(b, offset); + p_addr = bm_map_pidx(b, idx); bm = p_addr + MLPP(offset); - catch_oob_access_start(); if (bm+do_now > p_addr + LWPP) { printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", p_addr, bm, (int)do_now); - break; /* breaks to after catch_oob_access_end() only! */ - } - memset(bm, c, do_now * sizeof(long)); - catch_oob_access_end(); + } else + memset(bm, c, do_now * sizeof(long)); bm_unmap(p_addr); + bm_set_page_need_writeout(b->bm_pages[idx]); offset += do_now; } } @@ -469,18 +625,19 @@ * In case this is actually a resize, we copy the old bitmap into the new one. * Otherwise, the bitmap is initialized to all bits set. */ -int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) +int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) { struct drbd_bitmap *b = mdev->bitmap; - unsigned long bits, words, owords, obits, *p_addr, *bm; + unsigned long bits, words, owords, obits; unsigned long want, have, onpages; /* number of pages */ struct page **npages, **opages = NULL; int err = 0, growing; int opages_vmalloced; - ERR_IF(!b) return -ENOMEM; + if (!expect(b)) + return -ENOMEM; - drbd_bm_lock(mdev, "resize"); + drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK); dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", (unsigned long long)capacity); @@ -488,7 +645,7 @@ if (capacity == b->bm_dev_capacity) goto out; - opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); + opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags); if (capacity == 0) { spin_lock_irq(&b->bm_lock); @@ -516,18 +673,23 @@ words = ALIGN(bits, 64) >> LN2_BPL; if (get_ldev(mdev)) { - D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); + u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; put_ldev(mdev); + if (bits > bits_on_disk) { + dev_err(DEV, "Not enough space for bitmap: %lu > %lu\n", + (unsigned long)bits, (unsigned long)bits_on_disk); + err = -ENOSPC; + goto out; + } } - /* one extra long to catch off by one errors */ - want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; + want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; have = b->bm_number_of_pages; if (want == have) { D_ASSERT(b->bm_pages != NULL); npages = b->bm_pages; } else { - if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) + if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC)) npages = NULL; else npages = bm_realloc_pages(b, want); @@ -544,7 +706,7 @@ obits = b->bm_bits; growing = bits > obits; - if (opages) + if (opages && growing && set_new_bits) bm_set_surplus(b); b->bm_pages = npages; @@ -554,8 +716,12 @@ b->bm_dev_capacity = capacity; if (growing) { - bm_memset(b, owords, 0xff, words-owords); - b->bm_set += bits - obits; + if (set_new_bits) { + bm_memset(b, owords, 0xff, words-owords); + b->bm_set += bits - obits; + } else + bm_memset(b, owords, 0x00, words-owords); + } if (want < have) { @@ -563,22 +729,14 @@ bm_free_pages(opages + want, have - want); } - p_addr = bm_map_paddr(b, words); - bm = p_addr + MLPP(words); - catch_oob_access_start(); - *bm = DRBD_MAGIC; - catch_oob_access_end(); - bm_unmap(p_addr); - (void)bm_clear_surplus(b); - bm_end_info(mdev, __func__); spin_unlock_irq(&b->bm_lock); if (opages != npages) bm_vk_free(opages, opages_vmalloced); if (!growing) b->bm_set = bm_count_bits(b); - dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); + dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); out: drbd_bm_unlock(mdev); @@ -593,14 +751,16 @@ * * maybe bm_set should be atomic_t ? */ -static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) +unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; unsigned long s; unsigned long flags; - ERR_IF(!b) return 0; - ERR_IF(!b->bm_pages) return 0; + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; spin_lock_irqsave(&b->bm_lock, flags); s = b->bm_set; @@ -623,8 +783,10 @@ size_t drbd_bm_words(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; - ERR_IF(!b) return 0; - ERR_IF(!b->bm_pages) return 0; + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; return b->bm_words; } @@ -632,7 +794,8 @@ unsigned long drbd_bm_bits(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; - ERR_IF(!b) return 0; + if (!expect(b)) + return 0; return b->bm_bits; } @@ -648,12 +811,15 @@ struct drbd_bitmap *b = mdev->bitmap; unsigned long *p_addr, *bm; unsigned long word, bits; + unsigned int idx; size_t end, do_now; end = offset + number; - ERR_IF(!b) return; - ERR_IF(!b->bm_pages) return; + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; if (number == 0) return; WARN_ON(offset >= b->bm_words); @@ -662,28 +828,26 @@ spin_lock_irq(&b->bm_lock); while (offset < end) { do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; - p_addr = bm_map_paddr(b, offset); + idx = bm_word_to_page_idx(b, offset); + p_addr = bm_map_pidx(b, idx); bm = p_addr + MLPP(offset); offset += do_now; while (do_now--) { - catch_oob_access_start(); bits = hweight_long(*bm); - word = *bm | lel_to_cpu(*buffer++); + word = *bm | *buffer++; *bm++ = word; b->bm_set += hweight_long(word) - bits; - catch_oob_access_end(); } bm_unmap(p_addr); + bm_set_page_need_writeout(b->bm_pages[idx]); } /* with 32bit <-> 64bit cross-platform connect * this is only correct for current usage, * where we _know_ that we are 64 bit aligned, * and know that this function is used in this way, too... */ - if (end == b->bm_words) { + if (end == b->bm_words) b->bm_set -= bm_clear_surplus(b); - bm_end_info(mdev, __func__); - } spin_unlock_irq(&b->bm_lock); } @@ -699,8 +863,10 @@ end = offset + number; - ERR_IF(!b) return; - ERR_IF(!b->bm_pages) return; + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; spin_lock_irq(&b->bm_lock); if ((offset >= b->bm_words) || @@ -713,14 +879,11 @@ else { while (offset < end) { do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; - p_addr = bm_map_paddr(b, offset); + p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset)); bm = p_addr + MLPP(offset); offset += do_now; - while (do_now--) { - catch_oob_access_start(); - *buffer++ = cpu_to_lel(*bm++); - catch_oob_access_end(); - } + while (do_now--) + *buffer++ = *bm++; bm_unmap(p_addr); } } @@ -731,8 +894,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; - ERR_IF(!b) return; - ERR_IF(!b->bm_pages) return; + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; spin_lock_irq(&b->bm_lock); bm_memset(b, 0, 0xff, b->bm_words); @@ -745,8 +910,10 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; - ERR_IF(!b) return; - ERR_IF(!b->bm_pages) return; + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; spin_lock_irq(&b->bm_lock); bm_memset(b, 0, 0, b->bm_words); @@ -754,9 +921,32 @@ spin_unlock_irq(&b->bm_lock); } +struct bm_aio_ctx { + struct drbd_conf *mdev; + atomic_t in_flight; + unsigned int done; + unsigned flags; +#define BM_AIO_COPY_PAGES 1 +#define BM_AIO_WRITE_HINTED 2 + int error; + struct kref kref; +}; + +static void bm_aio_ctx_destroy(struct kref *kref) +{ + struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); + + put_ldev(ctx->mdev); + kfree(ctx); +} + +/* bv_page may be a copy, or may be the original */ static BIO_ENDIO_TYPE bm_async_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error) { - struct drbd_bitmap *b = bio->bi_private; + struct bm_aio_ctx *ctx = bio->bi_private; + struct drbd_conf *mdev = ctx->mdev; + struct drbd_bitmap *b = mdev->bitmap; + unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); int uptodate = bio_flagged(bio, BIO_UPTODATE); BIO_ENDIO_FN_START; @@ -768,140 +958,215 @@ if (!error && !uptodate) error = -EIO; + if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && + !bm_test_page_unchanged(b->bm_pages[idx])) + dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx); + if (error) { - /* doh. what now? - * for now, set all bits, and flag MD_IO_ERROR */ - __set_bit(BM_MD_IO_ERROR, &b->bm_flags); + /* ctx error will hold the completed-last non-zero error code, + * in case error codes differ. */ + ctx->error = error; + bm_set_page_io_err(b->bm_pages[idx]); + /* Not identical to on disk version of it. + * Is BM_PAGE_IO_ERROR enough? */ + if (DRBD_ratelimit(5*HZ, 5)) + dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n", + error, idx); + } else { + bm_clear_page_io_err(b->bm_pages[idx]); + dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx); } - if (atomic_dec_and_test(&b->bm_async_io)) - wake_up(&b->bm_io_wait); + + bm_page_unlock_io(mdev, idx); + + if (ctx->flags & BM_AIO_COPY_PAGES) + mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); bio_put(bio); + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&mdev->misc_wait); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + } + BIO_ENDIO_FN_RETURN; } -STATIC void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) +STATIC void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) { - /* we are process context. we always get a bio */ - struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct bio *bio = bio_alloc_drbd(GFP_NOIO); + struct drbd_conf *mdev = ctx->mdev; + struct drbd_bitmap *b = mdev->bitmap; + struct page *page; unsigned int len; + sector_t on_disk_sector = mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); /* this might happen with very small - * flexible external meta data device */ + * flexible external meta data device, + * or with PAGE_SIZE > 4k */ len = min_t(unsigned int, PAGE_SIZE, (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); - D_DUMPLU(on_disk_sector); - D_DUMPI(len); + /* serialize IO on this page */ + bm_page_lock_io(mdev, page_nr); + /* before memcpy and submit, + * so it can be redirtied any time */ + bm_set_page_unchanged(b->bm_pages[page_nr]); + + if (ctx->flags & BM_AIO_COPY_PAGES) { + void *src, *dest; + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + dest = kmap_atomic(page, KM_USER0); + src = kmap_atomic(b->bm_pages[page_nr], KM_USER1); + memcpy(dest, src, PAGE_SIZE); + kunmap_atomic(src, KM_USER1); + kunmap_atomic(dest, KM_USER0); + bm_store_page_idx(page, page_nr); + } else + page = b->bm_pages[page_nr]; bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; - bio_add_page(bio, b->bm_pages[page_nr], len, 0); - bio->bi_private = b; + /* bio_add_page of a single page to an empty bio will always succeed, + * according to api. Do we want to assert that? */ + bio_add_page(bio, page, len, 0); + bio->bi_private = ctx; bio->bi_end_io = bm_async_io_complete; - if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { + if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { bio->bi_rw |= rw; bio_endio(bio, -EIO); } else { submit_bio(rw, bio); + /* this should not count as user activity and cause the + * resync to throttle -- see drbd_rs_should_slow_down(). */ + atomic_add(len >> 9, &mdev->rs_sect_ev); } } -# if defined(__LITTLE_ENDIAN) - /* nothing to do, on disk == in memory */ -# define bm_cpu_to_lel(x) ((void)0) -# else -void bm_cpu_to_lel(struct drbd_bitmap *b) -{ - /* need to cpu_to_lel all the pages ... - * this may be optimized by using - * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; - * the following is still not optimal, but better than nothing */ - unsigned int i; - unsigned long *p_addr, *bm; - if (b->bm_set == 0) { - /* no page at all; avoid swap if all is 0 */ - i = b->bm_number_of_pages; - } else if (b->bm_set == b->bm_bits) { - /* only the last page */ - i = b->bm_number_of_pages - 1; - } else { - /* all pages */ - i = 0; - } - for (; i < b->bm_number_of_pages; i++) { - p_addr = kmap_atomic(b->bm_pages[i], KM_USER0); - for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++) - *bm = cpu_to_lel(*bm); - kunmap_atomic(p_addr, KM_USER0); - } -} -# endif -/* lel_to_cpu == cpu_to_lel */ -# define bm_lel_to_cpu(x) bm_cpu_to_lel(x) - /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -STATIC int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) +STATIC int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) { + struct bm_aio_ctx *ctx; struct drbd_bitmap *b = mdev->bitmap; - /* sector_t sector; */ - int bm_words, num_pages, i; + int num_pages, i, count = 0; unsigned long now; char ppb[10]; int err = 0; - WARN_ON(!bm_is_locked(b)); + /* + * We are protected against bitmap disappearing/resizing by holding an + * ldev reference (caller must have called get_ldev()). + * For read/write, we are protected against changes to the bitmap by + * the bitmap lock (see drbd_bitmap_io). + * For lazy writeout, we don't care for ongoing changes to the bitmap, + * as we submit copies of pages anyways. + */ + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; - /* no spinlock here, the drbd_bm_lock should be enough! */ + *ctx = (struct bm_aio_ctx) { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = 0, + .flags = flags, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); + err = -ENODEV; + goto out; + } - bm_words = drbd_bm_words(mdev); - num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; + if (!ctx->flags) + WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); - /* on disk bitmap is little endian */ - if (rw == WRITE) - bm_cpu_to_lel(b); + num_pages = b->bm_number_of_pages; now = jiffies; - atomic_set(&b->bm_async_io, num_pages); - __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); /* let the layers below us try to merge these bios... */ - for (i = 0; i < num_pages; i++) - bm_page_io_async(mdev, b, i, rw); + for (i = 0; i < num_pages; i++) { + /* ignore completely unchanged pages */ + if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) + break; + if (rw & WRITE) { + if ((flags & BM_AIO_WRITE_HINTED) && + !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, + &page_private(b->bm_pages[i]))) + continue; + if (bm_test_page_unchanged(b->bm_pages[i])) { + dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); + continue; + } + /* during lazy writeout, + * ignore those pages not marked for lazy writeout. */ + if (lazy_writeout_upper_idx && + !bm_test_page_lazy_writeout(b->bm_pages[i])) { + dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i); + continue; + } + } + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i, rw); + ++count; + cond_resched(); + } + + /* + * We initialize ctx->in_flight to one to make sure bm_async_io_complete + * will not set ctx->done early, and decrement / test it here. If there + * are still some bios in flight, we need to wait for them here. + * If all IO is done already (or nothing had been submitted), there is + * no need to wait. Still, we need to put the kref associated with the + * "in_flight reached zero, all done" event. + */ + if (!atomic_dec_and_test(&ctx->in_flight)) + wait_until_done_or_disk_failure(mdev, &ctx->done); + else + kref_put(&ctx->kref, &bm_aio_ctx_destroy); - drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); - wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); + /* summary for global bitmap IO */ + if (flags == 0) + dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", + rw == WRITE ? "WRITE" : "READ", + count, jiffies - now); - if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { + if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); - drbd_chk_io_error(mdev, 1, TRUE); - err = -EIO; + drbd_chk_io_error(mdev, 1, true); + err = -EIO; /* ctx->error ? */ } + if (atomic_read(&ctx->in_flight)) + err = -EIO; /* Disk failed during IO... */ + now = jiffies; if (rw == WRITE) { - /* swap back endianness */ - bm_lel_to_cpu(b); - /* flush bitmap to stable storage */ drbd_md_flush(mdev); } else /* rw == READ */ { - /* just read, if necessary adjust endianness */ - b->bm_set = bm_count_bits_swap_endian(b); + b->bm_set = bm_count_bits(b); dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", jiffies - now); } now = b->bm_set; - dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", - ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + if (flags == 0) + dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", + ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); +out: + kref_put(&ctx->kref, &bm_aio_ctx_destroy); return err; } @@ -911,116 +1176,156 @@ */ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, READ); + return bm_rw(mdev, READ, 0, 0); } /** * drbd_bm_write() - Write the whole bitmap to its on disk location. * @mdev: DRBD device. + * + * Will only write pages that have changed since last IO. */ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, WRITE); + return bm_rw(mdev, WRITE, 0, 0); } /** - * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap + * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. * @mdev: DRBD device. - * @enr: Extent number in the resync lru (happens to be sector offset) + * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages + */ +int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx); +} + +/** + * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. + * @mdev: DRBD device. + */ +int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); +} + +/** + * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap + * @mdev: DRBD device. + * @idx: bitmap page index * - * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered - * by a single sector write. Therefore enr == sector offset from the - * start of the bitmap. - */ -int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) -{ - sector_t on_disk_sector = enr + mdev->ldev->md.md_offset - + mdev->ldev->md.bm_offset; - int bm_words, num_words, offset; - int err = 0; + * We don't want to special case on logical_block_size of the backend device, + * so we submit PAGE_SIZE aligned pieces. + * Note that on "most" systems, PAGE_SIZE is 4k. + * + * In case this becomes an issue on systems with larger PAGE_SIZE, + * we may want to change this again to write 4k aligned 4k pieces. + */ +int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) +{ + struct bm_aio_ctx *ctx; + int err; - mutex_lock(&mdev->md_io_mutex); - bm_words = drbd_bm_words(mdev); - offset = S2W(enr); /* word offset into bitmap */ - num_words = min(S2W(1), bm_words - offset); -#if DUMP_MD >= 3 - dev_info(DEV, "write_sect: sector=%lu offset=%u num_words=%u\n", - enr, offset, num_words); -#endif - if (num_words < S2W(1)) - memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); - drbd_bm_get_lel(mdev, offset, num_words, - page_address(mdev->md_io_page)); - if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { - int i; - err = -EIO; - dev_err(DEV, "IO ERROR writing bitmap sector %lu " - "(meta-disk sector %llus)\n", - enr, (unsigned long long)on_disk_sector); - drbd_chk_io_error(mdev, 1, TRUE); - for (i = 0; i < AL_EXT_PER_BM_SECT; i++) - drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); + if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { + dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); + return 0; + } + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = 0, + .flags = BM_AIO_COPY_PAGES, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); + err = -ENODEV; + goto out; } + + bm_page_io_async(ctx, idx, WRITE_SYNC); + wait_until_done_or_disk_failure(mdev, &ctx->done); + + if (ctx->error) + drbd_chk_io_error(mdev, 1, true); + /* that should force detach, so the in memory bitmap will be + * gone in a moment as well. */ + mdev->bm_writ_cnt++; - mutex_unlock(&mdev->md_io_mutex); + err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; + out: + kref_put(&ctx->kref, &bm_aio_ctx_destroy); return err; } /* NOTE * find_first_bit returns int, we return unsigned long. - * should not make much difference anyways, but ... + * For this to work on 32bit arch with bitnumbers > (1<<32), + * we'd need to return u64, and get a whole lot of other places + * fixed where we still use unsigned long. * * this returns a bit number, NOT a sector! */ -#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1) static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, const int find_zero_bit, const enum km_type km) { struct drbd_bitmap *b = mdev->bitmap; - unsigned long i = -1UL; unsigned long *p_addr; - unsigned long bit_offset; /* bit offset of the mapped page. */ + unsigned long bit_offset; + unsigned i; + if (bm_fo > b->bm_bits) { dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); + bm_fo = DRBD_END_OF_BITMAP; } else { while (bm_fo < b->bm_bits) { - unsigned long offset; - bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ - offset = bit_offset >> LN2_BPL; /* word offset of the page */ - p_addr = __bm_map_paddr(b, offset, km); + /* bit offset of the first bit in the page */ + bit_offset = bm_fo & ~BITS_PER_PAGE_MASK; + p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km); if (find_zero_bit) - i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); + i = find_next_zero_bit_le(p_addr, + PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK); else - i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); + i = find_next_bit_le(p_addr, + PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK); __bm_unmap(p_addr, km); if (i < PAGE_SIZE*8) { - i = bit_offset + i; - if (i >= b->bm_bits) + bm_fo = bit_offset + i; + if (bm_fo >= b->bm_bits) break; goto found; } bm_fo = bit_offset + PAGE_SIZE*8; } - i = -1UL; + bm_fo = DRBD_END_OF_BITMAP; } found: - return i; + return bm_fo; } static unsigned long bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, const int find_zero_bit) { struct drbd_bitmap *b = mdev->bitmap; - unsigned long i = -1UL; + unsigned long i = DRBD_END_OF_BITMAP; - ERR_IF(!b) return i; - ERR_IF(!b->bm_pages) return i; + if (!expect(b)) + return i; + if (!expect(b->bm_pages)) + return i; spin_lock_irq(&b->bm_lock); - if (bm_is_locked(b)) + if (BM_DONT_TEST & b->bm_flags) bm_print_lock_info(mdev); i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); @@ -1046,13 +1351,13 @@ * you must take drbd_bm_lock() first */ unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) { - /* WARN_ON(!bm_is_locked(mdev)); */ + /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */ return __bm_find_next(mdev, bm_fo, 0, KM_USER1); } unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) { - /* WARN_ON(!bm_is_locked(mdev)); */ + /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */ return __bm_find_next(mdev, bm_fo, 1, KM_USER1); } @@ -1062,14 +1367,15 @@ * wants bitnr, not sector. * expected to be called for only a few bits (e - s about BITS_PER_LONG). * Must hold bitmap lock already. */ -int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, - unsigned long e, int val, const enum km_type km) +STATIC int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, + unsigned long e, int val) { struct drbd_bitmap *b = mdev->bitmap; unsigned long *p_addr = NULL; unsigned long bitnr; - unsigned long last_page_nr = -1UL; + unsigned int last_page_nr = -1U; int c = 0; + int changed_total = 0; if (e >= b->bm_bits) { dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", @@ -1077,44 +1383,56 @@ e = b->bm_bits ? b->bm_bits -1 : 0; } for (bitnr = s; bitnr <= e; bitnr++) { - unsigned long offset = bitnr>>LN2_BPL; - unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); + unsigned int page_nr = bm_bit_to_page_idx(b, bitnr); if (page_nr != last_page_nr) { if (p_addr) - __bm_unmap(p_addr, km); - p_addr = __bm_map_paddr(b, offset, km); + __bm_unmap(p_addr, KM_IRQ1); + if (c < 0) + bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); + else if (c > 0) + bm_set_page_need_writeout(b->bm_pages[last_page_nr]); + changed_total += c; + c = 0; + p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1); last_page_nr = page_nr; } if (val) - c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); + c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); else - c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); + c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); } if (p_addr) - __bm_unmap(p_addr, km); - b->bm_set += c; - return c; + __bm_unmap(p_addr, KM_IRQ1); + if (c < 0) + bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); + else if (c > 0) + bm_set_page_need_writeout(b->bm_pages[last_page_nr]); + changed_total += c; + b->bm_set += changed_total; + return changed_total; } /* returns number of bits actually changed. * for val != 0, we change 0 -> 1, return code positive * for val == 0, we change 1 -> 0, return code negative * wants bitnr, not sector */ -int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, +STATIC int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, const unsigned long e, int val) { unsigned long flags; struct drbd_bitmap *b = mdev->bitmap; int c = 0; - ERR_IF(!b) return 1; - ERR_IF(!b->bm_pages) return 0; + if (!expect(b)) + return 1; + if (!expect(b->bm_pages)) + return 0; spin_lock_irqsave(&b->bm_lock, flags); - if (bm_is_locked(b)) + if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) bm_print_lock_info(mdev); - c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); + c = __bm_change_bits_to(mdev, s, e, val); spin_unlock_irqrestore(&b->bm_lock, flags); return c; @@ -1139,16 +1457,25 @@ { int i; int bits; - unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); + int changed = 0; + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1); for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; - b->bm_set += BITS_PER_LONG - bits; + changed += BITS_PER_LONG - bits; + } + kunmap_atomic(paddr, KM_IRQ1); + if (changed) { + /* We only need lazy writeout, the information is still in the + * remote bitmap as well, and is reconstructed during the next + * bitmap exchange, if lost locally due to a crash. */ + bm_set_page_lazy_writeout(b->bm_pages[page_nr]); + b->bm_set += changed; } - kunmap_atomic(paddr, KM_USER0); } -/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. +/* Same thing as drbd_bm_set_bits, + * but more efficient for a large bit range. * You must first drbd_bm_lock(). * Can be called to set the whole bitmap in one go. * Sets bits from s to e _inclusive_. */ @@ -1162,6 +1489,7 @@ * Do not use memset, because we must account for changes, * so we need to loop over the words with hweight() anyways. */ + struct drbd_bitmap *b = mdev->bitmap; unsigned long sl = ALIGN(s,BITS_PER_LONG); unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); int first_page; @@ -1172,15 +1500,19 @@ if (e - s <= 3*BITS_PER_LONG) { /* don't bother; el and sl may even be wrong. */ - __bm_change_bits_to(mdev, s, e, 1, KM_USER0); + spin_lock_irq(&b->bm_lock); + __bm_change_bits_to(mdev, s, e, 1); + spin_unlock_irq(&b->bm_lock); return; } /* difference is large enough that we can trust sl and el */ + spin_lock_irq(&b->bm_lock); + /* bits filling the current long */ if (sl) - __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); + __bm_change_bits_to(mdev, s, sl-1, 1); first_page = sl >> (3 + PAGE_SHIFT); last_page = el >> (3 + PAGE_SHIFT); @@ -1193,8 +1525,10 @@ /* first and full pages, unless first page == last page */ for (page_nr = first_page; page_nr < last_page; page_nr++) { bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); + spin_unlock_irq(&b->bm_lock); cond_resched(); first_word = 0; + spin_lock_irq(&b->bm_lock); } /* last page (respectively only page, for first page == last page) */ @@ -1207,7 +1541,8 @@ * it would trigger an assert in __bm_change_bits_to() */ if (el <= e) - __bm_change_bits_to(mdev, el, e, 1, KM_USER0); + __bm_change_bits_to(mdev, el, e, 1); + spin_unlock_irq(&b->bm_lock); } /* returns bit state @@ -1224,16 +1559,17 @@ unsigned long *p_addr; int i; - ERR_IF(!b) return 0; - ERR_IF(!b->bm_pages) return 0; + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; spin_lock_irqsave(&b->bm_lock, flags); - if (bm_is_locked(b)) + if (BM_DONT_TEST & b->bm_flags) bm_print_lock_info(mdev); if (bitnr < b->bm_bits) { - unsigned long offset = bitnr>>LN2_BPL; - p_addr = bm_map_paddr(b, offset); - i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; + p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr)); + i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0; bm_unmap(p_addr); } else if (bitnr == b->bm_bits) { i = -1; @@ -1251,34 +1587,35 @@ { unsigned long flags; struct drbd_bitmap *b = mdev->bitmap; - unsigned long *p_addr = NULL, page_nr = -1; + unsigned long *p_addr = NULL; unsigned long bitnr; + unsigned int page_nr = -1U; int c = 0; - size_t w; /* If this is called without a bitmap, that is a bug. But just to be * robust in case we screwed up elsewhere, in that case pretend there * was one dirty bit in the requested area, so we won't try to do a * local read there (no bitmap probably implies no disk) */ - ERR_IF(!b) return 1; - ERR_IF(!b->bm_pages) return 1; + if (!expect(b)) + return 1; + if (!expect(b->bm_pages)) + return 1; spin_lock_irqsave(&b->bm_lock, flags); - if (bm_is_locked(b)) + if (BM_DONT_TEST & b->bm_flags) bm_print_lock_info(mdev); for (bitnr = s; bitnr <= e; bitnr++) { - w = bitnr >> LN2_BPL; - if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { - page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); + unsigned int idx = bm_bit_to_page_idx(b, bitnr); + if (page_nr != idx) { + page_nr = idx; if (p_addr) bm_unmap(p_addr); - p_addr = bm_map_paddr(b, w); + p_addr = bm_map_pidx(b, idx); } - ERR_IF (bitnr >= b->bm_bits) { + if (expect(bitnr < b->bm_bits)) + c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); + else dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); - } else { - c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); - } } if (p_addr) bm_unmap(p_addr); @@ -1308,11 +1645,13 @@ unsigned long flags; unsigned long *p_addr, *bm; - ERR_IF(!b) return 0; - ERR_IF(!b->bm_pages) return 0; + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; spin_lock_irqsave(&b->bm_lock, flags); - if (bm_is_locked(b)) + if (BM_DONT_TEST & b->bm_flags) bm_print_lock_info(mdev); s = S2W(enr); @@ -1320,13 +1659,10 @@ count = 0; if (s < b->bm_words) { int n = e-s; - p_addr = bm_map_paddr(b, s); + p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); bm = p_addr + MLPP(s); - while (n--) { - catch_oob_access_start(); + while (n--) count += hweight_long(*bm++); - catch_oob_access_end(); - } bm_unmap(p_addr); } else { dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s); @@ -1338,18 +1674,22 @@ return count; } -/* set all bits covered by the AL-extent al_enr */ +/* Set all bits covered by the AL-extent al_enr. + * Returns number of bits changed. */ unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) { struct drbd_bitmap *b = mdev->bitmap; unsigned long *p_addr, *bm; unsigned long weight; - int count, s, e, i, do_now; - ERR_IF(!b) return 0; - ERR_IF(!b->bm_pages) return 0; + unsigned long s, e; + int count, i, do_now; + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; spin_lock_irq(&b->bm_lock); - if (bm_is_locked(b)) + if (BM_DONT_SET & b->bm_flags) bm_print_lock_info(mdev); weight = b->bm_set; @@ -1361,13 +1701,11 @@ count = 0; if (s < b->bm_words) { i = do_now = e-s; - p_addr = bm_map_paddr(b, s); + p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); bm = p_addr + MLPP(s); while (i--) { - catch_oob_access_start(); count += hweight_long(*bm); *bm = -1UL; - catch_oob_access_end(); bm++; } bm_unmap(p_addr); @@ -1375,7 +1713,7 @@ if (e == b->bm_words) b->bm_set -= bm_clear_surplus(b); } else { - dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); + dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s); } weight = b->bm_set - weight; spin_unlock_irq(&b->bm_lock); diff -Nru drbd8-8.3.7/drbd/drbd_buildtag.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_buildtag.c --- drbd8-8.3.7/drbd/drbd_buildtag.c 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_buildtag.c 2012-09-03 22:37:15.000000000 +0000 @@ -2,6 +2,6 @@ #include const char *drbd_buildtag(void) { - return "GIT-hash: ea9e28dbff98e331a62bcbcc63a6135808fe2917" - " build by ivoks@lucid, 2010-02-19 17:53:12"; + return "GIT-hash: e3169387b068d825dd433287f7fd7ba48ed07919 debian/changelog" + " build by ildefonso@rexy, 2012-09-03 18:07:15"; } diff -Nru drbd8-8.3.7/drbd/drbd_int.h drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_int.h --- drbd8-8.3.7/drbd/drbd_int.h 2010-01-07 09:09:58.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_int.h 2012-02-02 14:09:14.000000000 +0000 @@ -37,8 +37,16 @@ #include #include #include +#include #include #include +#include +#include +#include +#include + +#include "compat.h" +#include "drbd_state.h" #ifdef __CHECKER__ # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) @@ -75,7 +83,6 @@ extern unsigned int minor_count; extern int disable_sendpage; extern int allow_oos; -extern unsigned int cn_idx; #ifdef DRBD_ENABLE_FAULTS extern int enable_faults; @@ -93,14 +100,6 @@ #include #include -/* XXX do we need this? */ -#ifndef TRUE -#define TRUE 1 -#endif -#ifndef FALSE -#define FALSE 0 -#endif - /* I don't remember why XCPU ... * This is used to wake the asender, * and to interrupt sending the sending task @@ -117,19 +116,14 @@ */ #define DRBD_SIGKILL SIGHUP -/* All EEs on the free list should have ID_VACANT (== 0) - * freshly allocated EEs get !ID_VACANT (== 1) - * so if it says "cannot dereference null pointer at adress 0x00000001", - * it is most likely one of these :( */ - #define ID_IN_SYNC (4711ULL) #define ID_OUT_OF_SYNC (4712ULL) - #define ID_SYNCER (-1ULL) -#define ID_VACANT 0 -#define is_syncer_block_id(id) ((id) == ID_SYNCER) + +#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) struct drbd_conf; +struct drbd_tconn; #ifdef DBG_ALL_SYMBOLS # define STATIC @@ -137,44 +131,11 @@ # define STATIC static #endif -#ifdef PARANOIA -# define PARANOIA_BUG_ON(x) BUG_ON(x) -#else -# define PARANOIA_BUG_ON(x) -#endif - -/* - * Some Message Macros - *************************/ - -/* handy macro: DUMPP(somepointer) */ -#define DUMPP(A) dev_err(DEV, #A " = %p in %s:%d\n", (A), __FILE__, __LINE__); -#define DUMPLU(A) dev_err(DEV, #A " = %lu in %s:%d\n", (unsigned long)(A), __FILE__, __LINE__); -#define DUMPLLU(A) dev_err(DEV, #A " = %llu in %s:%d\n", (unsigned long long)(A), __FILE__, __LINE__); -#define DUMPLX(A) dev_err(DEV, #A " = %lx in %s:%d\n", (A), __FILE__, __LINE__); -#define DUMPI(A) dev_err(DEV, #A " = %d in %s:%d\n", (int)(A), __FILE__, __LINE__); - -#define DUMPST(A) DUMPLLU((unsigned long long)(A)) - -#if 0 -#define D_DUMPP(A) DUMPP(A) -#define D_DUMPLU(A) DUMPLU(A) -#define D_DUMPLLU(A) DUMPLLU(A) -#define D_DUMPLX(A) DUMPLX(A) -#define D_DUMPI(A) DUMPI(A) -#else -#define D_DUMPP(A) -#define D_DUMPLU(A) -#define D_DUMPLLU(A) -#define D_DUMPLX(A) -#define D_DUMPI(A) -#endif - /* upstream kernel wants us to use dev_warn(), ... * dev_printk() expects to be presented a struct device *; * in older kernels, (<= 2.6.24), there is nothing suitable there. * "backport" hack: redefine dev_printk. - * Trigger is definition of dev_to_disk marcro, introduced with the + * Trigger is definition of dev_to_disk macro, introduced with the * commit edfaa7c36574f1bf09c65ad602412db9da5f96bf * Driver core: convert block from raw kobjects to core devices */ @@ -197,10 +158,18 @@ dev_printk(KERN_CRIT , dev , format , ## arg) #endif - +#define conn_printk(LEVEL, TCONN, FMT, ARGS...) \ + printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS) +#define conn_alert(TCONN, FMT, ARGS...) conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS) +#define conn_crit(TCONN, FMT, ARGS...) conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS) +#define conn_err(TCONN, FMT, ARGS...) conn_printk(KERN_ERR, TCONN, FMT, ## ARGS) +#define conn_warn(TCONN, FMT, ARGS...) conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS) +#define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS) +#define conn_info(TCONN, FMT, ARGS...) conn_printk(KERN_INFO, TCONN, FMT, ## ARGS) +#define conn_dbg(TCONN, FMT, ARGS...) conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS) /* see kernel/printk.c:printk_ratelimit - * macro, so it is easy do have independend rate limits at different locations + * macro, so it is easy do have independent rate limits at different locations * "initializer element not constant ..." with kernel 2.4 :( * so I initialize toks to something large */ @@ -239,12 +208,19 @@ # define D_ASSERT(exp) if (!(exp)) \ dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) #endif -#define ERR_IF(exp) if (({ \ - int _b = (exp) != 0; \ - if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ - __func__, #exp, __FILE__, __LINE__); \ - _b; \ - })) + +/** + * expect - Make an assertion + * + * Unlike the assert macro, this macro returns a boolean result. + */ +#define expect(exp) ({ \ + bool _bool = (exp); \ + if (!_bool) \ + dev_err(DEV, "ASSERTION %s FAILED in %s\n", \ + #exp, __func__); \ + _bool; \ + }) /* Defines to control fault insertion */ enum { @@ -257,37 +233,24 @@ DRBD_FAULT_DT_RA = 6, /* data read ahead */ DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ DRBD_FAULT_AL_EE = 8, /* alloc ee */ + DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */ DRBD_FAULT_MAX, }; -extern void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...); - -#ifdef DRBD_ENABLE_FAULTS extern unsigned int _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); + static inline int drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { +#ifdef DRBD_ENABLE_FAULTS return fault_rate && (enable_faults & (1<= KERNEL_VERSION(2,6,8) -# define HAVE_KERNEL_SENDMSG 1 #else -# define HAVE_KERNEL_SENDMSG 0 + return 0; #endif +} /* * our structs @@ -300,14 +263,11 @@ (typecheck(struct drbd_conf*, x) && \ ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)) : 0)) -/* drbd_meta-data.c (still in drbd_main.c) */ -/* 4th incarnation of the disk layout. */ -#define DRBD_MD_MAGIC (DRBD_MAGIC+4) - -extern struct drbd_conf **minor_table; +extern struct idr minors; /* RCU, updates: genl_lock() */ +extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */ /* on the wire */ -enum drbd_packets { +enum drbd_packet { /* receiver (data socket) */ P_DATA = 0x00, P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ @@ -335,7 +295,7 @@ P_RECV_ACK = 0x15, /* Used in protocol B */ P_WRITE_ACK = 0x16, /* Used in protocol C */ P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ - P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */ + P_DISCARD_WRITE = 0x18, /* Used in proto C, two-primaries conflict detection */ P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ @@ -351,72 +311,28 @@ P_RS_IS_IN_SYNC = 0x22, /* meta socket */ P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ + /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */ + /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */ + P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ + P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ + P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ + P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */ + P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ + P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ + P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ - P_MAX_CMD = 0x25, P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, /* special command ids for handshake */ - P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ - P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ + P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */ + P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */ - P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ + P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */ }; -static inline const char *cmdname(enum drbd_packets cmd) -{ - /* THINK may need to become several global tables - * when we want to support more than - * one PRO_VERSION */ - static const char *cmdnames[] = { - [P_DATA] = "Data", - [P_DATA_REPLY] = "DataReply", - [P_RS_DATA_REPLY] = "RSDataReply", - [P_BARRIER] = "Barrier", - [P_BITMAP] = "ReportBitMap", - [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", - [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", - [P_UNPLUG_REMOTE] = "UnplugRemote", - [P_DATA_REQUEST] = "DataRequest", - [P_RS_DATA_REQUEST] = "RSDataRequest", - [P_SYNC_PARAM] = "SyncParam", - [P_SYNC_PARAM89] = "SyncParam89", - [P_PROTOCOL] = "ReportProtocol", - [P_UUIDS] = "ReportUUIDs", - [P_SIZES] = "ReportSizes", - [P_STATE] = "ReportState", - [P_SYNC_UUID] = "ReportSyncUUID", - [P_AUTH_CHALLENGE] = "AuthChallenge", - [P_AUTH_RESPONSE] = "AuthResponse", - [P_PING] = "Ping", - [P_PING_ACK] = "PingAck", - [P_RECV_ACK] = "RecvAck", - [P_WRITE_ACK] = "WriteAck", - [P_RS_WRITE_ACK] = "RSWriteAck", - [P_DISCARD_ACK] = "DiscardAck", - [P_NEG_ACK] = "NegAck", - [P_NEG_DREPLY] = "NegDReply", - [P_NEG_RS_DREPLY] = "NegRSDReply", - [P_BARRIER_ACK] = "BarrierAck", - [P_STATE_CHG_REQ] = "StateChgRequest", - [P_STATE_CHG_REPLY] = "StateChgReply", - [P_OV_REQUEST] = "OVRequest", - [P_OV_REPLY] = "OVReply", - [P_OV_RESULT] = "OVResult", - [P_MAX_CMD] = NULL, - }; - - if (cmd == P_HAND_SHAKE_M) - return "HandShakeM"; - if (cmd == P_HAND_SHAKE_S) - return "HandShakeS"; - if (cmd == P_HAND_SHAKE) - return "HandShake"; - if (cmd >= P_MAX_CMD) - return "Unknown"; - return cmdnames[cmd]; -} +extern const char *cmdname(enum drbd_packet cmd); /* for sending/receiving the bitmap, * possibly in some encoding scheme */ @@ -472,37 +388,41 @@ * NOTE that the payload starts at a long aligned offset, * regardless of 32 or 64 bit arch! */ -struct p_header { +struct p_header80 { u32 magic; u16 command; u16 length; /* bytes of data after this header */ - u8 payload[0]; } __packed; -/* 8 bytes. packet FIXED for the next century! */ -/* - * short commands, packets without payload, plain p_header: - * P_PING - * P_PING_ACK - * P_BECOME_SYNC_TARGET - * P_BECOME_SYNC_SOURCE - * P_UNPLUG_REMOTE - */ +/* Header for big packets, Used for data packets exceeding 64kB */ +struct p_header95 { + u16 magic; /* use DRBD_MAGIC_BIG here */ + u16 command; + u32 length; +} __packed; -/* - * commands with out-of-struct payload: - * P_BITMAP (no additional fields) - * P_DATA, P_DATA_REPLY (see p_data) - * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) - */ +struct p_header100 { + u32 magic; + u16 volume; + u16 command; + u32 length; + u32 pad; +} __packed; + +extern unsigned int drbd_header_size(struct drbd_tconn *tconn); /* these defines must not be changed without changing the protocol version */ -#define DP_HARDBARRIER 1 -#define DP_RW_SYNC 2 +#define DP_HARDBARRIER 1 /* no longer used */ +#define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_MAY_SET_IN_SYNC 4 +#define DP_UNPLUG 8 /* not used anymore */ +#define DP_FUA 16 /* equals REQ_FUA */ +#define DP_FLUSH 32 /* equals REQ_FLUSH */ +#define DP_DISCARD 64 /* equals REQ_DISCARD */ +#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ +#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ struct p_data { - struct p_header head; u64 sector; /* 64 bits sector number */ u64 block_id; /* to identify the request in protocol B&C */ u32 seq_num; @@ -513,21 +433,18 @@ * commands which share a struct: * p_block_ack: * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), - * P_DISCARD_ACK (proto C, two-primaries conflict detection) + * P_DISCARD_WRITE (proto C, two-primaries conflict detection) * p_block_req: * P_DATA_REQUEST, P_RS_DATA_REQUEST */ struct p_block_ack { - struct p_header head; u64 sector; u64 block_id; u32 blksize; u32 seq_num; } __packed; - struct p_block_req { - struct p_header head; u64 sector; u64 block_id; u32 blksize; @@ -536,63 +453,71 @@ /* * commands with their own struct for additional fields: - * P_HAND_SHAKE + * P_CONNECTION_FEATURES * P_BARRIER * P_BARRIER_ACK * P_SYNC_PARAM * ReportParams */ -struct p_handshake { - struct p_header head; /* 8 bytes */ +struct p_connection_features { u32 protocol_min; u32 feature_flags; u32 protocol_max; /* should be more than enough for future enhancements - * for now, feature_flags and the reserverd array shall be zero. + * for now, feature_flags and the reserved array shall be zero. */ u32 _pad; - u64 reserverd[7]; + u64 reserved[7]; } __packed; -/* 80 bytes, FIXED for the next century */ struct p_barrier { - struct p_header head; u32 barrier; /* barrier number _handle_ only */ u32 pad; /* to multiple of 8 Byte */ } __packed; struct p_barrier_ack { - struct p_header head; u32 barrier; u32 set_size; } __packed; struct p_rs_param { - struct p_header head; - u32 rate; + u32 resync_rate; /* Since protocol version 88 and higher. */ char verify_alg[0]; } __packed; struct p_rs_param_89 { - struct p_header head; - u32 rate; + u32 resync_rate; /* protocol version 89: */ char verify_alg[SHARED_SECRET_MAX]; char csums_alg[SHARED_SECRET_MAX]; } __packed; +struct p_rs_param_95 { + u32 resync_rate; + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; + u32 c_plan_ahead; + u32 c_delay_target; + u32 c_fill_target; + u32 c_max_rate; +} __packed; + +enum drbd_conn_flags { + CF_DISCARD_MY_DATA = 1, + CF_DRY_RUN = 2, +}; + struct p_protocol { - struct p_header head; u32 protocol; u32 after_sb_0p; u32 after_sb_1p; u32 after_sb_2p; - u32 want_lose; + u32 conn_flags; u32 two_primaries; /* Since protocol version 87 and higher. */ @@ -601,37 +526,32 @@ } __packed; struct p_uuids { - struct p_header head; u64 uuid[UI_EXTENDED_SIZE]; } __packed; struct p_rs_uuid { - struct p_header head; u64 uuid; } __packed; struct p_sizes { - struct p_header head; u64 d_size; /* size of disk */ u64 u_size; /* user requested size */ u64 c_size; /* current exported size */ - u32 max_segment_size; /* Maximal size of a BIO */ - u32 queue_order_type; + u32 max_bio_size; /* Maximal size of a BIO */ + u16 queue_order_type; /* not yet implemented in DRBD*/ + u16 dds_flags; /* use enum dds_flags here. */ } __packed; struct p_state { - struct p_header head; u32 state; } __packed; struct p_req_state { - struct p_header head; u32 mask; u32 val; } __packed; struct p_req_state_reply { - struct p_header head; u32 retcode; } __packed; @@ -646,12 +566,17 @@ } __packed; struct p_discard { - struct p_header head; u64 block_id; u32 seq_num; u32 pad; } __packed; +struct p_block_desc { + u64 sector; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + /* Valid values for the encoding field. * Bump proto version when changing this. */ enum drbd_bitmap_code { @@ -662,7 +587,6 @@ }; struct p_compressed_bm { - struct p_header head; /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code * (encoding & 0x80): polarity (set/unset) of first runlength * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits @@ -673,81 +597,23 @@ u8 code[0]; } __packed; -/* DCBP: Drbd Compressed Bitmap Packet ... */ -static inline enum drbd_bitmap_code -DCBP_get_code(struct p_compressed_bm *p) -{ - return (enum drbd_bitmap_code)(p->encoding & 0x0f); -} - -static inline void -DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) -{ - BUG_ON(code & ~0xf); - p->encoding = (p->encoding & ~0xf) | code; -} - -static inline int -DCBP_get_start(struct p_compressed_bm *p) -{ - return (p->encoding & 0x80) != 0; -} - -static inline void -DCBP_set_start(struct p_compressed_bm *p, int set) -{ - p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); -} - -static inline int -DCBP_get_pad_bits(struct p_compressed_bm *p) -{ - return (p->encoding >> 4) & 0x7; -} - -static inline void -DCBP_set_pad_bits(struct p_compressed_bm *p, int n) -{ - BUG_ON(n & ~0x7); - p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); -} - -/* one bitmap packet, including the p_header, - * should fit within one _architecture independend_ page. - * so we need to use the fixed size 4KiB page size - * most architechtures have used for a long time. - */ -#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) -#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) -#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) -#if (PAGE_SIZE < 4096) -/* drbd_send_bitmap / receive_bitmap would break horribly */ -#error "PAGE_SIZE too small" -#endif - -union p_polymorph { - struct p_header header; - struct p_handshake handshake; - struct p_data data; - struct p_block_ack block_ack; - struct p_barrier barrier; - struct p_barrier_ack barrier_ack; - struct p_rs_param_89 rs_param_89; - struct p_protocol protocol; - struct p_sizes sizes; - struct p_uuids uuids; - struct p_state state; - struct p_req_state req_state; - struct p_req_state_reply req_state_reply; - struct p_block_req block_req; +struct p_delay_probe93 { + u32 seq_num; /* sequence number to match the two probe packets */ + u32 offset; /* usecs the probe got sent after the reference time point */ } __packed; +/* + * Bitmap packets need to fit within a single page on the sender and receiver, + * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger). + */ +#define DRBD_SOCKET_BUFFER_SIZE 4096 + /**********************************************************************/ enum drbd_thread_state { - None, - Running, - Exiting, - Restarting + NONE, + RUNNING, + EXITING, + RESTARTING }; struct drbd_thread { @@ -756,8 +622,9 @@ struct completion startstop; enum drbd_thread_state t_state; int (*function) (struct drbd_thread *); - struct drbd_conf *mdev; + struct drbd_tconn *tconn; int reset_cpu_mask; + char name[9]; }; static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) @@ -770,34 +637,29 @@ return thi->t_state; } - -/* - * Having this as the first member of a struct provides sort of "inheritance". - * "derived" structs can be "drbd_queue_work()"ed. - * The callback should know and cast back to the descendant struct. - * drbd_request and drbd_epoch_entry are descendants of drbd_work. - */ -struct drbd_work; -typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); struct drbd_work { struct list_head list; - drbd_work_cb cb; + int (*cb)(struct drbd_work *, int cancel); + union { + struct drbd_conf *mdev; + struct drbd_tconn *tconn; + }; }; -struct drbd_tl_epoch; +#include "drbd_interval.h" + +extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *); + struct drbd_request { struct drbd_work w; - struct drbd_conf *mdev; /* if local IO is not allowed, will be NULL. * if local IO _is_ allowed, holds the locally submitted bio clone, * or, after local IO completion, the ERR_PTR(error). - * see drbd_endio_pri(). */ + * see drbd_request_endio(). */ struct bio *private_bio; - struct hlist_node colision; - sector_t sector; - unsigned int size; + struct drbd_interval i; unsigned int epoch; /* barrier_nr */ /* barrier_nr: used to check on "completion" whether this req was in @@ -805,9 +667,6 @@ * starting a new epoch... */ - /* up to here, the struct layout is identical to drbd_epoch_entry; - * we might be able to use that to our advantage... */ - struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ unsigned long rq_state; /* see comments above _req_mod() */ @@ -820,19 +679,11 @@ struct list_head requests; /* requests before */ struct drbd_tl_epoch *next; /* pointer to the next barrier */ unsigned int br_number; /* the barriers identifier. */ - int n_req; /* number of requests attached before this barrier */ + int n_writes; /* number of requests attached before this barrier */ }; -struct drbd_request; - -/* These Tl_epoch_entries may be in one of 6 lists: - active_ee .. data packet being written - sync_ee .. syncer block being written - done_ee .. block written, need to send P_WRITE_ACK - read_ee .. [RS]P_DATA_REQUEST being read -*/ - struct drbd_epoch { + struct drbd_conf *mdev; struct list_head list; unsigned int barrier_nr; atomic_t epoch_size; /* increased on every request added. */ @@ -854,30 +705,9 @@ EV_GOT_BARRIER_NR, EV_BARRIER_DONE, EV_BECAME_LAST, - EV_TRACE_FLUSH, /* TRACE_ are not real events, only used for tracing */ - EV_TRACE_ADD_BARRIER, /* Doing the first write as a barrier write */ - EV_TRACE_SETTING_BI, /* Barrier is expressed with the first write of the next epoch */ - EV_TRACE_ALLOC, - EV_TRACE_FREE, EV_CLEANUP = 32, /* used as flag */ }; -struct drbd_epoch_entry { - struct drbd_work w; - struct drbd_conf *mdev; - struct bio *private_bio; - struct hlist_node colision; - sector_t sector; - unsigned int size; - struct drbd_epoch *epoch; - - /* up to here, the struct layout is identical to drbd_request; - * we might be able to use that to our advantage... */ - - unsigned int flags; - u64 block_id; -}; - struct drbd_wq_barrier { struct drbd_work w; struct completion done; @@ -888,37 +718,77 @@ void *digest; }; -/* ee flag bits */ +struct drbd_peer_request { + struct drbd_work w; + struct drbd_epoch *epoch; /* for writes */ + struct page *pages; + atomic_t pending_bios; + struct drbd_interval i; + /* see comments on ee flag bits below */ + unsigned long flags; + union { + u64 block_id; + struct digest_info *digest; + }; +}; + +/* ee flag bits. + * While corresponding bios are in flight, the only modification will be + * set_bit WAS_ERROR, which has to be atomic. + * If no bios are in flight yet, or all have been completed, + * non-atomic modification to ee->flags is ok. + */ enum { __EE_CALL_AL_COMPLETE_IO, - __EE_CONFLICT_PENDING, __EE_MAY_SET_IN_SYNC, + + /* This peer request closes an epoch using a barrier. + * On successful completion, the epoch is released, + * and the P_BARRIER_ACK send. */ __EE_IS_BARRIER, + + /* In case a barrier failed, + * we need to resubmit without the barrier flag. */ + __EE_RESUBMITTED, + + /* we may have several bios per peer request. + * if any of those fail, we set this flag atomically + * from the endio callback */ + __EE_WAS_ERROR, + + /* This ee has a pointer to a digest instead of a block id */ + __EE_HAS_DIGEST, + + /* Conflicting local requests need to be restarted after this request */ + __EE_RESTART_REQUESTS, + + /* The peer wants a write ACK for this (wire proto C) */ + __EE_SEND_WRITE_ACK, + + /* Is set when net_conf had two_primaries set while creating this peer_req */ + __EE_IN_INTERVAL_TREE, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) -#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) #define EE_IS_BARRIER (1<<__EE_IS_BARRIER) +#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) +#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) +#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) +#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) +#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) +#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) -/* global flag bits */ +/* flag bits per mdev */ enum { - CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ - SIGNAL_ASENDER, /* whether asender wants to be interrupted */ - SEND_PING, /* whether asender should send a ping asap */ - - STOP_SYNC_TIMER, /* tell timer to cancel itself */ UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ MD_DIRTY, /* current uuids and flags not yet on disk */ - DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ - CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ CL_ST_CHG_SUCCESS, CL_ST_CHG_FAIL, CRASHED_PRIMARY, /* This node was a crashed primary. * Gets cleared when the state.conn * goes into C_CONNECTED state. */ - WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ CONSIDER_RESYNC, @@ -928,30 +798,52 @@ BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ + GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ + WAS_IO_ERROR, /* Local disk failed returned IO error */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ - NET_CONGESTED, /* The data socket is congested */ - - CONFIG_PENDING, /* serialization of (re)configuration requests. - * if set, also prevents the device from dying */ - DEVICE_DYING, /* device became unconfigured, - * but worker thread is still handling the cleanup. - * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, - * while this is set. */ RESIZE_PENDING, /* Size change detected locally, waiting for the response from * the peer, if it changed there as well. */ + NEW_CUR_UUID, /* Create new current UUID when thawing IO */ + AL_SUSPENDED, /* Activity logging is currently suspended. */ + AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ + B_RS_H_DONE, /* Before resync handler done (already executed) */ + DISCARD_MY_DATA, /* discard_my_data flag per volume */ + READ_BALANCE_RR, }; struct drbd_bitmap; /* opaque for drbd_conf */ +/* definition of bits in bm_flags to be used in drbd_bm_lock + * and drbd_bitmap_io and friends. */ +enum bm_flag { + /* do we need to kfree, or vfree bm_pages? */ + BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ + + /* currently locked for bulk operation */ + BM_LOCKED_MASK = 0x7, + + /* in detail, that is: */ + BM_DONT_CLEAR = 0x1, + BM_DONT_SET = 0x2, + BM_DONT_TEST = 0x4, + + /* (test bit, count bit) allowed (common case) */ + BM_LOCKED_TEST_ALLOWED = 0x3, + + /* testing bits, as well as setting new bits allowed, but clearing bits + * would be unexpected. Used during bitmap receive. Setting new bits + * requires sending of "out-of-sync" information, though. */ + BM_LOCKED_SET_ALLOWED = 0x1, + + /* clear is not expected while bitmap is locked for bulk operation */ +}; + + /* TODO sort members for performance * MAYBE group them further */ /* THINK maybe we actually want to use the default "event/%s" worker threads * or similar in linux 2.6, which uses per cpu data and threads. - * - * To be general, this might need a spin_lock member. - * For now, please use the mdev->req_lock to protect list_head, - * see drbd_queue_work below. */ struct drbd_work_queue { struct list_head q; @@ -970,8 +862,8 @@ struct socket *socket; /* this way we get our * send/receive buffers off the stack */ - union p_polymorph sbuf; - union p_polymorph rbuf; + void *sbuf; + void *rbuf; }; struct drbd_md { @@ -987,38 +879,28 @@ s32 bm_offset; /* signed relative sector offset to bitmap */ /* u32 al_nr_extents; important for restoring the AL - * is stored into sync_conf.al_extents, which in turn + * is stored into ldev->dc.al_extents, which in turn * gets applied to act_log->nr_elements */ }; -/* for sync_conf and other types... */ -#define NL_PACKET(name, number, fields) struct name { fields }; -#define NL_INTEGER(pn,pr,member) int member; -#define NL_INT64(pn,pr,member) __u64 member; -#define NL_BIT(pn,pr,member) unsigned member:1; -#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; -#include "linux/drbd_nl.h" - struct drbd_backing_dev { struct block_device *backing_bdev; struct block_device *md_bdev; - struct file *lo_file; - struct file *md_file; struct drbd_md md; - struct disk_conf dc; /* The user provided config... */ + struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */ sector_t known_size; /* last known size of that backing device */ }; struct drbd_md_io { - struct drbd_conf *mdev; - struct completion event; + unsigned int done; int error; }; struct bm_io_work { struct drbd_work w; char *why; + enum bm_flag flags; int (*io_fn)(struct drbd_conf *mdev); void (*done)(struct drbd_conf *mdev, int rv); }; @@ -1030,16 +912,97 @@ WO_bio_barrier }; +struct fifo_buffer { + unsigned int head_index; + unsigned int size; + int total; /* sum of all values */ + int values[0]; +}; +extern struct fifo_buffer *fifo_alloc(int fifo_size); + +/* flag bits per tconn */ +enum { + NET_CONGESTED, /* The data socket is congested */ + DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ + SEND_PING, /* whether asender should send a ping asap */ + SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ + CONN_WD_ST_CHG_OKAY, + CONN_WD_ST_CHG_FAIL, + CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ + CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ +}; + +struct drbd_tconn { /* is a resource from the config file */ + char *name; /* Resource name */ + struct list_head all_tconn; /* linked on global drbd_tconns */ + struct kref kref; + struct idr volumes; /* to mdev mapping */ + enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ + unsigned susp:1; /* IO suspended by user */ + unsigned susp_nod:1; /* IO suspended because no data */ + unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ + struct mutex cstate_mutex; /* Protects graceful disconnects */ + + unsigned long flags; + struct net_conf *net_conf; /* content protected by rcu */ + struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ + wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */ + struct res_opts res_opts; + + struct sockaddr_storage my_addr; + int my_addr_len; + struct sockaddr_storage peer_addr; + int peer_addr_len; + + struct drbd_socket data; /* data/barrier/cstate/parameter packets */ + struct drbd_socket meta; /* ping/ack (metadata) packets */ + int agreed_pro_version; /* actually used protocol version */ + unsigned long last_received; /* in jiffies, either socket */ + unsigned int ko_count; + + spinlock_t req_lock; + struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ + struct drbd_tl_epoch *newest_tle; + struct drbd_tl_epoch *oldest_tle; + struct list_head out_of_sequence_requests; + struct list_head barrier_acked_requests; + + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */ + struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ + struct crypto_hash *csums_tfm; + struct crypto_hash *verify_tfm; + void *int_dig_in; + void *int_dig_vv; + + struct drbd_epoch *current_epoch; + spinlock_t epoch_lock; + unsigned int epochs; + enum write_ordering_e write_ordering; + + struct drbd_thread receiver; + struct drbd_thread worker; + struct drbd_thread asender; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30) && !defined(cpumask_bits) + cpumask_t cpu_mask[1]; +#else + cpumask_var_t cpu_mask; +#endif +}; + struct drbd_conf { #ifdef PARANOIA long magic; #endif + struct drbd_tconn *tconn; + int vnr; /* volume number within the connection */ + struct kref kref; + /* things that are stored as / read from meta data on disk */ unsigned long flags; /* configured by drbdsetup */ - struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ - struct syncer_conf sync_conf; struct drbd_backing_dev *ldev __protected_by(local); sector_t p_size; /* partner's disk size */ @@ -1047,21 +1010,26 @@ struct block_device *this_bdev; struct gendisk *vdisk; - struct drbd_socket data; /* data/barrier/cstate/parameter packets */ - struct drbd_socket meta; /* ping/ack (metadata) packets */ - int agreed_pro_version; /* actually used protocol version */ - unsigned long last_received; /* in jiffies, either socket */ - unsigned int ko_count; struct drbd_work resync_work, unplug_work, - md_sync_work; + go_diskless, + md_sync_work, + start_resync_work; struct timer_list resync_timer; struct timer_list md_sync_timer; + struct timer_list start_resync_timer; + struct timer_list request_timer; +#ifdef DRBD_DEBUG_MD_SYNC + struct { + unsigned int line; + const char* func; + } last_md_mark_dirty; +#endif /* Used after attach while negotiating new disk state. */ union drbd_state new_state_tmp; - union drbd_state state; + union drbd_dev_state state; wait_queue_head_t misc_wait; wait_queue_head_t state_wait; /* upon each state change. */ unsigned int send_cnt; @@ -1073,31 +1041,31 @@ atomic_t ap_bio_cnt; /* Requests we need to complete */ atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ - atomic_t unacked_cnt; /* Need to send replys for */ + atomic_t unacked_cnt; /* Need to send replies for */ atomic_t local_cnt; /* Waiting for local completion */ - atomic_t net_cnt; /* Users of net_conf */ - spinlock_t req_lock; - struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ - struct drbd_tl_epoch *newest_tle; - struct drbd_tl_epoch *oldest_tle; - struct list_head out_of_sequence_requests; - struct hlist_head *tl_hash; - unsigned int tl_hash_s; - /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ + /* Interval tree of pending local write requests */ + struct rb_root read_requests; + struct rb_root write_requests; + + /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ unsigned long rs_total; - /* number of sync IOs that failed in this run */ + /* number of resync blocks that failed in this run */ unsigned long rs_failed; /* Syncer's start time [unit jiffies] */ unsigned long rs_start; /* cumulated time in PausedSyncX state [unit jiffies] */ unsigned long rs_paused; + /* skipped because csum was equal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; +#define DRBD_SYNC_MARKS 8 +#define DRBD_SYNC_MARK_STEP (3*HZ) /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ - unsigned long rs_mark_left; + unsigned long rs_mark_left[DRBD_SYNC_MARKS]; /* marks's time [unit jiffies] */ - unsigned long rs_mark_time; - /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ - unsigned long rs_same_csum; + unsigned long rs_mark_time[DRBD_SYNC_MARKS]; + /* current index into rs_mark_{left,time} */ + int rs_last_mark; /* where does the admin want us to start? (sector) */ sector_t ov_start_sector; @@ -1108,12 +1076,7 @@ /* size of out-of-sync range in sectors. */ sector_t ov_last_oos_size; unsigned long ov_left; /* in bits */ - struct crypto_hash *csums_tfm; - struct crypto_hash *verify_tfm; - struct drbd_thread receiver; - struct drbd_thread worker; - struct drbd_thread asender; struct drbd_bitmap *bitmap; unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ @@ -1128,65 +1091,54 @@ u64 *p_uuid; /* FIXME clean comments, restructure so it is more obvious which * members are protected by what */ - struct drbd_epoch *current_epoch; - spinlock_t epoch_lock; - unsigned int epochs; - enum write_ordering_e write_ordering; - struct list_head active_ee; /* IO in progress */ - struct list_head sync_ee; /* IO in progress */ - struct list_head done_ee; /* send ack */ - struct list_head read_ee; /* IO in progress */ - struct list_head net_ee; /* zero-copy network send in progress */ - struct hlist_head *ee_hash; /* is proteced by req_lock! */ - unsigned int ee_hash_s; - /* this one is protected by ee_lock, single thread */ - struct drbd_epoch_entry *last_write_w_barrier; + struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ + struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ + struct list_head done_ee; /* need to send P_WRITE_ACK */ + struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ + struct list_head net_ee; /* zero-copy network send in progress */ int next_barrier_nr; - struct hlist_head *app_reads_hash; /* is proteced by req_lock */ struct list_head resync_reads; - atomic_t pp_in_use; + atomic_t pp_in_use; /* allocated from page pool */ + atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ - struct page *md_io_tmpp; /* for logical_block_size != 512 */ - struct mutex md_io_mutex; /* protects the md_io_buffer */ + struct drbd_md_io md_io; + atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ unsigned int al_tr_number; int al_tr_cycle; int al_tr_pos; /* position of the next transaction in the journal */ - struct crypto_hash *cram_hmac_tfm; - struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ - struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ - void *int_dig_out; - void *int_dig_in; - void *int_dig_vv; wait_queue_head_t seq_wait; atomic_t packet_seq; unsigned int peer_seq; spinlock_t peer_seq_lock; unsigned int minor; unsigned long comm_bm_set; /* communicated number of set bits. */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30) && !defined(cpumask_bits) - cpumask_t cpu_mask[1]; -#else - cpumask_var_t cpu_mask; -#endif struct bm_io_work bm_io_work; u64 ed_uuid; /* UUID of the exposed data */ - struct mutex state_mutex; + struct mutex own_state_mutex; + struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */ char congestion_reason; /* Why we where congested... */ + atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ + atomic_t rs_sect_ev; /* for submitted resync data rate, both */ + int rs_last_sect_ev; /* counter to compare with */ + int rs_last_events; /* counter of read or write "events" (unit sectors) + * on the lower level device when we last looked. */ + int c_sync_rate; /* current resync rate after syncer throttle magic */ + struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */ + int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ + atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ + int peer_max_bio_size; + int local_max_bio_size; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) { - struct drbd_conf *mdev; - - mdev = minor < minor_count ? minor_table[minor] : NULL; - - return mdev; + return (struct drbd_conf *)idr_find(&minors, minor); } static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) @@ -1194,29 +1146,9 @@ return mdev->minor; } -/* returns 1 if it was successfull, - * returns 0 if there was no data socket. - * so wherever you are going to use the data.socket, e.g. do - * if (!drbd_get_data_sock(mdev)) - * return 0; - * CODE(); - * drbd_put_data_sock(mdev); - */ -static inline int drbd_get_data_sock(struct drbd_conf *mdev) -{ - mutex_lock(&mdev->data.mutex); - /* drbd_disconnect() could have called drbd_free_sock() - * while we were waiting in down()... */ - if (unlikely(mdev->data.socket == NULL)) { - mutex_unlock(&mdev->data.mutex); - return 0; - } - return 1; -} - -static inline void drbd_put_data_sock(struct drbd_conf *mdev) +static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr) { - mutex_unlock(&mdev->data.mutex); + return (struct drbd_conf *)idr_find(&tconn->volumes, vnr); } /* @@ -1225,93 +1157,76 @@ /* drbd_main.c */ -enum chg_state_flags { - CS_HARD = 1, - CS_VERBOSE = 2, - CS_WAIT_COMPLETE = 4, - CS_SERIALIZE = 8, - CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, +enum dds_flags { + DDSF_FORCED = 1, + DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ }; extern void drbd_init_set_defaults(struct drbd_conf *mdev); -extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, - union drbd_state mask, union drbd_state val); -extern void drbd_force_state(struct drbd_conf *, union drbd_state, - union drbd_state); -extern int _drbd_request_state(struct drbd_conf *, union drbd_state, - union drbd_state, enum chg_state_flags); -extern int __drbd_set_state(struct drbd_conf *, union drbd_state, - enum chg_state_flags, struct completion *done); -extern void print_st_err(struct drbd_conf *, union drbd_state, - union drbd_state, int); extern int drbd_thread_start(struct drbd_thread *thi); extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); +extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task); #ifdef CONFIG_SMP -extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); -extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); +extern void drbd_thread_current_set_cpu(struct drbd_thread *thi); +extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn); #else #define drbd_thread_current_set_cpu(A) ({}) #define drbd_calc_cpu_mask(A) ({}) #endif -extern void drbd_free_resources(struct drbd_conf *mdev); -extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, +extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr, unsigned int set_size); -extern void tl_clear(struct drbd_conf *mdev); -extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); -extern void drbd_free_sock(struct drbd_conf *mdev); -extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, - void *buf, size_t size, unsigned msg_flags); -extern int drbd_send_protocol(struct drbd_conf *mdev); +extern void tl_clear(struct drbd_tconn *); +extern void _tl_add_barrier(struct drbd_tconn *, struct drbd_tl_epoch *); +extern void drbd_free_sock(struct drbd_tconn *tconn); +extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock, + void *buf, size_t size, unsigned msg_flags); +extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t, + unsigned); + +extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd); +extern int drbd_send_protocol(struct drbd_tconn *tconn); extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); -extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); -extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); -extern int _drbd_send_state(struct drbd_conf *mdev); -extern int drbd_send_state(struct drbd_conf *mdev); -extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, - enum drbd_packets cmd, struct p_header *h, - size_t size, unsigned msg_flags); -#define USE_DATA_SOCKET 1 -#define USE_META_SOCKET 0 -extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, - enum drbd_packets cmd, struct p_header *h, - size_t size); -extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, - char *data, size_t size); -extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); -extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, - u32 set_size); -extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, - struct drbd_epoch_entry *e); -extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_block_req *rp); -extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_data *dp); -extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, +extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); +extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); +#define drbd_send_state(m, s) drbd_send_state_(m, s, __func__ , __LINE__ ) +#define drbd_send_current_state(m) drbd_send_current_state_(m, __func__ , __LINE__ ) +extern int drbd_send_state_(struct drbd_conf *mdev, + union drbd_state s, + const char *func, unsigned int line); +extern int drbd_send_current_state_(struct drbd_conf *mdev, + const char *func, unsigned int line); +extern int drbd_send_sync_param(struct drbd_conf *mdev); +extern void drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet, + struct drbd_peer_request *); +extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, + struct p_block_req *rp); +extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, + struct p_data *dp, int data_size); +extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, sector_t sector, int blksize, u64 block_id); -extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, - struct drbd_epoch_entry *e); +extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *); +extern int drbd_send_block(struct drbd_conf *, enum drbd_packet, + struct drbd_peer_request *); extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); -extern int _drbd_send_barrier(struct drbd_conf *mdev, - struct drbd_tl_epoch *barrier); extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, sector_t sector, int size, u64 block_id); -extern int drbd_send_drequest_csum(struct drbd_conf *mdev, - sector_t sector,int size, - void *digest, int digest_size, - enum drbd_packets cmd); +extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, + int size, void *digest, int digest_size, + enum drbd_packet cmd); extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); extern int drbd_send_bitmap(struct drbd_conf *mdev); -extern int _drbd_send_bitmap(struct drbd_conf *mdev); -extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); +extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode); +extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode); extern void drbd_free_bc(struct drbd_backing_dev *ldev); extern void drbd_mdev_cleanup(struct drbd_conf *mdev); +void drbd_print_uuids(struct drbd_conf *mdev, const char *text); -/* drbd_meta-data.c (still in drbd_main.c) */ extern void drbd_md_sync(struct drbd_conf *mdev); extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); -/* maybe define them below as inline? */ extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); @@ -1320,37 +1235,63 @@ extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +#ifndef DRBD_DEBUG_MD_SYNC extern void drbd_md_mark_dirty(struct drbd_conf *mdev); +#else +#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ ) +extern void drbd_md_mark_dirty_(struct drbd_conf *mdev, + unsigned int line, const char *func); +#endif extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), void (*done)(struct drbd_conf *, int), - char *why); + char *why, enum bm_flag flags); +extern int drbd_bitmap_io(struct drbd_conf *mdev, + int (*io_fn)(struct drbd_conf *), + char *why, enum bm_flag flags); extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); -extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); - +extern void drbd_go_diskless(struct drbd_conf *mdev); +extern void drbd_ldev_destroy(struct drbd_conf *mdev); /* Meta data layout We reserve a 128MB Block (4k aligned) * either at the end of the backing device - * or on a seperate meta data device. */ + * or on a separate meta data device. */ -#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ /* The following numbers are sectors */ -#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ -#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ -/* Allows up to about 3.8TB */ -#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) - -/* Since the smalles IO unit is usually 512 byte */ -#define MD_SECTOR_SHIFT 9 -#define MD_SECTOR_SIZE (1< local node thinks this block needs to be synced. */ -#define BM_BLOCK_SHIFT 12 /* 4k per bit */ +#define SLEEP_TIME (HZ/10) + +/* We do bitmap IO in units of 4k blocks. + * We also still have a hardcoded 4k per bit relation. */ +#define BM_BLOCK_SHIFT 12 /* 4k per bit */ #define BM_BLOCK_SIZE (1< BIO_MAX_SIZE +#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE +#endif +#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */ -/* Number of elements in the app_reads_hash */ -#define APP_R_HSIZE 15 +#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* Header 80 only allows packets up to 32KiB data */ +#define DRBD_MAX_BIO_SIZE_P95 (1 << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ extern int drbd_bm_init(struct drbd_conf *mdev); -extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors); +extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits); extern void drbd_bm_cleanup(struct drbd_conf *mdev); extern void drbd_bm_set_all(struct drbd_conf *mdev); extern void drbd_bm_clear_all(struct drbd_conf *mdev); +/* set/clear/test only a few bits at a time */ extern int drbd_bm_set_bits( struct drbd_conf *mdev, unsigned long s, unsigned long e); extern int drbd_bm_clear_bits( struct drbd_conf *mdev, unsigned long s, unsigned long e); -/* bm_set_bits variant for use while holding drbd_bm_lock */ +extern int drbd_bm_count_bits( + struct drbd_conf *mdev, const unsigned long s, const unsigned long e); +/* bm_set_bits variant for use while holding drbd_bm_lock, + * may process the whole bitmap in one go */ extern void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e); extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); -extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); +extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); +extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); extern size_t drbd_bm_words(struct drbd_conf *mdev); extern unsigned long drbd_bm_bits(struct drbd_conf *mdev); extern sector_t drbd_bm_capacity(struct drbd_conf *mdev); + +#define DRBD_END_OF_BITMAP (~(unsigned long)0) extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); /* bm_find_next variants for use while you hold drbd_bm_lock() */ extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo); extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo); +extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev); extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev); extern int drbd_bm_rs_done(struct drbd_conf *mdev); /* for receive_bitmap */ extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, unsigned long *buffer); -/* for _drbd_send_bitmap and drbd_bm_write_sect */ +/* for _drbd_send_bitmap */ extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, unsigned long *buffer); -extern void drbd_bm_lock(struct drbd_conf *mdev, char *why); +extern void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags); extern void drbd_bm_unlock(struct drbd_conf *mdev); - -extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e); /* drbd_main.c */ /* needs to be included here, @@ -1508,26 +1457,63 @@ #include "drbd_wrappers.h" extern struct kmem_cache *drbd_request_cache; -extern struct kmem_cache *drbd_ee_cache; /* epoch entries */ +extern struct kmem_cache *drbd_ee_cache; /* peer requests */ extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t *drbd_request_mempool; extern mempool_t *drbd_ee_mempool; -extern struct page *drbd_pp_pool; /* drbd's page pool */ +/* drbd's page pool, used to buffer data received from the peer, + * or data requested by the peer. + * + * This does not have an emergency reserve. + * + * When allocating from this pool, it first takes pages from the pool. + * Only if the pool is depleted will try to allocate from the system. + * + * The assumption is that pages taken from this pool will be processed, + * and given back, "quickly", and then can be recycled, so we can avoid + * frequent calls to alloc_page(), and still will be able to make progress even + * under memory pressure. + */ +extern struct page *drbd_pp_pool; extern spinlock_t drbd_pp_lock; extern int drbd_pp_vacant; extern wait_queue_head_t drbd_pp_wait; +/* We also need a standard (emergency-reserve backed) page pool + * for meta data IO (activity log, bitmap). + * We can keep it global, as long as it is used as "N pages at a time". + * 128 should be plenty, currently we probably can get away with as few as 1. + */ +#define DRBD_MIN_POOL_PAGES 128 +extern mempool_t *drbd_md_io_page_pool; + +/* We also need to make sure we get a bio + * when we need it for housekeeping purposes */ +extern struct bio_set *drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); + extern rwlock_t global_state_lock; -extern struct drbd_conf *drbd_new_device(unsigned int minor); -extern void drbd_free_mdev(struct drbd_conf *mdev); +extern int conn_lowest_minor(struct drbd_tconn *tconn); +enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr); +extern void drbd_minor_destroy(struct kref *kref); + +extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts); +extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts); +extern void conn_destroy(struct kref *kref); +struct drbd_tconn *conn_get_by_name(const char *name); +extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, + void *peer_addr, int peer_addr_len); +extern void conn_free_crypto(struct drbd_tconn *tconn); extern int proc_details; /* drbd_req */ -extern int drbd_make_request_26(struct request_queue *q, struct bio *bio); +extern int __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); +extern MAKE_REQUEST_TYPE drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); extern int drbd_merge_bvec(struct request_queue *q, #ifdef HAVE_bvec_merge_data @@ -1540,32 +1526,40 @@ /* drbd_nl.c */ +extern int drbd_msg_put_info(const char *info); extern void drbd_suspend_io(struct drbd_conf *mdev); extern void drbd_resume_io(struct drbd_conf *mdev); extern char *ppsize(char *buf, unsigned long long size); -extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); +extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; -extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local); +extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); extern void resync_after_online_grow(struct drbd_conf *); -extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); -extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, - int force); -enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev); +extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); +extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, + enum drbd_role new_role, + int force); +extern bool conn_try_outdate_peer(struct drbd_tconn *tconn); +extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn); extern int drbd_khelper(struct drbd_conf *mdev, char *cmd); /* drbd_worker.c */ extern int drbd_worker(struct drbd_thread *thi); -extern int drbd_alter_sa(struct drbd_conf *mdev, int na); +enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor); +void drbd_resync_after_changed(struct drbd_conf *mdev); extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side); extern void resume_next_sg(struct drbd_conf *mdev); extern void suspend_other_sg(struct drbd_conf *mdev); extern int drbd_resync_finished(struct drbd_conf *mdev); /* maybe rather drbd_main.c ? */ +extern void *drbd_md_get_buffer(struct drbd_conf *mdev); +extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, sector_t sector, int rw); -extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); +extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int); +extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, unsigned int *done); +extern void drbd_rs_controller_reset(struct drbd_conf *mdev); -static inline void ov_oos_print(struct drbd_conf *mdev) +static inline void ov_out_of_sync_print(struct drbd_conf *mdev) { if (mdev->ov_last_oos_size) { dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", @@ -1576,88 +1570,105 @@ } -extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); +extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); +extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, + struct drbd_peer_request *, void *); /* worker callbacks */ -extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); -extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); -extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); -extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); -extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); -extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); -extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); -extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); -extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); -extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); -extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); -extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); -extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); -extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); -extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); -extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); -extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); -extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); +extern int w_read_retry_remote(struct drbd_work *, int); +extern int w_e_end_data_req(struct drbd_work *, int); +extern int w_e_end_rsdata_req(struct drbd_work *, int); +extern int w_e_end_csum_rs_req(struct drbd_work *, int); +extern int w_e_end_ov_reply(struct drbd_work *, int); +extern int w_e_end_ov_req(struct drbd_work *, int); +extern int w_ov_finished(struct drbd_work *, int); +extern int w_resync_timer(struct drbd_work *, int); +extern int w_send_write_hint(struct drbd_work *, int); +extern int w_make_resync_request(struct drbd_work *, int); +extern int w_send_dblock(struct drbd_work *, int); +extern int w_send_barrier(struct drbd_work *, int); +extern int w_send_read_req(struct drbd_work *, int); +extern int w_prev_work_done(struct drbd_work *, int); +extern int w_e_reissue(struct drbd_work *, int); +extern int w_restart_disk_io(struct drbd_work *, int); +extern int w_send_out_of_sync(struct drbd_work *, int); +extern int w_start_resync(struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); +extern void start_resync_timer_fn(unsigned long data); /* drbd_receiver.c */ -extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); -extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, - u64 id, - sector_t sector, - unsigned int data_size, - gfp_t gfp_mask) __must_hold(local); -extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); -extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, - struct list_head *head); -extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, - struct list_head *head); +extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector); +extern int drbd_submit_peer_request(struct drbd_conf *, + struct drbd_peer_request *, const unsigned, + const int); +extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *); +extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64, + sector_t, unsigned int, + gfp_t) __must_hold(local); +extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *, + int); +#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) +#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) +extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool); extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); -extern void drbd_flush_workqueue(struct drbd_conf *mdev); +extern void conn_flush_workqueue(struct drbd_tconn *tconn); +extern int drbd_connected(struct drbd_conf *mdev); +static inline void drbd_flush_workqueue(struct drbd_conf *mdev) +{ + conn_flush_workqueue(mdev->tconn); +} -/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to - * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ +/* Yes, there is kernel_setsockopt, but only since 2.6.18. + * So we have our own copy of it here. */ static inline int drbd_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, int optlen) + char *optval, int optlen) { + mm_segment_t oldfs = get_fs(); + char __user *uoptval; int err; + + uoptval = (char __user __force *)optval; + + set_fs(KERNEL_DS); if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, optval, optlen); + err = sock_setsockopt(sock, level, optname, uoptval, optlen); else - err = sock->ops->setsockopt(sock, level, optname, optval, + err = sock->ops->setsockopt(sock, level, optname, uoptval, optlen); + set_fs(oldfs); return err; } static inline void drbd_tcp_cork(struct socket *sock) { - int __user val = 1; + int val = 1; (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, - (char __user *)&val, sizeof(val)); + (char*)&val, sizeof(val)); } static inline void drbd_tcp_uncork(struct socket *sock) { - int __user val = 0; + int val = 0; (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, - (char __user *)&val, sizeof(val)); + (char*)&val, sizeof(val)); } static inline void drbd_tcp_nodelay(struct socket *sock) { - int __user val = 1; + int val = 1; (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char __user *)&val, sizeof(val)); + (char*)&val, sizeof(val)); } static inline void drbd_tcp_quickack(struct socket *sock) { - int __user val = 1; + int val = 2; (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, - (char __user *)&val, sizeof(val)); + (char*)&val, sizeof(val)); } -void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); +void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo); /* drbd_proc.c */ extern struct proc_dir_entry *drbd_proc; @@ -1666,8 +1677,8 @@ extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ -extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); -extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); +extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); +extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); @@ -1675,110 +1686,78 @@ extern int drbd_rs_del_all(struct drbd_conf *mdev); extern void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size); -extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); +extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go); extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line); #define drbd_set_in_sync(mdev, sector, size) \ __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) -extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, +extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line); #define drbd_set_out_of_sync(mdev, sector, size) \ __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) -extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); -extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev); extern void drbd_al_shrink(struct drbd_conf *mdev); - /* drbd_nl.c */ +/* state info broadcast */ +struct sib_info { + enum drbd_state_info_bcast_reason sib_reason; + union { + struct { + char *helper_name; + unsigned helper_exit_code; + }; + struct { + union drbd_state os; + union drbd_state ns; + }; + }; +}; +void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib); -void drbd_nl_cleanup(void); -int __init drbd_nl_init(void); -void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); -void drbd_bcast_sync_progress(struct drbd_conf *mdev); -void drbd_bcast_ee(struct drbd_conf *mdev, - const char *reason, const int dgs, - const char* seen_hash, const char* calc_hash, - const struct drbd_epoch_entry* e); - - -/** - * DOC: DRBD State macros - * - * These macros are used to express state changes in easily readable form. - * - * The NS macros expand to a mask and a value, that can be bit ored onto the - * current state as soon as the spinlock (req_lock) was taken. - * - * The _NS macros are used for state functions that get called with the - * spinlock. These macros expand directly to the new state value. - * - * Besides the basic forms NS() and _NS() additional _?NS[23] are defined - * to express state changes that affect more than one aspect of the state. - * - * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) - * Means that the network connection was established and that the peer - * is in secondary role. - */ -#define role_MASK R_MASK -#define peer_MASK R_MASK -#define disk_MASK D_MASK -#define pdsk_MASK D_MASK -#define conn_MASK C_MASK -#define susp_MASK 1 -#define user_isp_MASK 1 -#define aftr_isp_MASK 1 - -/* drbd state debug */ -#if DRBD_DEBUG_STATE_CHANGES -#define DRBD_STATE_DEBUG_INIT_VAL(s) ({ (s).line = __LINE__; (s).func = __func__; }) -#else -#define DRBD_STATE_DEBUG_INIT_VAL(s) do { } while (0) -#endif - -#define NS(T, S) \ - ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ - ({ union drbd_state val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T = (S); val; }) -#define NS2(T1, S1, T2, S2) \ - ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ - mask.T2 = T2##_MASK; mask; }), \ - ({ union drbd_state val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T1 = (S1); \ - val.T2 = (S2); val; }) -#define NS3(T1, S1, T2, S2, T3, S3) \ - ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ - mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ - ({ union drbd_state val; DRBD_STATE_DEBUG_INIT_VAL(val); val.i = 0; val.T1 = (S1); \ - val.T2 = (S2); val.T3 = (S3); val; }) - -#define _NS(D, T, S) \ - D, ({ union drbd_state __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T = (S); __ns; }) -#define _NS2(D, T1, S1, T2, S2) \ - D, ({ union drbd_state __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T1 = (S1); \ - __ns.T2 = (S2); __ns; }) -#define _NS3(D, T1, S1, T2, S2, T3, S3) \ - D, ({ union drbd_state __ns; DRBD_STATE_DEBUG_INIT_VAL(__ns); __ns.i = D->state.i; __ns.T1 = (S1); \ - __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) /* * inline helper functions *************************/ -static inline void drbd_state_lock(struct drbd_conf *mdev) +/* see also page_chain_add and friends in drbd_receiver.c */ +static inline struct page *page_chain_next(struct page *page) { - wait_event(mdev->misc_wait, - !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); + return (struct page *)page_private(page); } +#define page_chain_for_each(page) \ + for (; page && ({ prefetch(page_chain_next(page)); 1; }); \ + page = page_chain_next(page)) +#define page_chain_for_each_safe(page, n) \ + for (; page && ({ n = page_chain_next(page); 1; }); page = n) -static inline void drbd_state_unlock(struct drbd_conf *mdev) +static inline int drbd_bio_has_active_page(struct bio *bio) { - clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); - wake_up(&mdev->misc_wait); + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, bio, i, 0) { + if (page_count(bvec->bv_page) > 1) + return 1; + } + + return 0; } -static inline int _drbd_set_state(struct drbd_conf *mdev, - union drbd_state ns, enum chg_state_flags flags, - struct completion *done) +static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) { - int rv; + struct page *page = peer_req->pages; + page_chain_for_each(page) { + if (page_count(page) > 1) + return 1; + } + return 0; +} + +static inline enum drbd_state_rv +_drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) +{ + enum drbd_state_rv rv; read_lock(&global_state_lock); rv = __drbd_set_state(mdev, ns, flags, done); @@ -1787,41 +1766,43 @@ return rv; } -/** - * drbd_request_state() - Reqest a state change - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * - * This is the most graceful way of requesting a state change. It is verbose - * quite verbose in case the state change is not possible, and all those - * state changes are globally serialized. - */ -static inline int drbd_request_state(struct drbd_conf *mdev, - union drbd_state mask, - union drbd_state val) +static inline union drbd_state drbd_read_state(struct drbd_conf *mdev) { - return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); + union drbd_state rv; + + rv.i = mdev->state.i; + rv.susp = mdev->tconn->susp; + rv.susp_nod = mdev->tconn->susp_nod; + rv.susp_fen = mdev->tconn->susp_fen; + + return rv; } #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) { - switch (mdev->ldev->dc.on_io_error) { + enum drbd_io_error_p ep; + + rcu_read_lock(); + ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; + rcu_read_unlock(); + switch (ep) { case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ if (!forcedetach) { - if (printk_ratelimit()) - dev_err(DEV, "Local IO failed in %s." - "Passing error on...\n", where); + if (DRBD_ratelimit(5*HZ, 5)) + dev_err(DEV, "Local IO failed in %s.\n", where); + if (mdev->state.disk > D_INCONSISTENT) + _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); break; } /* NOTE fall through to detach case if forcedetach set */ case EP_DETACH: case EP_CALL_HELPER: + set_bit(WAS_IO_ERROR, &mdev->flags); if (mdev->state.disk > D_FAILED) { _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); - dev_err(DEV, "Local IO failed in %s." - "Detaching...\n", where); + dev_err(DEV, + "Local IO failed in %s. Detaching...\n", where); } break; } @@ -1841,9 +1822,9 @@ { if (error) { unsigned long flags; - spin_lock_irqsave(&mdev->req_lock, flags); + spin_lock_irqsave(&mdev->tconn->req_lock, flags); __drbd_chk_io_error_(mdev, forcedetach, where); - spin_unlock_irqrestore(&mdev->req_lock, flags); + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); } } @@ -1855,9 +1836,9 @@ * BTW, for internal meta data, this happens to be the maximum capacity * we could agree upon with our peer node. */ -static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) +static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) { - switch (bdev->dc.meta_dev_idx) { + switch (meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + bdev->md.bm_offset; @@ -1867,13 +1848,30 @@ } } +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) +{ + int meta_dev_idx; + + rcu_read_lock(); + meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; + rcu_read_unlock(); + + return _drbd_md_first_sector(meta_dev_idx, bdev); +} + /** * drbd_md_last_sector() - Return the last sector number of the meta data area * @bdev: Meta data block device. */ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) { - switch (bdev->dc.meta_dev_idx) { + int meta_dev_idx; + + rcu_read_lock(); + meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; + rcu_read_unlock(); + + switch (meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + MD_AL_OFFSET - 1; @@ -1894,12 +1892,18 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) { sector_t s; - switch (bdev->dc.meta_dev_idx) { + int meta_dev_idx; + + rcu_read_lock(); + meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; + rcu_read_unlock(); + + switch (meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: s = drbd_get_capacity(bdev->backing_bdev) ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, - drbd_md_first_sector(bdev)) + _drbd_md_first_sector(meta_dev_idx, bdev)) : 0; break; case DRBD_MD_INDEX_FLEX_EXT: @@ -1925,9 +1929,15 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { - switch (bdev->dc.meta_dev_idx) { + int meta_dev_idx; + + rcu_read_lock(); + meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; + rcu_read_unlock(); + + switch (meta_dev_idx) { default: /* external, some index */ - return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; + return MD_RESERVED_SECT * meta_dev_idx; case DRBD_MD_INDEX_INTERNAL: /* with drbd08, internal meta data is always "flexible" */ case DRBD_MD_INDEX_FLEX_INT: @@ -1948,13 +1958,6 @@ } static inline void -_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) -{ - list_add_tail(&w->list, &q->q); - up(&q->s); -} - -static inline void drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) { unsigned long flags; @@ -1976,50 +1979,45 @@ spin_unlock_irqrestore(&q->q_lock, flags); } -static inline void wake_asender(struct drbd_conf *mdev) -{ - if (test_bit(SIGNAL_ASENDER, &mdev->flags)) - force_sig(DRBD_SIG, mdev->asender.task); -} - -static inline void request_ping(struct drbd_conf *mdev) +static inline void wake_asender(struct drbd_tconn *tconn) { - set_bit(SEND_PING, &mdev->flags); - wake_asender(mdev); + if (test_bit(SIGNAL_ASENDER, &tconn->flags)) + force_sig(DRBD_SIG, tconn->asender.task); } -static inline int drbd_send_short_cmd(struct drbd_conf *mdev, - enum drbd_packets cmd) +static inline void request_ping(struct drbd_tconn *tconn) { - struct p_header h; - return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); + set_bit(SEND_PING, &tconn->flags); + wake_asender(tconn); } -static inline int drbd_send_ping(struct drbd_conf *mdev) -{ - struct p_header h; - return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); -} +extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *); +extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *); +extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *, + enum drbd_packet, unsigned int, void *, + unsigned int); +extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *, + enum drbd_packet, unsigned int, void *, + unsigned int); -static inline int drbd_send_ping_ack(struct drbd_conf *mdev) -{ - struct p_header h; - return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); -} +extern int drbd_send_ping(struct drbd_tconn *tconn); +extern int drbd_send_ping_ack(struct drbd_tconn *tconn); +extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); +extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state); static inline void drbd_thread_stop(struct drbd_thread *thi) { - _drbd_thread_stop(thi, FALSE, TRUE); + _drbd_thread_stop(thi, false, true); } static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) { - _drbd_thread_stop(thi, FALSE, FALSE); + _drbd_thread_stop(thi, false, false); } static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) { - _drbd_thread_stop(thi, TRUE, FALSE); + _drbd_thread_stop(thi, true, false); } /* counts how many answer packets packets we expect from our peer, @@ -2027,22 +2025,22 @@ * or implicit barrier packets as necessary. * increased: * w_send_barrier - * _req_mod(req, queue_for_net_write or queue_for_net_read); + * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ); * it is much easier and equally valid to count what we queue for the * worker, even before it actually was queued or send. * (drbd_make_request_common; recovery path on read io-error) * decreased: * got_BarrierAck (respective tl_clear, tl_clear_barrier) - * _req_mod(req, data_received) + * _req_mod(req, DATA_RECEIVED) * [from receive_DataReply] - * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) + * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED) * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] * FIXME * for some reason it is NOT decreased in got_NegAck, * but in the resulting cleanup code from report_params. * we should try to remember the reason for that... - * _req_mod(req, send_failed or send_canceled) - * _req_mod(req, connection_lost_while_pending) + * _req_mod(req, SEND_FAILED or SEND_CANCELED) + * _req_mod(req, CONNECTION_LOST_WHILE_PENDING) * [from tl_clear_barrier] */ static inline void inc_ap_pending(struct drbd_conf *mdev) @@ -2050,22 +2048,24 @@ atomic_inc(&mdev->ap_pending_cnt); } -#define ERR_IF_CNT_IS_NEGATIVE(which) \ - if (atomic_read(&mdev->which) < 0) \ +#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ + if (atomic_read(&mdev->which) < 0) \ dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ - __func__ , __LINE__ , \ - atomic_read(&mdev->which)) + func, line, \ + atomic_read(&mdev->which)) -#define dec_ap_pending(mdev) do { \ - typecheck(struct drbd_conf *, mdev); \ - if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ - wake_up(&mdev->misc_wait); \ - ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) +#define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__) +static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line) +{ + if (atomic_dec_and_test(&mdev->ap_pending_cnt)) + wake_up(&mdev->misc_wait); + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); +} /* counts how many resync-related answers we still expect from the peer * increase decrease * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) - * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER) + * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER) * (or P_NEG_ACK with ID_SYNCER) */ static inline void inc_rs_pending(struct drbd_conf *mdev) @@ -2073,10 +2073,12 @@ atomic_inc(&mdev->rs_pending_cnt); } -#define dec_rs_pending(mdev) do { \ - typecheck(struct drbd_conf *, mdev); \ - atomic_dec(&mdev->rs_pending_cnt); \ - ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) +#define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__) +static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line) +{ + atomic_dec(&mdev->rs_pending_cnt); + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); +} /* counts how many answers we still need to send to the peer. * increased on @@ -2092,38 +2094,18 @@ atomic_inc(&mdev->unacked_cnt); } -#define dec_unacked(mdev) do { \ - typecheck(struct drbd_conf *, mdev); \ - atomic_dec(&mdev->unacked_cnt); \ - ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) - -#define sub_unacked(mdev, n) do { \ - typecheck(struct drbd_conf *, mdev); \ - atomic_sub(n, &mdev->unacked_cnt); \ - ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) - - -static inline void put_net_conf(struct drbd_conf *mdev) +#define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__) +static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line) { - if (atomic_dec_and_test(&mdev->net_cnt)) - wake_up(&mdev->misc_wait); + atomic_dec(&mdev->unacked_cnt); + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); } -/** - * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there - * @mdev: DRBD device. - * - * You have to call put_net_conf() when finished working with mdev->net_conf. - */ -static inline int get_net_conf(struct drbd_conf *mdev) +#define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__) +static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line) { - int have_net_conf; - - atomic_inc(&mdev->net_cnt); - have_net_conf = mdev->state.conn >= C_UNCONNECTED; - if (!have_net_conf) - put_net_conf(mdev); - return have_net_conf; + atomic_sub(n, &mdev->unacked_cnt); + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); } /** @@ -2137,10 +2119,22 @@ static inline void put_ldev(struct drbd_conf *mdev) { + int i = atomic_dec_return(&mdev->local_cnt); + + /* This may be called from some endio handler, + * so we must not sleep here. */ + __release(local); - if (atomic_dec_and_test(&mdev->local_cnt)) + D_ASSERT(i >= 0); + if (i == 0) { + if (mdev->state.disk == D_DISKLESS) + /* even internal references gone, safe to destroy */ + drbd_ldev_destroy(mdev); + if (mdev->state.disk == D_FAILED) + /* all application IO references gone. */ + drbd_go_diskless(mdev); wake_up(&mdev->misc_wait); - D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); + } } #ifndef __CHECKER__ @@ -2148,6 +2142,10 @@ { int io_allowed; + /* never get a reference while D_DISKLESS */ + if (mdev->state.disk == D_DISKLESS) + return 0; + atomic_inc(&mdev->local_cnt); io_allowed = (mdev->state.disk >= mins); if (!io_allowed) @@ -2162,17 +2160,18 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, unsigned long *bits_left, unsigned int *per_mil_done) { - /* - * this is to break it at compile time when we change that - * (we may feel 4TB maximum storage per drbd is not enough) - */ + /* this is to break it at compile time when we change that, in case we + * want to support more than (1<<32) bits on a 32bit arch. */ typecheck(unsigned long, mdev->rs_total); /* note: both rs_total and rs_left are in bits, i.e. in * units of BM_BLOCK_SIZE. * for the percentage, we don't care. */ - *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) + *bits_left = mdev->ov_left; + else + *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; /* >> 10 to prevent overflow, * +1 to prevent division by zero */ if (*bits_left > mdev->rs_total) { @@ -2187,10 +2186,19 @@ *bits_left, mdev->rs_total, mdev->rs_failed); *per_mil_done = 0; } else { - /* make sure the calculation happens in long context */ - unsigned long tmp = 1000UL - - (*bits_left >> 10)*1000UL - / ((mdev->rs_total >> 10) + 1UL); + /* Make sure the division happens in long context. + * We allow up to one petabyte storage right now, + * at a granularity of 4k per bit that is 2**38 bits. + * After shift right and multiplication by 1000, + * this should still fit easily into a 32bit long, + * so we don't need a 64bit division on 32bit arch. + * Note: currently we don't support such large bitmaps on 32bit + * arch anyways, but no harm done to be prepared for it here. + */ + unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10; + unsigned long left = *bits_left >> shift; + unsigned long total = 1UL + (mdev->rs_total >> shift); + unsigned long tmp = 1000UL - left * 1000UL/total; *per_mil_done = tmp; } } @@ -2201,16 +2209,20 @@ * maybe re-implement using semaphores? */ static inline int drbd_get_max_buffers(struct drbd_conf *mdev) { - int mxb = 1000000; /* arbitrary limit on open requests */ - if (get_net_conf(mdev)) { - mxb = mdev->net_conf->max_buffers; - put_net_conf(mdev); - } + struct net_conf *nc; + int mxb; + + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */ + rcu_read_unlock(); + return mxb; } -static inline int drbd_state_is_stable(union drbd_state s) +static inline int drbd_state_is_stable(struct drbd_conf *mdev) { + union drbd_dev_state s = mdev->state; /* DO NOT add a default clause, we want the compiler to warn us * for any newly introduced state we may have forgotten to add here */ @@ -2227,11 +2239,9 @@ case C_VERIFY_T: case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: - /* maybe stable, look at the disk state */ - break; - - /* no new io accepted during tansitional states - * like handshake or teardown */ + case C_AHEAD: + case C_BEHIND: + /* transitional states, IO allowed */ case C_DISCONNECTING: case C_UNCONNECTED: case C_TIMEOUT: @@ -2242,7 +2252,15 @@ case C_WF_REPORT_PARAMS: case C_STARTING_SYNC_S: case C_STARTING_SYNC_T: + break; + + /* Allow IO in BM exchange states with new protocols */ case C_WF_BITMAP_S: + if (mdev->tconn->agreed_pro_version < 96) + return 0; + break; + + /* no new io accepted in these states */ case C_WF_BITMAP_T: case C_WF_SYNC_UUID: case C_MASK: @@ -2256,12 +2274,12 @@ case D_OUTDATED: case D_CONSISTENT: case D_UP_TO_DATE: + case D_FAILED: /* disk state is stable as well. */ break; - /* no new io accepted during tansitional states */ + /* no new io accepted during transitional states */ case D_ATTACHING: - case D_FAILED: case D_NEGOTIATING: case D_UNKNOWN: case D_MASK: @@ -2272,59 +2290,63 @@ return 1; } -static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) +static inline int drbd_suspended(struct drbd_conf *mdev) +{ + struct drbd_tconn *tconn = mdev->tconn; + + return tconn->susp || tconn->susp_fen || tconn->susp_nod; +} + +static inline bool may_inc_ap_bio(struct drbd_conf *mdev) { int mxb = drbd_get_max_buffers(mdev); - if (mdev->state.susp) - return 0; + if (drbd_suspended(mdev)) + return false; if (test_bit(SUSPEND_IO, &mdev->flags)) - return 0; + return false; /* to avoid potential deadlock or bitmap corruption, * in various places, we only allow new application io * to start during "stable" states. */ /* no new io accepted when attaching or detaching the disk */ - if (!drbd_state_is_stable(mdev->state)) - return 0; + if (!drbd_state_is_stable(mdev)) + return false; /* since some older kernels don't have atomic_add_unless, * and we are within the spinlock anyways, we have this workaround. */ if (atomic_read(&mdev->ap_bio_cnt) > mxb) - return 0; + return false; if (test_bit(BITMAP_IO, &mdev->flags)) - return 0; - return 1; + return false; + return true; } -/* I'd like to use wait_event_lock_irq, - * but I'm not sure when it got introduced, - * and not sure when it has 3 or 4 arguments */ -static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) -{ - /* compare with after_state_ch, - * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ - DEFINE_WAIT(wait); +static inline bool inc_ap_bio_cond(struct drbd_conf *mdev) +{ + bool rv = false; + + spin_lock_irq(&mdev->tconn->req_lock); + rv = may_inc_ap_bio(mdev); + if (rv) + atomic_inc(&mdev->ap_bio_cnt); + spin_unlock_irq(&mdev->tconn->req_lock); + + return rv; +} +static inline void inc_ap_bio(struct drbd_conf *mdev) +{ /* we wait here * as long as the device is suspended * until the bitmap is no longer on the fly during connection - * handshake as long as we would exeed the max_buffer limit. + * handshake as long as we would exceed the max_buffer limit. * * to avoid races with the reconnect code, * we need to atomic_inc within the spinlock. */ - spin_lock_irq(&mdev->req_lock); - while (!__inc_ap_bio_cond(mdev)) { - prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&mdev->req_lock); - schedule(); - finish_wait(&mdev->misc_wait, &wait); - spin_lock_irq(&mdev->req_lock); - } - atomic_add(one_or_two, &mdev->ap_bio_cnt); - spin_unlock_irq(&mdev->req_lock); + wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev)); } static inline void dec_ap_bio(struct drbd_conf *mdev) @@ -2340,47 +2362,15 @@ wake_up(&mdev->misc_wait); if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) - drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w); } } -static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) +static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) { + int changed = mdev->ed_uuid != val; mdev->ed_uuid = val; -} - -static inline int seq_cmp(u32 a, u32 b) -{ - /* we assume wrap around at 32bit. - * for wrap around at 24bit (old atomic_t), - * we'd have to - * a <<= 8; b <<= 8; - */ - return (s32)(a) - (s32)(b); -} -#define seq_lt(a, b) (seq_cmp((a), (b)) < 0) -#define seq_gt(a, b) (seq_cmp((a), (b)) > 0) -#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) -#define seq_le(a, b) (seq_cmp((a), (b)) <= 0) -/* CAUTION: please no side effects in arguments! */ -#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) - -static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) -{ - unsigned int m; - spin_lock(&mdev->peer_seq_lock); - m = seq_max(mdev->peer_seq, new_seq); - mdev->peer_seq = m; - spin_unlock(&mdev->peer_seq_lock); - if (m == new_seq) - wake_up(&mdev->seq_wait); -} - -static inline void drbd_update_congested(struct drbd_conf *mdev) -{ - struct sock *sk = mdev->data.socket->sk; - if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) - set_bit(NET_CONGESTED, &mdev->flags); + return changed; } static inline int drbd_queue_order_type(struct drbd_conf *mdev) @@ -2393,34 +2383,6 @@ return QUEUE_ORDERED_NONE; } -/* - * FIXME investigate what makes most sense: - * a) blk_run_queue(q); - * - * b) struct backing_dev_info *bdi; - * b1) bdi = &q->backing_dev_info; - * b2) bdi = mdev->ldev->backing_bdev->bd_inode->i_mapping->backing_dev_info; - * blk_run_backing_dev(bdi,NULL); - * - * c) generic_unplug(q) ? __generic_unplug(q) ? - * - * d) q->unplug_fn(q), which is what all the drivers/md/ stuff uses... - * - */ -static inline void drbd_blk_run_queue(struct request_queue *q) -{ - if (q && q->unplug_fn) - q->unplug_fn(q); -} - -static inline void drbd_kick_lo(struct drbd_conf *mdev) -{ - if (get_ldev(mdev)) { - drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev)); - put_ldev(mdev); - } -} - static inline void drbd_md_flush(struct drbd_conf *mdev) { int r; @@ -2428,11 +2390,36 @@ if (test_bit(MD_NO_BARRIER, &mdev->flags)) return; - r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); + r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL); if (r) { set_bit(MD_NO_BARRIER, &mdev->flags); dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); } } +/* resync bitmap */ +/* 16MB sized 'bitmap extent' to track syncer usage */ +struct bm_extent { + int rs_left; /* number of bits set (out of sync) in this extent. */ + int rs_failed; /* number of failed resync requests in this extent. */ + unsigned long flags; + struct lc_element lce; +}; + +#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ +#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ +#define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */ + +/* should be moved to idr.h */ +/** + * idr_for_each_entry - iterate over an idr's elements of a given type + * @idp: idr handle + * @entry: the type * to use as cursor + * @id: id entry's key + */ +#define idr_for_each_entry(idp, entry, id) \ + for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ + entry != NULL; \ + ++id, entry = (typeof(entry))idr_get_next((idp), &(id))) + #endif diff -Nru drbd8-8.3.7/drbd/drbd_interval.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_interval.c --- drbd8-8.3.7/drbd/drbd_interval.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_interval.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,178 @@ +#include "drbd_interval.h" +#include "drbd_wrappers.h" + +/** + * interval_end - return end of @node + */ +static inline +sector_t interval_end(struct rb_node *node) +{ + struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); + return this->end; +} + +/** + * update_interval_end - recompute end of @node + * + * The end of an interval is the highest (start + (size >> 9)) value of this + * node and of its children. Called for @node and its parents whenever the end + * may have changed. + */ +static void +update_interval_end(struct rb_node *node, void *__unused) +{ + struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); + sector_t end; + + end = this->sector + (this->size >> 9); + if (node->rb_left) { + sector_t left = interval_end(node->rb_left); + if (left > end) + end = left; + } + if (node->rb_right) { + sector_t right = interval_end(node->rb_right); + if (right > end) + end = right; + } + this->end = end; +} + +/** + * drbd_insert_interval - insert a new interval into a tree + */ +bool +drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + + BUG_ON(!IS_ALIGNED(this->size, 512)); + + while (*new) { + struct drbd_interval *here = + rb_entry(*new, struct drbd_interval, rb); + + parent = *new; + if (this->sector < here->sector) + new = &(*new)->rb_left; + else if (this->sector > here->sector) + new = &(*new)->rb_right; + else if (this < here) + new = &(*new)->rb_left; + else if (this > here) + new = &(*new)->rb_right; + else + return false; + } + + rb_link_node(&this->rb, parent, new); + rb_insert_color(&this->rb, root); + rb_augment_insert(&this->rb, update_interval_end, NULL); + return true; +} + +/** + * drbd_contains_interval - check if a tree contains a given interval + * @sector: start sector of @interval + * @interval: may not be a valid pointer + * + * Returns if the tree contains the node @interval with start sector @start. + * Does not dereference @interval until @interval is known to be a valid object + * in @tree. Returns %false if @interval is in the tree but with a different + * sector number. + */ +bool +drbd_contains_interval(struct rb_root *root, sector_t sector, + struct drbd_interval *interval) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct drbd_interval *here = + rb_entry(node, struct drbd_interval, rb); + + if (sector < here->sector) + node = node->rb_left; + else if (sector > here->sector) + node = node->rb_right; + else if (interval < here) + node = node->rb_left; + else if (interval > here) + node = node->rb_right; + else + return true; + } + return false; +} + +/** + * drbd_remove_interval - remove an interval from a tree + */ +void +drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) +{ + struct rb_node *deepest; + + deepest = rb_augment_erase_begin(&this->rb); + rb_erase(&this->rb, root); + rb_augment_erase_end(deepest, update_interval_end, NULL); +} + +/** + * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) + * @sector: start sector + * @size: size, aligned to 512 bytes + * + * Returns an interval overlapping with [sector, sector + size), or NULL if + * there is none. When there is more than one overlapping interval in the + * tree, the interval with the lowest start sector is returned, and all other + * overlapping intervals will be on the right side of the tree, reachable with + * rb_next(). + */ +struct drbd_interval * +drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size) +{ + struct rb_node *node = root->rb_node; + struct drbd_interval *overlap = NULL; + sector_t end = sector + (size >> 9); + + BUG_ON(!IS_ALIGNED(size, 512)); + + while (node) { + struct drbd_interval *here = + rb_entry(node, struct drbd_interval, rb); + + if (node->rb_left && + sector < interval_end(node->rb_left)) { + /* Overlap if any must be on left side */ + node = node->rb_left; + } else if (here->sector < end && + sector < here->sector + (here->size >> 9)) { + overlap = here; + break; + } else if (sector >= here->sector) { + /* Overlap if any must be on right side */ + node = node->rb_right; + } else + break; + } + return overlap; +} + +struct drbd_interval * +drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size) +{ + sector_t end = sector + (size >> 9); + struct rb_node *node; + + for (;;) { + node = rb_next(&i->rb); + if (!node) + return NULL; + i = rb_entry(node, struct drbd_interval, rb); + if (i->sector >= end) + return NULL; + if (sector < i->sector + (i->size >> 9)) + return i; + } +} diff -Nru drbd8-8.3.7/drbd/drbd_interval.h drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_interval.h --- drbd8-8.3.7/drbd/drbd_interval.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_interval.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,68 @@ +#ifndef __DRBD_INTERVAL_H +#define __DRBD_INTERVAL_H + +#include +#include +#include + +/* Compatibility code for 2.6.16 (SLES10) */ +#ifndef rb_parent +#define rb_parent(r) ((r)->rb_parent) +#endif + +/* + * Kernels between mainline commit dd67d051 (v2.6.18-rc1) and 10fd48f2 + * (v2.6.19-rc1) have a broken version of RB_EMPTY_NODE(). + * + * RHEL5 kernels until at least 2.6.18-238.12.1.el5 have the broken definition. + */ +#if !defined(RB_EMPTY_NODE) || LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,19) + +#undef RB_EMPTY_NODE +#define RB_EMPTY_NODE(node) (rb_parent(node) == node) + +#endif + +#ifndef RB_CLEAR_NODE +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->rb_parent = p; +} +#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) +#endif +/* /Compatibility code */ + +struct drbd_interval { + struct rb_node rb; + sector_t sector; /* start sector of the interval */ + unsigned int size; /* size in bytes */ + sector_t end; /* highest interval end in subtree */ + int local:1 /* local or remote request? */; + int waiting:1; +}; + +static inline void drbd_clear_interval(struct drbd_interval *i) +{ + RB_CLEAR_NODE(&i->rb); +} + +static inline bool drbd_interval_empty(struct drbd_interval *i) +{ + return RB_EMPTY_NODE(&i->rb); +} + +extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *); +extern bool drbd_contains_interval(struct rb_root *, sector_t, + struct drbd_interval *); +extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *); +extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t, + unsigned int); +extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t, + unsigned int); + +#define drbd_for_each_overlap(i, root, sector, size) \ + for (i = drbd_find_overlap(root, sector, size); \ + i; \ + i = drbd_next_overlap(i, sector, size)) + +#endif /* __DRBD_INTERVAL_H */ diff -Nru drbd8-8.3.7/drbd/drbd_main.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_main.c --- drbd8-8.3.7/drbd/drbd_main.c 2010-01-07 09:09:33.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_main.c 2012-02-02 14:09:14.000000000 +0000 @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -45,30 +44,22 @@ #include #include #include -#ifdef HAVE_LINUX_BYTEORDER_SWABB_H -#include -#else -#include -#endif - #define __KERNEL_SYSCALLS__ #include #include +#include +#include #include #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ - #include "drbd_vli.h" -struct after_state_chg_work { - struct drbd_work w; - union drbd_state os; - union drbd_state ns; - enum chg_state_flags flags; - struct completion *done; -}; +#ifdef COMPAT_HAVE_LINUX_BYTEORDER_SWABB_H +#include +#else +#include +#endif int drbdd_init(struct drbd_thread *); int drbd_worker(struct drbd_thread *); @@ -82,31 +73,18 @@ static int drbd_open(struct inode *inode, struct file *file); static int drbd_release(struct inode *inode, struct file *file); #endif -STATIC int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); -STATIC void after_state_ch(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags); -STATIC int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); +STATIC int w_md_sync(struct drbd_work *w, int unused); STATIC void md_sync_timer_fn(unsigned long data); -STATIC int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); - -DEFINE_TRACE(drbd_unplug); -DEFINE_TRACE(drbd_uuid); -DEFINE_TRACE(drbd_ee); -DEFINE_TRACE(drbd_packet); -DEFINE_TRACE(drbd_md_io); -DEFINE_TRACE(drbd_epoch); -DEFINE_TRACE(drbd_netlink); -DEFINE_TRACE(drbd_actlog); -DEFINE_TRACE(drbd_bio); -DEFINE_TRACE(_drbd_resync); -DEFINE_TRACE(drbd_req); +STATIC int w_bitmap_io(struct drbd_work *w, int unused); +STATIC int w_go_diskless(struct drbd_work *w, int unused); MODULE_AUTHOR("Philipp Reisner , " "Lars Ellenberg "); MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); MODULE_VERSION(REL_VERSION); MODULE_LICENSE("GPL"); -MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); +MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices (" + __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); #include @@ -117,7 +95,6 @@ module_param(minor_count, uint, 0444); module_param(disable_sendpage, bool, 0644); module_param(allow_oos, bool, 0); -module_param(cn_idx, uint, 0444); module_param(proc_details, int, 0644); #ifdef DRBD_ENABLE_FAULTS @@ -136,10 +113,9 @@ #endif /* module parameter, defined */ -unsigned int minor_count = 32; +unsigned int minor_count = DRBD_MINOR_COUNT_DEF; int disable_sendpage; int allow_oos; -unsigned int cn_idx = CN_IDX_DRBD; int proc_details; /* Detail level in proc drbd*/ /* Module parameter for setting the user mode helper program @@ -151,14 +127,17 @@ /* in 2.6.x, our device mapping and config info contains our virtual gendisks * as member "struct gendisk *vdisk;" */ -struct drbd_conf **minor_table; +struct idr minors; +struct list_head drbd_tconns; /* list of struct drbd_tconn */ struct kmem_cache *drbd_request_cache; -struct kmem_cache *drbd_ee_cache; /* epoch entries */ +struct kmem_cache *drbd_ee_cache; /* peer requests */ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; +mempool_t *drbd_md_io_page_pool; +struct bio_set *drbd_md_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -177,7 +156,24 @@ .release = drbd_release, }; -#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) +static void bio_destructor_drbd(struct bio *bio) +{ + bio_free(bio, drbd_md_io_bio_set); +} + +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; + bio->bi_destructor = bio_destructor_drbd; + return bio; +} #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will @@ -202,13 +198,13 @@ * DOC: The transfer log * * The transfer log is a single linked list of &struct drbd_tl_epoch objects. - * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail + * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail * of the list. There is always at least one &struct drbd_tl_epoch object. * * Each &struct drbd_tl_epoch has a circular double linked list of requests * attached. */ -STATIC int tl_init(struct drbd_conf *mdev) +STATIC int tl_init(struct drbd_tconn *tconn) { struct drbd_tl_epoch *b; @@ -220,30 +216,27 @@ INIT_LIST_HEAD(&b->w.list); b->next = NULL; b->br_number = 4711; - b->n_req = 0; + b->n_writes = 0; b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ - mdev->oldest_tle = b; - mdev->newest_tle = b; - INIT_LIST_HEAD(&mdev->out_of_sequence_requests); - - mdev->tl_hash = NULL; - mdev->tl_hash_s = 0; + tconn->oldest_tle = b; + tconn->newest_tle = b; + INIT_LIST_HEAD(&tconn->out_of_sequence_requests); + INIT_LIST_HEAD(&tconn->barrier_acked_requests); return 1; } -STATIC void tl_cleanup(struct drbd_conf *mdev) +STATIC void tl_cleanup(struct drbd_tconn *tconn) { - D_ASSERT(mdev->oldest_tle == mdev->newest_tle); - D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); - kfree(mdev->oldest_tle); - mdev->oldest_tle = NULL; - kfree(mdev->unused_spare_tle); - mdev->unused_spare_tle = NULL; - kfree(mdev->tl_hash); - mdev->tl_hash = NULL; - mdev->tl_hash_s = 0; + if (tconn->oldest_tle != tconn->newest_tle) + conn_err(tconn, "ASSERT FAILED: oldest_tle == newest_tle\n"); + if (!list_empty(&tconn->out_of_sequence_requests)) + conn_err(tconn, "ASSERT FAILED: list_empty(out_of_sequence_requests)\n"); + kfree(tconn->oldest_tle); + tconn->oldest_tle = NULL; + kfree(tconn->unused_spare_tle); + tconn->unused_spare_tle = NULL; } /** @@ -253,7 +246,7 @@ * * The caller must hold the req_lock. */ -void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) +void _tl_add_barrier(struct drbd_tconn *tconn, struct drbd_tl_epoch *new) { struct drbd_tl_epoch *newest_before; @@ -261,15 +254,15 @@ INIT_LIST_HEAD(&new->w.list); new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ new->next = NULL; - new->n_req = 0; + new->n_writes = 0; - newest_before = mdev->newest_tle; + newest_before = tconn->newest_tle; /* never send a barrier number == 0, because that is special-cased * when using TCQ for our write ordering code */ new->br_number = (newest_before->br_number+1) ?: 1; - if (mdev->newest_tle != new) { - mdev->newest_tle->next = new; - mdev->newest_tle = new; + if (tconn->newest_tle != new) { + tconn->newest_tle->next = new; + tconn->newest_tle = new; } } @@ -283,38 +276,39 @@ * &struct drbd_tl_epoch objects this function will cause a termination * of the connection. */ -void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, - unsigned int set_size) +void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr, + unsigned int set_size) { + struct drbd_conf *mdev; struct drbd_tl_epoch *b, *nob; /* next old barrier */ struct list_head *le, *tle; struct drbd_request *r; - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&tconn->req_lock); - b = mdev->oldest_tle; + b = tconn->oldest_tle; /* first some paranoia code */ if (b == NULL) { - dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", - barrier_nr); + conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", + barrier_nr); goto bail; } if (b->br_number != barrier_nr) { - dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", - barrier_nr, b->br_number); + conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n", + barrier_nr, b->br_number); goto bail; } - if (b->n_req != set_size) { - dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", - barrier_nr, set_size, b->n_req); + if (b->n_writes != set_size) { + conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", + barrier_nr, set_size, b->n_writes); goto bail; } /* Clean up list of requests processed during current epoch */ list_for_each_safe(le, tle, &b->requests) { r = list_entry(le, struct drbd_request, tl_requests); - _req_mod(r, barrier_acked); + _req_mod(r, BARRIER_ACKED); } /* There could be requests on the list waiting for completion of the write to the local disk. To avoid corruptions of @@ -324,1158 +318,219 @@ the write acks - which would be a bug and violating write ordering. To not deadlock in case we lose connection while such requests are still pending, we need some way to find them for the - _req_mode(connection_lost_while_pending). + _req_mode(CONNECTION_LOST_WHILE_PENDING). These have been list_move'd to the out_of_sequence_requests list in - _req_mod(, barrier_acked) above. + _req_mod(, BARRIER_ACKED) above. */ - list_del_init(&b->requests); + list_splice_init(&b->requests, &tconn->barrier_acked_requests); + mdev = b->w.mdev; nob = b->next; - if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { - _tl_add_barrier(mdev, b); + if (test_and_clear_bit(CREATE_BARRIER, &tconn->flags)) { + _tl_add_barrier(tconn, b); if (nob) - mdev->oldest_tle = nob; + tconn->oldest_tle = nob; /* if nob == NULL b was the only barrier, and becomes the new - barrier. Therefore mdev->oldest_tle points already to b */ + barrier. Therefore tconn->oldest_tle points already to b */ } else { D_ASSERT(nob != NULL); - mdev->oldest_tle = nob; + tconn->oldest_tle = nob; kfree(b); } - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&tconn->req_lock); dec_ap_pending(mdev); return; bail: - spin_unlock_irq(&mdev->req_lock); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + spin_unlock_irq(&tconn->req_lock); + conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); } /** - * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * _tl_restart() - Walks the transfer log, and applies an action to all requests * @mdev: DRBD device. + * @what: The action/event to perform with all request objects * - * This is called after the connection to the peer was lost. The storage covered - * by the requests on the transfer gets marked as our of sync. Called from the - * receiver thread and the worker thread. + * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, + * RESTART_FROZEN_DISK_IO. */ -void tl_clear(struct drbd_conf *mdev) +void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) { - struct drbd_tl_epoch *b, *tmp; - struct list_head *le, *tle; - struct drbd_request *r; - int new_initial_bnr = net_random(); - - spin_lock_irq(&mdev->req_lock); + struct drbd_tl_epoch *b, *tmp, **pn; + struct list_head *le, *tle, carry_reads; + struct drbd_request *req; + int rv, n_writes, n_reads; - b = mdev->oldest_tle; + b = tconn->oldest_tle; + pn = &tconn->oldest_tle; while (b) { + n_writes = 0; + n_reads = 0; + INIT_LIST_HEAD(&carry_reads); list_for_each_safe(le, tle, &b->requests) { - r = list_entry(le, struct drbd_request, tl_requests); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(r, connection_lost_while_pending); - } - tmp = b->next; - - /* there could still be requests on that ring list, - * in case local io is still pending */ - list_del(&b->requests); - - /* dec_ap_pending corresponding to queue_barrier. - * the newest barrier may not have been queued yet, - * in which case w.cb is still NULL. */ - if (b->w.cb != NULL) - dec_ap_pending(mdev); - - if (b == mdev->newest_tle) { - /* recycle, but reinit! */ - D_ASSERT(tmp == NULL); - INIT_LIST_HEAD(&b->requests); - INIT_LIST_HEAD(&b->w.list); - b->w.cb = NULL; - b->br_number = new_initial_bnr; - b->n_req = 0; + req = list_entry(le, struct drbd_request, tl_requests); + rv = _req_mod(req, what); - mdev->oldest_tle = b; - break; + if (rv & MR_WRITE) + n_writes++; + if (rv & MR_READ) + n_reads++; } - kfree(b); - b = tmp; - } - - /* we expect this list to be empty. */ - D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); - - /* but just in case, clean it up anyways! */ - list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { - r = list_entry(le, struct drbd_request, tl_requests); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(r, connection_lost_while_pending); - } - - /* ensure bit indicating barrier is required is clear */ - clear_bit(CREATE_BARRIER, &mdev->flags); - - spin_unlock_irq(&mdev->req_lock); -} - -#if DRBD_DEBUG_STATE_CHANGES -static void trace_st(struct drbd_conf *mdev, const unsigned long long seq, - const char *func, unsigned int line, - const char *name, union drbd_state s); -#endif - -/** - * cl_wide_st_chg() - TRUE if the state change is a cluster wide one - * @mdev: DRBD device. - * @os: old (current) state. - * @ns: new (wanted) state. - */ -STATIC int cl_wide_st_chg(struct drbd_conf *mdev, - union drbd_state os, union drbd_state ns) -{ - return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && - ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || - (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || - (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || - (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || - (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); -} - -int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, - union drbd_state mask, union drbd_state val) -{ -#if DRBD_DEBUG_STATE_CHANGES - static unsigned long long sseq = 0xf0000000LLU; - unsigned long seq; - unsigned int line = val.line; - const char *func = val.func; -#endif - - unsigned long flags; - union drbd_state os, ns; - int rv; - - spin_lock_irqsave(&mdev->req_lock, flags); - os = mdev->state; - ns.i = (os.i & ~mask.i) | val.i; -#if DRBD_DEBUG_STATE_CHANGES - seq = ++sseq; - trace_st(mdev, seq, func, line, "!os", os); - trace_st(mdev, seq, func, line, "!ns", ns); - ns.func = NULL; -#endif - rv = _drbd_set_state(mdev, ns, f, NULL); - ns = mdev->state; -#if DRBD_DEBUG_STATE_CHANGES - trace_st(mdev, seq, func, line, "=ns", ns); -#endif - spin_unlock_irqrestore(&mdev->req_lock, flags); - - return rv; -} - -/** - * drbd_force_state() - Impose a change which happens outside our control on our state - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - */ -void drbd_force_state(struct drbd_conf *mdev, - union drbd_state mask, union drbd_state val) -{ - drbd_change_state(mdev, CS_HARD, mask, val); -} - -STATIC int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); -STATIC int is_valid_state_transition(struct drbd_conf *, - union drbd_state, union drbd_state); -STATIC union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, int *warn_sync_abort); -int drbd_send_state_req(struct drbd_conf *, - union drbd_state, union drbd_state); - -STATIC enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, - union drbd_state mask, union drbd_state val) -{ - union drbd_state os, ns; - unsigned long flags; - int rv; + tmp = b->next; - if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) - return SS_CW_SUCCESS; + if (n_writes) { + if (what == RESEND) { + b->n_writes = n_writes; + if (b->w.cb == NULL) { + b->w.cb = w_send_barrier; + inc_ap_pending(b->w.mdev); + set_bit(CREATE_BARRIER, &tconn->flags); + } - if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) - return SS_CW_FAILED_BY_PEER; + drbd_queue_work(&tconn->data.work, &b->w); + } + pn = &b->next; + } else { + if (n_reads) + list_add(&carry_reads, &b->requests); + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + /* dec_ap_pending corresponding to queue_barrier. + * the newest barrier may not have been queued yet, + * in which case w.cb is still NULL. */ + if (b->w.cb != NULL) + dec_ap_pending(b->w.mdev); + + if (b == tconn->newest_tle) { + /* recycle, but reinit! */ + if (tmp != NULL) + conn_err(tconn, "ASSERT FAILED tmp == NULL"); + INIT_LIST_HEAD(&b->requests); + list_splice(&carry_reads, &b->requests); + INIT_LIST_HEAD(&b->w.list); + b->w.cb = NULL; + b->br_number = net_random(); + b->n_writes = 0; - rv = 0; - spin_lock_irqsave(&mdev->req_lock, flags); - os = mdev->state; - ns.i = (os.i & ~mask.i) | val.i; - ns = sanitize_state(mdev, os, ns, NULL); - - if (!cl_wide_st_chg(mdev, os, ns)) - rv = SS_CW_NO_NEED; - if (!rv) { - rv = is_valid_state(mdev, ns); - if (rv == SS_SUCCESS) { - rv = is_valid_state_transition(mdev, ns, os); - if (rv == SS_SUCCESS) - rv = 0; /* cont waiting, otherwise fail. */ + *pn = b; + break; + } + *pn = tmp; + kfree(b); } + b = tmp; + list_splice(&carry_reads, &b->requests); } - spin_unlock_irqrestore(&mdev->req_lock, flags); - return rv; -} - -/** - * drbd_req_state() - Perform an eventually cluster wide state change - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * @f: flags - * - * Should not be called directly, use drbd_request_state() or - * _drbd_request_state(). - */ -STATIC int drbd_req_state(struct drbd_conf *mdev, - union drbd_state mask, union drbd_state val, - enum chg_state_flags f) -{ -#if DRBD_DEBUG_STATE_CHANGES - static unsigned long long sseq = 0; - unsigned long seq; - unsigned int line = val.line; - const char *func = val.func; -#endif - - struct completion done; - unsigned long flags; - union drbd_state os, ns; - int rv; - - init_completion(&done); - - if (f & CS_SERIALIZE) - mutex_lock(&mdev->state_mutex); - - spin_lock_irqsave(&mdev->req_lock, flags); - os = mdev->state; - ns.i = (os.i & ~mask.i) | val.i; - ns = sanitize_state(mdev, os, ns, NULL); - -#if DRBD_DEBUG_STATE_CHANGES - seq = ++sseq; - trace_st(mdev, seq, func, line, "?os", os); - trace_st(mdev, seq, func, line, "?ns", ns); - ns.func = NULL; -#endif - - if (cl_wide_st_chg(mdev, os, ns)) { - rv = is_valid_state(mdev, ns); - if (rv == SS_SUCCESS) - rv = is_valid_state_transition(mdev, ns, os); - spin_unlock_irqrestore(&mdev->req_lock, flags); - - if (rv < SS_SUCCESS) { - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; - } - - drbd_state_lock(mdev); - if (!drbd_send_state_req(mdev, mask, val)) { - drbd_state_unlock(mdev); - rv = SS_CW_FAILED_BY_PEER; - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; + /* Actions operating on the disk state, also want to work on + requests that got barrier acked. */ + switch (what) { + case FAIL_FROZEN_DISK_IO: + case RESTART_FROZEN_DISK_IO: + list_for_each_safe(le, tle, &tconn->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); } - - wait_event(mdev->state_wait, - (rv = _req_st_cond(mdev, mask, val))); - - if (rv < SS_SUCCESS) { - drbd_state_unlock(mdev); - if (f & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - goto abort; - } - spin_lock_irqsave(&mdev->req_lock, flags); - os = mdev->state; - ns.i = (os.i & ~mask.i) | val.i; - rv = _drbd_set_state(mdev, ns, f, &done); - drbd_state_unlock(mdev); - } else { - rv = _drbd_set_state(mdev, ns, f, &done); - } - - spin_unlock_irqrestore(&mdev->req_lock, flags); - - if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { - D_ASSERT(current != mdev->worker.task); - wait_for_completion(&done); - } - -abort: -#if DRBD_DEBUG_STATE_CHANGES - trace_st(mdev, seq, func, line, ":os", os); - trace_st(mdev, seq, func, line, ":ns", ns); -#endif - - if (f & CS_SERIALIZE) - mutex_unlock(&mdev->state_mutex); - - return rv; -} - -/** - * _drbd_request_state() - Request a state change (with flags) - * @mdev: DRBD device. - * @mask: mask of state bits to change. - * @val: value of new state bits. - * @f: flags - * - * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE - * flag, or when logging of failed state change requests is not desired. - */ -int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, - union drbd_state val, enum chg_state_flags f) -{ - int rv; - - wait_event(mdev->state_wait, - (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); - - return rv; -} - -#if DRBD_DEBUG_STATE_CHANGES -static void trace_st(struct drbd_conf *mdev, const unsigned long long seq, - const char *func, unsigned int line, - const char *name, union drbd_state s) -{ - - const struct task_struct *c = current; - const char *context = - c == mdev->worker.task ? "worker" : - c == mdev->receiver.task ? "receiver" : - c == mdev->asender.task ? "asender" : "other"; - - dev_info(DEV, " %8llx [%s] %s:%u %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", - seq, context, func, line, - name, - drbd_conn_str(s.conn), - drbd_role_str(s.role), - drbd_role_str(s.peer), - drbd_disk_str(s.disk), - drbd_disk_str(s.pdsk), - s.susp ? 's' : 'r', - s.aftr_isp ? 'a' : '-', - s.peer_isp ? 'p' : '-', - s.user_isp ? 'u' : '-' - ); -} -#else -#define trace_st(...) do { } while (0) -#endif - -STATIC void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) -{ - dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", - name, - drbd_conn_str(ns.conn), - drbd_role_str(ns.role), - drbd_role_str(ns.peer), - drbd_disk_str(ns.disk), - drbd_disk_str(ns.pdsk), - ns.susp ? 's' : 'r', - ns.aftr_isp ? 'a' : '-', - ns.peer_isp ? 'p' : '-', - ns.user_isp ? 'u' : '-' - ); -} - -void print_st_err(struct drbd_conf *mdev, - union drbd_state os, union drbd_state ns, int err) -{ - if (err == SS_IN_TRANSIENT_STATE) - return; - dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); - print_st(mdev, " state", os); - print_st(mdev, "wanted", ns); -} - - -#define drbd_peer_str drbd_role_str -#define drbd_pdsk_str drbd_disk_str - -#define drbd_susp_str(A) ((A) ? "1" : "0") -#define drbd_aftr_isp_str(A) ((A) ? "1" : "0") -#define drbd_peer_isp_str(A) ((A) ? "1" : "0") -#define drbd_user_isp_str(A) ((A) ? "1" : "0") - -#define PSC(A) \ - ({ if (ns.A != os.A) { \ - pbp += sprintf(pbp, #A "( %s -> %s ) ", \ - drbd_##A##_str(os.A), \ - drbd_##A##_str(ns.A)); \ - } }) - -/** - * is_valid_state() - Returns an SS_ error code if ns is not valid - * @mdev: DRBD device. - * @ns: State to consider. - */ -STATIC int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) -{ - /* See drbd_state_sw_errors in drbd_strings.c */ - - enum drbd_fencing_p fp; - int rv = SS_SUCCESS; - - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - fp = mdev->ldev->dc.fencing; - put_ldev(mdev); + case CONNECTION_LOST_WHILE_PENDING: + case RESEND: + break; + default: + conn_err(tconn, "what = %d in _tl_restart()\n", what); } - - if (get_net_conf(mdev)) { - if (!mdev->net_conf->two_primaries && - ns.role == R_PRIMARY && ns.peer == R_PRIMARY) - rv = SS_TWO_PRIMARIES; - put_net_conf(mdev); - } - - if (rv <= 0) - /* already found a reason to abort */; - else if (ns.role == R_SECONDARY && mdev->open_cnt) - rv = SS_DEVICE_IN_USE; - - else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) - rv = SS_NO_UP_TO_DATE_DISK; - - else if (fp >= FP_RESOURCE && - ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) - rv = SS_PRIMARY_NOP; - - else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) - rv = SS_NO_UP_TO_DATE_DISK; - - else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) - rv = SS_NO_LOCAL_DISK; - - else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) - rv = SS_NO_REMOTE_DISK; - - else if ((ns.conn == C_CONNECTED || - ns.conn == C_WF_BITMAP_S || - ns.conn == C_SYNC_SOURCE || - ns.conn == C_PAUSED_SYNC_S) && - ns.disk == D_OUTDATED) - rv = SS_CONNECTED_OUTDATES; - - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - (mdev->sync_conf.verify_alg[0] == 0)) - rv = SS_NO_VERIFY_ALG; - - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - mdev->agreed_pro_version < 88) - rv = SS_NOT_SUPPORTED; - - return rv; } -/** - * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible - * @mdev: DRBD device. - * @ns: new state. - * @os: old state. - */ -STATIC int is_valid_state_transition(struct drbd_conf *mdev, - union drbd_state ns, union drbd_state os) -{ - int rv = SS_SUCCESS; - - if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && - os.conn > C_CONNECTED) - rv = SS_RESYNC_RUNNING; - - if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) - rv = SS_ALREADY_STANDALONE; - - if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) - rv = SS_IS_DISKLESS; - - if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) - rv = SS_NO_NET_CONFIG; - - if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) - rv = SS_LOWER_THAN_OUTDATED; - - if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) - rv = SS_IN_TRANSIENT_STATE; - - if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) - rv = SS_IN_TRANSIENT_STATE; - - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) - rv = SS_NEED_CONNECTION; - - if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && - ns.conn != os.conn && os.conn > C_CONNECTED) - rv = SS_RESYNC_RUNNING; - - if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && - os.conn < C_CONNECTED) - rv = SS_NEED_CONNECTION; - - return rv; -} /** - * sanitize_state() - Resolves implicitly necessary additional changes to a state transition + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL * @mdev: DRBD device. - * @os: old state. - * @ns: new state. - * @warn_sync_abort: * - * When we loose connection, we have to set the state of the peers disk (pdsk) - * to D_UNKNOWN. This rule and many more along those lines are in this function. + * This is called after the connection to the peer was lost. The storage covered + * by the requests on the transfer gets marked as our of sync. Called from the + * receiver thread and the worker thread. */ -STATIC union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, int *warn_sync_abort) +void tl_clear(struct drbd_tconn *tconn) { - enum drbd_fencing_p fp; - - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - fp = mdev->ldev->dc.fencing; - put_ldev(mdev); - } - - /* Disallow Network errors to configure a device's network part */ - if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && - os.conn <= C_DISCONNECTING) - ns.conn = os.conn; - - /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ - if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) - ns.conn = os.conn; - - /* After C_DISCONNECTING only C_STANDALONE may follow */ - if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) - ns.conn = os.conn; - - if (ns.conn < C_CONNECTED) { - ns.peer_isp = 0; - ns.peer = R_UNKNOWN; - if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) - ns.pdsk = D_UNKNOWN; - } - - /* Clear the aftr_isp when becoming unconfigured */ - if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) - ns.aftr_isp = 0; - - if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) - ns.pdsk = D_UNKNOWN; - - /* Abort resync if a disk fails/detaches */ - if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && - (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn_sync_abort) - *warn_sync_abort = 1; - ns.conn = C_CONNECTED; - } - - if (ns.conn >= C_CONNECTED && - ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) || - (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) { - switch (ns.conn) { - case C_WF_BITMAP_T: - case C_PAUSED_SYNC_T: - ns.disk = D_OUTDATED; - break; - case C_CONNECTED: - case C_WF_BITMAP_S: - case C_SYNC_SOURCE: - case C_PAUSED_SYNC_S: - ns.disk = D_UP_TO_DATE; - break; - case C_SYNC_TARGET: - ns.disk = D_INCONSISTENT; - dev_warn(DEV, "Implicitly set disk state Inconsistent!\n"); - break; - } - if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE) - dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n"); - } - - if (ns.conn >= C_CONNECTED && - (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) { - switch (ns.conn) { - case C_CONNECTED: - case C_WF_BITMAP_T: - case C_PAUSED_SYNC_T: - case C_SYNC_TARGET: - ns.pdsk = D_UP_TO_DATE; - break; - case C_WF_BITMAP_S: - case C_PAUSED_SYNC_S: - ns.pdsk = D_OUTDATED; - break; - case C_SYNC_SOURCE: - ns.pdsk = D_INCONSISTENT; - dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n"); - break; - } - if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE) - dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n"); - } - - /* Connection breaks down before we finished "Negotiating" */ - if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && - get_ldev_if_state(mdev, D_NEGOTIATING)) { - if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { - ns.disk = mdev->new_state_tmp.disk; - ns.pdsk = mdev->new_state_tmp.pdsk; - } else { - dev_alert(DEV, "Connection lost while negotiating, no data!\n"); - ns.disk = D_DISKLESS; - ns.pdsk = D_UNKNOWN; - } - put_ldev(mdev); - } - - if (fp == FP_STONITH && - (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && - !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) - ns.susp = 1; - - if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { - if (ns.conn == C_SYNC_SOURCE) - ns.conn = C_PAUSED_SYNC_S; - if (ns.conn == C_SYNC_TARGET) - ns.conn = C_PAUSED_SYNC_T; - } else { - if (ns.conn == C_PAUSED_SYNC_S) - ns.conn = C_SYNC_SOURCE; - if (ns.conn == C_PAUSED_SYNC_T) - ns.conn = C_SYNC_TARGET; - } - - return ns; -} - -/* helper for __drbd_set_state */ -static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) -{ - if (cs == C_VERIFY_T) { - /* starting online verify from an arbitrary position - * does not fit well into the existing protocol. - * on C_VERIFY_T, we initialize ov_left and friends - * implicitly in receive_DataRequest once the - * first P_OV_REQUEST is received */ - mdev->ov_start_sector = ~(sector_t)0; - } else { - unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); - if (bit >= mdev->rs_total) - mdev->ov_start_sector = - BM_BIT_TO_SECT(mdev->rs_total - 1); - mdev->ov_position = mdev->ov_start_sector; - } -} - -/** - * __drbd_set_state() - Set a new DRBD state - * @mdev: DRBD device. - * @ns: new state. - * @flags: Flags - * @done: Optional completion, that will get completed after the after_state_ch() finished - * - * Caller needs to hold req_lock, and global_state_lock. Do not call directly. - */ -int __drbd_set_state(struct drbd_conf *mdev, - union drbd_state ns, enum chg_state_flags flags, - struct completion *done) -{ -#if DRBD_DEBUG_STATE_CHANGES - static unsigned long long sseq = 0xff000000LLU; - unsigned long long seq = 0; -#endif - union drbd_state os; - int rv = SS_SUCCESS; - int warn_sync_abort = 0; - struct after_state_chg_work *ascw; - - - os = mdev->state; - -#if DRBD_DEBUG_STATE_CHANGES - if (ns.func) { - seq = ++sseq; - trace_st(mdev, seq, ns.func, ns.line, "==os", os); - trace_st(mdev, seq, ns.func, ns.line, "==ns", ns); - } -#endif - - ns = sanitize_state(mdev, os, ns, &warn_sync_abort); - -#if DRBD_DEBUG_STATE_CHANGES - if (ns.func) - trace_st(mdev, seq, ns.func, ns.line, "==ns", ns); -#endif - - if (ns.i == os.i) - return SS_NOTHING_TO_DO; - - if (!(flags & CS_HARD)) { - /* pre-state-change checks ; only look at ns */ - /* See drbd_state_sw_errors in drbd_strings.c */ - - rv = is_valid_state(mdev, ns); - if (rv < SS_SUCCESS) { - /* If the old state was illegal as well, then let - this happen...*/ - - if (is_valid_state(mdev, os) == rv) { - dev_err(DEV, "Considering state change from bad state. " - "Error would be: '%s'\n", - drbd_set_st_err_str(rv)); - print_st(mdev, "old", os); - print_st(mdev, "new", ns); - rv = is_valid_state_transition(mdev, ns, os); - } - } else - rv = is_valid_state_transition(mdev, ns, os); - } - - if (rv < SS_SUCCESS) { - if (flags & CS_VERBOSE) - print_st_err(mdev, os, ns, rv); - return rv; - } - - if (warn_sync_abort) - dev_warn(DEV, "Resync aborted.\n"); - -#if DUMP_MD >= 2 - { - char *pbp, pb[300]; - pbp = pb; - *pbp = 0; - PSC(role); - PSC(peer); - PSC(conn); - PSC(disk); - PSC(pdsk); - PSC(susp); - PSC(aftr_isp); - PSC(peer_isp); - PSC(user_isp); - dev_info(DEV, "%s\n", pb); - } -#endif - -#if DRBD_DEBUG_STATE_CHANGES - if (ns.func) - trace_st(mdev, seq, ns.func, ns.line, ":=ns", ns); -#endif - - /* solve the race between becoming unconfigured, - * worker doing the cleanup, and - * admin reconfiguring us: - * on (re)configure, first set CONFIG_PENDING, - * then wait for a potentially exiting worker, - * start the worker, and schedule one no_op. - * then proceed with configuration. - */ - if (ns.disk == D_DISKLESS && - ns.conn == C_STANDALONE && - ns.role == R_SECONDARY && - !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) - set_bit(DEVICE_DYING, &mdev->flags); - - mdev->state.i = ns.i; - wake_up(&mdev->misc_wait); - wake_up(&mdev->state_wait); - - /* post-state-change actions */ - if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { - set_bit(STOP_SYNC_TIMER, &mdev->flags); - mod_timer(&mdev->resync_timer, jiffies); - } - - /* aborted verify run. log the last position */ - if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && - ns.conn < C_CONNECTED) { - mdev->ov_start_sector = - BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); - dev_info(DEV, "Online Verify reached sector %llu\n", - (unsigned long long)mdev->ov_start_sector); - } - - if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && - (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { - dev_info(DEV, "Syncer continues.\n"); - mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; - if (ns.conn == C_SYNC_TARGET) { - if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) - mod_timer(&mdev->resync_timer, jiffies); - /* This if (!test_bit) is only needed for the case - that a device that has ceased to used its timer, - i.e. it is already in drbd_resync_finished() gets - paused and resumed. */ - } - } + struct list_head *le, *tle; + struct drbd_request *r; - if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && - (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { - dev_info(DEV, "Resync suspended\n"); - mdev->rs_mark_time = jiffies; - if (ns.conn == C_PAUSED_SYNC_T) - set_bit(STOP_SYNC_TIMER, &mdev->flags); - } + spin_lock_irq(&tconn->req_lock); - if (os.conn == C_CONNECTED && - (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { - mdev->ov_position = 0; - mdev->rs_total = - mdev->rs_mark_left = drbd_bm_bits(mdev); - if (mdev->agreed_pro_version >= 90) - set_ov_position(mdev, ns.conn); - else - mdev->ov_start_sector = 0; - mdev->ov_left = mdev->rs_total - - BM_SECT_TO_BIT(mdev->ov_position); - mdev->rs_start = - mdev->rs_mark_time = jiffies; - mdev->ov_last_oos_size = 0; - mdev->ov_last_oos_start = 0; - - if (ns.conn == C_VERIFY_S) { - dev_info(DEV, "Starting Online Verify from sector %llu\n", - (unsigned long long)mdev->ov_position); - mod_timer(&mdev->resync_timer, jiffies); - } - } + _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING); - if (get_ldev(mdev)) { - u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| - MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| - MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); - - if (test_bit(CRASHED_PRIMARY, &mdev->flags)) - mdf |= MDF_CRASHED_PRIMARY; - if (mdev->state.role == R_PRIMARY || - (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) - mdf |= MDF_PRIMARY_IND; - if (mdev->state.conn > C_WF_REPORT_PARAMS) - mdf |= MDF_CONNECTED_IND; - if (mdev->state.disk > D_INCONSISTENT) - mdf |= MDF_CONSISTENT; - if (mdev->state.disk > D_OUTDATED) - mdf |= MDF_WAS_UP_TO_DATE; - if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) - mdf |= MDF_PEER_OUT_DATED; - if (mdf != mdev->ldev->md.flags) { - mdev->ldev->md.flags = mdf; - drbd_md_mark_dirty(mdev); - } - if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) - drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); - put_ldev(mdev); - } + /* we expect this list to be empty. */ + if (!list_empty(&tconn->out_of_sequence_requests)) + conn_err(tconn, "ASSERT FAILED list_empty(&out_of_sequence_requests)\n"); - /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ - if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && - os.peer == R_SECONDARY && ns.peer == R_PRIMARY) - set_bit(CONSIDER_RESYNC, &mdev->flags); - - /* Receiver should clean up itself */ - if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) - drbd_thread_stop_nowait(&mdev->receiver); - - /* Now the receiver finished cleaning up itself, it should die */ - if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) - drbd_thread_stop_nowait(&mdev->receiver); - - /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_TEAR_DOWN && - ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) - drbd_thread_restart_nowait(&mdev->receiver); - - ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); - if (ascw) { - ascw->os = os; - ascw->ns = ns; - ascw->flags = flags; - ascw->w.cb = w_after_state_ch; - ascw->done = done; - drbd_queue_work(&mdev->data.work, &ascw->w); - } else { - dev_warn(DEV, "Could not kmalloc an ascw\n"); + /* but just in case, clean it up anyways! */ + list_for_each_safe(le, tle, &tconn->out_of_sequence_requests) { + r = list_entry(le, struct drbd_request, tl_requests); + /* It would be nice to complete outside of spinlock. + * But this is easier for now. */ + _req_mod(r, CONNECTION_LOST_WHILE_PENDING); } - return rv; -} - -STATIC int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) -{ - struct after_state_chg_work *ascw = - container_of(w, struct after_state_chg_work, w); - after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); - if (ascw->flags & CS_WAIT_COMPLETE) { - D_ASSERT(ascw->done != NULL); - complete(ascw->done); - } - kfree(ascw); + /* ensure bit indicating barrier is required is clear */ + clear_bit(CREATE_BARRIER, &tconn->flags); - return 1; + spin_unlock_irq(&tconn->req_lock); } -static void abw_start_sync(struct drbd_conf *mdev, int rv) +void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what) { - if (rv) { - dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); - _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); - return; - } - - switch (mdev->state.conn) { - case C_STARTING_SYNC_T: - _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); - break; - case C_STARTING_SYNC_S: - drbd_start_resync(mdev, C_SYNC_SOURCE); - break; - } + spin_lock_irq(&tconn->req_lock); + _tl_restart(tconn, what); + spin_unlock_irq(&tconn->req_lock); } /** - * after_state_ch() - Perform after state change actions that may sleep + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL * @mdev: DRBD device. - * @os: old state. - * @ns: new state. - * @flags: Flags */ -STATIC void after_state_ch(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags) +void tl_abort_disk_io(struct drbd_conf *mdev) { - enum drbd_fencing_p fp; - - if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { - clear_bit(CRASHED_PRIMARY, &mdev->flags); - if (mdev->p_uuid) - mdev->p_uuid[UI_FLAGS] &= ~((u64)2); - } - - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - fp = mdev->ldev->dc.fencing; - put_ldev(mdev); - } - - /* Inform userspace about the change... */ - drbd_bcast_state(mdev, ns); + struct drbd_tconn *tconn = mdev->tconn; + struct drbd_tl_epoch *b; + struct list_head *le, *tle; + struct drbd_request *req; - if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) - drbd_khelper(mdev, "pri-on-incon-degr"); - - /* Here we have the actions that are performed after a - state change. This function might sleep */ - - if (fp == FP_STONITH && ns.susp) { - /* case1: The outdate peer handler is successful: - * case2: The connection was established again: */ - if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || - (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { - tl_clear(mdev); - spin_lock_irq(&mdev->req_lock); - _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->req_lock); - } - } - /* Do not change the order of the if above and the two below... */ - if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ - drbd_send_uuids(mdev); - drbd_send_state(mdev); - } - if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) - drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); - - /* Lost contact to peer's copy of the data */ - if ((os.pdsk >= D_INCONSISTENT && - os.pdsk != D_UNKNOWN && - os.pdsk != D_OUTDATED) - && (ns.pdsk < D_INCONSISTENT || - ns.pdsk == D_UNKNOWN || - ns.pdsk == D_OUTDATED)) { - kfree(mdev->p_uuid); - mdev->p_uuid = NULL; - if (get_ldev(mdev)) { - if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { - drbd_uuid_new_current(mdev); - drbd_send_uuids(mdev); - } - put_ldev(mdev); + spin_lock_irq(&tconn->req_lock); + b = tconn->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + if (req->w.mdev == mdev) + _req_mod(req, ABORT_DISK_IO); } + b = b->next; } - if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) - drbd_uuid_new_current(mdev); - - /* D_DISKLESS Peer becomes secondary */ - if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) - drbd_al_to_on_disk_bm(mdev); - put_ldev(mdev); - } - - /* Last part of the attaching process ... */ - if (ns.conn >= C_CONNECTED && - os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { - kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ - mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ - drbd_send_sizes(mdev, 0); /* to start sync... */ - drbd_send_uuids(mdev); - drbd_send_state(mdev); - } - - /* We want to pause/continue resync, tell peer. */ - if (ns.conn >= C_CONNECTED && - ((os.aftr_isp != ns.aftr_isp) || - (os.user_isp != ns.user_isp))) - drbd_send_state(mdev); - - /* In case one of the isp bits got set, suspend other devices. */ - if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && - (ns.aftr_isp || ns.peer_isp || ns.user_isp)) - suspend_other_sg(mdev); - - /* Make sure the peer gets informed about eventual state - changes (ISP bits) while we were in WFReportParams. */ - if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev); - - /* We are in the progress to start a full sync... */ - if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || - (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) - drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); - - /* We are invalidating our self... */ - if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && - os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) - drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); - - if (os.disk > D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh; - - eh = EP_PASS_ON; - if (get_ldev_if_state(mdev, D_FAILED)) { - eh = mdev->ldev->dc.on_io_error; - put_ldev(mdev); - } - - drbd_rs_cancel_all(mdev); - /* since get_ldev() only works as long as disk>=D_INCONSISTENT, - and it is D_DISKLESS here, local_cnt can only go down, it can - not increase... It will reach zero */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); - mdev->rs_total = 0; - mdev->rs_failed = 0; - atomic_set(&mdev->rs_pending_cnt, 0); - - spin_lock_irq(&mdev->req_lock); - _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); - spin_unlock_irq(&mdev->req_lock); - - if (eh == EP_CALL_HELPER) - drbd_khelper(mdev, "local-io-error"); - } - - if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { - - if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that my disk is broken.\n"); - else - dev_err(DEV, "Sending state in drbd_io_error() failed\n"); - } - - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); - lc_destroy(mdev->resync); - mdev->resync = NULL; - lc_destroy(mdev->act_log); - mdev->act_log = NULL; - __no_warn(local, - drbd_free_bc(mdev->ldev); - mdev->ldev = NULL;); - - if (mdev->md_io_tmpp) - __free_page(mdev->md_io_tmpp); - } - - /* Disks got bigger while they were detached */ - if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && - test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { - if (ns.conn == C_CONNECTED) - resync_after_online_grow(mdev); - } - - /* A resync finished or aborted, wake paused devices... */ - if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || - (os.peer_isp && !ns.peer_isp) || - (os.user_isp && !ns.user_isp)) - resume_next_sg(mdev); - - /* Upon network connection, we need to start the receiver */ - if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) - drbd_thread_start(&mdev->receiver); - - /* Terminate worker thread if we are unconfigured - it will be - restarted as needed... */ - if (ns.disk == D_DISKLESS && - ns.conn == C_STANDALONE && - ns.role == R_SECONDARY) { - if (os.aftr_isp != ns.aftr_isp) - resume_next_sg(mdev); - /* set in __drbd_set_state, unless CONFIG_PENDING was set */ - if (test_bit(DEVICE_DYING, &mdev->flags)) - drbd_thread_stop_nowait(&mdev->worker); + list_for_each_safe(le, tle, &tconn->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + if (req->w.mdev == mdev) + _req_mod(req, ABORT_DISK_IO); } - drbd_md_sync(mdev); + spin_unlock_irq(&tconn->req_lock); } - STATIC int drbd_thread_setup(void *arg) { struct drbd_thread *thi = (struct drbd_thread *) arg; - struct drbd_conf *mdev = thi->mdev; + struct drbd_tconn *tconn = thi->tconn; unsigned long flags; - long timeout; int retval; - const char *me = - thi == &mdev->receiver ? "receiver" : - thi == &mdev->asender ? "asender" : - thi == &mdev->worker ? "worker" : "NONSENSE"; daemonize("drbd_thread"); - D_ASSERT(get_t_state(thi) == Running); - D_ASSERT(thi->task == NULL); /* state engine takes this lock (in drbd_thread_stop_nowait) * while holding the req_lock irqsave */ spin_lock_irqsave(&thi->t_lock, flags); @@ -1485,139 +540,137 @@ __set_current_state(TASK_UNINTERRUPTIBLE); complete(&thi->startstop); /* notify: thi->task is set. */ - timeout = schedule_timeout(10*HZ); - D_ASSERT(timeout != 0); + schedule_timeout(10*HZ); + snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s", + thi->name[0], thi->tconn->name); restart: retval = thi->function(thi); spin_lock_irqsave(&thi->t_lock, flags); - /* if the receiver has been "Exiting", the last thing it did + /* if the receiver has been "EXITING", the last thing it did * was set the conn state to "StandAlone", * if now a re-connect request comes in, conn state goes C_UNCONNECTED, * and receiver thread will be "started". - * drbd_thread_start needs to set "Restarting" in that case. + * drbd_thread_start needs to set "RESTARTING" in that case. * t_state check and assignment needs to be within the same spinlock, - * so either thread_start sees Exiting, and can remap to Restarting, - * or thread_start see None, and can proceed as normal. + * so either thread_start sees EXITING, and can remap to RESTARTING, + * or thread_start see NONE, and can proceed as normal. */ - if (thi->t_state == Restarting) { - dev_info(DEV, "Restarting %s thread\n", me); - thi->t_state = Running; + if (thi->t_state == RESTARTING) { + conn_info(tconn, "Restarting %s thread\n", thi->name); + thi->t_state = RUNNING; spin_unlock_irqrestore(&thi->t_lock, flags); goto restart; } thi->task = NULL; - thi->t_state = None; + thi->t_state = NONE; smp_mb(); /* THINK maybe two different completions? */ - complete(&thi->startstop); /* notify: thi->task unset. */ - dev_info(DEV, "Terminating %s thread\n", me); + complete_all(&thi->startstop); /* notify: thi->task unset. */ + conn_info(tconn, "Terminating %s thread\n", thi->name); spin_unlock_irqrestore(&thi->t_lock, flags); /* Release mod reference taken when thread was started */ + + kref_put(&tconn->kref, &conn_destroy); module_put(THIS_MODULE); return retval; } -STATIC void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, - int (*func) (struct drbd_thread *)) +STATIC void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi, + int (*func) (struct drbd_thread *), char *name) { spin_lock_init(&thi->t_lock); thi->task = NULL; - thi->t_state = None; + thi->t_state = NONE; thi->function = func; - thi->mdev = mdev; + thi->tconn = tconn; + strncpy(thi->name, name, ARRAY_SIZE(thi->name)); } int drbd_thread_start(struct drbd_thread *thi) { - int pid; - struct drbd_conf *mdev = thi->mdev; + struct drbd_tconn *tconn = thi->tconn; unsigned long flags; - const char *me = - thi == &mdev->receiver ? "receiver" : - thi == &mdev->asender ? "asender" : - thi == &mdev->worker ? "worker" : "NONSENSE"; + int pid; /* is used from state engine doing drbd_thread_stop_nowait, * while holding the req lock irqsave */ spin_lock_irqsave(&thi->t_lock, flags); switch (thi->t_state) { - case None: - dev_info(DEV, "Starting %s thread (from %s [%d])\n", - me, current->comm, current->pid); + case NONE: + conn_info(tconn, "Starting %s thread (from %s [%d])\n", + thi->name, current->comm, current->pid); /* Get ref on module for thread - this is released when thread exits */ if (!try_module_get(THIS_MODULE)) { - dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); + conn_err(tconn, "Failed to get module reference in drbd_thread_start\n"); spin_unlock_irqrestore(&thi->t_lock, flags); - return FALSE; + return false; } + kref_get(&thi->tconn->kref); + init_completion(&thi->startstop); - D_ASSERT(thi->task == NULL); thi->reset_cpu_mask = 1; - thi->t_state = Running; + thi->t_state = RUNNING; spin_unlock_irqrestore(&thi->t_lock, flags); flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ pid = kernel_thread(drbd_thread_setup, (void *) thi, CLONE_FS); if (pid < 0) { - dev_err(DEV, "Couldn't start thread (%d)\n", pid); + conn_err(tconn, "Couldn't start thread (%d)\n", pid); + kref_put(&tconn->kref, &conn_destroy); module_put(THIS_MODULE); - return FALSE; + return false; } /* waits until thi->task is set */ wait_for_completion(&thi->startstop); - if (thi->t_state != Running) - dev_err(DEV, "ASSERT FAILED: %s t_state == %d expected %d.\n", - me, thi->t_state, Running); + if (thi->t_state != RUNNING) + conn_err(tconn, "ASSERT FAILED: %s t_state == %d expected %d.\n", + thi->name, thi->t_state, RUNNING); if (thi->task) wake_up_process(thi->task); else - dev_err(DEV, "ASSERT FAILED thi->task is NULL where it should be set!?\n"); + conn_err(tconn, "ASSERT FAILED thi->task is NULL where it should be set!?\n"); break; - case Exiting: - thi->t_state = Restarting; - dev_info(DEV, "Restarting %s thread (from %s [%d])\n", - me, current->comm, current->pid); + case EXITING: + thi->t_state = RESTARTING; + conn_info(tconn, "Restarting %s thread (from %s [%d])\n", + thi->name, current->comm, current->pid); /* fall through */ - case Running: - case Restarting: + case RUNNING: + case RESTARTING: default: spin_unlock_irqrestore(&thi->t_lock, flags); break; } - return TRUE; + return true; } void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) { - struct drbd_conf *mdev = thi->mdev; + struct drbd_tconn *tconn = thi->tconn; unsigned long flags; - enum drbd_thread_state ns = restart ? Restarting : Exiting; - const char *me = - thi == &mdev->receiver ? "receiver" : - thi == &mdev->asender ? "asender" : - thi == &mdev->worker ? "worker" : "NONSENSE"; + enum drbd_thread_state ns = restart ? RESTARTING : EXITING; /* may be called from state engine, holding the req lock irqsave */ spin_lock_irqsave(&thi->t_lock, flags); - /* dev_info(DEV, "drbd_thread_stop: %s [%d]: %s %d -> %d; %d\n", + /* conn_err(tconn, "drbd_thread_stop: %s [%d]: %s %d -> %d; %d\n", current->comm, current->pid, thi->task ? thi->task->comm : "NULL", thi->t_state, ns, wait); */ - if (thi->t_state == None) { + if (thi->t_state == NONE) { spin_unlock_irqrestore(&thi->t_lock, flags); if (restart) drbd_thread_start(thi); @@ -1635,23 +688,54 @@ init_completion(&thi->startstop); if (thi->task != current) force_sig(DRBD_SIGKILL, thi->task); - else - D_ASSERT(!wait); + else if (wait) + conn_err(tconn, "ASSERT FAILED: wait=%d\n", wait); } spin_unlock_irqrestore(&thi->t_lock, flags); if (wait) { - D_ASSERT(thi->task != current); + if (thi->task == current) { + conn_err(tconn, "ASSERT FAILED: Trying to wait for current task!\n"); + return; + } wait_for_completion(&thi->startstop); spin_lock_irqsave(&thi->t_lock, flags); - D_ASSERT(thi->task == NULL); - if (thi->t_state != None) - dev_err(DEV, "ASSERT FAILED: %s t_state == %d expected %d.\n", - me, thi->t_state, None); + if (thi->t_state != NONE) + conn_err(tconn, "ASSERT FAILED: %s t_state == %d expected %d.\n", + thi->name, thi->t_state, NONE); spin_unlock_irqrestore(&thi->t_lock, flags); } } +static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task) +{ + struct drbd_thread *thi = + task == tconn->receiver.task ? &tconn->receiver : + task == tconn->asender.task ? &tconn->asender : + task == tconn->worker.task ? &tconn->worker : NULL; + + return thi; +} + +char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task) +{ + struct drbd_thread *thi = drbd_task_to_thread(tconn, task); + return thi ? thi->name : task->comm; +} + +int conn_lowest_minor(struct drbd_tconn *tconn) +{ + struct drbd_conf *mdev; + int vnr = 0, m; + + rcu_read_lock(); + mdev = idr_get_next(&tconn->volumes, &vnr); + m = mdev ? mdev_to_minor(mdev) : -1; + rcu_read_unlock(); + + return m; +} + #ifdef CONFIG_SMP /** * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs @@ -1660,223 +744,343 @@ * Forces all threads of a device onto the same CPU. This is beneficial for * DRBD's performance. May be overwritten by user's configuration. */ -void drbd_calc_cpu_mask(struct drbd_conf *mdev) +void drbd_calc_cpu_mask(struct drbd_tconn *tconn) { int ord, cpu; /* user override. */ - if (cpumask_weight(mdev->cpu_mask)) + if (cpumask_weight(tconn->cpu_mask)) return; - ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); + ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask); for_each_online_cpu(cpu) { if (ord-- == 0) { - cpumask_set_cpu(cpu, mdev->cpu_mask); + cpumask_set_cpu(cpu, tconn->cpu_mask); return; } } /* should not be reached */ - cpumask_setall(mdev->cpu_mask); + cpumask_setall(tconn->cpu_mask); } /** * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread * @mdev: DRBD device. + * @thi: drbd_thread object * * call in the "main loop" of _all_ threads, no need for any mutex, current won't die * prematurely. */ -void drbd_thread_current_set_cpu(struct drbd_conf *mdev) +void drbd_thread_current_set_cpu(struct drbd_thread *thi) { struct task_struct *p = current; - struct drbd_thread *thi = - p == mdev->asender.task ? &mdev->asender : - p == mdev->receiver.task ? &mdev->receiver : - p == mdev->worker.task ? &mdev->worker : - NULL; - ERR_IF(thi == NULL) - return; + if (!thi->reset_cpu_mask) return; thi->reset_cpu_mask = 0; - set_cpus_allowed_ptr(p, mdev->cpu_mask); + set_cpus_allowed_ptr(p, thi->tconn->cpu_mask); } #endif -/* the appropriate socket mutex must be held already */ -int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, - enum drbd_packets cmd, struct p_header *h, - size_t size, unsigned msg_flags) +/** + * drbd_header_size - size of a packet header + * + * The header size is a multiple of 8, so any payload following the header is + * word aligned on 64-bit architectures. (The bitmap send and receive code + * relies on this.) + */ +unsigned int drbd_header_size(struct drbd_tconn *tconn) { - int sent, ok; + if (tconn->agreed_pro_version >= 100) { + BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8)); + return sizeof(struct p_header100); + } else { + BUILD_BUG_ON(sizeof(struct p_header80) != + sizeof(struct p_header95)); + BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8)); + return sizeof(struct p_header80); + } +} - ERR_IF(!h) return FALSE; - ERR_IF(!size) return FALSE; +static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size) +{ + h->magic = cpu_to_be32(DRBD_MAGIC); + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be16(size); + return sizeof(struct p_header80); +} - h->magic = BE_DRBD_MAGIC; +static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size) +{ + h->magic = cpu_to_be16(DRBD_MAGIC_BIG); h->command = cpu_to_be16(cmd); - h->length = cpu_to_be16(size-sizeof(struct p_header)); + h->length = cpu_to_be32(size); + return sizeof(struct p_header95); +} - trace_drbd_packet(mdev, sock, 0, (void *)h, __FILE__, __LINE__); - sent = drbd_send(mdev, sock, h, size, msg_flags); +static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd, + int size, int vnr) +{ + h->magic = cpu_to_be32(DRBD_MAGIC_100); + h->volume = cpu_to_be16(vnr); + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be32(size); + h->pad = 0; + return sizeof(struct p_header100); +} - ok = (sent == size); - if (!ok) - dev_err(DEV, "short sent %s size=%d sent=%d\n", - cmdname(cmd), (int)size, sent); - return ok; +static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr, + void *buffer, enum drbd_packet cmd, int size) +{ + if (tconn->agreed_pro_version >= 100) + return prepare_header100(buffer, cmd, size, vnr); + else if (tconn->agreed_pro_version >= 95 && + size > DRBD_MAX_SIZE_H80_PACKET) + return prepare_header95(buffer, cmd, size); + else + return prepare_header80(buffer, cmd, size); } -/* don't pass the socket. we may only look at it - * when we hold the appropriate socket mutex. - */ -int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, - enum drbd_packets cmd, struct p_header *h, size_t size) +static void *__conn_prepare_command(struct drbd_tconn *tconn, + struct drbd_socket *sock) { - int ok = 0; - struct socket *sock; + if (!sock->socket) + return NULL; + return sock->sbuf + drbd_header_size(tconn); +} - if (use_data_socket) { - mutex_lock(&mdev->data.mutex); - sock = mdev->data.socket; - } else { - mutex_lock(&mdev->meta.mutex); - sock = mdev->meta.socket; - } +void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock) +{ + void *p; - /* drbd_disconnect() could have called drbd_free_sock() - * while we were waiting in down()... */ - if (likely(sock != NULL)) - ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); + mutex_lock(&sock->mutex); + p = __conn_prepare_command(tconn, sock); + if (!p) + mutex_unlock(&sock->mutex); - if (use_data_socket) - mutex_unlock(&mdev->data.mutex); - else - mutex_unlock(&mdev->meta.mutex); - return ok; + return p; } -int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, - size_t size) +void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock) { - struct p_header h; - int ok; + return conn_prepare_command(mdev->tconn, sock); +} - h.magic = BE_DRBD_MAGIC; - h.command = cpu_to_be16(cmd); - h.length = cpu_to_be16(size); +static int __send_command(struct drbd_tconn *tconn, int vnr, + struct drbd_socket *sock, enum drbd_packet cmd, + unsigned int header_size, void *data, + unsigned int size) +{ + int msg_flags; + int err; - if (!drbd_get_data_sock(mdev)) - return 0; + /* + * Called with @data == NULL and the size of the data blocks in @size + * for commands that send data blocks. For those commands, omit the + * MSG_MORE flag: this will increase the likelihood that data blocks + * which are page aligned on the sender will end up page aligned on the + * receiver. + */ + msg_flags = data ? MSG_MORE : 0; - trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&h, __FILE__, __LINE__); + header_size += prepare_header(tconn, vnr, sock->sbuf, cmd, + header_size + size); + err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size, + msg_flags); + if (data && !err) + err = drbd_send_all(tconn, sock->socket, data, size, 0); + return err; +} - ok = (sizeof(h) == - drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); - ok = ok && (size == - drbd_send(mdev, mdev->data.socket, data, size, 0)); +static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) +{ + return __send_command(tconn, 0, sock, cmd, header_size, data, size); +} - drbd_put_data_sock(mdev); +int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) +{ + int err; - return ok; + err = __conn_send_command(tconn, sock, cmd, header_size, data, size); + mutex_unlock(&sock->mutex); + return err; } -int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) +int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) { - struct p_rs_param_89 *p; - struct socket *sock; - int size, rv; - const int apv = mdev->agreed_pro_version; + int err; - size = apv <= 87 ? sizeof(struct p_rs_param) - : apv == 88 ? sizeof(struct p_rs_param) - + strlen(mdev->sync_conf.verify_alg) + 1 - : /* 89 */ sizeof(struct p_rs_param_89); + err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size, + data, size); + mutex_unlock(&sock->mutex); + return err; +} + +int drbd_send_ping(struct drbd_tconn *tconn) +{ + struct drbd_socket *sock; + + sock = &tconn->meta; + if (!conn_prepare_command(tconn, sock)) + return -EIO; + return conn_send_command(tconn, sock, P_PING, 0, NULL, 0); +} + +int drbd_send_ping_ack(struct drbd_tconn *tconn) +{ + struct drbd_socket *sock; - /* used from admin command context and receiver/worker context. - * to avoid kmalloc, grab the socket right here, - * then use the pre-allocated sbuf there */ - mutex_lock(&mdev->data.mutex); - sock = mdev->data.socket; + sock = &tconn->meta; + if (!conn_prepare_command(tconn, sock)) + return -EIO; + return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0); +} - if (likely(sock != NULL)) { - enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; +int drbd_send_sync_param(struct drbd_conf *mdev) +{ + struct drbd_socket *sock; + struct p_rs_param_95 *p; + int size; + const int apv = mdev->tconn->agreed_pro_version; + enum drbd_packet cmd; + struct net_conf *nc; + struct disk_conf *dc; - p = &mdev->data.sbuf.rs_param_89; + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; - /* initialize verify_alg and csums_alg */ - memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); - p->rate = cpu_to_be32(sc->rate); + size = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + strlen(nc->verify_alg) + 1 + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); - if (apv >= 88) - strcpy(p->verify_alg, mdev->sync_conf.verify_alg); - if (apv >= 89) - strcpy(p->csums_alg, mdev->sync_conf.csums_alg); + cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; - rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); - } else - rv = 0; /* not ok */ + /* initialize verify_alg and csums_alg */ + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); - mutex_unlock(&mdev->data.mutex); + if (get_ldev(mdev)) { + dc = rcu_dereference(mdev->ldev->disk_conf); + p->resync_rate = cpu_to_be32(dc->resync_rate); + p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead); + p->c_delay_target = cpu_to_be32(dc->c_delay_target); + p->c_fill_target = cpu_to_be32(dc->c_fill_target); + p->c_max_rate = cpu_to_be32(dc->c_max_rate); + put_ldev(mdev); + } else { + p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF); + p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF); + p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF); + p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF); + p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF); + } - return rv; + if (apv >= 88) + strcpy(p->verify_alg, nc->verify_alg); + if (apv >= 89) + strcpy(p->csums_alg, nc->csums_alg); + rcu_read_unlock(); + + return drbd_send_command(mdev, sock, cmd, size, NULL, 0); } -int drbd_send_protocol(struct drbd_conf *mdev) +int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd) { + struct drbd_socket *sock; struct p_protocol *p; - int size, rv; + struct net_conf *nc; + int size, cf; - size = sizeof(struct p_protocol); + sock = &tconn->data; + p = __conn_prepare_command(tconn, sock); + if (!p) + return -EIO; + + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + + if (nc->tentative && tconn->agreed_pro_version < 92) { + rcu_read_unlock(); + mutex_unlock(&sock->mutex); + conn_err(tconn, "--dry-run is not supported by peer"); + return -EOPNOTSUPP; + } + + size = sizeof(*p); + if (tconn->agreed_pro_version >= 87) + size += strlen(nc->integrity_alg) + 1; + + p->protocol = cpu_to_be32(nc->wire_protocol); + p->after_sb_0p = cpu_to_be32(nc->after_sb_0p); + p->after_sb_1p = cpu_to_be32(nc->after_sb_1p); + p->after_sb_2p = cpu_to_be32(nc->after_sb_2p); + p->two_primaries = cpu_to_be32(nc->two_primaries); + cf = 0; + if (nc->discard_my_data) + cf |= CF_DISCARD_MY_DATA; + if (nc->tentative) + cf |= CF_DRY_RUN; + p->conn_flags = cpu_to_be32(cf); + + if (tconn->agreed_pro_version >= 87) + strcpy(p->integrity_alg, nc->integrity_alg); + rcu_read_unlock(); - if (mdev->agreed_pro_version >= 87) - size += strlen(mdev->net_conf->integrity_alg) + 1; + return __conn_send_command(tconn, sock, cmd, size, NULL, 0); +} - /* we must not recurse into our own queue, - * as that is blocked during handshake */ - p = kmalloc(size, GFP_NOIO); - if (p == NULL) - return 0; +int drbd_send_protocol(struct drbd_tconn *tconn) +{ + int err; - p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); - p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); - p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); - p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); - p->want_lose = cpu_to_be32(mdev->net_conf->want_lose); - p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); - - if (mdev->agreed_pro_version >= 87) - strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); - - rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, - (struct p_header *)p, size); - kfree(p); - return rv; + mutex_lock(&tconn->data.mutex); + err = __drbd_send_protocol(tconn, P_PROTOCOL); + mutex_unlock(&tconn->data.mutex); + + return err; } int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) { - struct p_uuids p; + struct drbd_socket *sock; + struct p_uuids *p; int i; if (!get_ldev_if_state(mdev, D_NEGOTIATING)) - return 1; + return 0; + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) { + put_ldev(mdev); + return -EIO; + } for (i = UI_CURRENT; i < UI_SIZE; i++) - p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; + p->uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; mdev->comm_bm_set = drbd_bm_total_weight(mdev); - p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); - uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; + p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); + rcu_read_lock(); + uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0; + rcu_read_unlock(); uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; - p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); + p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); put_ldev(mdev); - - return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, - (struct p_header *)&p, sizeof(p)); + return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0); } int drbd_send_uuids(struct drbd_conf *mdev) @@ -1889,103 +1093,210 @@ return _drbd_send_uuids(mdev, 8); } +void drbd_print_uuids(struct drbd_conf *mdev, const char *text) +{ + if (get_ldev_if_state(mdev, D_NEGOTIATING)) { + u64 *uuid = mdev->ldev->md.uuid; + dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n", + text, + (unsigned long long)uuid[UI_CURRENT], + (unsigned long long)uuid[UI_BITMAP], + (unsigned long long)uuid[UI_HISTORY_START], + (unsigned long long)uuid[UI_HISTORY_END]); + put_ldev(mdev); + } else { + dev_info(DEV, "%s effective data uuid: %016llX\n", + text, + (unsigned long long)mdev->ed_uuid); + } +} -int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) +void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) { - struct p_rs_uuid p; + struct drbd_socket *sock; + struct p_rs_uuid *p; + u64 uuid; + + D_ASSERT(mdev->state.disk == D_UP_TO_DATE); - p.uuid = cpu_to_be64(val); + uuid = mdev->ldev->md.uuid[UI_BITMAP]; + if (uuid && uuid != UUID_JUST_CREATED) + uuid = uuid + UUID_NEW_BM_OFFSET; + else + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(mdev, UI_BITMAP, uuid); + drbd_print_uuids(mdev, "updated sync UUID"); + drbd_md_sync(mdev); - return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, - (struct p_header *)&p, sizeof(p)); + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (p) { + p->uuid = cpu_to_be64(uuid); + drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0); + } } -int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) +int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags) { - struct p_sizes p; + struct drbd_socket *sock; + struct p_sizes *p; sector_t d_size, u_size; - int q_order_type; - int ok; + int q_order_type, max_bio_size; if (get_ldev_if_state(mdev, D_NEGOTIATING)) { D_ASSERT(mdev->ldev->backing_bdev); d_size = drbd_get_max_capacity(mdev->ldev); - u_size = mdev->ldev->dc.disk_size; + rcu_read_lock(); + u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; + rcu_read_unlock(); q_order_type = drbd_queue_order_type(mdev); - p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); + max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; + max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE); put_ldev(mdev); } else { d_size = 0; u_size = 0; q_order_type = QUEUE_ORDERED_NONE; + max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } - p.d_size = cpu_to_be64(d_size); - p.u_size = cpu_to_be64(u_size); - p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); - p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); - p.queue_order_type = cpu_to_be32(q_order_type); + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; - ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, - (struct p_header *)&p, sizeof(p)); - return ok; + if (mdev->tconn->agreed_pro_version <= 94) + max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + else if (mdev->tconn->agreed_pro_version < 100) + max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE_P95); + + p->d_size = cpu_to_be64(d_size); + p->u_size = cpu_to_be64(u_size); + p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); + p->max_bio_size = cpu_to_be32(max_bio_size); + p->queue_order_type = cpu_to_be16(q_order_type); + p->dds_flags = cpu_to_be16(flags); + return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0); } /** - * drbd_send_state() - Sends the drbd state to the peer + * drbd_send_current_state() - Sends the drbd state to the peer * @mdev: DRBD device. */ -int drbd_send_state(struct drbd_conf *mdev) +int drbd_send_current_state_(struct drbd_conf *mdev, const char *func, unsigned int line) { - struct socket *sock; - struct p_state p; - int ok = 0; + struct drbd_socket *sock; + struct p_state *p; - /* Grab state lock so we wont send state if we're in the middle - * of a cluster wide state change on another thread */ - drbd_state_lock(mdev); + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; + p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ + return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); +} - mutex_lock(&mdev->data.mutex); +/** + * drbd_send_state() - After a state change, sends the new state to the peer + * @mdev: DRBD device. + * @state: the state to send, not necessarily the current state. + * + * Each state change queues an "after_state_ch" work, which will eventually + * send the resulting new state to the peer. If more state changes happen + * between queuing and processing of the after_state_ch work, we still + * want to send each intermediary state in the order it occurred. + */ +int drbd_send_state_(struct drbd_conf *mdev, union drbd_state state, const char *func, unsigned int line) +{ + struct drbd_socket *sock; + struct p_state *p; - p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ - sock = mdev->data.socket; + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; + p->state = cpu_to_be32(state.i); /* Within the send mutex */ + return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0); +} - if (likely(sock != NULL)) { - ok = _drbd_send_cmd(mdev, sock, P_STATE, - (struct p_header *)&p, sizeof(p), 0); - } +int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) +{ + struct drbd_socket *sock; + struct p_req_state *p; - mutex_unlock(&mdev->data.mutex); + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; + p->mask = cpu_to_be32(mask.i); + p->val = cpu_to_be32(val.i); + return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0); +} - drbd_state_unlock(mdev); - return ok; +int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) +{ + enum drbd_packet cmd; + struct drbd_socket *sock; + struct p_req_state *p; + + cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ; + sock = &tconn->data; + p = conn_prepare_command(tconn, sock); + if (!p) + return -EIO; + p->mask = cpu_to_be32(mask.i); + p->val = cpu_to_be32(val.i); + return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); } -int drbd_send_state_req(struct drbd_conf *mdev, - union drbd_state mask, union drbd_state val) +void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode) { - struct p_req_state p; + struct drbd_socket *sock; + struct p_req_state_reply *p; + + sock = &mdev->tconn->meta; + p = drbd_prepare_command(mdev, sock); + if (p) { + p->retcode = cpu_to_be32(retcode); + drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0); + } +} - p.mask = cpu_to_be32(mask.i); - p.val = cpu_to_be32(val.i); +void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode) +{ + struct drbd_socket *sock; + struct p_req_state_reply *p; + enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY; - return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, - (struct p_header *)&p, sizeof(p)); + sock = &tconn->meta; + p = conn_prepare_command(tconn, sock); + if (p) { + p->retcode = cpu_to_be32(retcode); + conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0); + } } -int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) +static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) { - struct p_req_state_reply p; + BUG_ON(code & ~0xf); + p->encoding = (p->encoding & ~0xf) | code; +} - p.retcode = cpu_to_be32(retcode); +static void dcbp_set_start(struct p_compressed_bm *p, int set) +{ + p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); +} - return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, - (struct p_header *)&p, sizeof(p)); +static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n) +{ + BUG_ON(n & ~0x7); + p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); } int fill_bitmap_rle_bits(struct drbd_conf *mdev, - struct p_compressed_bm *p, - struct bm_xfer_ctx *c) + struct p_compressed_bm *p, + unsigned int size, + struct bm_xfer_ctx *c) { struct bitstream bs; unsigned long plain_bits; @@ -1993,19 +1304,21 @@ unsigned long rl; unsigned len; unsigned toggle; - int bits; + int bits, use_rle; /* may we use this feature? */ - if ((mdev->sync_conf.use_rle == 0) || - (mdev->agreed_pro_version < 90)) - return 0; + rcu_read_lock(); + use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle; + rcu_read_unlock(); + if (!use_rle || mdev->tconn->agreed_pro_version < 90) + return 0; if (c->bit_offset >= c->bm_bits) return 0; /* nothing to do. */ /* use at most thus many bytes */ - bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); - memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); + bitstream_init(&bs, p->code, size, 0); + memset(p->code, 0, size); /* plain bits covered in this code string */ plain_bits = 0; @@ -2027,12 +1340,12 @@ if (rl == 0) { /* the first checked bit was set, * store start value, */ - DCBP_set_start(p, 1); + dcbp_set_start(p, 1); /* but skip encoding of zero run length */ toggle = !toggle; continue; } - DCBP_set_start(p, 0); + dcbp_set_start(p, 0); } /* paranoia: catch zero runlength. @@ -2072,76 +1385,81 @@ bm_xfer_ctx_bit_to_word_offset(c); /* store pad_bits */ - DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); + dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); return len; } -enum { OK, FAILED, DONE } -send_bitmap_rle_or_plain(struct drbd_conf *mdev, - struct p_header *h, struct bm_xfer_ctx *c) -{ - struct p_compressed_bm *p = (void*)h; - unsigned long num_words; - int len; - int ok; - - len = fill_bitmap_rle_bits(mdev, p, c); +/** + * send_bitmap_rle_or_plain + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +STATIC int +send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c) +{ + struct drbd_socket *sock = &mdev->tconn->data; + unsigned int header_size = drbd_header_size(mdev->tconn); + struct p_compressed_bm *p = sock->sbuf + header_size; + int len, err; + len = fill_bitmap_rle_bits(mdev, p, + DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c); if (len < 0) - return FAILED; + return -EIO; if (len) { - DCBP_set_code(p, RLE_VLI_Bits); - ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, - sizeof(*p) + len, 0); - + dcbp_set_code(p, RLE_VLI_Bits); + err = __send_command(mdev->tconn, mdev->vnr, sock, + P_COMPRESSED_BITMAP, sizeof(*p) + len, + NULL, 0); c->packets[0]++; - c->bytes[0] += sizeof(*p) + len; + c->bytes[0] += header_size + sizeof(*p) + len; if (c->bit_offset >= c->bm_bits) len = 0; /* DONE */ } else { /* was not compressible. * send a buffer full of plain text bits instead. */ - num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); - len = num_words * sizeof(long); + unsigned int data_size; + unsigned long num_words; + unsigned long *p = sock->sbuf + header_size; + + data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; + num_words = min_t(size_t, data_size / sizeof(*p), + c->bm_words - c->word_offset); + len = num_words * sizeof(*p); if (len) - drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); - ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, - h, sizeof(struct p_header) + len, 0); + drbd_bm_get_lel(mdev, c->word_offset, num_words, p); + err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0); c->word_offset += num_words; c->bit_offset = c->word_offset * BITS_PER_LONG; c->packets[1]++; - c->bytes[1] += sizeof(struct p_header) + len; + c->bytes[1] += header_size + len; if (c->bit_offset > c->bm_bits) c->bit_offset = c->bm_bits; } - ok = ok ? ((len == 0) ? DONE : OK) : FAILED; - - if (ok == DONE) - INFO_bm_xfer_stats(mdev, "send", c); - return ok; + if (!err) { + if (len == 0) { + INFO_bm_xfer_stats(mdev, "send", c); + return 0; + } else + return 1; + } + return -EIO; } /* See the comment at receive_bitmap() */ -int _drbd_send_bitmap(struct drbd_conf *mdev) +static int _drbd_send_bitmap(struct drbd_conf *mdev) { struct bm_xfer_ctx c; - struct p_header *p; - int ret; + int err; - ERR_IF(!mdev->bitmap) return FALSE; - - /* maybe we should use some per thread scratch page, - * and allocate that during initial device creation? */ - p = (struct p_header *) __get_free_page(GFP_NOIO); - if (!p) { - dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); - return FALSE; - } + if (!expect(mdev->bitmap)) + return false; if (get_ldev(mdev)) { if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { @@ -2166,37 +1484,39 @@ }; do { - ret = send_bitmap_rle_or_plain(mdev, p, &c); - } while (ret == OK); + err = send_bitmap_rle_or_plain(mdev, &c); + } while (err > 0); - free_page((unsigned long) p); - return (ret == DONE); + return err == 0; } int drbd_send_bitmap(struct drbd_conf *mdev) { - int err; + struct drbd_socket *sock = &mdev->tconn->data; + int err = -1; - if (!drbd_get_data_sock(mdev)) - return -1; - err = !_drbd_send_bitmap(mdev); - drbd_put_data_sock(mdev); + mutex_lock(&sock->mutex); + if (sock->socket) + err = !_drbd_send_bitmap(mdev); + mutex_unlock(&sock->mutex); return err; } -int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) +void drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) { - int ok; - struct p_barrier_ack p; - - p.barrier = barrier_nr; - p.set_size = cpu_to_be32(set_size); + struct drbd_socket *sock; + struct p_barrier_ack *p; if (mdev->state.conn < C_CONNECTED) - return FALSE; - ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, - (struct p_header *)&p, sizeof(p)); - return ok; + return; + + sock = &mdev->tconn->meta; + p = drbd_prepare_command(mdev, sock); + if (!p) + return; + p->barrier = barrier_nr; + p->set_size = cpu_to_be32(set_size); + drbd_send_command(mdev, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0); } /** @@ -2207,61 +1527,62 @@ * @blksize: size in byte, needs to be in big endian byte order * @block_id: Id, big endian byte order */ -STATIC int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, - u64 sector, - u32 blksize, - u64 block_id) +STATIC int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, + u64 sector, u32 blksize, u64 block_id) { - int ok; - struct p_block_ack p; - - p.sector = sector; - p.block_id = block_id; - p.blksize = blksize; - p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); + struct drbd_socket *sock; + struct p_block_ack *p; - if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) - return FALSE; - ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, - (struct p_header *)&p, sizeof(p)); - return ok; -} - -int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_data *dp) -{ - const int header_size = sizeof(struct p_data) - - sizeof(struct p_header); - int data_size = ((struct p_header *)dp)->length - header_size; + if (mdev->state.conn < C_CONNECTED) + return -EIO; - return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), - dp->block_id); + sock = &mdev->tconn->meta; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; + p->sector = sector; + p->block_id = block_id; + p->blksize = blksize; + p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq)); + return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); +} + +/* dp->sector and dp->block_id already/still in network byte order, + * data_size is payload size according to dp->head, + * and may need to be corrected for digest size. */ +void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd, + struct p_data *dp, int data_size) +{ + if (mdev->tconn->peer_integrity_tfm) + data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); + _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), + dp->block_id); } -int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, - struct p_block_req *rp) +void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd, + struct p_block_req *rp) { - return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); + _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); } /** * drbd_send_ack() - Sends an ack packet - * @mdev: DRBD device. - * @cmd: Packet command code. - * @e: Epoch entry. + * @mdev: DRBD device + * @cmd: packet command code + * @peer_req: peer request */ -int drbd_send_ack(struct drbd_conf *mdev, - enum drbd_packets cmd, struct drbd_epoch_entry *e) +int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd, + struct drbd_peer_request *peer_req) { return _drbd_send_ack(mdev, cmd, - cpu_to_be64(e->sector), - cpu_to_be32(e->size), - e->block_id); + cpu_to_be64(peer_req->i.sector), + cpu_to_be32(peer_req->i.size), + peer_req->block_id); } /* This function misuses the block_id field to signal if the blocks * are is sync or not. */ -int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, +int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd, sector_t sector, int blksize, u64 block_id) { return _drbd_send_ack(mdev, cmd, @@ -2273,86 +1594,86 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd, sector_t sector, int size, u64 block_id) { - int ok; - struct p_block_req p; + struct drbd_socket *sock; + struct p_block_req *p; - p.sector = cpu_to_be64(sector); - p.block_id = block_id; - p.blksize = cpu_to_be32(size); - - ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, - (struct p_header *)&p, sizeof(p)); - return ok; -} - -int drbd_send_drequest_csum(struct drbd_conf *mdev, - sector_t sector, int size, - void *digest, int digest_size, - enum drbd_packets cmd) -{ - int ok; - struct p_block_req p; - - p.sector = cpu_to_be64(sector); - p.block_id = BE_DRBD_MAGIC + 0xbeef; - p.blksize = cpu_to_be32(size); - - p.head.magic = BE_DRBD_MAGIC; - p.head.command = cpu_to_be16(cmd); - p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); - - mutex_lock(&mdev->data.mutex); - - ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); - ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); - - mutex_unlock(&mdev->data.mutex); - - return ok; + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = block_id; + p->blksize = cpu_to_be32(size); + return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0); +} + +int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size, + void *digest, int digest_size, enum drbd_packet cmd) +{ + struct drbd_socket *sock; + struct p_block_req *p; + + /* FIXME: Put the digest into the preallocated socket buffer. */ + + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = ID_SYNCER /* unused */; + p->blksize = cpu_to_be32(size); + return drbd_send_command(mdev, sock, cmd, sizeof(*p), + digest, digest_size); } int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) { - int ok; - struct p_block_req p; + struct drbd_socket *sock; + struct p_block_req *p; - p.sector = cpu_to_be64(sector); - p.block_id = BE_DRBD_MAGIC + 0xbabe; - p.blksize = cpu_to_be32(size); - - ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, - (struct p_header *)&p, sizeof(p)); - return ok; + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = ID_SYNCER /* unused */; + p->blksize = cpu_to_be32(size); + return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0); } /* called on sndtimeo - * returns FALSE if we should retry, - * TRUE if we think connection is dead + * returns false if we should retry, + * true if we think connection is dead */ -STATIC int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) +STATIC int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock) { int drop_it; - /* long elapsed = (long)(jiffies - mdev->last_received); */ - /* DUMPLU(elapsed); // elapsed ignored for now. */ - drop_it = mdev->meta.socket == sock - || !mdev->asender.task - || get_t_state(&mdev->asender) != Running - || mdev->state.conn < C_CONNECTED; + drop_it = tconn->meta.socket == sock + || !tconn->asender.task + || get_t_state(&tconn->asender) != RUNNING + || tconn->cstate < C_WF_REPORT_PARAMS; if (drop_it) - return TRUE; + return true; - drop_it = !--mdev->ko_count; + drop_it = !--tconn->ko_count; if (!drop_it) { - dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", - current->comm, current->pid, mdev->ko_count); - request_ping(mdev); + conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, tconn->ko_count); + request_ping(tconn); } return drop_it; /* && (mdev->state == R_PRIMARY) */; } +static void drbd_update_congested(struct drbd_tconn *tconn) +{ + struct sock *sk = tconn->data.socket->sk; + if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) + set_bit(NET_CONGESTED, &tconn->flags); +} + /* The idea of sendpage seems to be to put some kind of reference * to the page into the skb, and to hand it over to the NIC. In * this process get_page() gets called. @@ -2375,21 +1696,28 @@ * with page_count == 0 or PageSlab. */ STATIC int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, - int offset, size_t size) + int offset, size_t size, unsigned msg_flags) { - int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + struct socket *socket; + void *addr; + int err; + + socket = mdev->tconn->data.socket; + addr = kmap(page) + offset; + err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags); kunmap(page); - if (sent == size) - mdev->send_cnt += size>>9; - return sent == size; + if (!err) + mdev->send_cnt += size >> 9; + return err; } STATIC int _drbd_send_page(struct drbd_conf *mdev, struct page *page, - int offset, size_t size) + int offset, size_t size, unsigned msg_flags) { + struct socket *socket = mdev->tconn->data.socket; mm_segment_t oldfs = get_fs(); - int sent, ok; int len = size; + int err = -EIO; /* e.g. XFS meta- & log-data is in slab pages, which have a * page_count of 0 and/or have PageSlab() set. @@ -2398,61 +1726,107 @@ * __page_cache_release a page that would actually still be referenced * by someone, leading to some obscure delayed Oops somewhere else. */ if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) - return _drbd_no_send_page(mdev, page, offset, size); + return _drbd_no_send_page(mdev, page, offset, size, msg_flags); - drbd_update_congested(mdev); + msg_flags |= MSG_NOSIGNAL; + drbd_update_congested(mdev->tconn); set_fs(KERNEL_DS); do { - sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, - offset, len, - MSG_NOSIGNAL); - if (sent == -EAGAIN) { - if (we_should_drop_the_connection(mdev, - mdev->data.socket)) - break; - else - continue; - } + int sent; + + sent = socket->ops->sendpage(socket, page, offset, len, msg_flags); if (sent <= 0) { + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(mdev->tconn, socket)) + break; + continue; + } dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", __func__, (int)size, len, sent); + if (sent < 0) + err = sent; break; } len -= sent; offset += sent; } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); set_fs(oldfs); - clear_bit(NET_CONGESTED, &mdev->flags); + clear_bit(NET_CONGESTED, &mdev->tconn->flags); - ok = (len == 0); - if (likely(ok)) - mdev->send_cnt += size>>9; - return ok; + if (len == 0) { + err = 0; + mdev->send_cnt += size >> 9; + } + return err; } static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) { struct bio_vec *bvec; int i; + /* hint all but last page with MSG_MORE */ __bio_for_each_segment(bvec, bio, i, 0) { - if (!_drbd_no_send_page(mdev, bvec->bv_page, - bvec->bv_offset, bvec->bv_len)) - return 0; + int err; + + err = _drbd_no_send_page(mdev, bvec->bv_page, + bvec->bv_offset, bvec->bv_len, + i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); + if (err) + return err; } - return 1; + return 0; +} + +static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) +{ + struct bio_vec *bvec; + int i; + /* hint all but last page with MSG_MORE */ + __bio_for_each_segment(bvec, bio, i, 0) { + int err; + + err = _drbd_send_page(mdev, bvec->bv_page, + bvec->bv_offset, bvec->bv_len, + i == bio->bi_vcnt - 1 ? 0 : MSG_MORE); + if (err) + return err; + } + return 0; } -static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) +static int _drbd_send_zc_ee(struct drbd_conf *mdev, + struct drbd_peer_request *peer_req) { - struct bio_vec *bvec; - int i; - __bio_for_each_segment(bvec, bio, i, 0) { - if (!_drbd_send_page(mdev, bvec->bv_page, - bvec->bv_offset, bvec->bv_len)) - return 0; + struct page *page = peer_req->pages; + unsigned len = peer_req->i.size; + int err; + + /* hint all but last page with MSG_MORE */ + page_chain_for_each(page) { + unsigned l = min_t(unsigned, len, PAGE_SIZE); + + err = _drbd_send_page(mdev, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0); + if (err) + return err; + len -= l; } + return 0; +} - return 1; +/* see also wire_flags_to_bio() + * DRBD_REQ_*, because we need to semantically map the flags to data packet + * flags and back. We may replicate to other kernel versions. */ +static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) +{ + if (mdev->tconn->agreed_pro_version >= 95) + return (bi_rw & DRBD_REQ_SYNC ? DP_RW_SYNC : 0) | + (bi_rw & DRBD_REQ_FUA ? DP_FUA : 0) | + (bi_rw & DRBD_REQ_FLUSH ? DP_FLUSH : 0) | + (bi_rw & DRBD_REQ_DISCARD ? DP_DISCARD : 0); + + /* else: we used to communicate one bit only in older DRBD */ + return bi_rw & DRBD_REQ_SYNC ? DP_RW_SYNC : 0; } /* Used to send write requests @@ -2460,115 +1834,116 @@ */ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) { - int ok = 1; - struct p_data p; + struct drbd_socket *sock; + struct p_data *p; unsigned int dp_flags = 0; - void *dgb; int dgs; + int err; - if (!drbd_get_data_sock(mdev)) - return 0; - - dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? - crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; - - p.head.magic = BE_DRBD_MAGIC; - p.head.command = cpu_to_be16(P_DATA); - p.head.length = - cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); - - p.sector = cpu_to_be64(req->sector); - p.block_id = (unsigned long)req; - p.seq_num = cpu_to_be32(req->seq_num = - atomic_add_return(1, &mdev->packet_seq)); - dp_flags = 0; - - /* NOTE: no need to check if barriers supported here as we would - * not pass the test in make_request_common in that case - */ - if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) { - dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); - /* dp_flags |= DP_HARDBARRIER; */ - } -#ifdef BIO_RW_SYNC - if (bio_rw_flagged(req->master_bio, BIO_RW_SYNC)) - dp_flags |= DP_RW_SYNC; -#else - if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO)) - dp_flags |= DP_RW_SYNC; - /* for now handle SYNCIO and UNPLUG - * as if they still were one and the same flag */ - if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG)) - dp_flags |= DP_RW_SYNC; -#endif + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; + + if (!p) + return -EIO; + p->sector = cpu_to_be64(req->i.sector); + p->block_id = (unsigned long)req; + p->seq_num = cpu_to_be32(req->seq_num = atomic_inc_return(&mdev->packet_seq)); + dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn <= C_PAUSED_SYNC_T) dp_flags |= DP_MAY_SET_IN_SYNC; - - p.dp_flags = cpu_to_be32(dp_flags); - trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__); - ok = (sizeof(p) == - drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); - if (ok && dgs) { - dgb = mdev->int_dig_out; - drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); - ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); - } - if (ok) { - if (mdev->net_conf->wire_protocol == DRBD_PROT_A) - ok = _drbd_send_bio(mdev, req->master_bio); + if (mdev->tconn->agreed_pro_version >= 100) { + if (req->rq_state & RQ_EXP_RECEIVE_ACK) + dp_flags |= DP_SEND_RECEIVE_ACK; + if (req->rq_state & RQ_EXP_WRITE_ACK) + dp_flags |= DP_SEND_WRITE_ACK; + } + p->dp_flags = cpu_to_be32(dp_flags); + if (dgs) + drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1); + err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); + if (!err) { + /* For protocol A, we have to memcpy the payload into + * socket buffers, as we may complete right away + * as soon as we handed it over to tcp, at which point the data + * pages may become invalid. + * + * For data-integrity enabled, we copy it as well, so we can be + * sure that even if the bio pages may still be modified, it + * won't change the data on the wire, thus if the digest checks + * out ok after sending on this side, but does not fit on the + * receiving side, we sure have detected corruption elsewhere. + */ + if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs) + err = _drbd_send_bio(mdev, req->master_bio); else - ok = _drbd_send_zc_bio(mdev, req->master_bio); + err = _drbd_send_zc_bio(mdev, req->master_bio); + + /* double check digest, sometimes buffers have been modified in flight. */ + if (dgs > 0 && dgs <= 64) { + /* 64 byte, 512 bit, is the largest digest size + * currently supported in kernel crypto. */ + unsigned char digest[64]; + drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest); + if (memcmp(p + 1, digest, dgs)) { + dev_warn(DEV, + "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", + (unsigned long long)req->i.sector, req->i.size); + } + } /* else if (dgs > 64) { + ... Be noisy about digest too large ... + } */ } + mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ - drbd_put_data_sock(mdev); - return ok; + return err; } /* answer packet, used to send data back for read requests: * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) */ -int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, - struct drbd_epoch_entry *e) +int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd, + struct drbd_peer_request *peer_req) { - int ok; - struct p_data p; - void *dgb; + struct drbd_socket *sock; + struct p_data *p; + int err; int dgs; - dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? - crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); - p.head.magic = BE_DRBD_MAGIC; - p.head.command = cpu_to_be16(cmd); - p.head.length = - cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); - - p.sector = cpu_to_be64(e->sector); - p.block_id = e->block_id; - /* p.seq_num = 0; No sequence numbers here.. */ - - /* Only called by our kernel thread. - * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL - * in response to admin command or module unload. - */ - if (!drbd_get_data_sock(mdev)) - return 0; + dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0; - trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__); - ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, - sizeof(p), MSG_MORE); - if (ok && dgs) { - dgb = mdev->int_dig_out; - drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); - ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); - } - if (ok) - ok = _drbd_send_zc_bio(mdev, e->private_bio); + if (!p) + return -EIO; + p->sector = cpu_to_be64(peer_req->i.sector); + p->block_id = peer_req->block_id; + p->seq_num = 0; /* unused */ + if (dgs) + drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1); + err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size); + if (!err) + err = _drbd_send_zc_ee(mdev, peer_req); + mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ + + return err; +} + +int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req) +{ + struct drbd_socket *sock; + struct p_block_desc *p; - drbd_put_data_sock(mdev); - return ok; + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(req->i.sector); + p->blksize = cpu_to_be32(req->i.size); + return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0); } /* @@ -2587,20 +1962,15 @@ /* * you must have down()ed the appropriate [m]sock_mutex elsewhere! */ -int drbd_send(struct drbd_conf *mdev, struct socket *sock, +int drbd_send(struct drbd_tconn *tconn, struct socket *sock, void *buf, size_t size, unsigned msg_flags) { -#if !HAVE_KERNEL_SENDMSG - mm_segment_t oldfs; - struct iovec iov; -#else struct kvec iov; -#endif struct msghdr msg; int rv, sent = 0; if (!sock) - return -1000; + return -EBADR; /* THINK if (signal_pending) return ... ? */ @@ -2609,22 +1979,15 @@ msg.msg_name = NULL; msg.msg_namelen = 0; -#if !HAVE_KERNEL_SENDMSG - msg.msg_iov = &iov; - msg.msg_iovlen = 1; -#endif msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = msg_flags | MSG_NOSIGNAL; -#if !HAVE_KERNEL_SENDMSG - oldfs = get_fs(); - set_fs(KERNEL_DS); -#endif - - if (sock == mdev->data.socket) { - mdev->ko_count = mdev->net_conf->ko_count; - drbd_update_congested(mdev); + if (sock == tconn->data.socket) { + rcu_read_lock(); + tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count; + rcu_read_unlock(); + drbd_update_congested(tconn); } do { /* STRANGE @@ -2636,30 +1999,14 @@ * do we need to block DRBD_SIG if sock == &meta.socket ?? * otherwise wake_asender() might interrupt some send_*Ack ! */ -#if !HAVE_KERNEL_SENDMSG - rv = sock_sendmsg(sock, &msg, iov.iov_len); -#else rv = kernel_sendmsg(sock, &msg, &iov, 1, size); -#endif if (rv == -EAGAIN) { - if (we_should_drop_the_connection(mdev, sock)) + if (we_should_drop_the_connection(tconn, sock)) break; else continue; } - D_ASSERT(rv != 0); if (rv == -EINTR) { -#if 0 - /* FIXME this happens all the time. - * we don't care for now! - * eventually this should be sorted out be the proper - * use of the SIGNAL_ASENDER bit... */ - if (DRBD_ratelimit(5*HZ, 5)) { - dev_dbg(DEV, "Got a signal in drbd_send(,%c,)!\n", - sock == mdev->meta.socket ? 'm' : 's'); - /* dump_stack(); */ - } -#endif flush_signals(current); rv = 0; } @@ -2670,27 +2017,40 @@ iov.iov_len -= rv; } while (sent < size); - if (sock == mdev->data.socket) - clear_bit(NET_CONGESTED, &mdev->flags); - -#if !HAVE_KERNEL_SENDMSG - set_fs(oldfs); -#endif - + if (sock == tconn->data.socket) + clear_bit(NET_CONGESTED, &tconn->flags); if (rv <= 0) { if (rv != -EAGAIN) { - dev_err(DEV, "%s_sendmsg returned %d\n", - sock == mdev->meta.socket ? "msock" : "sock", - rv); - drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); + conn_err(tconn, "%s_sendmsg returned %d\n", + sock == tconn->meta.socket ? "msock" : "sock", + rv); + conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); } else - drbd_force_state(mdev, NS(conn, C_TIMEOUT)); + conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD); } return sent; } +/** + * drbd_send_all - Send an entire buffer + * + * Returns 0 upon success and a negative error value otherwise. + */ +int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer, + size_t size, unsigned msg_flags) +{ + int err; + + err = drbd_send(tconn, sock, buffer, size, msg_flags); + if (err < 0) + return err; + if (err != size) + return -EIO; + return 0; +} + #ifdef BD_OPS_USE_FMODE static int drbd_open(struct block_device *bdev, fmode_t mode) #else @@ -2706,7 +2066,7 @@ unsigned long flags; int rv = 0; - spin_lock_irqsave(&mdev->req_lock, flags); + spin_lock_irqsave(&mdev->tconn->req_lock, flags); /* to have a stable mdev->state.role * and no race with updating open_cnt */ @@ -2719,7 +2079,7 @@ if (!rv) mdev->open_cnt++; - spin_unlock_irqrestore(&mdev->req_lock, flags); + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); return rv; } @@ -2740,49 +2100,16 @@ } #endif -STATIC void drbd_unplug_fn(struct request_queue *q) -{ - struct drbd_conf *mdev = q->queuedata; - - trace_drbd_unplug(mdev, "got unplugged"); - - /* unplug FIRST */ - spin_lock_irq(q->queue_lock); - blk_remove_plug(q); - spin_unlock_irq(q->queue_lock); - - /* only if connected */ - spin_lock_irq(&mdev->req_lock); - if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) { - D_ASSERT(mdev->state.role == R_PRIMARY); - if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) { - /* add to the data.work queue, - * unless already queued. - * XXX this might be a good addition to drbd_queue_work - * anyways, to detect "double queuing" ... */ - if (list_empty(&mdev->unplug_work.list)) - drbd_queue_work(&mdev->data.work, - &mdev->unplug_work); - } - } - spin_unlock_irq(&mdev->req_lock); - - if (mdev->state.disk >= D_INCONSISTENT) - drbd_kick_lo(mdev); -} - STATIC void drbd_set_defaults(struct drbd_conf *mdev) { - mdev->sync_conf.after = DRBD_AFTER_DEF; - mdev->sync_conf.rate = DRBD_RATE_DEF; - mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; - mdev->state = (union drbd_state) { + /* Beware! The actual layout differs + * between big endian and little endian */ + mdev->state = (union drbd_dev_state) { { .role = R_SECONDARY, .peer = R_UNKNOWN, .conn = C_STANDALONE, .disk = D_DISKLESS, .pdsk = D_UNKNOWN, - .susp = 0 } }; } @@ -2807,24 +2134,17 @@ atomic_set(&mdev->rs_pending_cnt, 0); atomic_set(&mdev->unacked_cnt, 0); atomic_set(&mdev->local_cnt, 0); - atomic_set(&mdev->net_cnt, 0); - atomic_set(&mdev->packet_seq, 0); - atomic_set(&mdev->pp_in_use, 0); - - mutex_init(&mdev->md_io_mutex); - mutex_init(&mdev->data.mutex); - mutex_init(&mdev->meta.mutex); - sema_init(&mdev->data.work.s, 0); - sema_init(&mdev->meta.work.s, 0); - mutex_init(&mdev->state_mutex); + atomic_set(&mdev->pp_in_use_by_net, 0); + atomic_set(&mdev->rs_sect_in, 0); + atomic_set(&mdev->rs_sect_ev, 0); + atomic_set(&mdev->ap_in_flight, 0); + atomic_set(&mdev->md_io_in_use, 0); - spin_lock_init(&mdev->data.work.q_lock); - spin_lock_init(&mdev->meta.work.q_lock); + mutex_init(&mdev->own_state_mutex); + mdev->state_mutex = &mdev->own_state_mutex; spin_lock_init(&mdev->al_lock); - spin_lock_init(&mdev->req_lock); spin_lock_init(&mdev->peer_seq_lock); - spin_lock_init(&mdev->epoch_lock); INIT_LIST_HEAD(&mdev->active_ee); INIT_LIST_HEAD(&mdev->sync_ee); @@ -2832,22 +2152,39 @@ INIT_LIST_HEAD(&mdev->read_ee); INIT_LIST_HEAD(&mdev->net_ee); INIT_LIST_HEAD(&mdev->resync_reads); - INIT_LIST_HEAD(&mdev->data.work.q); - INIT_LIST_HEAD(&mdev->meta.work.q); INIT_LIST_HEAD(&mdev->resync_work.list); INIT_LIST_HEAD(&mdev->unplug_work.list); + INIT_LIST_HEAD(&mdev->go_diskless.list); INIT_LIST_HEAD(&mdev->md_sync_work.list); + INIT_LIST_HEAD(&mdev->start_resync_work.list); INIT_LIST_HEAD(&mdev->bm_io_work.w.list); - mdev->resync_work.cb = w_resync_inactive; + + mdev->resync_work.cb = w_resync_timer; mdev->unplug_work.cb = w_send_write_hint; + mdev->go_diskless.cb = w_go_diskless; mdev->md_sync_work.cb = w_md_sync; mdev->bm_io_work.w.cb = w_bitmap_io; + mdev->start_resync_work.cb = w_start_resync; + + mdev->resync_work.mdev = mdev; + mdev->unplug_work.mdev = mdev; + mdev->go_diskless.mdev = mdev; + mdev->md_sync_work.mdev = mdev; + mdev->bm_io_work.w.mdev = mdev; + mdev->start_resync_work.mdev = mdev; + init_timer(&mdev->resync_timer); init_timer(&mdev->md_sync_timer); + init_timer(&mdev->start_resync_timer); + init_timer(&mdev->request_timer); mdev->resync_timer.function = resync_timer_fn; mdev->resync_timer.data = (unsigned long) mdev; mdev->md_sync_timer.function = md_sync_timer_fn; mdev->md_sync_timer.data = (unsigned long) mdev; + mdev->start_resync_timer.function = start_resync_timer_fn; + mdev->start_resync_timer.data = (unsigned long) mdev; + mdev->request_timer.function = request_timer_fn; + mdev->request_timer.data = (unsigned long) mdev; init_waitqueue_head(&mdev->misc_wait); init_waitqueue_head(&mdev->state_wait); @@ -2855,24 +2192,18 @@ init_waitqueue_head(&mdev->al_wait); init_waitqueue_head(&mdev->seq_wait); - drbd_thread_init(mdev, &mdev->receiver, drbdd_init); - drbd_thread_init(mdev, &mdev->worker, drbd_worker); - drbd_thread_init(mdev, &mdev->asender, drbd_asender); - - mdev->agreed_pro_version = PRO_VERSION_MAX; - mdev->write_ordering = WO_bio_barrier; mdev->resync_wenr = LC_FREE; + mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; + mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; } void drbd_mdev_cleanup(struct drbd_conf *mdev) { - if (mdev->receiver.t_state != None) + int i; + if (mdev->tconn->receiver.t_state != NONE) dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", - mdev->receiver.t_state); + mdev->tconn->receiver.t_state); - /* no need to lock it, I'm the only thread alive */ - if (atomic_read(&mdev->current_epoch->epoch_size) != 0) - dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); mdev->al_writ_cnt = mdev->bm_writ_cnt = mdev->read_cnt = @@ -2882,35 +2213,40 @@ mdev->p_size = mdev->rs_start = mdev->rs_total = - mdev->rs_failed = - mdev->rs_mark_left = - mdev->rs_mark_time = 0; - D_ASSERT(mdev->net_conf == NULL); + mdev->rs_failed = 0; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = 0; + mdev->rs_mark_time[i] = 0; + } + D_ASSERT(mdev->tconn->net_conf == NULL); drbd_set_my_capacity(mdev, 0); if (mdev->bitmap) { /* maybe never allocated. */ - drbd_bm_resize(mdev, 0); + drbd_bm_resize(mdev, 0, 1); drbd_bm_cleanup(mdev); } - drbd_free_resources(mdev); + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL; + + clear_bit(AL_SUSPENDED, &mdev->flags); - /* - * currently we drbd_init_ee only on module load, so - * we may do drbd_release_ee only on module unload! - */ D_ASSERT(list_empty(&mdev->active_ee)); D_ASSERT(list_empty(&mdev->sync_ee)); D_ASSERT(list_empty(&mdev->done_ee)); D_ASSERT(list_empty(&mdev->read_ee)); D_ASSERT(list_empty(&mdev->net_ee)); D_ASSERT(list_empty(&mdev->resync_reads)); - D_ASSERT(list_empty(&mdev->data.work.q)); - D_ASSERT(list_empty(&mdev->meta.work.q)); + D_ASSERT(list_empty(&mdev->tconn->data.work.q)); + D_ASSERT(list_empty(&mdev->tconn->meta.work.q)); D_ASSERT(list_empty(&mdev->resync_work.list)); D_ASSERT(list_empty(&mdev->unplug_work.list)); + D_ASSERT(list_empty(&mdev->go_diskless.list)); + drbd_set_defaults(mdev); } @@ -2927,6 +2263,10 @@ /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_md_io_bio_set) + bioset_free(drbd_md_io_bio_set); + if (drbd_md_io_page_pool) + mempool_destroy(drbd_md_io_page_pool); if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); if (drbd_request_mempool) @@ -2940,6 +2280,8 @@ if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_md_io_bio_set = NULL; + drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; drbd_request_mempool = NULL; drbd_ee_cache = NULL; @@ -2953,7 +2295,7 @@ STATIC int drbd_create_mempools(void) { struct page *page; - const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; + const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count; int i; /* prepare our caches and mempools */ @@ -2963,6 +2305,8 @@ drbd_bm_ext_cache = NULL; drbd_al_ext_cache = NULL; drbd_pp_pool = NULL; + drbd_md_io_page_pool = NULL; + drbd_md_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -2971,7 +2315,7 @@ goto Enomem; drbd_ee_cache = kmem_cache_create( - "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); + "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL); if (drbd_ee_cache == NULL) goto Enomem; @@ -2986,6 +2330,14 @@ goto Enomem; /* mempools */ + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_bio_set == NULL) + goto Enomem; + + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_page_pool == NULL) + goto Enomem; + drbd_request_mempool = mempool_create(number, mempool_alloc_slab, mempool_free_slab, drbd_request_cache); if (drbd_request_mempool == NULL) @@ -2993,7 +2345,7 @@ drbd_ee_mempool = mempool_create(number, mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); - if (drbd_request_mempool == NULL) + if (drbd_ee_mempool == NULL) goto Enomem; /* drbd's page pool */ @@ -3029,71 +2381,53 @@ .notifier_call = drbd_notify_sys, }; -static void drbd_release_ee_lists(struct drbd_conf *mdev) +static void drbd_release_all_peer_reqs(struct drbd_conf *mdev) { int rr; - rr = drbd_release_ee(mdev, &mdev->active_ee); + rr = drbd_free_peer_reqs(mdev, &mdev->active_ee); if (rr) dev_err(DEV, "%d EEs in active list found!\n", rr); - rr = drbd_release_ee(mdev, &mdev->sync_ee); + rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee); if (rr) dev_err(DEV, "%d EEs in sync list found!\n", rr); - rr = drbd_release_ee(mdev, &mdev->read_ee); + rr = drbd_free_peer_reqs(mdev, &mdev->read_ee); if (rr) dev_err(DEV, "%d EEs in read list found!\n", rr); - rr = drbd_release_ee(mdev, &mdev->done_ee); + rr = drbd_free_peer_reqs(mdev, &mdev->done_ee); if (rr) dev_err(DEV, "%d EEs in done list found!\n", rr); - rr = drbd_release_ee(mdev, &mdev->net_ee); + rr = drbd_free_peer_reqs(mdev, &mdev->net_ee); if (rr) dev_err(DEV, "%d EEs in net list found!\n", rr); } -/* caution. no locking. - * currently only used from module cleanup code. */ -static void drbd_delete_device(unsigned int minor) +/* caution. no locking. */ +void drbd_minor_destroy(struct kref *kref) { - struct drbd_conf *mdev = minor_to_mdev(minor); + struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref); + struct drbd_tconn *tconn = mdev->tconn; - if (!mdev) - return; + del_timer_sync(&mdev->request_timer); /* paranoia asserts */ - if (mdev->open_cnt != 0) - dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, - __FILE__ , __LINE__); - - ERR_IF (!list_empty(&mdev->data.work.q)) { - struct list_head *lp; - list_for_each(lp, &mdev->data.work.q) { - DUMPP(lp); - } - }; + D_ASSERT(mdev->open_cnt == 0); /* end paranoia asserts */ - del_gendisk(mdev->vdisk); - /* cleanup stuff that may have been allocated during * device (re-)configuration or state changes */ if (mdev->this_bdev) bdput(mdev->this_bdev); - drbd_free_resources(mdev); - - drbd_release_ee_lists(mdev); + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL; - /* should be free'd on disconnect? */ - kfree(mdev->ee_hash); - /* - mdev->ee_hash_s = 0; - mdev->ee_hash = NULL; - */ + drbd_release_all_peer_reqs(mdev); lc_destroy(mdev->act_log); lc_destroy(mdev->resync); @@ -3101,37 +2435,58 @@ kfree(mdev->p_uuid); /* mdev->p_uuid = NULL; */ - kfree(mdev->int_dig_out); - kfree(mdev->int_dig_in); - kfree(mdev->int_dig_vv); - - /* cleanup the rest that has been - * allocated from drbd_new_device - * and actually free the mdev itself */ - drbd_free_mdev(mdev); + if (mdev->bitmap) /* should no longer be there. */ + drbd_bm_cleanup(mdev); + __free_page(mdev->md_io_page); + put_disk(mdev->vdisk); + blk_cleanup_queue(mdev->rq_queue); + kfree(mdev->rs_plan_s); + kfree(mdev); + + kref_put(&tconn->kref, &conn_destroy); } STATIC void drbd_cleanup(void) { unsigned int i; + struct drbd_conf *mdev; + struct drbd_tconn *tconn, *tmp; unregister_reboot_notifier(&drbd_notifier); - drbd_nl_cleanup(); + /* first remove proc, + * drbdsetup uses it's presence to detect + * whether DRBD is loaded. + * If we would get stuck in proc removal, + * but have netlink already deregistered, + * some drbdsetup commands may wait forever + * for an answer. + */ + if (drbd_proc) + remove_proc_entry("drbd", NULL); + + drbd_genl_unregister(); - if (minor_table) { - if (drbd_proc) - remove_proc_entry("drbd", NULL); - i = minor_count; - while (i--) - drbd_delete_device(i); - drbd_destroy_mempools(); + idr_for_each_entry(&minors, mdev, i) { + idr_remove(&minors, mdev_to_minor(mdev)); + idr_remove(&mdev->tconn->volumes, mdev->vnr); + del_gendisk(mdev->vdisk); + /* synchronize_rcu(); No other threads running at this point */ + kref_put(&mdev->kref, &drbd_minor_destroy); } - kfree(minor_table); + /* not _rcu since, no other updater anymore. Genl already unregistered */ + list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { + list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */ + /* synchronize_rcu(); */ + kref_put(&tconn->kref, &conn_destroy); + } + drbd_destroy_mempools(); drbd_unregister_blkdev(DRBD_MAJOR, "drbd"); + idr_destroy(&minors); + printk(KERN_INFO "drbd: module cleanup done.\n"); } @@ -3149,7 +2504,7 @@ char reason = '-'; int r = 0; - if (!__inc_ap_bio_cond(mdev)) { + if (!may_inc_ap_bio(mdev)) { /* DRBD has frozen IO */ r = bdi_bits; reason = 'd'; @@ -3164,7 +2519,7 @@ reason = 'b'; } - if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { + if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) { r |= (1 << BDI_async_congested); reason = reason == 'b' ? 'a' : 'n'; } @@ -3174,20 +2529,242 @@ return r; } -struct drbd_conf *drbd_new_device(unsigned int minor) +static void drbd_init_workqueue(struct drbd_work_queue* wq) +{ + sema_init(&wq->s, 0); + spin_lock_init(&wq->q_lock); + INIT_LIST_HEAD(&wq->q); +} + +struct drbd_tconn *conn_get_by_name(const char *name) +{ + struct drbd_tconn *tconn; + + if (!name || !name[0]) + return NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { + if (!strcmp(tconn->name, name)) { + kref_get(&tconn->kref); + goto found; + } + } + tconn = NULL; +found: + rcu_read_unlock(); + return tconn; +} + +struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len, + void *peer_addr, int peer_addr_len) +{ + struct drbd_tconn *tconn; + + rcu_read_lock(); + list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) { + if (tconn->my_addr_len == my_addr_len && + tconn->peer_addr_len == peer_addr_len && + !memcmp(&tconn->my_addr, my_addr, my_addr_len) && + !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) { + kref_get(&tconn->kref); + goto found; + } + } + tconn = NULL; +found: + rcu_read_unlock(); + return tconn; +} + +static int drbd_alloc_socket(struct drbd_socket *socket) +{ + socket->rbuf = (void *) __get_free_page(GFP_KERNEL); + if (!socket->rbuf) + return -ENOMEM; + socket->sbuf = (void *) __get_free_page(GFP_KERNEL); + if (!socket->sbuf) + return -ENOMEM; + return 0; +} + +static void drbd_free_socket(struct drbd_socket *socket) +{ + free_page((unsigned long) socket->sbuf); + free_page((unsigned long) socket->rbuf); +} + +void conn_free_crypto(struct drbd_tconn *tconn) +{ + drbd_free_sock(tconn); + + crypto_free_hash(tconn->csums_tfm); + crypto_free_hash(tconn->verify_tfm); + crypto_free_hash(tconn->cram_hmac_tfm); + crypto_free_hash(tconn->integrity_tfm); + crypto_free_hash(tconn->peer_integrity_tfm); + kfree(tconn->int_dig_in); + kfree(tconn->int_dig_vv); + + tconn->csums_tfm = NULL; + tconn->verify_tfm = NULL; + tconn->cram_hmac_tfm = NULL; + tconn->integrity_tfm = NULL; + tconn->peer_integrity_tfm = NULL; + tconn->int_dig_in = NULL; + tconn->int_dig_vv = NULL; +} + +int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts) +{ + cpumask_var_t new_cpu_mask; + int err; + + if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) + return -ENOMEM; + /* + retcode = ERR_NOMEM; + drbd_msg_put_info("unable to allocate cpumask"); + */ + + /* silently ignore cpu mask on UP kernel */ + if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { + /* FIXME: Get rid of constant 32 here */ + err = __bitmap_parse(res_opts->cpu_mask, 32, 0, + cpumask_bits(new_cpu_mask), nr_cpu_ids); + if (err) { + conn_warn(tconn, "__bitmap_parse() failed with %d\n", err); + /* retcode = ERR_CPU_MASK_PARSE; */ + goto fail; + } + } + tconn->res_opts = *res_opts; + if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) { + cpumask_copy(tconn->cpu_mask, new_cpu_mask); + drbd_calc_cpu_mask(tconn); + tconn->receiver.reset_cpu_mask = 1; + tconn->asender.reset_cpu_mask = 1; + tconn->worker.reset_cpu_mask = 1; + } + err = 0; + +fail: + free_cpumask_var(new_cpu_mask); + return err; + +} + +/* caller must be under genl_lock() */ +struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts) +{ + struct drbd_tconn *tconn; + + tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL); + if (!tconn) + return NULL; + + tconn->name = kstrdup(name, GFP_KERNEL); + if (!tconn->name) + goto fail; + + if (drbd_alloc_socket(&tconn->data)) + goto fail; + if (drbd_alloc_socket(&tconn->meta)) + goto fail; + + if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL)) + goto fail; + + if (set_resource_options(tconn, res_opts)) + goto fail; + + if (!tl_init(tconn)) + goto fail; + + tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); + if (!tconn->current_epoch) + goto fail; + INIT_LIST_HEAD(&tconn->current_epoch->list); + tconn->epochs = 1; + spin_lock_init(&tconn->epoch_lock); + tconn->write_ordering = WO_bio_barrier; + + tconn->cstate = C_STANDALONE; + mutex_init(&tconn->cstate_mutex); + spin_lock_init(&tconn->req_lock); + mutex_init(&tconn->conf_update); + init_waitqueue_head(&tconn->ping_wait); + idr_init(&tconn->volumes); + + drbd_init_workqueue(&tconn->data.work); + mutex_init(&tconn->data.mutex); + + drbd_init_workqueue(&tconn->meta.work); + mutex_init(&tconn->meta.mutex); + + drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver"); + drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker"); + drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender"); + + kref_init(&tconn->kref); + list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns); + + return tconn; + +fail: + kfree(tconn->current_epoch); + tl_cleanup(tconn); + free_cpumask_var(tconn->cpu_mask); + drbd_free_socket(&tconn->meta); + drbd_free_socket(&tconn->data); + kfree(tconn->name); + kfree(tconn); + + return NULL; +} + +void conn_destroy(struct kref *kref) +{ + struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref); + + if (atomic_read(&tconn->current_epoch->epoch_size) != 0) + conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size)); + kfree(tconn->current_epoch); + + idr_destroy(&tconn->volumes); + + free_cpumask_var(tconn->cpu_mask); + drbd_free_socket(&tconn->meta); + drbd_free_socket(&tconn->data); + kfree(tconn->name); + kfree(tconn->int_dig_in); + kfree(tconn->int_dig_vv); + kfree(tconn); +} + +enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) { struct drbd_conf *mdev; struct gendisk *disk; struct request_queue *q; + int vnr_got = vnr; + int minor_got = minor; + enum drbd_ret_code err = ERR_NOMEM; + + mdev = minor_to_mdev(minor); + if (mdev) + return ERR_MINOR_EXISTS; /* GFP_KERNEL, we are outside of all write-out paths */ mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); if (!mdev) - return NULL; - if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) - goto out_no_cpumask; + return ERR_NOMEM; + + kref_get(&tconn->kref); + mdev->tconn = tconn; mdev->minor = minor; + mdev->vnr = vnr; drbd_init_set_defaults(mdev); @@ -3196,14 +2773,13 @@ goto out_no_q; mdev->rq_queue = q; q->queuedata = mdev; - blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); disk = alloc_disk(1); if (!disk) goto out_no_disk; mdev->vdisk = disk; - set_disk_ro(disk, TRUE); + set_disk_ro(disk, true); disk->queue = q; disk->major = DRBD_MAJOR; @@ -3219,12 +2795,13 @@ q->backing_dev_info.congested_fn = drbd_congested; q->backing_dev_info.congested_data = mdev; - blk_queue_make_request(q, drbd_make_request_26); + blk_queue_make_request(q, drbd_make_request); + /* Setting the max_hw_sectors to an odd value of 8kibyte here + This triggers a max_bio_size message upon first attach or connect */ + blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); blk_queue_merge_bvec(q, drbd_merge_bvec); - q->queue_lock = &mdev->req_lock; /* needed since we use */ - /* plugging on a queue, that actually has no requests! */ - q->unplug_fn = drbd_unplug_fn; + q->queue_lock = &mdev->tconn->req_lock; mdev->md_io_page = alloc_page(GFP_KERNEL); if (!mdev->md_io_page) @@ -3232,30 +2809,44 @@ if (drbd_bm_init(mdev)) goto out_no_bitmap; - /* no need to lock access, we are still initializing this minor device. */ - if (!tl_init(mdev)) - goto out_no_tl; - - mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); - if (!mdev->app_reads_hash) - goto out_no_app_reads; - - mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); - if (!mdev->current_epoch) - goto out_no_epoch; - - INIT_LIST_HEAD(&mdev->current_epoch->list); - mdev->epochs = 1; - - return mdev; - -/* out_whatever_else: - kfree(mdev->current_epoch); */ -out_no_epoch: - kfree(mdev->app_reads_hash); -out_no_app_reads: - tl_cleanup(mdev); -out_no_tl: + mdev->read_requests = RB_ROOT; + mdev->write_requests = RB_ROOT; + + if (!idr_pre_get(&minors, GFP_KERNEL)) + goto out_no_minor_idr; + if (idr_get_new_above(&minors, mdev, minor, &minor_got)) + goto out_no_minor_idr; + if (minor_got != minor) { + err = ERR_MINOR_EXISTS; + drbd_msg_put_info("requested minor exists already"); + goto out_idr_remove_minor; + } + + if (!idr_pre_get(&tconn->volumes, GFP_KERNEL)) + goto out_idr_remove_minor; + if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got)) + goto out_idr_remove_minor; + if (vnr_got != vnr) { + err = ERR_INVALID_REQUEST; + drbd_msg_put_info("requested volume exists already"); + goto out_idr_remove_vol; + } + add_disk(disk); + kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ + + /* inherit the connection state */ + mdev->state.conn = tconn->cstate; + if (mdev->state.conn == C_WF_REPORT_PARAMS) + drbd_connected(mdev); + + return NO_ERROR; + +out_idr_remove_vol: + idr_remove(&tconn->volumes, vnr_got); +out_idr_remove_minor: + idr_remove(&minors, minor_got); + synchronize_rcu(); +out_no_minor_idr: drbd_bm_cleanup(mdev); out_no_bitmap: __free_page(mdev->md_io_page); @@ -3264,54 +2855,25 @@ out_no_disk: blk_cleanup_queue(q); out_no_q: - free_cpumask_var(mdev->cpu_mask); -out_no_cpumask: - kfree(mdev); - return NULL; -} - -/* counterpart of drbd_new_device. - * last part of drbd_delete_device. */ -void drbd_free_mdev(struct drbd_conf *mdev) -{ - kfree(mdev->current_epoch); - kfree(mdev->app_reads_hash); - tl_cleanup(mdev); - if (mdev->bitmap) /* should no longer be there. */ - drbd_bm_cleanup(mdev); - __free_page(mdev->md_io_page); - put_disk(mdev->vdisk); - blk_cleanup_queue(mdev->rq_queue); - free_cpumask_var(mdev->cpu_mask); kfree(mdev); + kref_put(&tconn->kref, &conn_destroy); + return err; } - int __init drbd_init(void) { int err; - if (sizeof(struct p_handshake) != 80) { - printk(KERN_ERR - "drbd: never change the size or layout " - "of the HandShake packet.\n"); - return -EINVAL; - } - - if (1 > minor_count || minor_count > 255) { + if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { printk(KERN_ERR - "drbd: invalid minor_count (%d)\n", minor_count); + "drbd: invalid minor_count (%d)\n", minor_count); #ifdef MODULE return -EINVAL; #else - minor_count = 8; + minor_count = DRBD_MINOR_COUNT_DEF; #endif } - err = drbd_nl_init(); - if (err) - return err; - err = register_blkdev(DRBD_MAJOR, "drbd"); if (err) { printk(KERN_ERR @@ -3320,6 +2882,13 @@ return err; } + err = drbd_genl_register(); + if (err) { + printk(KERN_ERR "drbd: unable to register generic netlink family\n"); + goto fail; + } + + register_reboot_notifier(&drbd_notifier); /* @@ -3330,22 +2899,20 @@ init_waitqueue_head(&drbd_pp_wait); drbd_proc = NULL; /* play safe for drbd_cleanup */ - minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, - GFP_KERNEL); - if (!minor_table) - goto Enomem; + idr_init(&minors); err = drbd_create_mempools(); if (err) - goto Enomem; + goto fail; - drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); + drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); if (!drbd_proc) { printk(KERN_ERR "drbd: unable to register proc file\n"); - goto Enomem; + goto fail; } rwlock_init(&global_state_lock); + INIT_LIST_HEAD(&drbd_tconns); printk(KERN_INFO "drbd: initialized. " "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", @@ -3353,11 +2920,10 @@ printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); printk(KERN_INFO "drbd: registered as block device major %d\n", DRBD_MAJOR); - printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); return 0; /* Success! */ -Enomem: +fail: drbd_cleanup(); if (err == -ENOMEM) /* currently always the case */ @@ -3372,48 +2938,29 @@ if (ldev == NULL) return; - bd_release(ldev->backing_bdev); - bd_release(ldev->md_bdev); - - fput(ldev->lo_file); - fput(ldev->md_file); + blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); kfree(ldev); } -void drbd_free_sock(struct drbd_conf *mdev) -{ - if (mdev->data.socket) { - kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); - sock_release(mdev->data.socket); - mdev->data.socket = NULL; - } - if (mdev->meta.socket) { - kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); - sock_release(mdev->meta.socket); - mdev->meta.socket = NULL; - } -} - -void drbd_free_resources(struct drbd_conf *mdev) +void drbd_free_sock(struct drbd_tconn *tconn) { - crypto_free_hash(mdev->csums_tfm); - mdev->csums_tfm = NULL; - crypto_free_hash(mdev->verify_tfm); - mdev->verify_tfm = NULL; - crypto_free_hash(mdev->cram_hmac_tfm); - mdev->cram_hmac_tfm = NULL; - crypto_free_hash(mdev->integrity_w_tfm); - mdev->integrity_w_tfm = NULL; - crypto_free_hash(mdev->integrity_r_tfm); - mdev->integrity_r_tfm = NULL; - - drbd_free_sock(mdev); - - __no_warn(local, - drbd_free_bc(mdev->ldev); - mdev->ldev = NULL;); + if (tconn->data.socket) { + mutex_lock(&tconn->data.mutex); + kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR); + sock_release(tconn->data.socket); + tconn->data.socket = NULL; + mutex_unlock(&tconn->data.mutex); + } + if (tconn->meta.socket) { + mutex_lock(&tconn->meta.mutex); + kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR); + sock_release(tconn->meta.socket); + tconn->meta.socket = NULL; + mutex_unlock(&tconn->meta.mutex); + } } /* meta data management */ @@ -3428,10 +2975,11 @@ u32 md_size_sect; u32 al_offset; /* offset to this block */ u32 al_nr_extents; /* important for restoring the AL */ - /* `-- act_log->nr_elements <-- sync_conf.al_extents */ + /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ u32 bm_offset; /* offset to the bitmap, from here */ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ - u32 reserved_u32[4]; + u32 la_peer_max_bio_size; /* last peer max_bio_size */ + u32 reserved_u32[3]; } __packed; @@ -3445,26 +2993,27 @@ sector_t sector; int i; + del_timer(&mdev->md_sync_timer); + /* timer may be rearmed by drbd_md_mark_dirty() now. */ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) return; - del_timer(&mdev->md_sync_timer); /* We use here D_FAILED and not D_ATTACHING because we try to write * metadata even if we detach due to a disk failure! */ if (!get_ldev_if_state(mdev, D_FAILED)) return; - trace_drbd_md_io(mdev, WRITE, mdev->ldev); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); memset(buffer, 0, 512); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); for (i = UI_CURRENT; i < UI_SIZE; i++) buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); buffer->flags = cpu_to_be32(mdev->ldev->md.flags); - buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN); buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); @@ -3473,24 +3022,23 @@ buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); + buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); sector = mdev->ldev->md.md_offset; if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { - clear_bit(MD_DIRTY, &mdev->flags); - } else { /* this was a try anyways ... */ dev_err(DEV, "meta data update failed!\n"); - - drbd_chk_io_error(mdev, 1, TRUE); + drbd_chk_io_error(mdev, 1, true); } /* Update mdev->ldev->md.la_size_sect, * since we updated it on metadata. */ mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); +out: put_ldev(mdev); } @@ -3499,32 +3047,44 @@ * @mdev: DRBD device. * @bdev: Device from which the meta data should be read in. * - * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case - * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. + * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case + * something goes wrong. */ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { struct meta_data_on_disk *buffer; + u32 magic, flags; int i, rv = NO_ERROR; if (!get_ldev_if_state(mdev, D_ATTACHING)) return ERR_IO_MD_DISK; - trace_drbd_md_io(mdev, READ, bdev); - - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; - if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { - /* NOTE: cant do normal error processing here as this is + if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { + /* NOTE: can't do normal error processing here as this is called BEFORE disk is attached */ dev_err(DEV, "Error while reading metadata.\n"); rv = ERR_IO_MD_DISK; goto err; } - if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { - dev_err(DEV, "Error while reading metadata, magic not found.\n"); + magic = be32_to_cpu(buffer->magic); + flags = be32_to_cpu(buffer->flags); + if (magic == DRBD_MD_MAGIC_84_UNCLEAN || + (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) { + /* btw: that's Activity Log clean, not "all" clean. */ + dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n"); + rv = ERR_MD_UNCLEAN; + goto err; + } + if (magic != DRBD_MD_MAGIC_08) { + if (magic == DRBD_MD_MAGIC_07) + dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); + else + dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); rv = ERR_MD_INVALID; goto err; } @@ -3558,14 +3118,20 @@ for (i = UI_CURRENT; i < UI_SIZE; i++) bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); bdev->md.flags = be32_to_cpu(buffer->flags); - mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); - if (mdev->sync_conf.al_extents < 7) - mdev->sync_conf.al_extents = 127; + spin_lock_irq(&mdev->tconn->req_lock); + if (mdev->state.conn < C_CONNECTED) { + int peer; + peer = be32_to_cpu(buffer->la_peer_max_bio_size); + peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE); + mdev->peer_max_bio_size = peer; + } + spin_unlock_irq(&mdev->tconn->req_lock); err: - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); + out: put_ldev(mdev); return rv; @@ -3579,22 +3145,29 @@ * the meta-data super block. This function sets MD_DIRTY, and starts a * timer that ensures that within five seconds you have to call drbd_md_sync(). */ +#ifdef DRBD_DEBUG_MD_SYNC +void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func) +{ + if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) { + mod_timer(&mdev->md_sync_timer, jiffies + HZ); + mdev->last_md_mark_dirty.line = line; + mdev->last_md_mark_dirty.func = func; + } +} +#else void drbd_md_mark_dirty(struct drbd_conf *mdev) { - set_bit(MD_DIRTY, &mdev->flags); - mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); + if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) + mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); } +#endif - -STATIC void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) +static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) { int i; - for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) { + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; - - trace_drbd_uuid(mdev, i+1); - } } void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) @@ -3609,7 +3182,6 @@ } mdev->ldev->md.uuid[idx] = val; - trace_drbd_uuid(mdev, idx); drbd_md_mark_dirty(mdev); } @@ -3619,7 +3191,6 @@ if (mdev->ldev->md.uuid[idx]) { drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; - trace_drbd_uuid(mdev, UI_HISTORY_START); } _drbd_uuid_set(mdev, idx, val); } @@ -3634,14 +3205,18 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) { u64 val; + unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP]; + + if (bm_uuid) + dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid); - dev_info(DEV, "Creating new current UUID\n"); - D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; - trace_drbd_uuid(mdev, UI_BITMAP); get_random_bytes(&val, sizeof(u64)); _drbd_uuid_set(mdev, UI_CURRENT, val); + drbd_print_uuids(mdev, "new current UUID"); + /* get it to stable storage _now_ */ + drbd_md_sync(mdev); } void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) @@ -3653,16 +3228,12 @@ drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; mdev->ldev->md.uuid[UI_BITMAP] = 0; - trace_drbd_uuid(mdev, UI_HISTORY_START); - trace_drbd_uuid(mdev, UI_BITMAP); } else { - if (mdev->ldev->md.uuid[UI_BITMAP]) - dev_warn(DEV, "bm UUID already set"); + unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP]; + if (bm_uuid) + dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid); - mdev->ldev->md.uuid[UI_BITMAP] = val; - mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); - - trace_drbd_uuid(mdev, UI_BITMAP); + mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1); } drbd_md_mark_dirty(mdev); } @@ -3705,6 +3276,7 @@ { int rv = -EIO; + drbd_resume_al(mdev); if (get_ldev_if_state(mdev, D_ATTACHING)) { drbd_bm_clear_all(mdev); rv = drbd_bm_write(mdev); @@ -3714,18 +3286,22 @@ return rv; } -STATIC int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) +STATIC int w_bitmap_io(struct drbd_work *w, int unused) { struct bm_io_work *work = container_of(w, struct bm_io_work, w); - int rv; + struct drbd_conf *mdev = w->mdev; + int rv = -EIO; D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); - drbd_bm_lock(mdev, work->why); - rv = work->io_fn(mdev); - drbd_bm_unlock(mdev); + if (get_ldev(mdev)) { + drbd_bm_lock(mdev, work->why, work->flags); + rv = work->io_fn(mdev); + drbd_bm_unlock(mdev); + put_ldev(mdev); + } - clear_bit(BITMAP_IO, &mdev->flags); + clear_bit_unlock(BITMAP_IO, &mdev->flags); wake_up(&mdev->misc_wait); if (work->done) @@ -3733,8 +3309,42 @@ clear_bit(BITMAP_IO_QUEUED, &mdev->flags); work->why = NULL; + work->flags = 0; - return 1; + return 0; +} + +void drbd_ldev_destroy(struct drbd_conf *mdev) +{ + lc_destroy(mdev->resync); + mdev->resync = NULL; + lc_destroy(mdev->act_log); + mdev->act_log = NULL; + __no_warn(local, + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL;); + + clear_bit(GO_DISKLESS, &mdev->flags); +} + +STATIC int w_go_diskless(struct drbd_work *w, int unused) +{ + struct drbd_conf *mdev = w->mdev; + + D_ASSERT(mdev->state.disk == D_FAILED); + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will + * inc/dec it frequently. Once we are D_DISKLESS, no one will touch + * the protected members anymore, though, so once put_ldev reaches zero + * again, it will be safe to free them. */ + drbd_force_state(mdev, NS(disk, D_DISKLESS)); + return 0; +} + +void drbd_go_diskless(struct drbd_conf *mdev) +{ + D_ASSERT(mdev->state.disk == D_FAILED); + if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) + drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless); } /** @@ -3752,9 +3362,9 @@ void drbd_queue_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), void (*done)(struct drbd_conf *, int), - char *why) + char *why, enum bm_flag flags) { - D_ASSERT(current == mdev->worker.task); + D_ASSERT(current == mdev->tconn->worker.task); D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); @@ -3766,15 +3376,15 @@ mdev->bm_io_work.io_fn = io_fn; mdev->bm_io_work.done = done; mdev->bm_io_work.why = why; + mdev->bm_io_work.flags = flags; + spin_lock_irq(&mdev->tconn->req_lock); set_bit(BITMAP_IO, &mdev->flags); if (atomic_read(&mdev->ap_bio_cnt) == 0) { - if (list_empty(&mdev->bm_io_work.w.list)) { - set_bit(BITMAP_IO_QUEUED, &mdev->flags); - drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); - } else - dev_err(DEV, "FIXME avoided double queuing bm_io_work\n"); + if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) + drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w); } + spin_unlock_irq(&mdev->tconn->req_lock); } /** @@ -3786,19 +3396,22 @@ * freezes application IO while that the actual IO operations runs. This * functions MAY NOT be called from worker context. */ -int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) +int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), + char *why, enum bm_flag flags) { int rv; - D_ASSERT(current != mdev->worker.task); + D_ASSERT(current != mdev->tconn->worker.task); - drbd_suspend_io(mdev); + if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + drbd_suspend_io(mdev); - drbd_bm_lock(mdev, why); + drbd_bm_lock(mdev, why, flags); rv = io_fn(mdev); drbd_bm_unlock(mdev); - drbd_resume_io(mdev); + if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + drbd_resume_io(mdev); return rv; } @@ -3827,15 +3440,125 @@ { struct drbd_conf *mdev = (struct drbd_conf *) data; - drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); + drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work); } -STATIC int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) +STATIC int w_md_sync(struct drbd_work *w, int unused) { + struct drbd_conf *mdev = w->mdev; + dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); +#ifdef DRBD_DEBUG_MD_SYNC + dev_warn(DEV, "last md_mark_dirty: %s:%u\n", + mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line); +#endif drbd_md_sync(mdev); + return 0; +} - return 1; +const char *cmdname(enum drbd_packet cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [P_DATA] = "Data", + [P_DATA_REPLY] = "DataReply", + [P_RS_DATA_REPLY] = "RSDataReply", + [P_BARRIER] = "Barrier", + [P_BITMAP] = "ReportBitMap", + [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", + [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", + [P_UNPLUG_REMOTE] = "UnplugRemote", + [P_DATA_REQUEST] = "DataRequest", + [P_RS_DATA_REQUEST] = "RSDataRequest", + [P_SYNC_PARAM] = "SyncParam", + [P_SYNC_PARAM89] = "SyncParam89", + [P_PROTOCOL] = "ReportProtocol", + [P_UUIDS] = "ReportUUIDs", + [P_SIZES] = "ReportSizes", + [P_STATE] = "ReportState", + [P_SYNC_UUID] = "ReportSyncUUID", + [P_AUTH_CHALLENGE] = "AuthChallenge", + [P_AUTH_RESPONSE] = "AuthResponse", + [P_PING] = "Ping", + [P_PING_ACK] = "PingAck", + [P_RECV_ACK] = "RecvAck", + [P_WRITE_ACK] = "WriteAck", + [P_RS_WRITE_ACK] = "RSWriteAck", + [P_DISCARD_WRITE] = "DiscardWrite", + [P_NEG_ACK] = "NegAck", + [P_NEG_DREPLY] = "NegDReply", + [P_NEG_RS_DREPLY] = "NegRSDReply", + [P_BARRIER_ACK] = "BarrierAck", + [P_STATE_CHG_REQ] = "StateChgRequest", + [P_STATE_CHG_REPLY] = "StateChgReply", + [P_OV_REQUEST] = "OVRequest", + [P_OV_REPLY] = "OVReply", + [P_OV_RESULT] = "OVResult", + [P_CSUM_RS_REQUEST] = "CsumRSRequest", + [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", + [P_COMPRESSED_BITMAP] = "CBitmap", + [P_DELAY_PROBE] = "DelayProbe", + [P_OUT_OF_SYNC] = "OutOfSync", + [P_RETRY_WRITE] = "RetryWrite", + [P_RS_CANCEL] = "RSCancel", + [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", + [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", + [P_RETRY_WRITE] = "retry_write", + [P_PROTOCOL_UPDATE] = "protocol_update", + + /* enum drbd_packet, but not commands - obsoleted flags: + * P_MAY_IGNORE + * P_MAX_OPT_CMD + */ + }; + + /* too big for the array: 0xfffX */ + if (cmd == P_INITIAL_META) + return "InitialMeta"; + if (cmd == P_INITIAL_DATA) + return "InitialData"; + if (cmd == P_CONNECTION_FEATURES) + return "ConnectionFeatures"; + if (cmd >= ARRAY_SIZE(cmdnames)) + return "Unknown"; + return cmdnames[cmd]; +} + +/** + * drbd_wait_misc - wait for a request to make progress + * @mdev: device associated with the request + * @i: the struct drbd_interval embedded in struct drbd_request or + * struct drbd_peer_request + */ +int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i) +{ + struct net_conf *nc; + DEFINE_WAIT(wait); + long timeout; + + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + if (!nc) { + rcu_read_unlock(); + return -ETIMEDOUT; + } + timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT; + rcu_read_unlock(); + + /* Indicate to wake up mdev->misc_wait on progress. */ + i->waiting = true; + prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE); + spin_unlock_irq(&mdev->tconn->req_lock); + timeout = schedule_timeout(timeout); + finish_wait(&mdev->misc_wait, &wait); + spin_lock_irq(&mdev->tconn->req_lock); + if (!timeout || mdev->state.conn < C_CONNECTED) + return -ETIMEDOUT; + if (signal_pending(current)) + return -ERESTARTSYS; + return 0; } #ifdef DRBD_ENABLE_FAULTS @@ -3879,7 +3602,8 @@ [DRBD_FAULT_DT_RD] = "Data read", [DRBD_FAULT_DT_RA] = "Data read ahead", [DRBD_FAULT_BM_ALLOC] = "BM allocation", - [DRBD_FAULT_AL_EE] = "EE allocation" + [DRBD_FAULT_AL_EE] = "EE allocation", + [DRBD_FAULT_RECEIVE] = "receive data corruption", }; return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; @@ -3898,7 +3622,7 @@ if (ret) { fault_count++; - if (printk_ratelimit()) + if (DRBD_ratelimit(5*HZ, 5)) dev_warn(DEV, "***Simulating %s failure\n", _drbd_fault_str(type)); } diff -Nru drbd8-8.3.7/drbd/drbd_nl.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_nl.c --- drbd8-8.3.7/drbd/drbd_nl.c 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_nl.c 2012-02-02 14:09:14.000000000 +0000 @@ -30,151 +30,337 @@ #include #include #include -#include #include #include #include "drbd_int.h" -#include "drbd_tracing.h" -#include "drbd_wrappers.h" +#include "drbd_req.h" #include -#include #include -#include +#include -static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); -static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); -static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) +/* + * copied from more recent kernel source + */ +int genl_register_family_with_ops(struct genl_family *family, + struct genl_ops *ops, size_t n_ops) +{ + int err, i; + + err = genl_register_family(family); + if (err) + return err; + + for (i = 0; i < n_ops; ++i, ++ops) { + err = genl_register_ops(family, ops); + if (err) + goto err_out; + } + return 0; +err_out: + genl_unregister_family(family); + return err; +} +#endif -/* see get_sb_bdev and bd_claim */ +/* .doit */ +// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); +// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); +/* .dumpit */ +int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); + +#include +#include "drbd_nla.h" +#include + +/* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; -/* Generate the tag_list to struct functions */ -#define NL_PACKET(name, number, fields) \ -STATIC int name ## _from_tags(struct drbd_conf *mdev, \ - unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ -STATIC int name ## _from_tags(struct drbd_conf *mdev, \ - unsigned short *tags, struct name *arg) \ -{ \ - int tag; \ - int dlen; \ - \ - while ((tag = get_unaligned(tags++)) != TT_END) { \ - dlen = get_unaligned(tags++); \ - switch (tag_number(tag)) { \ - fields \ - default: \ - if (tag & T_MANDATORY) { \ - dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ - return 0; \ - } \ - } \ - tags = (unsigned short *)((char *)tags + dlen); \ - } \ - return 1; \ -} -#define NL_INTEGER(pn, pr, member) \ - case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ - arg->member = get_unaligned((int *)(tags)); \ - break; -#define NL_INT64(pn, pr, member) \ - case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ - arg->member = get_unaligned((u64 *)(tags)); \ +/* Configuration is strictly serialized, because generic netlink message + * processing is strictly serialized by the genl_lock(). + * Which means we can use one static global drbd_config_context struct. + */ +static struct drbd_config_context { + /* assigned from drbd_genlmsghdr */ + unsigned int minor; + /* assigned from request attributes, if present */ + unsigned int volume; +#define VOLUME_UNSPECIFIED (-1U) + /* pointer into the request skb, + * limited lifetime! */ + char *resource_name; + struct nlattr *my_addr; + struct nlattr *peer_addr; + + /* reply buffer */ + struct sk_buff *reply_skb; + /* pointer into reply buffer */ + struct drbd_genlmsghdr *reply_dh; + /* resolved from attributes, if possible */ + struct drbd_conf *mdev; + struct drbd_tconn *tconn; +} adm_ctx; + +static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) +{ + genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); + if (genlmsg_reply(skb, info)) + printk(KERN_ERR "drbd: error sending genl reply\n"); +} + +/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only + * reason it could fail was no space in skb, and there are 4k available. */ +int drbd_msg_put_info(const char *info) +{ + struct sk_buff *skb = adm_ctx.reply_skb; + struct nlattr *nla; + int err = -EMSGSIZE; + + if (!info || !info[0]) + return 0; + + nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); + if (!nla) + return err; + + err = nla_put_string(skb, T_info_text, info); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } else + nla_nest_end(skb, nla); + return 0; +} + +/* This would be a good candidate for a "pre_doit" hook, + * and per-family private info->pointers. + * But we need to stay compatible with older kernels. + * If it returns successfully, adm_ctx members are valid. + */ +#define DRBD_ADM_NEED_MINOR 1 +#define DRBD_ADM_NEED_RESOURCE 2 +#define DRBD_ADM_NEED_CONNECTION 4 +static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, + unsigned flags) +{ + struct drbd_genlmsghdr *d_in = info->userhdr; + const u8 cmd = info->genlhdr->cmd; + int err; + + memset(&adm_ctx, 0, sizeof(adm_ctx)); + + /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ + if (cmd != DRBD_ADM_GET_STATUS + && security_netlink_recv(skb, CAP_SYS_ADMIN)) + return -EPERM; + + adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!adm_ctx.reply_skb) { + err = -ENOMEM; + goto fail; + } + + adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, + info, &drbd_genl_family, 0, cmd); + /* put of a few bytes into a fresh skb of >= 4k will always succeed. + * but anyways */ + if (!adm_ctx.reply_dh) { + err = -ENOMEM; + goto fail; + } + + adm_ctx.reply_dh->minor = d_in->minor; + adm_ctx.reply_dh->ret_code = NO_ERROR; + + adm_ctx.volume = VOLUME_UNSPECIFIED; + if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { + struct nlattr *nla; + /* parse and validate only */ + err = drbd_cfg_context_from_attrs(NULL, info); + if (err) + goto fail; + + /* It was present, and valid, + * copy it over to the reply skb. */ + err = nla_put_nohdr(adm_ctx.reply_skb, + info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, + info->attrs[DRBD_NLA_CFG_CONTEXT]); + if (err) + goto fail; + + /* and assign stuff to the global adm_ctx */ + nla = nested_attr_tb[__nla_type(T_ctx_volume)]; + if (nla) + adm_ctx.volume = nla_get_u32(nla); + nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; + if (nla) + adm_ctx.resource_name = nla_data(nla); + adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; + adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + if ((adm_ctx.my_addr && + nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) || + (adm_ctx.peer_addr && + nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) { + err = -EINVAL; + goto fail; + } + } + + adm_ctx.minor = d_in->minor; + adm_ctx.mdev = minor_to_mdev(d_in->minor); + adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name); + + if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) { + drbd_msg_put_info("unknown minor"); + return ERR_MINOR_INVALID; + } + if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info("unknown resource"); + return ERR_INVALID_REQUEST; + } + + if (flags & DRBD_ADM_NEED_CONNECTION) { + if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info("no resource name expected"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx.mdev) { + drbd_msg_put_info("no minor number expected"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx.my_addr && adm_ctx.peer_addr) + adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr), + nla_len(adm_ctx.my_addr), + nla_data(adm_ctx.peer_addr), + nla_len(adm_ctx.peer_addr)); + if (!adm_ctx.tconn) { + drbd_msg_put_info("unknown connection"); + return ERR_INVALID_REQUEST; + } + } + + /* some more paranoia, if the request was over-determined */ + if (adm_ctx.mdev && adm_ctx.tconn && + adm_ctx.mdev->tconn != adm_ctx.tconn) { + pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n", + adm_ctx.minor, adm_ctx.resource_name, + adm_ctx.mdev->tconn->name); + drbd_msg_put_info("minor exists in different resource"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx.mdev && + adm_ctx.volume != VOLUME_UNSPECIFIED && + adm_ctx.volume != adm_ctx.mdev->vnr) { + pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", + adm_ctx.minor, adm_ctx.volume, + adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name); + drbd_msg_put_info("minor exists as different volume"); + return ERR_INVALID_REQUEST; + } + + return NO_ERROR; + +fail: + nlmsg_free(adm_ctx.reply_skb); + adm_ctx.reply_skb = NULL; + return err; +} + +static int drbd_adm_finish(struct genl_info *info, int retcode) +{ + if (adm_ctx.tconn) { + kref_put(&adm_ctx.tconn->kref, &conn_destroy); + adm_ctx.tconn = NULL; + } + + if (!adm_ctx.reply_skb) + return -ENOMEM; + + adm_ctx.reply_dh->ret_code = retcode; + drbd_adm_send_reply(adm_ctx.reply_skb, info); + return 0; +} + +static void setup_khelper_env(struct drbd_tconn *tconn, char **envp) +{ + char *afs; + + /* FIXME: A future version will not allow this case. */ + if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0) + return; + + switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) { + case AF_INET6: + afs = "ipv6"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", + &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr); break; -#define NL_BIT(pn, pr, member) \ - case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ - arg->member = *(char *)(tags) ? 1 : 0; \ + case AF_INET: + afs = "ipv4"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); break; -#define NL_STRING(pn, pr, member, len) \ - case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ - if (dlen > len) { \ - dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ - #member, dlen, (unsigned int)len); \ - return 0; \ - } \ - arg->member ## _len = dlen; \ - memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ - break; -#include "linux/drbd_nl.h" - -/* Generate the struct to tag_list functions */ -#define NL_PACKET(name, number, fields) \ -STATIC unsigned short* \ -name ## _to_tags(struct drbd_conf *mdev, \ - struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ -STATIC unsigned short* \ -name ## _to_tags(struct drbd_conf *mdev, \ - struct name *arg, unsigned short *tags) \ -{ \ - fields \ - return tags; \ -} - -#define NL_INTEGER(pn, pr, member) \ - put_unaligned(pn | pr | TT_INTEGER, tags++); \ - put_unaligned(sizeof(int), tags++); \ - put_unaligned(arg->member, (int *)tags); \ - tags = (unsigned short *)((char *)tags+sizeof(int)); -#define NL_INT64(pn, pr, member) \ - put_unaligned(pn | pr | TT_INT64, tags++); \ - put_unaligned(sizeof(u64), tags++); \ - put_unaligned(arg->member, (u64 *)tags); \ - tags = (unsigned short *)((char *)tags+sizeof(u64)); -#define NL_BIT(pn, pr, member) \ - put_unaligned(pn | pr | TT_BIT, tags++); \ - put_unaligned(sizeof(char), tags++); \ - *(char *)tags = arg->member; \ - tags = (unsigned short *)((char *)tags+sizeof(char)); -#define NL_STRING(pn, pr, member, len) \ - put_unaligned(pn | pr | TT_STRING, tags++); \ - put_unaligned(arg->member ## _len, tags++); \ - memcpy(tags, arg->member, arg->member ## _len); \ - tags = (unsigned short *)((char *)tags + arg->member ## _len); -#include "linux/drbd_nl.h" - -void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); -void drbd_nl_send_reply(struct cn_msg *, int); + default: + afs = "ssocks"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr); + } + snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); +} int drbd_khelper(struct drbd_conf *mdev, char *cmd) { char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL, /* Will be set to address family */ - NULL, /* Will be set to address */ + (char[20]) { }, /* address family */ + (char[60]) { }, /* address */ NULL }; - - char mb[12], af[20], ad[60], *afs; + char mb[12]; char *argv[] = {usermode_helper, cmd, mb, NULL }; + struct sib_info sib; int ret; snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); + setup_khelper_env(mdev->tconn, envp); - if (get_net_conf(mdev)) { - switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { - case AF_INET6: - afs = "ipv6"; - snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", - &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); - break; - case AF_INET: - afs = "ipv4"; - snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", - &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); - break; - default: - afs = "ssocks"; - snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", - &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); - } - snprintf(af, 20, "DRBD_PEER_AF=%s", afs); - envp[3]=af; - envp[4]=ad; - put_net_conf(mdev); - } + /* The helper may take some time. + * write out any unsynced meta data changes now */ + drbd_md_sync(mdev); dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); - - drbd_bcast_ev_helper(mdev, cmd); + sib.sib_reason = SIB_HELPER_PRE; + sib.helper_name = cmd; + drbd_bcast_event(mdev, &sib); ret = call_usermodehelper(usermode_helper, argv, envp, 1); if (ret) dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", @@ -184,6 +370,59 @@ dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); + sib.sib_reason = SIB_HELPER_POST; + sib.helper_exit_code = ret; + drbd_bcast_event(mdev, &sib); + + if (ret < 0) /* Ignore any ERRNOs we got. */ + ret = 0; + + return ret; +} + +static void conn_md_sync(struct drbd_tconn *tconn) +{ + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + kref_get(&mdev->kref); + rcu_read_unlock(); + drbd_md_sync(mdev); + kref_put(&mdev->kref, &drbd_minor_destroy); + rcu_read_lock(); + } + rcu_read_unlock(); +} + +int conn_khelper(struct drbd_tconn *tconn, char *cmd) +{ + char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + (char[20]) { }, /* address family */ + (char[60]) { }, /* address */ + NULL }; + char *argv[] = {usermode_helper, cmd, tconn->name, NULL }; + int ret; + + setup_khelper_env(tconn, envp); + conn_md_sync(tconn); + + conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name); + /* TODO: conn_bcast_event() ?? */ + + ret = call_usermodehelper(usermode_helper, argv, envp, 1); + if (ret) + conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, tconn->name, + (ret >> 8) & 0xff, ret); + else + conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, tconn->name, + (ret >> 8) & 0xff, ret); + /* TODO: conn_bcast_event() ?? */ if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; @@ -191,166 +430,217 @@ return ret; } -enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) +static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) +{ + enum drbd_fencing_p fp = FP_NOT_AVAIL; + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (get_ldev_if_state(mdev, D_CONSISTENT)) { + fp = max_t(enum drbd_fencing_p, fp, + rcu_dereference(mdev->ldev->disk_conf)->fencing); + put_ldev(mdev); + } + } + rcu_read_unlock(); + + return fp; +} + +bool conn_try_outdate_peer(struct drbd_tconn *tconn) { + union drbd_state mask = { }; + union drbd_state val = { }; + enum drbd_fencing_p fp; char *ex_to_string; int r; - enum drbd_disk_state nps; - enum drbd_fencing_p fp; - - D_ASSERT(mdev->state.pdsk == D_UNKNOWN); - if (get_ldev_if_state(mdev, D_CONSISTENT)) { - fp = mdev->ldev->dc.fencing; - put_ldev(mdev); - } else { - dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); - return mdev->state.pdsk; + if (tconn->cstate >= C_WF_REPORT_PARAMS) { + conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n"); + return false; } - if (fp == FP_STONITH) - _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); + fp = highest_fencing_policy(tconn); + switch (fp) { + case FP_NOT_AVAIL: + conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n"); + goto out; + case FP_DONT_CARE: + return true; + default: ; + } - r = drbd_khelper(mdev, "fence-peer"); + r = conn_khelper(tconn, "fence-peer"); switch ((r>>8) & 0xff) { case 3: /* peer is inconsistent */ ex_to_string = "peer is inconsistent or worse"; - nps = D_INCONSISTENT; + mask.pdsk = D_MASK; + val.pdsk = D_INCONSISTENT; break; case 4: /* peer got outdated, or was already outdated */ ex_to_string = "peer was fenced"; - nps = D_OUTDATED; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; break; case 5: /* peer was down */ - if (mdev->state.disk == D_UP_TO_DATE) { + if (conn_highest_disk(tconn) == D_UP_TO_DATE) { /* we will(have) create(d) a new UUID anyways... */ ex_to_string = "peer is unreachable, assumed to be dead"; - nps = D_OUTDATED; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; } else { ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; - nps = mdev->state.pdsk; } break; case 6: /* Peer is primary, voluntarily outdate myself. * This is useful when an unconnected R_SECONDARY is asked to * become R_PRIMARY, but finds the other peer being active. */ ex_to_string = "peer is active"; - dev_warn(DEV, "Peer is primary, outdating myself.\n"); - nps = D_UNKNOWN; - _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); + conn_warn(tconn, "Peer is primary, outdating myself.\n"); + mask.disk = D_MASK; + val.disk = D_OUTDATED; break; case 7: /* THINK: do we need to handle this * like case 4, or more like case 5? */ if (fp != FP_STONITH) - dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); + conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n"); ex_to_string = "peer was stonithed"; - nps = D_OUTDATED; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; break; default: /* The script is broken ... */ - nps = D_UNKNOWN; - dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); - return nps; + conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); + return false; /* Eventually leave IO frozen */ } - dev_info(DEV, "fence-peer helper returned %d (%s)\n", - (r>>8) & 0xff, ex_to_string); - return nps; + conn_info(tconn, "fence-peer helper returned %d (%s)\n", + (r>>8) & 0xff, ex_to_string); + + out: + + /* Not using + conn_request_state(tconn, mask, val, CS_VERBOSE); + here, because we might were able to re-establish the connection in the + meantime. */ + spin_lock_irq(&tconn->req_lock); + if (tconn->cstate < C_WF_REPORT_PARAMS) + _conn_request_state(tconn, mask, val, CS_VERBOSE); + spin_unlock_irq(&tconn->req_lock); + + return conn_highest_pdsk(tconn) <= D_OUTDATED; } +static int _try_outdate_peer_async(void *data) +{ + struct drbd_tconn *tconn = (struct drbd_tconn *)data; + + conn_try_outdate_peer(tconn); + + kref_put(&tconn->kref, &conn_destroy); + return 0; +} + +void conn_try_outdate_peer_async(struct drbd_tconn *tconn) +{ + struct task_struct *opa; + + kref_get(&tconn->kref); + opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h"); + if (IS_ERR(opa)) { + conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n"); + kref_put(&tconn->kref, &conn_destroy); + } +} -int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) +enum drbd_state_rv +drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) { const int max_tries = 4; - int r = 0; + enum drbd_state_rv rv = SS_UNKNOWN_ERROR; + struct net_conf *nc; int try = 0; int forced = 0; union drbd_state mask, val; - enum drbd_disk_state nps; if (new_role == R_PRIMARY) - request_ping(mdev); /* Detect a dead peer ASAP */ + request_ping(mdev->tconn); /* Detect a dead peer ASAP */ - mutex_lock(&mdev->state_mutex); + mutex_lock(mdev->state_mutex); mask.i = 0; mask.role = R_MASK; val.i = 0; val.role = new_role; while (try++ < max_tries) { - DRBD_STATE_DEBUG_INIT_VAL(val); - r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); + rv = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); /* in case we first succeeded to outdate, * but now suddenly could establish a connection */ - if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { + if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { val.pdsk = 0; mask.pdsk = 0; continue; } - if (r == SS_NO_UP_TO_DATE_DISK && force && - (mdev->state.disk == D_INCONSISTENT || - mdev->state.disk == D_OUTDATED)) { + if (rv == SS_NO_UP_TO_DATE_DISK && force && + (mdev->state.disk < D_UP_TO_DATE && + mdev->state.disk >= D_INCONSISTENT)) { mask.disk = D_MASK; val.disk = D_UP_TO_DATE; forced = 1; continue; } - if (r == SS_NO_UP_TO_DATE_DISK && + if (rv == SS_NO_UP_TO_DATE_DISK && mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { D_ASSERT(mdev->state.pdsk == D_UNKNOWN); - nps = drbd_try_outdate_peer(mdev); - if (nps == D_OUTDATED || nps == D_INCONSISTENT) { + if (conn_try_outdate_peer(mdev->tconn)) { val.disk = D_UP_TO_DATE; mask.disk = D_MASK; } - - val.pdsk = nps; - mask.pdsk = D_MASK; - continue; } - if (r == SS_NOTHING_TO_DO) - goto fail; - if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { - nps = drbd_try_outdate_peer(mdev); - - if (force && nps > D_OUTDATED) { + if (rv == SS_NOTHING_TO_DO) + goto out; + if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { + if (!conn_try_outdate_peer(mdev->tconn) && force) { dev_warn(DEV, "Forced into split brain situation!\n"); - nps = D_OUTDATED; - } - - mask.pdsk = D_MASK; - val.pdsk = nps; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + } continue; } - if (r == SS_TWO_PRIMARIES) { + if (rv == SS_TWO_PRIMARIES) { /* Maybe the peer is detected as dead very soon... retry at most once more in this case. */ - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); + int timeo; + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; + rcu_read_unlock(); + schedule_timeout_interruptible(timeo); if (try < max_tries) try = max_tries - 1; continue; } - if (r < SS_SUCCESS) { - DRBD_STATE_DEBUG_INIT_VAL(val); - r = _drbd_request_state(mdev, mask, val, + if (rv < SS_SUCCESS) { + rv = _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_WAIT_COMPLETE); - if (r < SS_SUCCESS) - goto fail; + if (rv < SS_SUCCESS) + goto out; } break; } - if (r < SS_SUCCESS) - goto fail; + if (rv < SS_SUCCESS) + goto out; if (forced) dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); @@ -359,17 +649,19 @@ wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); if (new_role == R_SECONDARY) { - set_disk_ro(mdev->vdisk, TRUE); + set_disk_ro(mdev->vdisk, true); if (get_ldev(mdev)) { mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; put_ldev(mdev); } } else { - if (get_net_conf(mdev)) { - mdev->net_conf->want_lose = 0; - put_net_conf(mdev); - } - set_disk_ro(mdev->vdisk, FALSE); + mutex_lock(&mdev->tconn->conf_update); + nc = mdev->tconn->net_conf; + if (nc) + nc->discard_my_data = 0; /* without copy; single bit op is atomic */ + mutex_unlock(&mdev->tconn->conf_update); + + set_disk_ro(mdev->vdisk, false); if (get_ldev(mdev)) { if (((mdev->state.conn < C_CONNECTED || mdev->state.pdsk <= D_FAILED) @@ -381,49 +673,60 @@ } } - if ((new_role == R_SECONDARY) && get_ldev(mdev)) { - drbd_al_to_on_disk_bm(mdev); - put_ldev(mdev); - } + /* writeout of activity log covered areas of the bitmap + * to stable storage done in after state change already */ if (mdev->state.conn >= C_WF_REPORT_PARAMS) { /* if this was forced, we should consider sync */ if (forced) drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } drbd_md_sync(mdev); drbd_kobject_uevent(mdev); - fail: - mutex_unlock(&mdev->state_mutex); - return r; +out: + mutex_unlock(mdev->state_mutex); + return rv; } - -STATIC int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +static const char *from_attrs_err_to_txt(int err) { - struct primary primary_args; - - memset(&primary_args, 0, sizeof(struct primary)); - if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { - reply->ret_code = ERR_MANDATORY_TAG; - return 0; - } - - reply->ret_code = - drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer); - - return 0; + return err == -ENOMSG ? "required attribute missing" : + err == -EOPNOTSUPP ? "unknown mandatory attribute" : + err == -EEXIST ? "can not change invariant setting" : + "invalid attribute value"; } -STATIC int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) { - reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); + struct set_role_parms parms; + int err; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + memset(&parms, 0, sizeof(parms)); + if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) { + err = set_role_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto out; + } + } + + if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) + retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate); + else + retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0); +out: + drbd_adm_finish(info, retcode); return 0; } @@ -433,7 +736,12 @@ struct drbd_backing_dev *bdev) { sector_t md_size_sect = 0; - switch (bdev->dc.meta_dev_idx) { + int meta_dev_idx; + + rcu_read_lock(); + meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; + + switch (meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ bdev->md.md_size_sect = MD_RESERVED_SECT; @@ -452,7 +760,7 @@ case DRBD_MD_INDEX_FLEX_INT: bdev->md.md_offset = drbd_md_ss__(mdev, bdev); /* al size is still fixed */ - bdev->md.al_offset = -MD_AL_MAX_SIZE; + bdev->md.al_offset = -MD_AL_SECTORS; /* we need (slightly less than) ~ this much bitmap sectors: */ md_size_sect = drbd_get_capacity(bdev->backing_bdev); md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); @@ -468,19 +776,22 @@ bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; break; } + rcu_read_unlock(); } +/* input size is expected to be in KB */ char *ppsize(char *buf, unsigned long long size) { - /* Needs 9 bytes at max. */ + /* Needs 9 bytes at max including trailing NUL: + * -1ULL ==> "16384 EB" */ static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; int base = 0; - while (size >= 10000) { + while (size >= 10000 && base < sizeof(units)-1) { /* shift + round */ size = (size >> 10) + !!(size & (1<<9)); base++; } - sprintf(buf, "%lu %cB", (long)size, units[base]); + sprintf(buf, "%u %cB", (unsigned)size, units[base]); return buf; } @@ -498,9 +809,17 @@ * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: * peer may not initiate a resize. */ +/* Note these are not to be confused with + * drbd_adm_suspend_io/drbd_adm_resume_io, + * which are (sub) state changes triggered by admin (drbdsetup), + * and can be long lived. + * This changes an mdev->flag, is triggered by drbd internals, + * and should be short-lived. */ void drbd_suspend_io(struct drbd_conf *mdev) { set_bit(SUSPEND_IO, &mdev->flags); + if (drbd_suspended(mdev)) + return; wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); } @@ -517,10 +836,10 @@ * Returns 0 on success, negative return values indicate errors. * You should call drbd_md_sync() after calling this function. */ -enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local) +enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size; + sector_t la_size, u_size; sector_t size; char ppb[10]; @@ -548,12 +867,15 @@ /* TODO: should only be some assert here, not (re)init... */ drbd_md_set_sector_offsets(mdev, mdev->ldev); - size = drbd_new_dev_size(mdev, mdev->ldev, force); + rcu_read_lock(); + u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); if (drbd_get_capacity(mdev->this_bdev) != size || drbd_bm_capacity(mdev) != size) { int err; - err = drbd_bm_resize(mdev, size); + err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC)); if (unlikely(err)) { /* currently there is only one error: ENOMEM! */ size = drbd_bm_capacity(mdev)>>1; @@ -582,11 +904,19 @@ || prev_size != mdev->ldev->md.md_size_sect; if (la_size_changed || md_moved) { + int err; + drbd_al_shrink(mdev); /* All extents inactive. */ dev_info(DEV, "Writing the whole bitmap, %s\n", la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); - rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ + /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ + err = drbd_bitmap_io(mdev, &drbd_bm_write, + "size changed", BM_LOCKED_MASK); + if (err) { + rv = dev_size_error; + goto out; + } drbd_md_mark_dirty(mdev); } @@ -603,12 +933,12 @@ } sector_t -drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space) +drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + sector_t u_size, int assume_peer_has_space) { sector_t p_size = mdev->p_size; /* partner's disk size. */ sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ sector_t m_size; /* my size */ - sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ sector_t size = 0; m_size = drbd_get_max_capacity(bdev); @@ -657,24 +987,21 @@ * failed, and 0 on success. You should call drbd_md_sync() after you called * this function. */ -STATIC int drbd_check_al_size(struct drbd_conf *mdev) +STATIC int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc) { struct lru_cache *n, *t; struct lc_element *e; unsigned int in_use; int i; - ERR_IF(mdev->sync_conf.al_extents < 7) - mdev->sync_conf.al_extents = 127; - if (mdev->act_log && - mdev->act_log->nr_elements == mdev->sync_conf.al_extents) + mdev->act_log->nr_elements == dc->al_extents) return 0; in_use = 0; t = mdev->act_log; - n = lc_create("act_log", drbd_al_ext_cache, - mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); + n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION, + dc->al_extents, sizeof(struct lc_element), 0); if (n == NULL) { dev_err(DEV, "Cannot allocate act_log lru!\n"); @@ -705,229 +1032,413 @@ return 0; } -void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) +static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) { struct request_queue * const q = mdev->rq_queue; - struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; - int max_segments = mdev->ldev->dc.max_bio_bvecs; + int max_hw_sectors = max_bio_size >> 9; + int max_segments = 0; - if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) - max_seg_s = PAGE_SIZE; + if (get_ldev_if_state(mdev, D_ATTACHING)) { + struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; - max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); + max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); + rcu_read_lock(); + max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs; + rcu_read_unlock(); + put_ldev(mdev); + } - blk_queue_max_sectors(q, max_seg_s >> 9); - blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS); - blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS); - blk_queue_max_segment_size(q, max_seg_s); blk_queue_logical_block_size(q, 512); - blk_queue_segment_boundary(q, PAGE_SIZE-1); - blk_queue_stack_limits(q, b); - - /* KERNEL BUG in old ll_rw_blk.c - * t->max_segment_size = min(t->max_segment_size,b->max_segment_size); - * should be - * t->max_segment_size = min_not_zero(...,...) - * workaround here: */ - if (queue_max_segment_size(q) == 0) - blk_queue_max_segment_size(q, max_seg_s); - - if (b->merge_bvec_fn) - dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", - b->merge_bvec_fn); - dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); - - if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { - dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", - q->backing_dev_info.ra_pages, - b->backing_dev_info.ra_pages); - q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + blk_queue_max_hw_sectors(q, max_hw_sectors); + /* This is the workaround for "bio would need to, but cannot, be split" */ + blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); + blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); + + if (get_ldev_if_state(mdev, D_ATTACHING)) { + struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; + + blk_queue_stack_limits(q, b); + + if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } + put_ldev(mdev); } } -/* serialize deconfig (worker exiting, doing cleanup) - * and reconfig (drbdsetup disk, drbdsetup net) - * - * wait for a potentially exiting worker, then restart it, - * or start a new one. - */ -static void drbd_reconfig_start(struct drbd_conf *mdev) +void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) { - wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); - wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); - drbd_thread_start(&mdev->worker); + int now, new, local, peer; + + now = queue_max_hw_sectors(mdev->rq_queue) << 9; + local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */ + peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */ + + if (get_ldev_if_state(mdev, D_ATTACHING)) { + local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; + mdev->local_max_bio_size = local; + put_ldev(mdev); + } + + /* We may ignore peer limits if the peer is modern enough. + Because new from 8.3.8 onwards the peer can use multiple + BIOs for a single peer_request */ + if (mdev->state.conn >= C_CONNECTED) { + if (mdev->tconn->agreed_pro_version < 94) + peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ + else if (mdev->tconn->agreed_pro_version == 94) + peer = DRBD_MAX_SIZE_H80_PACKET; + else if (mdev->tconn->agreed_pro_version < 100) + peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ + else + peer = DRBD_MAX_BIO_SIZE; + } + + new = min_t(int, local, peer); + + if (mdev->state.role == R_PRIMARY && new < now) + dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now); + + if (new != now) + dev_info(DEV, "max BIO size = %u\n", new); + + drbd_setup_queue_param(mdev, new); } -/* if still unconfigured, stops worker again. - * if configured now, clears CONFIG_PENDING. - * wakes potential waiters */ -static void drbd_reconfig_done(struct drbd_conf *mdev) +/* Starts the worker thread */ +static void conn_reconfig_start(struct drbd_tconn *tconn) { - spin_lock_irq(&mdev->req_lock); - if (mdev->state.disk == D_DISKLESS && - mdev->state.conn == C_STANDALONE && - mdev->state.role == R_SECONDARY) { - set_bit(DEVICE_DYING, &mdev->flags); - drbd_thread_stop_nowait(&mdev->worker); - } else - clear_bit(CONFIG_PENDING, &mdev->flags); - spin_unlock_irq(&mdev->req_lock); - wake_up(&mdev->state_wait); + drbd_thread_start(&tconn->worker); + conn_flush_workqueue(tconn); } -/* does always return 0; - * interesting return code is in reply->ret_code */ -STATIC int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +/* if still unconfigured, stops worker again. */ +static void conn_reconfig_done(struct drbd_tconn *tconn) { - enum drbd_ret_codes retcode; - enum determine_dev_size dd; - sector_t max_possible_sectors; - sector_t min_md_device_sectors; - struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ - struct inode *inode, *inode2; - struct lru_cache *resync_lru = NULL; - union drbd_state ns, os; - int rv; - int cp_discovered = 0; - int logical_block_size; + bool stop_threads; + spin_lock_irq(&tconn->req_lock); + stop_threads = conn_all_vols_unconf(tconn) && + tconn->cstate == C_STANDALONE; + spin_unlock_irq(&tconn->req_lock); + if (stop_threads) { + /* asender is implicitly stopped by receiver + * in conn_disconnect() */ + drbd_thread_stop(&tconn->receiver); + drbd_thread_stop(&tconn->worker); + } +} - drbd_reconfig_start(mdev); +/* Make sure IO is suspended before calling this function(). */ +static void drbd_suspend_al(struct drbd_conf *mdev) +{ + int s = 0; - /* if you want to reconfigure, please tear down first */ - if (mdev->state.disk > D_DISKLESS) { - retcode = ERR_DISK_CONFIGURED; - goto fail; + if (!lc_try_lock(mdev->act_log)) { + dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n"); + return; } - /* allocation not in the IO path, cqueue thread context */ - nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); - if (!nbc) { - retcode = ERR_NOMEM; - goto fail; - } + drbd_al_shrink(mdev); + spin_lock_irq(&mdev->tconn->req_lock); + if (mdev->state.conn < C_CONNECTED) + s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags); + spin_unlock_irq(&mdev->tconn->req_lock); + lc_unlock(mdev->act_log); - nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; - nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; - nbc->dc.fencing = DRBD_FENCING_DEF; - nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; + if (s) + dev_info(DEV, "Suspended AL updates\n"); +} - if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { - retcode = ERR_MANDATORY_TAG; - goto fail; - } - if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { - retcode = ERR_MD_IDX_INVALID; - goto fail; - } +static bool should_set_defaults(struct genl_info *info) +{ + unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; + return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); +} - nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); - if (IS_ERR(nbc->lo_file)) { - dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, - PTR_ERR(nbc->lo_file)); - nbc->lo_file = NULL; - retcode = ERR_OPEN_DISK; - goto fail; - } +static void enforce_disk_conf_limits(struct disk_conf *dc) +{ + if (dc->al_extents < DRBD_AL_EXTENTS_MIN) + dc->al_extents = DRBD_AL_EXTENTS_MIN; + if (dc->al_extents > DRBD_AL_EXTENTS_MAX) + dc->al_extents = DRBD_AL_EXTENTS_MAX; - inode = nbc->lo_file->f_dentry->d_inode; + if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; +} - if (!S_ISBLK(inode->i_mode)) { - retcode = ERR_DISK_NOT_BDEV; - goto fail; +int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) +{ + enum drbd_ret_code retcode; + struct drbd_conf *mdev; + struct disk_conf *new_disk_conf, *old_disk_conf; + struct fifo_buffer *old_plan = NULL, *new_plan = NULL; + int err, fifo_size; + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mdev = adm_ctx.mdev; + + /* we also need a disk + * to change the options on */ + if (!get_ldev(mdev)) { + retcode = ERR_NO_DISK; + goto out; } - nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); - if (IS_ERR(nbc->md_file)) { - dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, - PTR_ERR(nbc->md_file)); - nbc->md_file = NULL; - retcode = ERR_OPEN_MD_DISK; + new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + retcode = ERR_NOMEM; goto fail; } - inode2 = nbc->md_file->f_dentry->d_inode; + mutex_lock(&mdev->tconn->conf_update); + old_disk_conf = mdev->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + if (should_set_defaults(info)) + set_disk_conf_defaults(new_disk_conf); - if (!S_ISBLK(inode2->i_mode)) { - retcode = ERR_MD_NOT_BDEV; - goto fail; + err = disk_conf_from_attrs_for_change(new_disk_conf, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); } - nbc->backing_bdev = inode->i_bdev; - if (bd_claim(nbc->backing_bdev, mdev)) { - printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", - nbc->backing_bdev, mdev, - nbc->backing_bdev->bd_holder, - nbc->backing_bdev->bd_contains->bd_holder, - nbc->backing_bdev->bd_holders); - retcode = ERR_BDCLAIM_DISK; - goto fail; - } + if (!expect(new_disk_conf->resync_rate >= 1)) + new_disk_conf->resync_rate = 1; - resync_lru = lc_create("resync", drbd_bm_ext_cache, - 61, sizeof(struct bm_extent), - offsetof(struct bm_extent, lce)); - if (!resync_lru) { + enforce_disk_conf_limits(new_disk_conf); + + fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s->size) { + new_plan = fifo_alloc(fifo_size); + if (!new_plan) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); + retcode = ERR_NOMEM; + goto fail_unlock; + } + } + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + drbd_al_shrink(mdev); + err = drbd_check_al_size(mdev, new_disk_conf); + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + if (err) { + retcode = ERR_NOMEM; + goto fail_unlock; + } + + write_lock_irq(&global_state_lock); + retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); + if (retcode == NO_ERROR) { + rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); + drbd_resync_after_changed(mdev); + } + write_unlock_irq(&global_state_lock); + + if (retcode != NO_ERROR) + goto fail_unlock; + + if (new_plan) { + old_plan = mdev->rs_plan_s; + rcu_assign_pointer(mdev->rs_plan_s, new_plan); + } + + mutex_unlock(&mdev->tconn->conf_update); + drbd_md_sync(mdev); + + if (mdev->state.conn >= C_CONNECTED) + drbd_send_sync_param(mdev); + + synchronize_rcu(); + kfree(old_disk_conf); + kfree(old_plan); + mod_timer(&mdev->request_timer, jiffies + HZ); + goto success; + +fail_unlock: + mutex_unlock(&mdev->tconn->conf_update); + fail: + kfree(new_disk_conf); + kfree(new_plan); +success: + put_ldev(mdev); + out: + drbd_adm_finish(info, retcode); + return 0; +} + +int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_conf *mdev; + int err; + enum drbd_ret_code retcode; + enum determine_dev_size dd; + sector_t max_possible_sectors; + sector_t min_md_device_sectors; + struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ + struct disk_conf *new_disk_conf = NULL; + struct block_device *bdev; + struct lru_cache *resync_lru = NULL; + struct fifo_buffer *new_plan = NULL; + union drbd_state ns, os; + enum drbd_state_rv rv; + struct net_conf *nc; + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + mdev = adm_ctx.mdev; + conn_reconfig_start(mdev->tconn); + + /* if you want to reconfigure, please tear down first */ + if (mdev->state.disk > D_DISKLESS) { + retcode = ERR_DISK_CONFIGURED; + goto fail; + } + /* It may just now have detached because of IO error. Make sure + * drbd_ldev_destroy is done already, we may end up here very fast, + * e.g. if someone calls attach from the on-io-error handler, + * to realize a "hot spare" feature (not that I'd recommend that) */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + + /* allocation not in the IO path, drbdsetup context */ + nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); + if (!nbc) { + retcode = ERR_NOMEM; + goto fail; + } + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + retcode = ERR_NOMEM; + goto fail; + } + nbc->disk_conf = new_disk_conf; + + set_disk_conf_defaults(new_disk_conf); + err = disk_conf_from_attrs(new_disk_conf, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto fail; + } + + enforce_disk_conf_limits(new_disk_conf); + + new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); + if (!new_plan) { retcode = ERR_NOMEM; - goto release_bdev_fail; + goto fail; + } + + if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { + retcode = ERR_MD_IDX_INVALID; + goto fail; + } + + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + if (nc) { + if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { + rcu_read_unlock(); + retcode = ERR_STONITH_AND_PROT_A; + goto fail; + } + } + rcu_read_unlock(); + + bdev = blkdev_get_by_path(new_disk_conf->backing_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); + if (IS_ERR(bdev)) { + dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, + PTR_ERR(bdev)); + retcode = ERR_OPEN_DISK; + goto fail; } + nbc->backing_bdev = bdev; - /* meta_dev_idx >= 0: external fixed size, - * possibly multiple drbd sharing one meta device. - * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is - * not yet used by some other drbd minor! - * (if you use drbd.conf + drbdadm, - * that should check it for you already; but if you don't, or someone - * fooled it, we need to double check here) */ - nbc->md_bdev = inode2->i_bdev; - if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev - : (void *) drbd_m_holder)) { - retcode = ERR_BDCLAIM_MD_DISK; - goto release_bdev_fail; + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = blkdev_get_by_path(new_disk_conf->meta_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + (new_disk_conf->meta_dev_idx < 0) ? + (void *)mdev : (void *)drbd_m_holder); + if (IS_ERR(bdev)) { + dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, + PTR_ERR(bdev)); + retcode = ERR_OPEN_MD_DISK; + goto fail; } + nbc->md_bdev = bdev; if ((nbc->backing_bdev == nbc->md_bdev) != - (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || - nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { + (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { retcode = ERR_MD_IDX_INVALID; - goto release_bdev2_fail; + goto fail; + } + + resync_lru = lc_create("resync", drbd_bm_ext_cache, + 1, 61, sizeof(struct bm_extent), + offsetof(struct bm_extent, lce)); + if (!resync_lru) { + retcode = ERR_NOMEM; + goto fail; } /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ drbd_md_set_sector_offsets(mdev, nbc); - if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { + if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), - (unsigned long long) nbc->dc.disk_size); - retcode = ERR_DISK_TO_SMALL; - goto release_bdev2_fail; + (unsigned long long) new_disk_conf->disk_size); + retcode = ERR_DISK_TOO_SMALL; + goto fail; } - if (nbc->dc.meta_dev_idx < 0) { + if (new_disk_conf->meta_dev_idx < 0) { max_possible_sectors = DRBD_MAX_SECTORS_FLEX; /* at least one MB, otherwise it does not make sense */ min_md_device_sectors = (2<<10); } else { max_possible_sectors = DRBD_MAX_SECTORS; - min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); + min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { - retcode = ERR_MD_DISK_TO_SMALL; + retcode = ERR_MD_DISK_TOO_SMALL; dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); - goto release_bdev2_fail; + goto fail; } /* Make sure the new disk is big enough * (we may currently be R_PRIMARY with no local disk...) */ if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { - retcode = ERR_DISK_TO_SMALL; - goto release_bdev2_fail; + retcode = ERR_DISK_TOO_SMALL; + goto fail; } nbc->known_size = drbd_get_capacity(nbc->backing_bdev); @@ -936,21 +1447,22 @@ dev_warn(DEV, "==> truncating very big lower level device " "to currently maximum possible %llu sectors <==\n", (unsigned long long) max_possible_sectors); - if (nbc->dc.meta_dev_idx >= 0) + if (new_disk_conf->meta_dev_idx >= 0) dev_warn(DEV, "==>> using internal or flexible " "meta data may help <<==\n"); } drbd_suspend_io(mdev); /* also wait for the last barrier ack. */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev)); /* and for any other previously queued work */ drbd_flush_workqueue(mdev); - retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); + rv = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); + retcode = rv; /* FIXME: Type mismatch. */ drbd_resume_io(mdev); - if (retcode < SS_SUCCESS) - goto release_bdev2_fail; + if (rv < SS_SUCCESS) + goto fail; if (!get_ldev_if_state(mdev, D_ATTACHING)) goto force_diskless; @@ -978,49 +1490,25 @@ } /* Since we are diskless, fix the activity log first... */ - if (drbd_check_al_size(mdev)) { + if (drbd_check_al_size(mdev, new_disk_conf)) { retcode = ERR_NOMEM; goto force_diskless_dec; } /* Prevent shrinking of consistent devices ! */ if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && - drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { + drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); - retcode = ERR_DISK_TO_SMALL; - goto force_diskless_dec; - } - - if (!drbd_al_read_log(mdev, nbc)) { - retcode = ERR_IO_MD_DISK; + retcode = ERR_DISK_TOO_SMALL; goto force_diskless_dec; } - /* allocate a second IO page if logical_block_size != 512 */ - logical_block_size = bdev_logical_block_size(nbc->md_bdev); - if (logical_block_size == 0) - logical_block_size = MD_SECTOR_SIZE; - - if (logical_block_size != MD_SECTOR_SIZE) { - if (!mdev->md_io_tmpp) { - struct page *page = alloc_page(GFP_NOIO); - if (!page) - goto force_diskless_dec; - - dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", - logical_block_size, MD_SECTOR_SIZE); - dev_warn(DEV, "Workaround engaged (has performance impact).\n"); - - mdev->md_io_tmpp = page; - } - } - /* Reset the "barriers don't work" bits here, then force meta data to * be written, to ensure we determine if barriers are supported. */ - if (nbc->dc.no_md_flush) - set_bit(MD_NO_BARRIER, &mdev->flags); - else + if (new_disk_conf->md_flushes) clear_bit(MD_NO_BARRIER, &mdev->flags); + else + set_bit(MD_NO_BARRIER, &mdev->flags); /* Point of no return reached. * Devices and memory are no longer released by error cleanup below. @@ -1029,28 +1517,29 @@ D_ASSERT(mdev->ldev == NULL); mdev->ldev = nbc; mdev->resync = resync_lru; + mdev->rs_plan_s = new_plan; nbc = NULL; resync_lru = NULL; + new_disk_conf = NULL; + new_plan = NULL; - mdev->write_ordering = WO_bio_barrier; - drbd_bump_write_ordering(mdev, WO_bio_barrier); + drbd_bump_write_ordering(mdev->tconn, WO_bio_barrier); if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) set_bit(CRASHED_PRIMARY, &mdev->flags); else clear_bit(CRASHED_PRIMARY, &mdev->flags); - if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { + if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && + !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod)) set_bit(CRASHED_PRIMARY, &mdev->flags); - cp_discovered = 1; - } mdev->send_cnt = 0; mdev->recv_cnt = 0; mdev->read_cnt = 0; mdev->writ_cnt = 0; - drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); + drbd_reconsider_max_bio_size(mdev); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, @@ -1072,7 +1561,7 @@ !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) set_bit(USE_DEGR_WFC_T, &mdev->flags); - dd = drbd_determin_dev_size(mdev, 0); + dd = drbd_determine_dev_size(mdev, 0); if (dd == dev_size_error) { retcode = ERR_NOMEM_BITMAP; goto force_diskless_dec; @@ -1082,25 +1571,25 @@ if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { dev_info(DEV, "Assuming that all blocks are out of sync " "(aka FullSync)\n"); - if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, + "set_n_write from attaching", BM_LOCKED_MASK)) { retcode = ERR_IO_MD_DISK; goto force_diskless_dec; } } else { - if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { + if (drbd_bitmap_io(mdev, &drbd_bm_read, + "read from attaching", BM_LOCKED_MASK)) { retcode = ERR_IO_MD_DISK; goto force_diskless_dec; } } - if (cp_discovered) { - drbd_al_apply_to_bm(mdev); - drbd_al_to_on_disk_bm(mdev); - } + if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) + drbd_suspend_al(mdev); /* IO is still suspended here... */ - spin_lock_irq(&mdev->req_lock); - os = mdev->state; - ns.i = os.i; + spin_lock_irq(&mdev->tconn->req_lock); + os = drbd_read_state(mdev); + ns = os; /* If MDF_CONSISTENT is not set go into inconsistent state, otherwise investigate MDF_WasUpToDate... If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, @@ -1118,9 +1607,11 @@ if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) ns.pdsk = D_OUTDATED; - if ( ns.disk == D_CONSISTENT && - (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) + rcu_read_lock(); + if (ns.disk == D_CONSISTENT && + (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE)) ns.disk = D_UP_TO_DATE; + rcu_read_unlock(); /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before @@ -1133,16 +1624,22 @@ mdev->new_state_tmp.i = ns.i; ns.i = os.i; ns.disk = D_NEGOTIATING; + + /* We expect to receive up-to-date UUIDs soon. + To avoid a race in receive_state, free p_uuid while + holding req_lock. I.e. atomic with the state change */ + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; } - DRBD_STATE_DEBUG_INIT_VAL(ns); rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); - ns = mdev->state; - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); if (rv < SS_SUCCESS) goto force_diskless_dec; + mod_timer(&mdev->request_timer, jiffies + HZ); + if (mdev->state.role == R_PRIMARY) mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; else @@ -1153,8 +1650,8 @@ drbd_kobject_uevent(mdev); put_ldev(mdev); - reply->ret_code = retcode; - drbd_reconfig_done(mdev); + conn_reconfig_done(mdev->tconn); + drbd_adm_finish(info, retcode); return 0; force_diskless_dec: @@ -1162,579 +1659,743 @@ force_diskless: drbd_force_state(mdev, NS(disk, D_DISKLESS)); drbd_md_sync(mdev); - release_bdev2_fail: - if (nbc) - bd_release(nbc->md_bdev); - release_bdev_fail: - if (nbc) - bd_release(nbc->backing_bdev); fail: + conn_reconfig_done(mdev->tconn); if (nbc) { - if (nbc->lo_file) - fput(nbc->lo_file); - if (nbc->md_file) - fput(nbc->md_file); + if (nbc->backing_bdev) + blkdev_put(nbc->backing_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); + if (nbc->md_bdev) + blkdev_put(nbc->md_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); kfree(nbc); } + kfree(new_disk_conf); lc_destroy(resync_lru); + kfree(new_plan); - reply->ret_code = retcode; - drbd_reconfig_done(mdev); + finish: + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +static int adm_detach(struct drbd_conf *mdev, int force) { - reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); - return 0; + enum drbd_state_rv retcode; + int ret; + + if (force) { + drbd_force_state(mdev, NS(disk, D_FAILED)); + retcode = SS_SUCCESS; + goto out; + } + + drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ + drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */ + retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); + drbd_md_put_buffer(mdev); + /* D_FAILED will transition to DISKLESS. */ + ret = wait_event_interruptible(mdev->misc_wait, + mdev->state.disk != D_FAILED); + drbd_resume_io(mdev); + if (retcode == SS_IS_DISKLESS) + retcode = SS_NOTHING_TO_DO; + if (ret) + retcode = ERR_INTR; +out: + return retcode; } -STATIC int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +/* Detaching the disk is a process in multiple stages. First we need to lock + * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. + * Then we transition to D_DISKLESS, and wait for put_ldev() to return all + * internal references as well. + * Only then we have finally detached. */ +int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) { - int i, ns; - enum drbd_ret_codes retcode; - struct net_conf *new_conf = NULL; - struct crypto_hash *tfm = NULL; - struct crypto_hash *integrity_w_tfm = NULL; - struct crypto_hash *integrity_r_tfm = NULL; - struct hlist_head *new_tl_hash = NULL; - struct hlist_head *new_ee_hash = NULL; - struct drbd_conf *odev; - char hmac_name[CRYPTO_MAX_ALG_NAME]; - void *int_dig_out = NULL; - void *int_dig_in = NULL; - void *int_dig_vv = NULL; - struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; + enum drbd_ret_code retcode; + struct detach_parms parms = { }; + int err; - drbd_reconfig_start(mdev); + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - if (mdev->state.conn > C_STANDALONE) { - retcode = ERR_NET_CONFIGURED; - goto fail; + if (info->attrs[DRBD_NLA_DETACH_PARMS]) { + err = detach_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto out; + } } - /* allocation not in the IO path, cqueue thread context */ - new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); - if (!new_conf) { - retcode = ERR_NOMEM; - goto fail; - } + retcode = adm_detach(adm_ctx.mdev, parms.force_detach); +out: + drbd_adm_finish(info, retcode); + return 0; +} - memset(new_conf, 0, sizeof(struct net_conf)); - new_conf->timeout = DRBD_TIMEOUT_DEF; - new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; - new_conf->ping_int = DRBD_PING_INT_DEF; - new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; - new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; - new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; - new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; - new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; - new_conf->ko_count = DRBD_KO_COUNT_DEF; - new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; - new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; - new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; - new_conf->want_lose = 0; - new_conf->two_primaries = 0; - new_conf->wire_protocol = DRBD_PROT_C; - new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; - new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; +static bool conn_resync_running(struct drbd_tconn *tconn) +{ + struct drbd_conf *mdev; + bool rv = false; + int vnr; - if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { - retcode = ERR_MANDATORY_TAG; - goto fail; + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (mdev->state.conn == C_SYNC_SOURCE || + mdev->state.conn == C_SYNC_TARGET || + mdev->state.conn == C_PAUSED_SYNC_S || + mdev->state.conn == C_PAUSED_SYNC_T) { + rv = true; + break; + } } + rcu_read_unlock(); - if (new_conf->two_primaries - && (new_conf->wire_protocol != DRBD_PROT_C)) { - retcode = ERR_NOT_PROTO_C; - goto fail; - }; - - if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { - retcode = ERR_DISCARD; - goto fail; - } + return rv; +} - retcode = NO_ERROR; +static bool conn_ov_running(struct drbd_tconn *tconn) +{ + struct drbd_conf *mdev; + bool rv = false; + int vnr; - new_my_addr = (struct sockaddr *)&new_conf->my_addr; - new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; - for (i = 0; i < minor_count; i++) { - odev = minor_to_mdev(i); - if (!odev || odev == mdev) - continue; - if (get_net_conf(odev)) { - taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; - if (new_conf->my_addr_len == odev->net_conf->my_addr_len && - !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) - retcode = ERR_LOCAL_ADDR; - - taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; - if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && - !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) - retcode = ERR_PEER_ADDR; - - put_net_conf(odev); - if (retcode != NO_ERROR) - goto fail; + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (mdev->state.conn == C_VERIFY_S || + mdev->state.conn == C_VERIFY_T) { + rv = true; + break; } } + rcu_read_unlock(); - if (new_conf->cram_hmac_alg[0] != 0) { - snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", - new_conf->cram_hmac_alg); - tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) { - tfm = NULL; - retcode = ERR_AUTH_ALG; - goto fail; - } + return rv; +} - if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { - retcode = ERR_AUTH_ALG_ND; - goto fail; - } - } +static enum drbd_ret_code +_check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf) +{ + struct drbd_conf *mdev; + int i; - if (new_conf->integrity_alg[0]) { - integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(integrity_w_tfm)) { - integrity_w_tfm = NULL; - retcode=ERR_INTEGRITY_ALG; - goto fail; - } + if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) { + if (new_conf->wire_protocol != old_conf->wire_protocol) + return ERR_NEED_APV_100; - if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { - retcode=ERR_INTEGRITY_ALG_ND; - goto fail; - } + if (new_conf->two_primaries != old_conf->two_primaries) + return ERR_NEED_APV_100; - integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(integrity_r_tfm)) { - integrity_r_tfm = NULL; - retcode=ERR_INTEGRITY_ALG; - goto fail; - } - } + if (!new_conf->integrity_alg != !old_conf->integrity_alg) + return ERR_NEED_APV_100; - ns = new_conf->max_epoch_size/8; - if (mdev->tl_hash_s != ns) { - new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); - if (!new_tl_hash) { - retcode = ERR_NOMEM; - goto fail; - } + if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg)) + return ERR_NEED_APV_100; } - ns = new_conf->max_buffers/8; - if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { - new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); - if (!new_ee_hash) { - retcode = ERR_NOMEM; - goto fail; + if (!new_conf->two_primaries && + conn_highest_role(tconn) == R_PRIMARY && + conn_highest_peer(tconn) == R_PRIMARY) + return ERR_NEED_ALLOW_TWO_PRI; + + if (new_conf->two_primaries && + (new_conf->wire_protocol != DRBD_PROT_C)) + return ERR_NOT_PROTO_C; + + idr_for_each_entry(&tconn->volumes, mdev, i) { + if (get_ldev(mdev)) { + enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; + put_ldev(mdev); + if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) + return ERR_STONITH_AND_PROT_A; } + if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data) + return ERR_DISCARD; } - ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; + if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) + return ERR_CONG_NOT_PROTO_A; -#if 0 - /* for the connection loss logic in drbd_recv - * I _need_ the resulting timeo in jiffies to be - * non-zero and different - * - * XXX maybe rather store the value scaled to jiffies? - * Note: MAX_SCHEDULE_TIMEOUT/HZ*HZ != MAX_SCHEDULE_TIMEOUT - * and HZ > 10; which is unlikely to change... - * Thus, if interrupted by a signal, - * sock_{send,recv}msg returns -EINTR, - * if the timeout expires, -EAGAIN. - */ - /* unlikely: someone disabled the timeouts ... - * just put some huge values in there. */ - if (!new_conf->ping_int) - new_conf->ping_int = MAX_SCHEDULE_TIMEOUT/HZ; - if (!new_conf->timeout) - new_conf->timeout = MAX_SCHEDULE_TIMEOUT/HZ*10; - if (new_conf->ping_int*10 < new_conf->timeout) - new_conf->timeout = new_conf->ping_int*10/6; - if (new_conf->ping_int*10 == new_conf->timeout) - new_conf->ping_int = new_conf->ping_int+1; -#endif + return NO_ERROR; +} - /* allocation not in the IO path, cqueue thread context */ - if (integrity_w_tfm) { - i = crypto_hash_digestsize(integrity_w_tfm); - int_dig_out = kmalloc(i, GFP_KERNEL); - if (!int_dig_out) { - retcode = ERR_NOMEM; - goto fail; - } - int_dig_in = kmalloc(i, GFP_KERNEL); - if (!int_dig_in) { - retcode = ERR_NOMEM; - goto fail; - } - int_dig_vv = kmalloc(i, GFP_KERNEL); - if (!int_dig_vv) { - retcode = ERR_NOMEM; - goto fail; - } - } +static enum drbd_ret_code +check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf) +{ + static enum drbd_ret_code rv; + struct drbd_conf *mdev; + int i; - if (!mdev->bitmap) { - if(drbd_bm_init(mdev)) { - retcode = ERR_NOMEM; - goto fail; + rcu_read_lock(); + rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf); + rcu_read_unlock(); + + /* tconn->volumes protected by genl_lock() here */ + idr_for_each_entry(&tconn->volumes, mdev, i) { + if (!mdev->bitmap) { + if(drbd_bm_init(mdev)) + return ERR_NOMEM; } } - spin_lock_irq(&mdev->req_lock); - if (mdev->net_conf != NULL) { - retcode = ERR_NET_CONFIGURED; - spin_unlock_irq(&mdev->req_lock); - goto fail; - } - mdev->net_conf = new_conf; + return rv; +} - mdev->send_cnt = 0; - mdev->recv_cnt = 0; +struct crypto { + struct crypto_hash *verify_tfm; + struct crypto_hash *csums_tfm; + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_tfm; +}; - if (new_tl_hash) { - kfree(mdev->tl_hash); - mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; - mdev->tl_hash = new_tl_hash; - } +static int +alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg) +{ + if (!tfm_name[0]) + return NO_ERROR; - if (new_ee_hash) { - kfree(mdev->ee_hash); - mdev->ee_hash_s = mdev->net_conf->max_buffers/8; - mdev->ee_hash = new_ee_hash; + *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(*tfm)) { + *tfm = NULL; + return err_alg; } - crypto_free_hash(mdev->cram_hmac_tfm); - mdev->cram_hmac_tfm = tfm; - - crypto_free_hash(mdev->integrity_w_tfm); - mdev->integrity_w_tfm = integrity_w_tfm; - - crypto_free_hash(mdev->integrity_r_tfm); - mdev->integrity_r_tfm = integrity_r_tfm; + return NO_ERROR; +} - kfree(mdev->int_dig_out); - kfree(mdev->int_dig_in); - kfree(mdev->int_dig_vv); - mdev->int_dig_out=int_dig_out; - mdev->int_dig_in=int_dig_in; - mdev->int_dig_vv=int_dig_vv; - spin_unlock_irq(&mdev->req_lock); +static enum drbd_ret_code +alloc_crypto(struct crypto *crypto, struct net_conf *new_conf) +{ + char hmac_name[CRYPTO_MAX_ALG_NAME]; + enum drbd_ret_code rv; - retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); + rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg, + ERR_CSUMS_ALG); + if (rv != NO_ERROR) + return rv; + rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg, + ERR_VERIFY_ALG); + if (rv != NO_ERROR) + return rv; + rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg, + ERR_INTEGRITY_ALG); + if (rv != NO_ERROR) + return rv; + if (new_conf->cram_hmac_alg[0] != 0) { + snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", + new_conf->cram_hmac_alg); - drbd_kobject_uevent(mdev); - reply->ret_code = retcode; - drbd_reconfig_done(mdev); - return 0; + rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name, + ERR_AUTH_ALG); + } -fail: - kfree(int_dig_out); - kfree(int_dig_in); - kfree(int_dig_vv); - crypto_free_hash(tfm); - crypto_free_hash(integrity_w_tfm); - crypto_free_hash(integrity_r_tfm); - kfree(new_tl_hash); - kfree(new_ee_hash); - kfree(new_conf); + return rv; +} - reply->ret_code = retcode; - drbd_reconfig_done(mdev); - return 0; +static void free_crypto(struct crypto *crypto) +{ + crypto_free_hash(crypto->cram_hmac_tfm); + crypto_free_hash(crypto->integrity_tfm); + crypto_free_hash(crypto->csums_tfm); + crypto_free_hash(crypto->verify_tfm); } -STATIC int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) { - int retcode; + enum drbd_ret_code retcode; + struct drbd_tconn *tconn; + struct net_conf *old_conf, *new_conf = NULL; + int err; + int ovr; /* online verify running */ + int rsr; /* re-sync running */ + struct crypto crypto = { }; + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); + tconn = adm_ctx.tconn; - if (retcode == SS_NOTHING_TO_DO) - goto done; - else if (retcode == SS_ALREADY_STANDALONE) - goto done; - else if (retcode == SS_PRIMARY_NOP) { - /* Our statche checking code wants to see the peer outdated. */ - retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, - pdsk, D_OUTDATED)); - } else if (retcode == SS_CW_FAILED_BY_PEER) { - /* The peer probably wants to see us outdated. */ - retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, - disk, D_OUTDATED), - CS_ORDERED); - if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - retcode = SS_SUCCESS; - } + new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_conf) { + retcode = ERR_NOMEM; + goto out; } - if (retcode < SS_SUCCESS) - goto fail; + conn_reconfig_start(tconn); - if (wait_event_interruptible(mdev->state_wait, - mdev->state.conn != C_DISCONNECTING)) { - /* Do not test for mdev->state.conn == C_STANDALONE, since - someone else might connect us in the mean time! */ - retcode = ERR_INTR; + mutex_lock(&tconn->data.mutex); + mutex_lock(&tconn->conf_update); + old_conf = tconn->net_conf; + + if (!old_conf) { + drbd_msg_put_info("net conf missing, try connect"); + retcode = ERR_INVALID_REQUEST; goto fail; } - done: - retcode = NO_ERROR; - fail: - drbd_md_sync(mdev); - reply->ret_code = retcode; - return 0; -} + *new_conf = *old_conf; + if (should_set_defaults(info)) + set_net_conf_defaults(new_conf); -void resync_after_online_grow(struct drbd_conf *mdev) -{ - int iass; /* I am sync source */ + err = net_conf_from_attrs_for_change(new_conf, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto fail; + } - dev_info(DEV, "Resync of new storage after online grow\n"); - if (mdev->state.role != mdev->state.peer) - iass = (mdev->state.role == R_PRIMARY); - else - iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); + retcode = check_net_options(tconn, new_conf); + if (retcode != NO_ERROR) + goto fail; - if (iass) - drbd_start_resync(mdev, C_SYNC_SOURCE); - else - _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); -} - -STATIC int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) -{ - struct resize rs; - int retcode = NO_ERROR; - int ldsc = 0; /* local disk size changed */ - enum determine_dev_size dd; - - memset(&rs, 0, sizeof(struct resize)); - if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { - retcode = ERR_MANDATORY_TAG; + /* re-sync running */ + rsr = conn_resync_running(tconn); + if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) { + retcode = ERR_CSUMS_RESYNC_RUNNING; goto fail; } - if (mdev->state.conn > C_CONNECTED) { - retcode = ERR_RESIZE_RESYNC; + /* online verify running */ + ovr = conn_ov_running(tconn); + if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) { + retcode = ERR_VERIFY_RUNNING; goto fail; } - if (mdev->state.role == R_SECONDARY && - mdev->state.peer == R_SECONDARY) { - retcode = ERR_NO_PRIMARY; + retcode = alloc_crypto(&crypto, new_conf); + if (retcode != NO_ERROR) goto fail; - } - if (!get_ldev(mdev)) { - retcode = ERR_NO_DISK; - goto fail; - } + rcu_assign_pointer(tconn->net_conf, new_conf); - if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { - mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); - ldsc = 1; + if (!rsr) { + crypto_free_hash(tconn->csums_tfm); + tconn->csums_tfm = crypto.csums_tfm; + crypto.csums_tfm = NULL; } - - mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; - dd = drbd_determin_dev_size(mdev, rs.resize_force); - drbd_md_sync(mdev); - put_ldev(mdev); - if (dd == dev_size_error) { - retcode = ERR_NOMEM_BITMAP; - goto fail; + if (!ovr) { + crypto_free_hash(tconn->verify_tfm); + tconn->verify_tfm = crypto.verify_tfm; + crypto.verify_tfm = NULL; } - if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { - if (dd == grew) - set_bit(RESIZE_PENDING, &mdev->flags); + crypto_free_hash(tconn->integrity_tfm); + tconn->integrity_tfm = crypto.integrity_tfm; + if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100) + /* Do this without trying to take tconn->data.mutex again. */ + __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE); - drbd_send_uuids(mdev); - drbd_send_sizes(mdev, 1); - } + crypto_free_hash(tconn->cram_hmac_tfm); + tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; + + mutex_unlock(&tconn->conf_update); + mutex_unlock(&tconn->data.mutex); + synchronize_rcu(); + kfree(old_conf); + + if (tconn->cstate >= C_WF_REPORT_PARAMS) + drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn))); + + goto done; fail: - reply->ret_code = retcode; + mutex_unlock(&tconn->conf_update); + mutex_unlock(&tconn->data.mutex); + free_crypto(&crypto); + kfree(new_conf); + done: + conn_reconfig_done(tconn); + out: + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) { - int retcode = NO_ERROR; + struct drbd_conf *mdev; + struct net_conf *old_conf, *new_conf = NULL; + struct crypto crypto = { }; + struct drbd_tconn *tconn; + enum drbd_ret_code retcode; + int i; int err; - int ovr; /* online verify running */ - int rsr; /* re-sync running */ - struct crypto_hash *verify_tfm = NULL; - struct crypto_hash *csums_tfm = NULL; - struct syncer_conf sc; - cpumask_var_t new_cpu_mask; - if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { + drbd_msg_put_info("connection endpoint(s) missing"); + retcode = ERR_INVALID_REQUEST; + goto out; + } + + /* No need for _rcu here. All reconfiguration is + * strictly serialized on genl_lock(). We are protected against + * concurrent reconfiguration/addition/deletion */ + list_for_each_entry(tconn, &drbd_tconns, all_tconn) { + if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len && + !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) { + retcode = ERR_LOCAL_ADDR; + goto out; + } + + if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len && + !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) { + retcode = ERR_PEER_ADDR; + goto out; + } + } + + tconn = adm_ctx.tconn; + conn_reconfig_start(tconn); + + if (tconn->cstate > C_STANDALONE) { + retcode = ERR_NET_CONFIGURED; + goto fail; + } + + /* allocation not in the IO path, drbdsetup / netlink process context */ + new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL); + if (!new_conf) { retcode = ERR_NOMEM; goto fail; } - if (nlp->flags & DRBD_NL_SET_DEFAULTS) { - memset(&sc, 0, sizeof(struct syncer_conf)); - sc.rate = DRBD_RATE_DEF; - sc.after = DRBD_AFTER_DEF; - sc.al_extents = DRBD_AL_EXTENTS_DEF; - } else - memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); + set_net_conf_defaults(new_conf); - if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { + err = net_conf_from_attrs(new_conf, info); + if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); goto fail; } - /* re-sync running */ - rsr = ( mdev->state.conn == C_SYNC_SOURCE || - mdev->state.conn == C_SYNC_TARGET || - mdev->state.conn == C_PAUSED_SYNC_S || - mdev->state.conn == C_PAUSED_SYNC_T ); + retcode = check_net_options(tconn, new_conf); + if (retcode != NO_ERROR) + goto fail; - if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { - retcode = ERR_CSUMS_RESYNC_RUNNING; + retcode = alloc_crypto(&crypto, new_conf); + if (retcode != NO_ERROR) + goto fail; + + ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; + + conn_flush_workqueue(tconn); + + mutex_lock(&tconn->conf_update); + old_conf = tconn->net_conf; + if (old_conf) { + retcode = ERR_NET_CONFIGURED; + mutex_unlock(&tconn->conf_update); goto fail; } + rcu_assign_pointer(tconn->net_conf, new_conf); - if (!rsr && sc.csums_alg[0]) { - csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(csums_tfm)) { - csums_tfm = NULL; - retcode = ERR_CSUMS_ALG; - goto fail; - } + conn_free_crypto(tconn); + tconn->cram_hmac_tfm = crypto.cram_hmac_tfm; + tconn->integrity_tfm = crypto.integrity_tfm; + tconn->csums_tfm = crypto.csums_tfm; + tconn->verify_tfm = crypto.verify_tfm; - if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { - retcode = ERR_CSUMS_ALG_ND; - goto fail; - } + tconn->my_addr_len = nla_len(adm_ctx.my_addr); + memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len); + tconn->peer_addr_len = nla_len(adm_ctx.peer_addr); + memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len); + + mutex_unlock(&tconn->conf_update); + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, i) { + mdev->send_cnt = 0; + mdev->recv_cnt = 0; } + rcu_read_unlock(); - /* online verify running */ - ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); + retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); - if (ovr) { - if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { - retcode = ERR_VERIFY_RUNNING; - goto fail; + conn_reconfig_done(tconn); + drbd_adm_finish(info, retcode); + return 0; + +fail: + free_crypto(&crypto); + kfree(new_conf); + + conn_reconfig_done(tconn); +out: + drbd_adm_finish(info, retcode); + return 0; +} + +static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force) +{ + enum drbd_state_rv rv; + + rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), + force ? CS_HARD : 0); + + switch (rv) { + case SS_NOTHING_TO_DO: + break; + case SS_ALREADY_STANDALONE: + return SS_SUCCESS; + case SS_PRIMARY_NOP: + /* Our state checking code wants to see the peer outdated. */ + rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, + pdsk, D_OUTDATED), CS_VERBOSE); + break; + case SS_CW_FAILED_BY_PEER: + /* The peer probably wants to see us outdated. */ + rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, + disk, D_OUTDATED), 0); + if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) { + rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), + CS_HARD); } + break; + default:; + /* no special handling necessary */ } - if (!ovr && sc.verify_alg[0]) { - verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(verify_tfm)) { - verify_tfm = NULL; - retcode = ERR_VERIFY_ALG; - goto fail; - } + if (rv >= SS_SUCCESS) { + enum drbd_state_rv rv2; + /* No one else can reconfigure the network while I am here. + * The state handling only uses drbd_thread_stop_nowait(), + * we want to really wait here until the receiver is no more. + */ + drbd_thread_stop(&adm_ctx.tconn->receiver); + + /* Race breaker. This additional state change request may be + * necessary, if this was a forced disconnect during a receiver + * restart. We may have "killed" the receiver thread just + * after drbdd_init() returned. Typically, we should be + * C_STANDALONE already, now, and this becomes a no-op. + */ + rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE), + CS_VERBOSE | CS_HARD); + if (rv2 < SS_SUCCESS) + conn_err(tconn, + "unexpected rv2=%d in conn_try_disconnect()\n", + rv2); + } + return rv; +} + +int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) +{ + struct disconnect_parms parms; + struct drbd_tconn *tconn; + enum drbd_state_rv rv; + enum drbd_ret_code retcode; + int err; + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto fail; - if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { - retcode = ERR_VERIFY_ALG_ND; + tconn = adm_ctx.tconn; + memset(&parms, 0, sizeof(parms)); + if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { + err = disconnect_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); goto fail; } } - /* silently ignore cpu mask on UP kernel */ - if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { - err = __bitmap_parse(sc.cpu_mask, 32, 0, - cpumask_bits(new_cpu_mask), nr_cpu_ids); + rv = conn_try_disconnect(tconn, parms.force_disconnect); + if (rv < SS_SUCCESS) + retcode = rv; /* FIXME: Type mismatch. */ + else + retcode = NO_ERROR; + fail: + drbd_adm_finish(info, retcode); + return 0; +} + +void resync_after_online_grow(struct drbd_conf *mdev) +{ + int iass; /* I am sync source */ + + dev_info(DEV, "Resync of new storage after online grow\n"); + if (mdev->state.role != mdev->state.peer) + iass = (mdev->state.role == R_PRIMARY); + else + iass = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags); + + if (iass) + drbd_start_resync(mdev, C_SYNC_SOURCE); + else + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); +} + +int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) +{ + struct disk_conf *old_disk_conf, *new_disk_conf = NULL; + struct resize_parms rs; + struct drbd_conf *mdev; + enum drbd_ret_code retcode; + enum determine_dev_size dd; + enum dds_flags ddsf; + sector_t u_size; + int err; + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto fail; + + memset(&rs, 0, sizeof(struct resize_parms)); + if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { + err = resize_parms_from_attrs(&rs, info); if (err) { - dev_warn(DEV, "__bitmap_parse() failed with %d\n", err); - retcode = ERR_CPU_MASK_PARSE; + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); goto fail; } } - ERR_IF (sc.rate < 1) sc.rate = 1; - ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ -#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) - if (sc.al_extents > AL_MAX) { - dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); - sc.al_extents = AL_MAX; - } -#undef AL_MAX - - /* most sanity checks done, try to assign the new sync-after - * dependency. need to hold the global lock in there, - * to avoid a race in the dependency loop check. */ - retcode = drbd_alter_sa(mdev, sc.after); - if (retcode != NO_ERROR) + mdev = adm_ctx.mdev; + if (mdev->state.conn > C_CONNECTED) { + retcode = ERR_RESIZE_RESYNC; goto fail; + } - /* ok, assign the rest of it as well. - * lock against receive_SyncParam() */ - spin_lock(&mdev->peer_seq_lock); - mdev->sync_conf = sc; - - if (!rsr) { - crypto_free_hash(mdev->csums_tfm); - mdev->csums_tfm = csums_tfm; - csums_tfm = NULL; + if (mdev->state.role == R_SECONDARY && + mdev->state.peer == R_SECONDARY) { + retcode = ERR_NO_PRIMARY; + goto fail; } - if (!ovr) { - crypto_free_hash(mdev->verify_tfm); - mdev->verify_tfm = verify_tfm; - verify_tfm = NULL; - } - spin_unlock(&mdev->peer_seq_lock); - - if (get_ldev(mdev)) { - wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); - drbd_al_shrink(mdev); - err = drbd_check_al_size(mdev); - lc_unlock(mdev->act_log); - wake_up(&mdev->al_wait); + if (!get_ldev(mdev)) { + retcode = ERR_NO_DISK; + goto fail; + } - put_ldev(mdev); - drbd_md_sync(mdev); + if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { + retcode = ERR_NEED_APV_93; + goto fail_ldev; + } - if (err) { + rcu_read_lock(); + u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + if (u_size != (sector_t)rs.resize_size) { + new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { retcode = ERR_NOMEM; - goto fail; + goto fail_ldev; } } - if (mdev->state.conn >= C_CONNECTED) - drbd_send_sync_param(mdev, &sc); + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); - if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { - cpumask_copy(mdev->cpu_mask, new_cpu_mask); - drbd_calc_cpu_mask(mdev); - mdev->receiver.reset_cpu_mask = 1; - mdev->asender.reset_cpu_mask = 1; - mdev->worker.reset_cpu_mask = 1; + if (new_disk_conf) { + mutex_lock(&mdev->tconn->conf_update); + old_disk_conf = mdev->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + new_disk_conf->disk_size = (sector_t)rs.resize_size; + rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); + mutex_unlock(&mdev->tconn->conf_update); + synchronize_rcu(); + kfree(old_disk_conf); + } + + ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); + dd = drbd_determine_dev_size(mdev, ddsf); + drbd_md_sync(mdev); + put_ldev(mdev); + if (dd == dev_size_error) { + retcode = ERR_NOMEM_BITMAP; + goto fail; + } + + if (mdev->state.conn == C_CONNECTED) { + if (dd == grew) + set_bit(RESIZE_PENDING, &mdev->flags); + + drbd_send_uuids(mdev); + drbd_send_sizes(mdev, 1, ddsf); + } + + fail: + drbd_adm_finish(info, retcode); + return 0; + + fail_ldev: + put_ldev(mdev); + goto fail; +} + +int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) +{ + enum drbd_ret_code retcode; + struct drbd_tconn *tconn; + struct res_opts res_opts; + int err; + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto fail; + tconn = adm_ctx.tconn; + + res_opts = tconn->res_opts; + if (should_set_defaults(info)) + set_res_opts_defaults(&res_opts); + + err = res_opts_from_attrs(&res_opts, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto fail; + } + + err = set_resource_options(tconn, &res_opts); + if (err) { + retcode = ERR_INVALID_REQUEST; + if (err == -ENOMEM) + retcode = ERR_NOMEM; } - drbd_kobject_uevent(mdev); fail: - free_cpumask_var(new_cpu_mask); - crypto_free_hash(csums_tfm); - crypto_free_hash(verify_tfm); - reply->ret_code = retcode; + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) { - int retcode; + struct drbd_conf *mdev; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mdev = adm_ctx.mdev; + + /* If there is still bitmap IO pending, probably because of a previous + * resync just being finished, wait for it before requesting a new resync. */ + wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -1742,10 +2403,10 @@ retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); while (retcode == SS_NEED_CONNECTION) { - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); if (mdev->state.conn < C_CONNECTED) retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); if (retcode != SS_NEED_CONNECTION) break; @@ -1753,185 +2414,514 @@ retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } - reply->ret_code = retcode; +out: + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, + union drbd_state mask, union drbd_state val) { + enum drbd_ret_code retcode; - reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + retcode = drbd_request_state(adm_ctx.mdev, mask, val); +out: + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) { - int retcode = NO_ERROR; + return drbd_adm_simple_request_state(skb, info, NS(conn, C_STARTING_SYNC_S)); +} - if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) - retcode = ERR_PAUSE_IS_SET; +int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) +{ + enum drbd_ret_code retcode; - reply->ret_code = retcode; + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) + retcode = ERR_PAUSE_IS_SET; +out: + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) { - int retcode = NO_ERROR; + union drbd_dev_state s; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) - retcode = ERR_PAUSE_IS_CLEAR; + if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { + s = adm_ctx.mdev->state; + if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { + retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : + s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; + } else { + retcode = ERR_PAUSE_IS_CLEAR; + } + } - reply->ret_code = retcode; +out: + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) +{ + return drbd_adm_simple_request_state(skb, info, NS(susp, 1)); +} + +int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) { - reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); + struct drbd_conf *mdev; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mdev = adm_ctx.mdev; + if (test_bit(NEW_CUR_UUID, &mdev->flags)) { + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + } + drbd_suspend_io(mdev); + retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); + if (retcode == SS_SUCCESS) { + if (mdev->state.conn < C_CONNECTED) + tl_clear(mdev->tconn); + if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED) + tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO); + } + drbd_resume_io(mdev); +out: + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) { - reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); - return 0; + return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); } -STATIC int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr) { - reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); + struct nlattr *nla; + nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); + if (!nla) + goto nla_put_failure; + if (vnr != VOLUME_UNSPECIFIED) + NLA_PUT_U32(skb, T_ctx_volume, vnr); + NLA_PUT_STRING(skb, T_ctx_resource_name, tconn->name); + if (tconn->my_addr_len) + NLA_PUT(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr); + if (tconn->peer_addr_len) + NLA_PUT(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr); + nla_nest_end(skb, nla); return 0; + +nla_put_failure: + if (nla) + nla_nest_cancel(skb, nla); + return -EMSGSIZE; } -STATIC int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, + const struct sib_info *sib) { - unsigned short *tl; + struct state_info *si = NULL; /* for sizeof(si->member); */ + struct net_conf *nc; + struct nlattr *nla; + int got_ldev; + int err = 0; + int exclude_sensitive; - tl = reply->tag_list; + /* If sib != NULL, this is drbd_bcast_event, which anyone can listen + * to. So we better exclude_sensitive information. + * + * If sib == NULL, this is drbd_adm_get_status, executed synchronously + * in the context of the requesting user process. Exclude sensitive + * information, unless current has superuser. + * + * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and + * relies on the current implementation of netlink_dump(), which + * executes the dump callback successively from netlink_recvmsg(), + * always in the context of the receiving process */ + exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); + + got_ldev = get_ldev(mdev); + + /* We need to add connection name and volume number information still. + * Minor number is in drbd_genlmsghdr. */ + if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr)) + goto nla_put_failure; + + if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive)) + goto nla_put_failure; + + rcu_read_lock(); + if (got_ldev) + if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) + goto nla_put_failure; + + nc = rcu_dereference(mdev->tconn->net_conf); + if (nc) + err = net_conf_to_skb(skb, nc, exclude_sensitive); + rcu_read_unlock(); + if (err) + goto nla_put_failure; - if (get_ldev(mdev)) { - tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); - put_ldev(mdev); + nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); + if (!nla) + goto nla_put_failure; + NLA_PUT_U32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY); + NLA_PUT_U32(skb, T_current_state, mdev->state.i); + NLA_PUT_U64(skb, T_ed_uuid, mdev->ed_uuid); + NLA_PUT_U64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)); + + if (got_ldev) { + NLA_PUT_U32(skb, T_disk_flags, mdev->ldev->md.flags); + NLA_PUT(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid); + NLA_PUT_U64(skb, T_bits_total, drbd_bm_bits(mdev)); + NLA_PUT_U64(skb, T_bits_oos, drbd_bm_total_weight(mdev)); + if (C_SYNC_SOURCE <= mdev->state.conn && + C_PAUSED_SYNC_T >= mdev->state.conn) { + NLA_PUT_U64(skb, T_bits_rs_total, mdev->rs_total); + NLA_PUT_U64(skb, T_bits_rs_failed, mdev->rs_failed); + } } - if (get_net_conf(mdev)) { - tl = net_conf_to_tags(mdev, mdev->net_conf, tl); - put_net_conf(mdev); + if (sib) { + switch(sib->sib_reason) { + case SIB_SYNC_PROGRESS: + case SIB_GET_STATUS_REPLY: + break; + case SIB_STATE_CHANGE: + NLA_PUT_U32(skb, T_prev_state, sib->os.i); + NLA_PUT_U32(skb, T_new_state, sib->ns.i); + break; + case SIB_HELPER_POST: + NLA_PUT_U32(skb, + T_helper_exit_code, sib->helper_exit_code); + /* fall through */ + case SIB_HELPER_PRE: + NLA_PUT_STRING(skb, T_helper, sib->helper_name); + break; + } } - tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); + nla_nest_end(skb, nla); - put_unaligned(TT_END, tl++); /* Close the tag list */ - - return (int)((char *)tl - (char *)reply->tag_list); + if (0) +nla_put_failure: + err = -EMSGSIZE; + if (got_ldev) + put_ldev(mdev); + return err; } -STATIC int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) { - unsigned short *tl = reply->tag_list; - union drbd_state s = mdev->state; - unsigned long rs_left; - unsigned int res; + enum drbd_ret_code retcode; + int err; - tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - /* no local ref, no bitmap, no syncer progress. */ - if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { - if (get_ldev(mdev)) { - drbd_get_syncer_progress(mdev, &rs_left, &res); - tl = tl_add_int(tl, T_sync_progress, &res); - put_ldev(mdev); - } + err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL); + if (err) { + nlmsg_free(adm_ctx.reply_skb); + return err; } - put_unaligned(TT_END, tl++); /* Close the tag list */ - - return (int)((char *)tl - (char *)reply->tag_list); +out: + drbd_adm_finish(info, retcode); + return 0; } -STATIC int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) { - unsigned short *tl; - - tl = reply->tag_list; + struct drbd_conf *mdev; + struct drbd_genlmsghdr *dh; + struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0]; + struct drbd_tconn *tconn = NULL; + struct drbd_tconn *tmp; + unsigned volume = cb->args[1]; + + /* Open coded, deferred, iteration: + * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) { + * idr_for_each_entry(&tconn->volumes, mdev, i) { + * ... + * } + * } + * where tconn is cb->args[0]; + * and i is cb->args[1]; + * + * cb->args[2] indicates if we shall loop over all resources, + * or just dump all volumes of a single resource. + * + * This may miss entries inserted after this dump started, + * or entries deleted before they are reached. + * + * We need to make sure the mdev won't disappear while + * we are looking at it, and revalidate our iterators + * on each iteration. + */ - if (get_ldev(mdev)) { - tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); - tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); - put_ldev(mdev); + /* synchronize with conn_create()/conn_destroy() */ + rcu_read_lock(); + /* revalidate iterator position */ + list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) { + if (pos == NULL) { + /* first iteration */ + pos = tmp; + tconn = pos; + break; + } + if (tmp == pos) { + tconn = pos; + break; + } } - put_unaligned(TT_END, tl++); /* Close the tag list */ + if (tconn) { +next_tconn: + mdev = idr_get_next(&tconn->volumes, &volume); + if (!mdev) { + /* No more volumes to dump on this tconn. + * Advance tconn iterator. */ + pos = list_entry_rcu(tconn->all_tconn.next, + struct drbd_tconn, all_tconn); + /* Did we dump any volume on this tconn yet? */ + if (volume != 0) { + /* If we reached the end of the list, + * or only a single resource dump was requested, + * we are done. */ + if (&pos->all_tconn == &drbd_tconns || cb->args[2]) + goto out; + volume = 0; + tconn = pos; + goto next_tconn; + } + } + + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_STATUS); + if (!dh) + goto out; + + if (!mdev) { + /* This is a tconn without a single volume. + * Suprisingly enough, it may have a network + * configuration. */ + struct net_conf *nc; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED)) + goto cancel; + nc = rcu_dereference(tconn->net_conf); + if (nc && net_conf_to_skb(skb, nc, 1) != 0) + goto cancel; + goto done; + } + + D_ASSERT(mdev->vnr == volume); + D_ASSERT(mdev->tconn == tconn); + + dh->minor = mdev_to_minor(mdev); + dh->ret_code = NO_ERROR; + + if (nla_put_status_info(skb, mdev, NULL)) { +cancel: + genlmsg_cancel(skb, dh); + goto out; + } +done: + genlmsg_end(skb, dh); + } - return (int)((char *)tl - (char *)reply->tag_list); +out: + rcu_read_unlock(); + /* where to start the next iteration */ + cb->args[0] = (long)pos; + cb->args[1] = (pos == tconn) ? volume + 1 : 0; + + /* No more tconns/volumes/minors found results in an empty skb. + * Which will terminate the dump. */ + return skb->len; } -/** - * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use - * @mdev: DRBD device. - * @nlp: Netlink/connector packet from drbdsetup - * @reply: Reply packet for drbdsetup +/* + * Request status of all resources, or of all volumes within a single resource. + * + * This is a dump, as the answer may not fit in a single reply skb otherwise. + * Which means we cannot use the family->attrbuf or other such members, because + * dump is NOT protected by the genl_lock(). During dump, we only have access + * to the incoming skb, and need to opencode "parsing" of the nlattr payload. + * + * Once things are setup properly, we call into get_one_status(). */ -STATIC int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) { - unsigned short *tl; - char rv; + const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; + struct nlattr *nla; + const char *resource_name; + struct drbd_tconn *tconn; + int maxtype; + + /* Is this a followup call? */ + if (cb->args[0]) { + /* ... of a single resource dump, + * and the resource iterator has been advanced already? */ + if (cb->args[2] && cb->args[2] != cb->args[0]) + return 0; /* DONE. */ + goto dump; + } + + /* First call (from netlink_dump_start). We need to figure out + * which resource(s) the user wants us to dump. */ + nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), + nlmsg_attrlen(cb->nlh, hdrlen), + DRBD_NLA_CFG_CONTEXT); + + /* No explicit context given. Dump all. */ + if (!nla) + goto dump; + maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; + nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); + if (IS_ERR(nla)) + return PTR_ERR(nla); + /* context given, but no name present? */ + if (!nla) + return -EINVAL; + resource_name = nla_data(nla); + tconn = conn_get_by_name(resource_name); + + if (!tconn) + return -ENODEV; + + kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */ + + /* prime iterators, and set "filter" mode mark: + * only dump this tconn. */ + cb->args[0] = (long)tconn; + /* cb->args[1] = 0; passed in this way. */ + cb->args[2] = (long)tconn; - tl = reply->tag_list; +dump: + return get_one_status(skb, cb); +} - rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : - test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; +int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) +{ + enum drbd_ret_code retcode; + struct timeout_parms tp; + int err; - tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); - put_unaligned(TT_END, tl++); /* Close the tag list */ + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + tp.timeout_type = + adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED : + UT_DEFAULT; - return (int)((char *)tl - (char *)reply->tag_list); + err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); + if (err) { + nlmsg_free(adm_ctx.reply_skb); + return err; + } +out: + drbd_adm_finish(info, retcode); + return 0; } -STATIC int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) { - /* default to resume from last known position, if possible */ - struct start_ov args = - { .start_sector = mdev->ov_start_sector }; + struct drbd_conf *mdev; + enum drbd_ret_code retcode; - if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { - reply->ret_code = ERR_MANDATORY_TAG; - return 0; - } - /* w_make_ov_request expects position to be aligned */ - mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; - reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mdev = adm_ctx.mdev; + if (info->attrs[DRBD_NLA_START_OV_PARMS]) { + /* resume from last known position, if possible */ + struct start_ov_parms parms = + { .ov_start_sector = mdev->ov_start_sector }; + int err = start_ov_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto out; + } + /* w_make_ov_request expects position to be aligned */ + mdev->ov_start_sector = parms.ov_start_sector & ~BM_SECT_PER_BIT; + } + /* If there is still bitmap IO pending, e.g. previous resync or verify + * just being finished, wait for it before requesting a new resync. */ + wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); +out: + drbd_adm_finish(info, retcode); return 0; } -STATIC int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, - struct drbd_nl_cfg_reply *reply) +int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) { - int retcode = NO_ERROR; + struct drbd_conf *mdev; + enum drbd_ret_code retcode; int skip_initial_sync = 0; int err; + struct new_c_uuid_parms args; - struct new_c_uuid args; + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out_nolock; - memset(&args, 0, sizeof(struct new_c_uuid)); - if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { - reply->ret_code = ERR_MANDATORY_TAG; - return 0; + mdev = adm_ctx.mdev; + memset(&args, 0, sizeof(args)); + if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { + err = new_c_uuid_parms_from_attrs(&args, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto out_nolock; + } } - mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ + mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */ if (!get_ldev(mdev)) { retcode = ERR_NO_DISK; @@ -1939,7 +2929,7 @@ } /* this is "skip initial sync", assume to be clean */ - if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && + if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 && mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { dev_info(DEV, "Preparing to skip initial sync\n"); skip_initial_sync = 1; @@ -1952,7 +2942,8 @@ drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ if (args.clear_bm) { - err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); + err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, + "clear_n_write from new_c_uuid", BM_LOCKED_MASK); if (err) { dev_err(DEV, "Writing bitmap failed with %d\n",err); retcode = ERR_IO_MD_DISK; @@ -1960,10 +2951,11 @@ if (skip_initial_sync) { drbd_send_uuids_skip_initial_sync(mdev); _drbd_uuid_set(mdev, UI_BITMAP, 0); - spin_lock_irq(&mdev->req_lock); + drbd_print_uuids(mdev, "cleared bitmap UUID"); + spin_lock_irq(&mdev->tconn->req_lock); _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); } } @@ -1971,491 +2963,265 @@ out_dec: put_ldev(mdev); out: - mutex_unlock(&mdev->state_mutex); - - reply->ret_code = retcode; + mutex_unlock(mdev->state_mutex); +out_nolock: + drbd_adm_finish(info, retcode); return 0; } -STATIC struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) +static enum drbd_ret_code +drbd_check_resource_name(const char *name) { - struct drbd_conf *mdev; - - if (nlp->drbd_minor >= minor_count) - return NULL; - - mdev = minor_to_mdev(nlp->drbd_minor); - - if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { - struct gendisk *disk = NULL; - mdev = drbd_new_device(nlp->drbd_minor); - - spin_lock_irq(&drbd_pp_lock); - if (minor_table[nlp->drbd_minor] == NULL) { - minor_table[nlp->drbd_minor] = mdev; - disk = mdev->vdisk; - mdev = NULL; - } /* else: we lost the race */ - spin_unlock_irq(&drbd_pp_lock); - - if (disk) /* we won the race above */ - /* in case we ever add a drbd_delete_device(), - * don't forget the del_gendisk! */ - add_disk(disk); - else /* we lost the race above */ - drbd_free_mdev(mdev); - - mdev = minor_to_mdev(nlp->drbd_minor); + if (!name || !name[0]) { + drbd_msg_put_info("resource name missing"); + return ERR_MANDATORY_TAG; } - - return mdev; + /* if we want to use these in sysfs/configfs/debugfs some day, + * we must not allow slashes */ + if (strchr(name, '/')) { + drbd_msg_put_info("invalid resource name"); + return ERR_INVALID_REQUEST; + } + return NO_ERROR; } -struct cn_handler_struct { - int (*function)(struct drbd_conf *, - struct drbd_nl_cfg_req *, - struct drbd_nl_cfg_reply *); - int reply_body_size; -}; - -static struct cn_handler_struct cnd_table[] = { - [ P_primary ] = { &drbd_nl_primary, 0 }, - [ P_secondary ] = { &drbd_nl_secondary, 0 }, - [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, - [ P_detach ] = { &drbd_nl_detach, 0 }, - [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, - [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, - [ P_resize ] = { &drbd_nl_resize, 0 }, - [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, - [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, - [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, - [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, - [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, - [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, - [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, - [ P_outdate ] = { &drbd_nl_outdate, 0 }, - [ P_get_config ] = { &drbd_nl_get_config, - sizeof(struct syncer_conf_tag_len_struct) + - sizeof(struct disk_conf_tag_len_struct) + - sizeof(struct net_conf_tag_len_struct) }, - [ P_get_state ] = { &drbd_nl_get_state, - sizeof(struct get_state_tag_len_struct) + - sizeof(struct sync_progress_tag_len_struct) }, - [ P_get_uuids ] = { &drbd_nl_get_uuids, - sizeof(struct get_uuids_tag_len_struct) }, - [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, - sizeof(struct get_timeout_flag_tag_len_struct)}, - [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, - [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, -}; - -#ifdef KERNEL_HAS_CN_SKB_PARMS -STATIC void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) +int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { -#else -STATIC void drbd_connector_callback(void *data) -{ - struct cn_msg *req = data; -#endif - struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; - struct cn_handler_struct *cm; - struct cn_msg *cn_reply; - struct drbd_nl_cfg_reply *reply; - struct drbd_conf *mdev; - int retcode, rr; - int reply_size = sizeof(struct cn_msg) - + sizeof(struct drbd_nl_cfg_reply) - + sizeof(short int); - - if (!try_module_get(THIS_MODULE)) { - printk(KERN_ERR "drbd: try_module_get() failed!\n"); - return; - } - -#ifdef KERNEL_HAS_CN_SKB_PARMS - if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) { - retcode = ERR_PERM; - goto fail; - } -#endif - - mdev = ensure_mdev(nlp); - if (!mdev) { - retcode = ERR_MINOR_INVALID; - goto fail; - } + enum drbd_ret_code retcode; + struct res_opts res_opts; + int err; - trace_drbd_netlink(req, 1); + retcode = drbd_adm_prepare(skb, info, 0); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - if (nlp->packet_type >= P_nl_after_last_packet) { - retcode = ERR_PACKET_NR; - goto fail; + set_res_opts_defaults(&res_opts); + err = res_opts_from_attrs(&res_opts, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(from_attrs_err_to_txt(err)); + goto out; } - cm = cnd_table + nlp->packet_type; - - /* This may happen if packet number is 0: */ - if (cm->function == NULL) { - retcode = ERR_PACKET_NR; - goto fail; - } + retcode = drbd_check_resource_name(adm_ctx.resource_name); + if (retcode != NO_ERROR) + goto out; - reply_size += cm->reply_body_size; + if (adm_ctx.tconn) + goto out; - /* allocation not in the IO path, cqueue thread context */ - cn_reply = kmalloc(reply_size, GFP_KERNEL); - if (!cn_reply) { + if (!conn_create(adm_ctx.resource_name, &res_opts)) retcode = ERR_NOMEM; - goto fail; - } - reply = (struct drbd_nl_cfg_reply *) cn_reply->data; - - reply->packet_type = - cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; - reply->minor = nlp->drbd_minor; - reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ - /* reply->tag_list; might be modified by cm->function. */ - - rr = cm->function(mdev, nlp, reply); - - cn_reply->id = req->id; - cn_reply->seq = req->seq; - cn_reply->ack = req->ack + 1; - cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; - cn_reply->flags = 0; - - trace_drbd_netlink(cn_reply, 0); - rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); - if (rr && rr != -ESRCH) - printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); - - kfree(cn_reply); - module_put(THIS_MODULE); - return; - fail: - drbd_nl_send_reply(req, retcode); - module_put(THIS_MODULE); -} - -static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ - -static unsigned short * -__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, - unsigned short len, int nul_terminated) -{ - unsigned short l = tag_descriptions[tag_number(tag)].max_len; - len = (len < l) ? len : l; - put_unaligned(tag, tl++); - put_unaligned(len, tl++); - memcpy(tl, data, len); - tl = (unsigned short*)((char*)tl + len); - if (nul_terminated) - *((char*)tl - 1) = 0; - return tl; +out: + drbd_adm_finish(info, retcode); + return 0; } -static unsigned short * -tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) +int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info) { - return __tl_add_blob(tl, tag, data, len, 0); -} + struct drbd_genlmsghdr *dh = info->userhdr; + enum drbd_ret_code retcode; -static unsigned short * -tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) -{ - return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); -} + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; -static unsigned short * -tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) -{ - put_unaligned(tag, tl++); - switch(tag_type(tag)) { - case TT_INTEGER: - put_unaligned(sizeof(int), tl++); - put_unaligned(*(int *)val, (int *)tl); - tl = (unsigned short*)((char*)tl+sizeof(int)); - break; - case TT_INT64: - put_unaligned(sizeof(u64), tl++); - put_unaligned(*(u64 *)val, (u64 *)tl); - tl = (unsigned short*)((char*)tl+sizeof(u64)); - break; - default: - /* someone did something stupid. */ - ; + if (dh->minor > MINORMASK) { + drbd_msg_put_info("requested minor out of range"); + retcode = ERR_INVALID_REQUEST; + goto out; + } + if (adm_ctx.volume > DRBD_VOLUME_MAX) { + drbd_msg_put_info("requested volume id out of range"); + retcode = ERR_INVALID_REQUEST; + goto out; } - return tl; -} - -void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) -{ - char buffer[sizeof(struct cn_msg)+ - sizeof(struct drbd_nl_cfg_reply)+ - sizeof(struct get_state_tag_len_struct)+ - sizeof(short int)]; - struct cn_msg *cn_reply = (struct cn_msg *) buffer; - struct drbd_nl_cfg_reply *reply = - (struct drbd_nl_cfg_reply *)cn_reply->data; - unsigned short *tl = reply->tag_list; - - /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ - - tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); - - put_unaligned(TT_END, tl++); /* Close the tag list */ - - cn_reply->id.idx = CN_IDX_DRBD; - cn_reply->id.val = CN_VAL_DRBD; - - cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); - cn_reply->ack = 0; /* not used here. */ - cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + - (int)((char *)tl - (char *)reply->tag_list); - cn_reply->flags = 0; - reply->packet_type = P_get_state; - reply->minor = mdev_to_minor(mdev); - reply->ret_code = NO_ERROR; + if (adm_ctx.mdev) + goto out; - trace_drbd_netlink(cn_reply, 0); - cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume); +out: + drbd_adm_finish(info, retcode); + return 0; } -void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) +static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) { - char buffer[sizeof(struct cn_msg)+ - sizeof(struct drbd_nl_cfg_reply)+ - sizeof(struct call_helper_tag_len_struct)+ - sizeof(short int)]; - struct cn_msg *cn_reply = (struct cn_msg *) buffer; - struct drbd_nl_cfg_reply *reply = - (struct drbd_nl_cfg_reply *)cn_reply->data; - unsigned short *tl = reply->tag_list; - - /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ - - tl = tl_add_str(tl, T_helper, helper_name); - put_unaligned(TT_END, tl++); /* Close the tag list */ - - cn_reply->id.idx = CN_IDX_DRBD; - cn_reply->id.val = CN_VAL_DRBD; + if (mdev->state.disk == D_DISKLESS && + /* no need to be mdev->state.conn == C_STANDALONE && + * we may want to delete a minor from a live replication group. + */ + mdev->state.role == R_SECONDARY) { + _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS), + CS_VERBOSE + CS_WAIT_COMPLETE); + idr_remove(&mdev->tconn->volumes, mdev->vnr); + idr_remove(&minors, mdev_to_minor(mdev)); + del_gendisk(mdev->vdisk); + synchronize_rcu(); + kref_put(&mdev->kref, &drbd_minor_destroy); + return NO_ERROR; + } else + return ERR_MINOR_CONFIGURED; +} - cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); - cn_reply->ack = 0; /* not used here. */ - cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + - (int)((char *)tl - (char *)reply->tag_list); - cn_reply->flags = 0; +int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info) +{ + enum drbd_ret_code retcode; - reply->packet_type = P_call_helper; - reply->minor = mdev_to_minor(mdev); - reply->ret_code = NO_ERROR; + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - trace_drbd_netlink(cn_reply, 0); - cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + retcode = adm_delete_minor(adm_ctx.mdev); +out: + drbd_adm_finish(info, retcode); + return 0; } -void drbd_bcast_ee(struct drbd_conf *mdev, - const char *reason, const int dgs, - const char* seen_hash, const char* calc_hash, - const struct drbd_epoch_entry* e) +int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { - struct cn_msg *cn_reply; - struct drbd_nl_cfg_reply *reply; - struct bio_vec *bvec; - unsigned short *tl; - int i; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + struct drbd_conf *mdev; + unsigned i; - if (!e) - return; - if (!reason || !reason[0]) - return; + retcode = drbd_adm_prepare(skb, info, 0); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - /* apparently we have to memcpy twice, first to prepare the data for the - * struct cn_msg, then within cn_netlink_send from the cn_msg to the - * netlink skb. */ - /* receiver thread context, which is not in the writeout path (of this node), - * but may be in the writeout path of the _other_ node. - * GFP_NOIO to avoid potential "distributed deadlock". */ - cn_reply = kmalloc( - sizeof(struct cn_msg)+ - sizeof(struct drbd_nl_cfg_reply)+ - sizeof(struct dump_ee_tag_len_struct)+ - sizeof(short int), - GFP_NOIO); - - if (!cn_reply) { - dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", - (unsigned long long)e->sector, e->size); - return; + if (!adm_ctx.tconn) { + retcode = ERR_RES_NOT_KNOWN; + goto out; } - reply = (struct drbd_nl_cfg_reply*)cn_reply->data; - tl = reply->tag_list; + /* demote */ + idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { + retcode = drbd_set_role(mdev, R_SECONDARY, 0); + if (retcode < SS_SUCCESS) { + drbd_msg_put_info("failed to demote"); + goto out; + } + } - tl = tl_add_str(tl, T_dump_ee_reason, reason); - tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); - tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); - tl = tl_add_int(tl, T_ee_sector, &e->sector); - tl = tl_add_int(tl, T_ee_block_id, &e->block_id); - - put_unaligned(T_ee_data, tl++); - put_unaligned(e->size, tl++); - - __bio_for_each_segment(bvec, e->private_bio, i, 0) { - void *d = kmap(bvec->bv_page); - memcpy(tl, d + bvec->bv_offset, bvec->bv_len); - kunmap(bvec->bv_page); - tl=(unsigned short*)((char*)tl + bvec->bv_len); - } - put_unaligned(TT_END, tl++); /* Close the tag list */ - - cn_reply->id.idx = CN_IDX_DRBD; - cn_reply->id.val = CN_VAL_DRBD; - - cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); - cn_reply->ack = 0; // not used here. - cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + - (int)((char*)tl - (char*)reply->tag_list); - cn_reply->flags = 0; - - reply->packet_type = P_dump_ee; - reply->minor = mdev_to_minor(mdev); - reply->ret_code = NO_ERROR; - - trace_drbd_netlink(cn_reply, 0); - cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); - kfree(cn_reply); -} - -void drbd_bcast_sync_progress(struct drbd_conf *mdev) -{ - char buffer[sizeof(struct cn_msg)+ - sizeof(struct drbd_nl_cfg_reply)+ - sizeof(struct sync_progress_tag_len_struct)+ - sizeof(short int)]; - struct cn_msg *cn_reply = (struct cn_msg *) buffer; - struct drbd_nl_cfg_reply *reply = - (struct drbd_nl_cfg_reply *)cn_reply->data; - unsigned short *tl = reply->tag_list; - unsigned long rs_left; - unsigned int res; + retcode = conn_try_disconnect(adm_ctx.tconn, 0); + if (retcode < SS_SUCCESS) { + drbd_msg_put_info("failed to disconnect"); + goto out; + } - /* no local ref, no bitmap, no syncer progress, no broadcast. */ - if (!get_ldev(mdev)) - return; - drbd_get_syncer_progress(mdev, &rs_left, &res); - put_ldev(mdev); + /* detach */ + idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { + retcode = adm_detach(mdev, 0); + if (retcode < SS_SUCCESS) { + drbd_msg_put_info("failed to detach"); + goto out; + } + } - tl = tl_add_int(tl, T_sync_progress, &res); - put_unaligned(TT_END, tl++); /* Close the tag list */ + /* If we reach this, all volumes (of this tconn) are Secondary, + * Disconnected, Diskless, aka Unconfigured. Make sure all threads have + * actually stopped, state handling only does drbd_thread_stop_nowait(). */ + drbd_thread_stop(&adm_ctx.tconn->worker); - cn_reply->id.idx = CN_IDX_DRBD; - cn_reply->id.val = CN_VAL_DRBD; + /* Now, nothing can fail anymore */ - cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); - cn_reply->ack = 0; /* not used here. */ - cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + - (int)((char *)tl - (char *)reply->tag_list); - cn_reply->flags = 0; + /* delete volumes */ + idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) { + retcode = adm_delete_minor(mdev); + if (retcode != NO_ERROR) { + /* "can not happen" */ + drbd_msg_put_info("failed to delete volume"); + goto out; + } + } - reply->packet_type = P_sync_progress; - reply->minor = mdev_to_minor(mdev); - reply->ret_code = NO_ERROR; + /* delete connection */ + if (conn_lowest_minor(adm_ctx.tconn) < 0) { + list_del_rcu(&adm_ctx.tconn->all_tconn); + synchronize_rcu(); + kref_put(&adm_ctx.tconn->kref, &conn_destroy); - trace_drbd_netlink(cn_reply, 0); - cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + retcode = NO_ERROR; + } else { + /* "can not happen" */ + retcode = ERR_RES_IN_USE; + drbd_msg_put_info("failed to delete connection"); + } + goto out; +out: + drbd_adm_finish(info, retcode); + return 0; } -#ifdef NETLINK_ROUTE6 -int __init cn_init(void); -void __exit cn_fini(void); -#endif - -typedef int (*cn_add_callback_req_nsp_fn)(struct cb_id *, char *, - void (*cb)(struct cn_msg *req, struct netlink_skb_parms *nsp)); -typedef int (*cn_add_callback_req_fn)(struct cb_id *, char *, - void (*cb)(struct cn_msg *req)); -typedef int (*cn_add_callback_void_fn)(struct cb_id *, char *, - void (*cb)(void *data)); -#ifndef __same_type -# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -#endif - -int __init drbd_nl_init(void) +int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { - static struct cb_id cn_id_drbd; - int err, try=10; + enum drbd_ret_code retcode; -#ifdef NETLINK_ROUTE6 - /* pre 2.6.16 */ - err = cn_init(); - if (err) - return err; -#endif - cn_id_drbd.val = CN_VAL_DRBD; - do { - cn_id_drbd.idx = cn_idx; - /* Try to catch incompatible callbacks at compile time, - * otherwise it will just be a compiler _warning_, - * but then BUG at runtime. */ -#ifdef KERNEL_HAS_CN_SKB_PARMS - BUILD_BUG_ON(!__same_type(&cn_add_callback, cn_add_callback_req_nsp_fn)); -#else - BUILD_BUG_ON(!( - __same_type(&cn_add_callback, cn_add_callback_req_fn) || - __same_type(&cn_add_callback, cn_add_callback_void_fn))); -#endif - err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); - if (!err) - break; - cn_idx = (cn_idx + CN_IDX_STEP); - } while (try--); + retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; - if (err) { - printk(KERN_ERR "drbd: cn_drbd failed to register\n"); - return err; + if (conn_lowest_minor(adm_ctx.tconn) < 0) { + list_del_rcu(&adm_ctx.tconn->all_tconn); + synchronize_rcu(); + kref_put(&adm_ctx.tconn->kref, &conn_destroy); + + retcode = NO_ERROR; + } else { + retcode = ERR_RES_IN_USE; } + if (retcode == NO_ERROR) + drbd_thread_stop(&adm_ctx.tconn->worker); +out: + drbd_adm_finish(info, retcode); return 0; } -void drbd_nl_cleanup(void) +void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib) { - static struct cb_id cn_id_drbd; - - cn_id_drbd.idx = cn_idx; - cn_id_drbd.val = CN_VAL_DRBD; + static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ + struct sk_buff *msg; + struct drbd_genlmsghdr *d_out; + unsigned seq; + int err = -ENOMEM; + + seq = atomic_inc_return(&drbd_genl_seq); + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + if (!msg) + goto failed; + + err = -EMSGSIZE; + d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT); + if (!d_out) /* cannot happen, but anyways. */ + goto nla_put_failure; + d_out->minor = mdev_to_minor(mdev); + d_out->ret_code = NO_ERROR; + + if (nla_put_status_info(msg, mdev, sib)) + goto nla_put_failure; + genlmsg_end(msg, d_out); + err = drbd_genl_multicast_events(msg, 0); + /* msg has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; - cn_del_callback(&cn_id_drbd); - -#ifdef NETLINK_ROUTE6 - /* pre 2.6.16 */ - cn_fini(); -#endif -} + return; -void drbd_nl_send_reply(struct cn_msg *req, int ret_code) -{ - char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; - struct cn_msg *cn_reply = (struct cn_msg *) buffer; - struct drbd_nl_cfg_reply *reply = - (struct drbd_nl_cfg_reply *)cn_reply->data; - int rr; - - cn_reply->id = req->id; - - cn_reply->seq = req->seq; - cn_reply->ack = req->ack + 1; - cn_reply->len = sizeof(struct drbd_nl_cfg_reply); - cn_reply->flags = 0; - - reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; - reply->ret_code = ret_code; - - trace_drbd_netlink(cn_reply, 0); - rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); - if (rr && rr != -ESRCH) - printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); +nla_put_failure: + nlmsg_free(msg); +failed: + dev_err(DEV, "Error %d while broadcasting event. " + "Event seq:%u sib_reason:%u\n", + err, seq, sib->sib_reason); } - diff -Nru drbd8-8.3.7/drbd/drbd_nla.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_nla.c --- drbd8-8.3.7/drbd/drbd_nla.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_nla.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,55 @@ +#include "drbd_wrappers.h" +#include +#include +#include +#include "drbd_nla.h" + +static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) +{ + struct nlattr *head = nla_data(nla); + int len = nla_len(nla); + int rem; + + /* + * validate_nla (called from nla_parse_nested) ignores attributes + * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. + * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY + * flag set also, check and remove that flag before calling + * nla_parse_nested. + */ + + nla_for_each_attr(nla, head, len, rem) { + if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { + nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; + if (nla_type(nla) > maxtype) + return -EOPNOTSUPP; + } + } + return 0; +} + +int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy) +{ + int err; + + err = drbd_nla_check_mandatory(maxtype, nla); + if (!err) + err = nla_parse_nested(tb, maxtype, nla, policy); + + return err; +} + +struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) +{ + int err; + /* + * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and + * we don't know about that attribute, reject all the nested + * attributes. + */ + err = drbd_nla_check_mandatory(maxtype, nla); + if (err) + return ERR_PTR(err); + return nla_find_nested(nla, attrtype); +} diff -Nru drbd8-8.3.7/drbd/drbd_nla.h drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_nla.h --- drbd8-8.3.7/drbd/drbd_nla.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_nla.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,8 @@ +#ifndef __DRBD_NLA_H +#define __DRBD_NLA_H + +extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy); +extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); + +#endif /* __DRBD_NLA_H */ diff -Nru drbd8-8.3.7/drbd/drbd_proc.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_proc.c --- drbd8-8.3.7/drbd/drbd_proc.c 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_proc.c 2012-02-02 14:09:14.000000000 +0000 @@ -29,13 +29,13 @@ #include #include #include -#include #include #include #include #include "drbd_int.h" STATIC int drbd_proc_open(struct inode *inode, struct file *file); +STATIC int drbd_proc_release(struct inode *inode, struct file *file); struct proc_dir_entry *drbd_proc; @@ -44,9 +44,22 @@ .open = drbd_proc_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = drbd_proc_release, }; +void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) +{ + /* v is in kB/sec. We don't expect TiByte/sec yet. */ + if (unlikely(v >= 1000000)) { + /* cool: > GiByte/s */ + seq_printf(seq, "%ld,", v / 1000000); + v %= 1000000; + seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); + } else if (likely(v >= 1000)) + seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); + else + seq_printf(seq, "%ld", v); +} /*lge * progress bars shamelessly adapted from driver/md/md.c @@ -59,6 +72,7 @@ unsigned long db, dt, dbdt, rt, rs_left; unsigned int res; int i, x, y; + int stalled = 0; drbd_get_syncer_progress(mdev, &rs_left, &res); @@ -72,17 +86,24 @@ seq_printf(seq, "."); seq_printf(seq, "] "); - seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); - /* if more than 1 GB display in MB */ - if (mdev->rs_total > 0x100000L) - seq_printf(seq, "(%lu/%lu)M\n\t", + if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) + seq_printf(seq, "verified:"); + else + seq_printf(seq, "sync'ed:"); + seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); + + /* if more than a few GB, display in MB */ + if (mdev->rs_total > (4UL << (30 - BM_BLOCK_SHIFT))) + seq_printf(seq, "(%lu/%lu)M", (unsigned long) Bit2KB(rs_left >> 10), (unsigned long) Bit2KB(mdev->rs_total >> 10)); else - seq_printf(seq, "(%lu/%lu)K\n\t", + seq_printf(seq, "(%lu/%lu)K", (unsigned long) Bit2KB(rs_left), (unsigned long) Bit2KB(mdev->rs_total)); + seq_printf(seq, "\n\t"); + /* see drivers/md/md.c * We do not want to overflow, so the order of operands and * the * 100 / 100 trick are important. We do a +1 to be @@ -92,45 +113,76 @@ * db: blocks written from mark until now * rt: remaining time */ - dt = (jiffies - mdev->rs_mark_time) / HZ; - - if (dt > 20) { - /* if we made no update to rs_mark_time for too long, - * we are stalled. show that. */ - seq_printf(seq, "stalled\n"); - return; - } + /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is + * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at + * least DRBD_SYNC_MARK_STEP time before it will be modified. */ + /* ------------------------ ~18s average ------------------------ */ + i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS; + dt = (jiffies - mdev->rs_mark_time[i]) / HZ; + if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) + stalled = 1; if (!dt) dt++; - db = mdev->rs_mark_left - rs_left; + db = mdev->rs_mark_left[i] - rs_left; rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ seq_printf(seq, "finish: %lu:%02lu:%02lu", rt / 3600, (rt % 3600) / 60, rt % 60); - /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ dbdt = Bit2KB(db/dt); - if (dbdt > 1000) - seq_printf(seq, " speed: %ld,%03ld", - dbdt/1000, dbdt % 1000); - else - seq_printf(seq, " speed: %ld", dbdt); + seq_printf(seq, " speed: "); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, " ("); + /* ------------------------- ~3s average ------------------------ */ + if (proc_details >= 1) { + /* this is what drbd_rs_should_slow_down() uses */ + i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; + dt = (jiffies - mdev->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = mdev->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, " -- "); + } + /* --------------------- long term average ---------------------- */ /* mean speed since syncer started * we do account for PausedSync periods */ dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; - if (dt <= 0) + if (dt == 0) dt = 1; db = mdev->rs_total - rs_left; dbdt = Bit2KB(db/dt); - if (dbdt > 1000) - seq_printf(seq, " (%ld,%03ld)", - dbdt/1000, dbdt % 1000); - else - seq_printf(seq, " (%ld)", dbdt); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, ")"); + + if (mdev->state.conn == C_SYNC_TARGET || + mdev->state.conn == C_VERIFY_S) { + seq_printf(seq, " want: "); + seq_printf_with_thousands_grouping(seq, mdev->c_sync_rate); + } + seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); - seq_printf(seq, " K/sec\n"); + if (proc_details >= 1) { + /* 64 bit: + * we convert to sectors in the display below. */ + unsigned long bm_bits = drbd_bm_bits(mdev); + unsigned long bit_pos; + if (mdev->state.conn == C_VERIFY_S || + mdev->state.conn == C_VERIFY_T) + bit_pos = bm_bits - mdev->ov_left; + else + bit_pos = mdev->bm_resync_fo; + /* Total sectors may be slightly off for oddly + * sized devices. So what. */ + seq_printf(seq, + "\t%3d%% sector pos: %llu/%llu\n", + (int)(bit_pos / (bm_bits/100+1)), + (unsigned long long)bit_pos * BM_SECT_PER_BIT, + (unsigned long long)bm_bits * BM_SECT_PER_BIT); + } } STATIC void resync_dump_detail(struct seq_file *seq, struct lc_element *e) @@ -145,9 +197,11 @@ STATIC int drbd_seq_show(struct seq_file *seq, void *v) { - int i, hole = 0; + int i, prev_i = -1; const char *sn; struct drbd_conf *mdev; + struct net_conf *nc; + char wp; static char write_ordering_chars[] = { [WO_none] = 'n', @@ -179,16 +233,11 @@ oos .. known out-of-sync kB */ - for (i = 0; i < minor_count; i++) { - mdev = minor_to_mdev(i); - if (!mdev) { - hole = 1; - continue; - } - if (hole) { - hole = 0; + rcu_read_lock(); + idr_for_each_entry(&minors, mdev, i) { + if (prev_i != i - 1) seq_printf(seq, "\n"); - } + prev_i = i; sn = drbd_conn_str(mdev->state.conn); @@ -197,8 +246,10 @@ mdev->state.role == R_SECONDARY) { seq_printf(seq, "%2d: cs:Unconfigured\n", i); } else { + nc = rcu_dereference(mdev->tconn->net_conf); + wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' '; seq_printf(seq, - "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" + "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", i, sn, @@ -206,13 +257,13 @@ drbd_role_str(mdev->state.peer), drbd_disk_str(mdev->state.disk), drbd_disk_str(mdev->state.pdsk), - (mdev->net_conf == NULL ? ' ' : - (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), - mdev->state.susp ? 's' : 'r', + wp, + drbd_suspended(mdev) ? 's' : 'r', mdev->state.aftr_isp ? 'a' : '-', mdev->state.peer_isp ? 'p' : '-', mdev->state.user_isp ? 'u' : '-', mdev->congestion_reason ?: '-', + test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-', mdev->send_cnt/2, mdev->recv_cnt/2, mdev->writ_cnt/2, @@ -224,23 +275,19 @@ atomic_read(&mdev->rs_pending_cnt), atomic_read(&mdev->unacked_cnt), atomic_read(&mdev->ap_bio_cnt), - mdev->epochs, - write_ordering_chars[mdev->write_ordering] + mdev->tconn->epochs, + write_ordering_chars[mdev->tconn->write_ordering] ); - seq_printf(seq, " oos:%lu\n", - Bit2KB(drbd_bm_total_weight(mdev))); + seq_printf(seq, " oos:%llu\n", + Bit2KB((unsigned long long) + drbd_bm_total_weight(mdev))); } if (mdev->state.conn == C_SYNC_SOURCE || - mdev->state.conn == C_SYNC_TARGET) + mdev->state.conn == C_SYNC_TARGET || + mdev->state.conn == C_VERIFY_S || + mdev->state.conn == C_VERIFY_T) drbd_syncer_progress(mdev, seq); - if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) - seq_printf(seq, "\t%3d%% %lu/%lu\n", - (int)((mdev->rs_total-mdev->ov_left) / - (mdev->rs_total/100+1)), - mdev->rs_total - mdev->ov_left, - mdev->rs_total); - if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { lc_seq_printf_stats(seq, mdev->resync); lc_seq_printf_stats(seq, mdev->act_log); @@ -254,13 +301,22 @@ } } } + rcu_read_unlock(); return 0; } STATIC int drbd_proc_open(struct inode *inode, struct file *file) { - return single_open(file, drbd_seq_show, PDE(inode)->data); + if (try_module_get(THIS_MODULE)) + return single_open(file, drbd_seq_show, PDE(inode)->data); + return -ENODEV; +} + +STATIC int drbd_proc_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return single_release(inode, file); } /* PROC FS stuff end */ diff -Nru drbd8-8.3.7/drbd/drbd_receiver.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_receiver.c --- drbd8-8.3.7/drbd/drbd_receiver.c 2010-01-07 16:15:48.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_receiver.c 2012-02-02 14:09:14.000000000 +0000 @@ -37,51 +37,49 @@ #include #include #include -#include #include #define __KERNEL_SYSCALLS__ #include #include #include -#ifdef HAVE_LINUX_SCATTERLIST_H -/* 2.6.11 (suse 9.3, fc4) does not include requisites - * from linux/scatterlist.h :( */ -#include -#include -#include -#include -#endif #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_req.h" - #include "drbd_vli.h" +#include struct flush_work { struct drbd_work w; struct drbd_epoch *epoch; }; +struct packet_info { + enum drbd_packet cmd; + unsigned int size; + unsigned int vnr; + void *data; +}; + enum finish_epoch { FE_STILL_LIVE, FE_DESTROYED, FE_RECYCLED, }; -STATIC int drbd_do_handshake(struct drbd_conf *mdev); -STATIC int drbd_do_auth(struct drbd_conf *mdev); +STATIC int drbd_do_features(struct drbd_tconn *tconn); +STATIC int drbd_do_auth(struct drbd_tconn *tconn); +STATIC int drbd_disconnected(struct drbd_conf *mdev); -STATIC enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); -STATIC int e_end_block(struct drbd_conf *, struct drbd_work *, int); +STATIC enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event); +STATIC int e_end_block(struct drbd_work *, int); -static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) +static struct drbd_epoch *previous_epoch(struct drbd_tconn *tconn, struct drbd_epoch *epoch) { struct drbd_epoch *prev; - spin_lock(&mdev->epoch_lock); + spin_lock(&tconn->epoch_lock); prev = list_entry(epoch->list.prev, struct drbd_epoch, list); - if (prev == epoch || prev == mdev->current_epoch) + if (prev == epoch || prev == tconn->current_epoch) prev = NULL; - spin_unlock(&mdev->epoch_lock); + spin_unlock(&tconn->epoch_lock); return prev; } @@ -95,44 +93,135 @@ #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) -static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) +/* + * some helper functions to deal with single linked page lists, + * page->private being our "next" pointer. + */ + +/* If at least n pages are linked at head, get n pages off. + * Otherwise, don't modify head, and return NULL. + * Locking is the responsibility of the caller. + */ +static struct page *page_chain_del(struct page **head, int n) +{ + struct page *page; + struct page *tmp; + + BUG_ON(!n); + BUG_ON(!head); + + page = *head; + + if (!page) + return NULL; + + while (page) { + tmp = page_chain_next(page); + if (--n == 0) + break; /* found sufficient pages */ + if (tmp == NULL) + /* insufficient pages, don't use any of them. */ + return NULL; + page = tmp; + } + + /* add end of list marker for the returned list */ + set_page_private(page, 0); + /* actual return value, and adjustment of head */ + page = *head; + *head = tmp; + return page; +} + +/* may be used outside of locks to find the tail of a (usually short) + * "private" page chain, before adding it back to a global chain head + * with page_chain_add() under a spinlock. */ +static struct page *page_chain_tail(struct page *page, int *len) +{ + struct page *tmp; + int i = 1; + while ((tmp = page_chain_next(page))) + ++i, page = tmp; + if (len) + *len = i; + return page; +} + +static int page_chain_free(struct page *page) +{ + struct page *tmp; + int i = 0; + page_chain_for_each_safe(page, tmp) { + put_page(page); + ++i; + } + return i; +} + +static void page_chain_add(struct page **head, + struct page *chain_first, struct page *chain_last) +{ +#if 1 + struct page *tmp; + tmp = page_chain_tail(chain_first, NULL); + BUG_ON(tmp != chain_last); +#endif + + /* add chain to head */ + set_page_private(chain_last, (unsigned long)*head); + *head = chain_first; +} + +static struct page *__drbd_alloc_pages(struct drbd_conf *mdev, + unsigned int number) { struct page *page = NULL; + struct page *tmp = NULL; + unsigned int i = 0; /* Yes, testing drbd_pp_vacant outside the lock is racy. * So what. It saves a spin_lock. */ - if (drbd_pp_vacant > 0) { + if (drbd_pp_vacant >= number) { spin_lock(&drbd_pp_lock); - page = drbd_pp_pool; - if (page) { - drbd_pp_pool = (struct page *)page_private(page); - set_page_private(page, 0); /* just to be polite */ - drbd_pp_vacant--; - } + page = page_chain_del(&drbd_pp_pool, number); + if (page) + drbd_pp_vacant -= number; spin_unlock(&drbd_pp_lock); + if (page) + return page; } + /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - if (!page) - page = alloc_page(GFP_TRY); - if (page) - atomic_inc(&mdev->pp_in_use); - return page; -} + for (i = 0; i < number; i++) { + tmp = alloc_page(GFP_TRY); + if (!tmp) + break; + set_page_private(tmp, (unsigned long)page); + page = tmp; + } -/* kick lower level device, if we have more than (arbitrary number) - * reference counts on it, which typically are locally submitted io - * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ -static void maybe_kick_lo(struct drbd_conf *mdev) -{ - if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark) - drbd_kick_lo(mdev); + if (i == number) + return page; + + /* Not enough pages immediately available this time. + * No need to jump around here, drbd_alloc_pages will retry this + * function "soon". */ + if (page) { + tmp = page_chain_tail(page, NULL); + spin_lock(&drbd_pp_lock); + page_chain_add(&drbd_pp_pool, page, tmp); + drbd_pp_vacant += i; + spin_unlock(&drbd_pp_lock); + } + return NULL; } -static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) +static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev, + struct list_head *to_be_freed) { - struct drbd_epoch_entry *e; + struct drbd_peer_request *peer_req; struct list_head *le, *tle; /* The EEs are always appended to the end of the list. Since @@ -141,54 +230,63 @@ stop to examine the list... */ list_for_each_safe(le, tle, &mdev->net_ee) { - e = list_entry(le, struct drbd_epoch_entry, w.list); - if (drbd_bio_has_active_page(e->private_bio)) + peer_req = list_entry(le, struct drbd_peer_request, w.list); + if (drbd_peer_req_has_active_page(peer_req)) break; list_move(le, to_be_freed); } } -static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) +static void drbd_reclaim_net(struct drbd_conf *mdev) { LIST_HEAD(reclaimed); - struct drbd_epoch_entry *e, *t; + struct drbd_peer_request *peer_req, *t; - maybe_kick_lo(mdev); - spin_lock_irq(&mdev->req_lock); - reclaim_net_ee(mdev, &reclaimed); - spin_unlock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); + reclaim_finished_net_peer_reqs(mdev, &reclaimed); + spin_unlock_irq(&mdev->tconn->req_lock); - list_for_each_entry_safe(e, t, &reclaimed, w.list) - drbd_free_ee(mdev, e); + list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) + drbd_free_net_peer_req(mdev, peer_req); } /** - * drbd_pp_alloc() - Returns a page, fails only if a signal comes in + * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) * @mdev: DRBD device. - * @retry: whether or not to retry allocation forever (or until signalled) + * @number: number of pages requested + * @retry: whether to retry, if not enough pages are available right now + * + * Tries to allocate number pages, first from our own page pool, then from + * the kernel, unless this allocation would exceed the max_buffers setting. + * Possibly retry until DRBD frees sufficient pages somewhere else. * - * Tries to allocate a page, first from our own page pool, then from the - * kernel, unless this allocation would exceed the max_buffers setting. - * If @retry is non-zero, retry until DRBD frees a page somewhere else. + * Returns a page chain linked via page->private. */ -STATIC struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) +struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number, + bool retry) { struct page *page = NULL; + struct net_conf *nc; DEFINE_WAIT(wait); + int mxb; - if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { - page = drbd_pp_first_page_or_try_alloc(mdev); - if (page) - return page; - } + /* Yes, we may run up to @number over max_buffers. If we + * follow it strictly, the admin will get it wrong anyways. */ + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + mxb = nc ? nc->max_buffers : 1000000; + rcu_read_unlock(); - for (;;) { + if (atomic_read(&mdev->pp_in_use) < mxb) + page = __drbd_alloc_pages(mdev, number); + + while (page == NULL) { prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); - drbd_kick_lo_and_reclaim_net(mdev); + drbd_reclaim_net(mdev); - if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { - page = drbd_pp_first_page_or_try_alloc(mdev); + if (atomic_read(&mdev->pp_in_use) < mxb) { + page = __drbd_alloc_pages(mdev, number); if (page) break; } @@ -197,7 +295,7 @@ break; if (signal_pending(current)) { - dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); + dev_warn(DEV, "drbd_alloc_pages interrupted!\n"); break; } @@ -205,62 +303,34 @@ } finish_wait(&drbd_pp_wait, &wait); + if (page) + atomic_add(number, &mdev->pp_in_use); return page; } -/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. - * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ -STATIC void drbd_pp_free(struct drbd_conf *mdev, struct page *page) -{ - int free_it; - - spin_lock(&drbd_pp_lock); - if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { - free_it = 1; - } else { - set_page_private(page, (unsigned long)drbd_pp_pool); - drbd_pp_pool = page; - drbd_pp_vacant++; - free_it = 0; - } - spin_unlock(&drbd_pp_lock); - - atomic_dec(&mdev->pp_in_use); - - if (free_it) - __free_page(page); - - wake_up(&drbd_pp_wait); -} - -STATIC void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) +/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. + * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock); + * Either links the page chain back to the global pool, + * or returns all pages to the system. */ +STATIC void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net) { - struct page *p_to_be_freed = NULL; - struct page *page; - struct bio_vec *bvec; + atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; int i; - spin_lock(&drbd_pp_lock); - __bio_for_each_segment(bvec, bio, i, 0) { - if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { - set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); - p_to_be_freed = bvec->bv_page; - } else { - set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); - drbd_pp_pool = bvec->bv_page; - drbd_pp_vacant++; - } - } - spin_unlock(&drbd_pp_lock); - atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); - - while (p_to_be_freed) { - page = p_to_be_freed; - p_to_be_freed = (struct page *)page_private(page); - set_page_private(page, 0); /* just to be polite */ - put_page(page); + if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) + i = page_chain_free(page); + else { + struct page *tmp; + tmp = page_chain_tail(page, &i); + spin_lock(&drbd_pp_lock); + page_chain_add(&drbd_pp_pool, page, tmp); + drbd_pp_vacant += i; + spin_unlock(&drbd_pp_lock); } - + i = atomic_sub_return(i, a); + if (i < 0) + dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n", + is_net ? "pp_in_use_by_net" : "pp_in_use", i); wake_up(&drbd_pp_wait); } @@ -269,186 +339,128 @@ _drbd_wait_ee_list_empty() You must not have the req_lock: - drbd_free_ee() - drbd_alloc_ee() - drbd_init_ee() - drbd_release_ee() + drbd_free_peer_req() + drbd_alloc_peer_req() + drbd_free_peer_reqs() drbd_ee_fix_bhs() - drbd_process_done_ee() + drbd_finish_peer_reqs() drbd_clear_done_ee() drbd_wait_ee_list_empty() */ -struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, - u64 id, - sector_t sector, - unsigned int data_size, - gfp_t gfp_mask) __must_hold(local) +struct drbd_peer_request * +drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector, + unsigned int data_size, gfp_t gfp_mask) __must_hold(local) { - struct request_queue *q; - struct drbd_epoch_entry *e; + struct drbd_peer_request *peer_req; struct page *page; - struct bio *bio; - unsigned int ds; + unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; - if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) + if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) return NULL; - e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); - if (!e) { + peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); + if (!peer_req) { if (!(gfp_mask & __GFP_NOWARN)) - dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); + dev_err(DEV, "%s: allocation failed\n", __func__); return NULL; } - bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); - if (!bio) { - if (!(gfp_mask & __GFP_NOWARN)) - dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); - goto fail1; - } - - bio->bi_bdev = mdev->ldev->backing_bdev; - bio->bi_sector = sector; - - ds = data_size; - while (ds) { - page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); - if (!page) { - if (!(gfp_mask & __GFP_NOWARN)) - dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); - goto fail2; - } - if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { - drbd_pp_free(mdev, page); - dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," - "data_size=%u,ds=%u) failed\n", - (unsigned long long)sector, data_size, ds); - - q = bdev_get_queue(bio->bi_bdev); - if (q->merge_bvec_fn) { -#ifdef HAVE_bvec_merge_data - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, - .bi_rw = bio->bi_rw, - }; - int l = q->merge_bvec_fn(q, &bvm, - &bio->bi_io_vec[bio->bi_vcnt]); -#else - int l = q->merge_bvec_fn(q, bio, - &bio->bi_io_vec[bio->bi_vcnt]); -#endif - dev_err(DEV, "merge_bvec_fn() = %d\n", l); - } - - /* dump more of the bio. */ - DUMPI(bio->bi_max_vecs); - DUMPI(bio->bi_vcnt); - DUMPI(bio->bi_size); - DUMPI(bio->bi_phys_segments); - - goto fail2; - break; - } - ds -= min_t(int, ds, PAGE_SIZE); - } - - D_ASSERT(data_size == bio->bi_size); - - bio->bi_private = e; - e->mdev = mdev; - e->sector = sector; - e->size = bio->bi_size; - - e->private_bio = bio; - e->block_id = id; - INIT_HLIST_NODE(&e->colision); - e->epoch = NULL; - e->flags = 0; - - trace_drbd_ee(mdev, e, "allocated"); + page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); + if (!page) + goto fail; - return e; + drbd_clear_interval(&peer_req->i); + peer_req->i.size = data_size; + peer_req->i.sector = sector; + peer_req->i.local = false; + peer_req->i.waiting = false; + + peer_req->epoch = NULL; + peer_req->w.mdev = mdev; + peer_req->pages = page; + atomic_set(&peer_req->pending_bios, 0); + peer_req->flags = 0; + /* + * The block_id is opaque to the receiver. It is not endianness + * converted, and sent back to the sender unchanged. + */ + peer_req->block_id = id; - fail2: - drbd_pp_free_bio_pages(mdev, bio); - bio_put(bio); - fail1: - mempool_free(e, drbd_ee_mempool); + return peer_req; + fail: + mempool_free(peer_req, drbd_ee_mempool); return NULL; } -void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req, + int is_net) { - struct bio *bio = e->private_bio; - trace_drbd_ee(mdev, e, "freed"); - drbd_pp_free_bio_pages(mdev, bio); - bio_put(bio); - D_ASSERT(hlist_unhashed(&e->colision)); - mempool_free(e, drbd_ee_mempool); + if (peer_req->flags & EE_HAS_DIGEST) + kfree(peer_req->digest); + drbd_free_pages(mdev, peer_req->pages, is_net); + D_ASSERT(atomic_read(&peer_req->pending_bios) == 0); + D_ASSERT(drbd_interval_empty(&peer_req->i)); + mempool_free(peer_req, drbd_ee_mempool); } -int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) +int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list) { LIST_HEAD(work_list); - struct drbd_epoch_entry *e, *t; + struct drbd_peer_request *peer_req, *t; int count = 0; + int is_net = list == &mdev->net_ee; - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); list_splice_init(list, &work_list); - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); - list_for_each_entry_safe(e, t, &work_list, w.list) { - drbd_free_ee(mdev, e); + list_for_each_entry_safe(peer_req, t, &work_list, w.list) { + __drbd_free_peer_req(mdev, peer_req, is_net); count++; } return count; } - /* - * This function is called from _asender only_ - * but see also comments in _req_mod(,barrier_acked) - * and receive_Barrier. - * - * Move entries from net_ee to done_ee, if ready. - * Grab done_ee, call all callbacks, free the entries. - * The callbacks typically send out ACKs. + * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. */ -STATIC int drbd_process_done_ee(struct drbd_conf *mdev) +static int drbd_finish_peer_reqs(struct drbd_conf *mdev) { LIST_HEAD(work_list); LIST_HEAD(reclaimed); - struct drbd_epoch_entry *e, *t; - int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); + struct drbd_peer_request *peer_req, *t; + int err = 0; - spin_lock_irq(&mdev->req_lock); - reclaim_net_ee(mdev, &reclaimed); + spin_lock_irq(&mdev->tconn->req_lock); + reclaim_finished_net_peer_reqs(mdev, &reclaimed); list_splice_init(&mdev->done_ee, &work_list); - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); - list_for_each_entry_safe(e, t, &reclaimed, w.list) - drbd_free_ee(mdev, e); + list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) + drbd_free_net_peer_req(mdev, peer_req); /* possible callbacks here: - * e_end_block, and e_end_resync_block, e_send_discard_ack. + * e_end_block, and e_end_resync_block, e_send_discard_write. * all ignore the last argument. */ - list_for_each_entry_safe(e, t, &work_list, w.list) { - trace_drbd_ee(mdev, e, "process_done_ee"); + list_for_each_entry_safe(peer_req, t, &work_list, w.list) { + int err2; + /* list_del not necessary, next/prev members not touched */ - ok = e->w.cb(mdev, &e->w, !ok) && ok; - drbd_free_ee(mdev, e); + err2 = peer_req->w.cb(&peer_req->w, !!err); + if (!err) + err = err2; + drbd_free_peer_req(mdev, peer_req); } wake_up(&mdev->ee_wait); - return ok; + return err; } -void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) +static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, + struct list_head *head) { DEFINE_WAIT(wait); @@ -456,44 +468,24 @@ * and calling prepare_to_wait in the fast path */ while (!list_empty(head)) { prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&mdev->req_lock); - drbd_kick_lo(mdev); + spin_unlock_irq(&mdev->tconn->req_lock); schedule(); finish_wait(&mdev->ee_wait, &wait); - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); } } -void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) +static void drbd_wait_ee_list_empty(struct drbd_conf *mdev, + struct list_head *head) { - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); _drbd_wait_ee_list_empty(mdev, head); - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); } -#ifdef DEFINE_SOCK_CREATE_KERN -/* if there is no sock_create_kern, - * there is also sock_create_lite missing */ -int sock_create_lite(int family, int type, int protocol, struct socket **res) -{ - int err = 0; - struct socket *sock = NULL; - - sock = sock_alloc(); - if (!sock) - err = -ENOMEM; - else - sock->type = type; - - *res = sock; - return err; -} -#endif - /* see also kernel_accept; which is only present since 2.6.18. * also we want to log which part of it failed, exactly */ -STATIC int drbd_accept(struct drbd_conf *mdev, const char **what, - struct socket *sock, struct socket **newsock) +STATIC int drbd_accept(const char **what, struct socket *sock, struct socket **newsock) { struct sock *sk = sock->sk; int err = 0; @@ -522,8 +514,7 @@ return err; } -STATIC int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, - void *buf, size_t size, int flags) +STATIC int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) { mm_segment_t oldfs; struct kvec iov = { @@ -545,7 +536,7 @@ return rv; } -STATIC int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) +STATIC int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size) { mm_segment_t oldfs; struct kvec iov = { @@ -563,7 +554,7 @@ set_fs(KERNEL_DS); for (;;) { - rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); + rv = sock_recvmsg(tconn->data.socket, &msg, size, msg.msg_flags); if (rv == size) break; @@ -574,12 +565,12 @@ if (rv < 0) { if (rv == -ECONNRESET) - dev_info(DEV, "sock was reset by peer\n"); + conn_info(tconn, "sock was reset by peer\n"); else if (rv != -ERESTARTSYS) - dev_err(DEV, "sock_recvmsg returned %d\n", rv); + conn_err(tconn, "sock_recvmsg returned %d\n", rv); break; } else if (rv == 0) { - dev_info(DEV, "sock was shut down by peer\n"); + conn_info(tconn, "sock was shut down by peer\n"); break; } else { /* signal came in, or peer/link went down, @@ -593,32 +584,97 @@ set_fs(oldfs); if (rv != size) - drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); + conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD); return rv; } -STATIC struct socket *drbd_try_connect(struct drbd_conf *mdev) +static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size) +{ + int err; + + err = drbd_recv(tconn, buf, size); + if (err != size) { + if (err >= 0) + err = -EIO; + } else + err = 0; + return err; +} + +static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size) +{ + int err; + + err = drbd_recv_all(tconn, buf, size); + if (err && !signal_pending(current)) + conn_warn(tconn, "short read (expected size %d)\n", (int)size); + return err; +} + +/* quoting tcp(7): + * On individual connections, the socket buffer size must be set prior to the + * listen(2) or connect(2) calls in order to have it take effect. + * This is our wrapper to do so. + */ +static void drbd_setbufsize(struct socket *sock, unsigned int snd, + unsigned int rcv) +{ + /* open coded SO_SNDBUF, SO_RCVBUF */ + if (snd) { + sock->sk->sk_sndbuf = snd; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } + if (rcv) { + sock->sk->sk_rcvbuf = rcv; + sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } +} + +STATIC struct socket *drbd_try_connect(struct drbd_tconn *tconn) { const char *what; struct socket *sock; struct sockaddr_in6 src_in6; - int err; + struct sockaddr_in6 peer_in6; + struct net_conf *nc; + int err, peer_addr_len, my_addr_len; + int sndbuf_size, rcvbuf_size, connect_int; int disconnect_on_error = 1; - if (!get_net_conf(mdev)) + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + if (!nc) { + rcu_read_unlock(); return NULL; + } + sndbuf_size = nc->sndbuf_size; + rcvbuf_size = nc->rcvbuf_size; + connect_int = nc->connect_int; + rcu_read_unlock(); + + my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6)); + memcpy(&src_in6, &tconn->my_addr, my_addr_len); + + if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6) + src_in6.sin6_port = 0; + else + ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ + + peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6)); + memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, - SOCK_STREAM, IPPROTO_TCP, &sock); + err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (err < 0) { sock = NULL; goto out; } sock->sk->sk_rcvtimeo = - sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + sock->sk->sk_sndtimeo = connect_int * HZ; + drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); /* explicitly bind to the configured IP as source IP * for the outgoing connections. @@ -627,17 +683,8 @@ * Make sure to use 0 as port number, so linux selects * a free one dynamically. */ - memcpy(&src_in6, mdev->net_conf->my_addr, - min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); - if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) - src_in6.sin6_port = 0; - else - ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ - what = "bind before connect"; - err = sock->ops->bind(sock, - (struct sockaddr *) &src_in6, - mdev->net_conf->my_addr_len); + err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); if (err < 0) goto out; @@ -645,9 +692,7 @@ * stay C_WF_CONNECTION, don't go Disconnecting! */ disconnect_on_error = 0; what = "connect"; - err = sock->ops->connect(sock, - (struct sockaddr *)mdev->net_conf->peer_addr, - mdev->net_conf->peer_addr_len, 0); + err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); out: if (err < 0) { @@ -665,106 +710,149 @@ disconnect_on_error = 0; break; default: - dev_err(DEV, "%s failed, err = %d\n", what, err); + conn_err(tconn, "%s failed, err = %d\n", what, err); } if (disconnect_on_error) - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); } - put_net_conf(mdev); + return sock; } -STATIC struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) +STATIC struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn) { - int timeo, err; + int timeo, err, my_addr_len; + int sndbuf_size, rcvbuf_size, connect_int; struct socket *s_estab = NULL, *s_listen; + struct sockaddr_in6 my_addr; + struct net_conf *nc; const char *what; - if (!get_net_conf(mdev)) + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + if (!nc) { + rcu_read_unlock(); return NULL; + } + sndbuf_size = nc->sndbuf_size; + rcvbuf_size = nc->rcvbuf_size; + connect_int = nc->connect_int; + rcu_read_unlock(); + + my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6)); + memcpy(&my_addr, &tconn->my_addr, my_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, + err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, SOCK_STREAM, IPPROTO_TCP, &s_listen); if (err) { s_listen = NULL; goto out; } - timeo = mdev->net_conf->try_connect_int * HZ; + timeo = connect_int * HZ; timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ s_listen->sk->sk_rcvtimeo = timeo; s_listen->sk->sk_sndtimeo = timeo; + drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); what = "bind before listen"; - err = s_listen->ops->bind(s_listen, - (struct sockaddr *) mdev->net_conf->my_addr, - mdev->net_conf->my_addr_len); + err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); if (err < 0) goto out; - err = drbd_accept(mdev, &what, s_listen, &s_estab); + err = drbd_accept(&what, s_listen, &s_estab); out: if (s_listen) sock_release(s_listen); if (err < 0) { if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { - dev_err(DEV, "%s failed, err = %d\n", what, err); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + conn_err(tconn, "%s failed, err = %d\n", what, err); + conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); } } - put_net_conf(mdev); return s_estab; } -STATIC int drbd_send_fp(struct drbd_conf *mdev, - struct socket *sock, enum drbd_packets cmd) -{ - struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; +static int decode_header(struct drbd_tconn *, void *, struct packet_info *); - return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); +static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock, + enum drbd_packet cmd) +{ + if (!conn_prepare_command(tconn, sock)) + return -EIO; + return conn_send_command(tconn, sock, cmd, 0, NULL, 0); } -STATIC enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) +static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock) { - struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; - int rr; - - rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); - - if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) - return be16_to_cpu(h->command); + unsigned int header_size = drbd_header_size(tconn); + struct packet_info pi; + int err; - return 0xffff; + err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0); + if (err != header_size) { + if (err >= 0) + err = -EIO; + return err; + } + err = decode_header(tconn, tconn->data.rbuf, &pi); + if (err) + return err; + return pi.cmd; } /** * drbd_socket_okay() - Free the socket if its connection is not okay - * @mdev: DRBD device. * @sock: pointer to the pointer to the socket. */ -static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) +static int drbd_socket_okay(struct socket **sock) { int rr; char tb[4]; if (!*sock) - return FALSE; + return false; - rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); + rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); if (rr > 0 || rr == -EAGAIN) { - return TRUE; + return true; } else { sock_release(*sock); *sock = NULL; - return FALSE; + return false; } } +/* Gets called if a connection is established, or if a new minor gets created + in a connection */ +int drbd_connected(struct drbd_conf *mdev) +{ + int err; + + atomic_set(&mdev->packet_seq, 0); + mdev->peer_seq = 0; + + mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ? + &mdev->tconn->cstate_mutex : + &mdev->own_state_mutex; + + err = drbd_send_sync_param(mdev); + if (!err) + err = drbd_send_sizes(mdev, 0, 0); + if (!err) + err = drbd_send_uuids(mdev); + if (!err) + err = drbd_send_current_state(mdev); + clear_bit(USE_DEGR_WFC_T, &mdev->flags); + clear_bit(RESIZE_PENDING, &mdev->flags); + mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ + return err; +} /* * return values: @@ -774,248 +862,314 @@ * no point in trying again, please go standalone. * -2 We do not have a network config... */ -STATIC int drbd_connect(struct drbd_conf *mdev) +STATIC int conn_connect(struct drbd_tconn *tconn) { - struct socket *s, *sock, *msock; - int try, h, ok; - - D_ASSERT(!mdev->data.socket); + struct drbd_socket sock, msock; + struct drbd_conf *mdev; + struct net_conf *nc; + int vnr, timeout, try, h, ok; + bool discard_my_data; - if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) - dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); - - if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) + if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) return -2; - clear_bit(DISCARD_CONCURRENT, &mdev->flags); + mutex_init(&sock.mutex); + sock.sbuf = tconn->data.sbuf; + sock.rbuf = tconn->data.rbuf; + sock.socket = NULL; + mutex_init(&msock.mutex); + msock.sbuf = tconn->meta.sbuf; + msock.rbuf = tconn->meta.rbuf; + msock.socket = NULL; + + clear_bit(DISCARD_CONCURRENT, &tconn->flags); - sock = NULL; - msock = NULL; + /* Assume that the peer only understands protocol 80 until we know better. */ + tconn->agreed_pro_version = 80; do { + struct socket *s; + for (try = 0;;) { /* 3 tries, this should take less than a second! */ - s = drbd_try_connect(mdev); + s = drbd_try_connect(tconn); if (s || ++try >= 3) break; /* give the other side time to call bind() & listen() */ - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ / 10); + schedule_timeout_interruptible(HZ / 10); } if (s) { - if (!sock) { - drbd_send_fp(mdev, s, P_HAND_SHAKE_S); - sock = s; - s = NULL; - } else if (!msock) { - drbd_send_fp(mdev, s, P_HAND_SHAKE_M); - msock = s; - s = NULL; + if (!sock.socket) { + sock.socket = s; + send_first_packet(tconn, &sock, P_INITIAL_DATA); + } else if (!msock.socket) { + msock.socket = s; + send_first_packet(tconn, &msock, P_INITIAL_META); } else { - dev_err(DEV, "Logic error in drbd_connect()\n"); + conn_err(tconn, "Logic error in conn_connect()\n"); goto out_release_sockets; } } - if (sock && msock) { - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ / 10); - ok = drbd_socket_okay(mdev, &sock); - ok = drbd_socket_okay(mdev, &msock) && ok; + if (sock.socket && msock.socket) { + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + timeout = nc->ping_timeo * HZ / 10; + rcu_read_unlock(); + schedule_timeout_interruptible(timeout); + ok = drbd_socket_okay(&sock.socket); + ok = drbd_socket_okay(&msock.socket) && ok; if (ok) break; } retry: - s = drbd_wait_for_connect(mdev); + s = drbd_wait_for_connect(tconn); if (s) { - try = drbd_recv_fp(mdev, s); - drbd_socket_okay(mdev, &sock); - drbd_socket_okay(mdev, &msock); + try = receive_first_packet(tconn, s); + drbd_socket_okay(&sock.socket); + drbd_socket_okay(&msock.socket); switch (try) { - case P_HAND_SHAKE_S: - if (sock) { - dev_warn(DEV, "initial packet S crossed\n"); - sock_release(sock); + case P_INITIAL_DATA: + if (sock.socket) { + conn_warn(tconn, "initial packet S crossed\n"); + sock_release(sock.socket); } - sock = s; + sock.socket = s; break; - case P_HAND_SHAKE_M: - if (msock) { - dev_warn(DEV, "initial packet M crossed\n"); - sock_release(msock); + case P_INITIAL_META: + if (msock.socket) { + conn_warn(tconn, "initial packet M crossed\n"); + sock_release(msock.socket); } - msock = s; - set_bit(DISCARD_CONCURRENT, &mdev->flags); + msock.socket = s; + set_bit(DISCARD_CONCURRENT, &tconn->flags); break; default: - dev_warn(DEV, "Error receiving initial packet\n"); + conn_warn(tconn, "Error receiving initial packet\n"); sock_release(s); if (random32() & 1) goto retry; } } - if (mdev->state.conn <= C_DISCONNECTING) + if (tconn->cstate <= C_DISCONNECTING) goto out_release_sockets; if (signal_pending(current)) { flush_signals(current); smp_rmb(); - if (get_t_state(&mdev->receiver) == Exiting) + if (get_t_state(&tconn->receiver) == EXITING) goto out_release_sockets; } - if (sock && msock) { - ok = drbd_socket_okay(mdev, &sock); - ok = drbd_socket_okay(mdev, &msock) && ok; + if (sock.socket && &msock.socket) { + ok = drbd_socket_okay(&sock.socket); + ok = drbd_socket_okay(&msock.socket) && ok; if (ok) break; } } while (1); - msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ - sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ + sock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */ + msock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */ - sock->sk->sk_allocation = GFP_NOIO; - msock->sk->sk_allocation = GFP_NOIO; + sock.socket->sk->sk_allocation = GFP_NOIO; + msock.socket->sk->sk_allocation = GFP_NOIO; - sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; - msock->sk->sk_priority = TC_PRIO_INTERACTIVE; - - if (mdev->net_conf->sndbuf_size) { - sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; - sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - } - - if (mdev->net_conf->rcvbuf_size) { - sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; - sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - } + sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; + msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; /* NOT YET ... - * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; - * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; - * first set it to the P_HAND_SHAKE timeout, + * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10; + * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the P_CONNECTION_FEATURES timeout, * which we set to 4x the configured ping_timeout. */ - sock->sk->sk_sndtimeo = - sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); - msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; - msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + sock.socket->sk->sk_sndtimeo = + sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; - /* we don't want delays. - * we use TCP_CORK where apropriate, though */ - drbd_tcp_nodelay(sock); - drbd_tcp_nodelay(msock); + msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; + timeout = nc->timeout * HZ / 10; + discard_my_data = nc->discard_my_data; + rcu_read_unlock(); - mdev->data.socket = sock; - mdev->meta.socket = msock; - mdev->last_received = jiffies; + msock.socket->sk->sk_sndtimeo = timeout; - D_ASSERT(mdev->asender.task == NULL); + /* we don't want delays. + * we use TCP_CORK where appropriate, though */ + drbd_tcp_nodelay(sock.socket); + drbd_tcp_nodelay(msock.socket); + + tconn->data.socket = sock.socket; + tconn->meta.socket = msock.socket; + tconn->last_received = jiffies; - h = drbd_do_handshake(mdev); + h = drbd_do_features(tconn); if (h <= 0) return h; - if (mdev->cram_hmac_tfm) { + if (tconn->cram_hmac_tfm) { /* drbd_request_state(mdev, NS(conn, WFAuth)); */ - switch (drbd_do_auth(mdev)) { + switch (drbd_do_auth(tconn)) { case -1: - dev_err(DEV, "Authentication of peer failed\n"); + conn_err(tconn, "Authentication of peer failed\n"); return -1; case 0: - dev_err(DEV, "Authentication of peer failed, trying again.\n"); + conn_err(tconn, "Authentication of peer failed, trying again.\n"); return 0; } } - if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) - return 0; + tconn->data.socket->sk->sk_sndtimeo = timeout; + tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; - sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; - sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + if (drbd_send_protocol(tconn) == -EOPNOTSUPP) + return -1; - atomic_set(&mdev->packet_seq, 0); - mdev->peer_seq = 0; + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + kref_get(&mdev->kref); + rcu_read_unlock(); - drbd_thread_start(&mdev->asender); + if (discard_my_data) + set_bit(DISCARD_MY_DATA, &mdev->flags); + else + clear_bit(DISCARD_MY_DATA, &mdev->flags); - drbd_send_protocol(mdev); - drbd_send_sync_param(mdev, &mdev->sync_conf); - drbd_send_sizes(mdev, 0); - drbd_send_uuids(mdev); - drbd_send_state(mdev); - clear_bit(USE_DEGR_WFC_T, &mdev->flags); - clear_bit(RESIZE_PENDING, &mdev->flags); + drbd_connected(mdev); + kref_put(&mdev->kref, &drbd_minor_destroy); + rcu_read_lock(); + } + rcu_read_unlock(); - return 1; + if (conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE) < SS_SUCCESS) + return 0; + + drbd_thread_start(&tconn->asender); + + mutex_lock(&tconn->conf_update); + /* The discard_my_data flag is a single-shot modifier to the next + * connection attempt, the handshake of which is now well underway. + * No need for rcu style copying of the whole struct + * just to clear a single value. */ + tconn->net_conf->discard_my_data = 0; + mutex_unlock(&tconn->conf_update); + + return h; out_release_sockets: - if (sock) - sock_release(sock); - if (msock) - sock_release(msock); + if (sock.socket) + sock_release(sock.socket); + if (msock.socket) + sock_release(msock.socket); return -1; } -STATIC int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) +static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi) { - int r; + unsigned int header_size = drbd_header_size(tconn); - r = drbd_recv(mdev, h, sizeof(*h)); - - if (unlikely(r != sizeof(*h))) { - dev_err(DEV, "short read expecting header on sock: r=%d\n", r); - return FALSE; - }; - h->command = be16_to_cpu(h->command); - h->length = be16_to_cpu(h->length); - if (unlikely(h->magic != BE_DRBD_MAGIC)) { - dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); - return FALSE; + if (header_size == sizeof(struct p_header100) && + *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { + struct p_header100 *h = header; + if (h->pad != 0) { + conn_err(tconn, "Header padding is not zero\n"); + return -EINVAL; + } + pi->vnr = be16_to_cpu(h->volume); + pi->cmd = be16_to_cpu(h->command); + pi->size = be32_to_cpu(h->length); + } else if (header_size == sizeof(struct p_header95) && + *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { + struct p_header95 *h = header; + pi->cmd = be16_to_cpu(h->command); + pi->size = be32_to_cpu(h->length); + pi->vnr = 0; + } else if (header_size == sizeof(struct p_header80) && + *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { + struct p_header80 *h = header; + pi->cmd = be16_to_cpu(h->command); + pi->size = be16_to_cpu(h->length); + pi->vnr = 0; + } else { + conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n", + be32_to_cpu(*(__be32 *)header), + tconn->agreed_pro_version); + return -EINVAL; } - mdev->last_received = jiffies; + pi->data = header + header_size; + return 0; +} + +static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi) +{ + void *buffer = tconn->data.rbuf; + int err; + + err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn)); + if (err) + return err; - return TRUE; + err = decode_header(tconn, buffer, pi); + tconn->last_received = jiffies; + + return err; } -STATIC enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) +STATIC enum finish_epoch drbd_flush_after_epoch(struct drbd_tconn *tconn, struct drbd_epoch *epoch) { int rv; + struct drbd_conf *mdev; + int vnr; + + if (tconn->write_ordering >= WO_bdev_flush) { + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (!get_ldev(mdev)) + continue; + kref_get(&mdev->kref); + rcu_read_unlock(); + + rv = blkdev_issue_flush(mdev->ldev->backing_bdev, + GFP_NOIO, NULL); + if (rv) { + dev_info(DEV, "local disk flush failed with status %d\n", rv); + /* would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 + * if (rv == -EOPNOTSUPP) */ + drbd_bump_write_ordering(tconn, WO_drain_io); + } + put_ldev(mdev); + kref_put(&mdev->kref, &drbd_minor_destroy); - if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { - rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); - if (rv) { - dev_err(DEV, "local disk flush failed with status %d\n", rv); - /* would rather check on EOPNOTSUPP, but that is not reliable. - * don't try again for ANY return value != 0 - * if (rv == -EOPNOTSUPP) */ - drbd_bump_write_ordering(mdev, WO_drain_io); + rcu_read_lock(); + if (rv) + break; } - put_ldev(mdev); + rcu_read_unlock(); } - return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); + return drbd_may_finish_epoch(tconn, epoch, EV_BARRIER_DONE); } -STATIC int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +STATIC int w_flush(struct drbd_work *w, int cancel) { struct flush_work *fw = (struct flush_work *)w; struct drbd_epoch *epoch = fw->epoch; + struct drbd_conf *mdev = w->mdev; kfree(w); if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags)) - drbd_flush_after_epoch(mdev, epoch); + drbd_flush_after_epoch(mdev->tconn, epoch); - drbd_may_finish_epoch(mdev, epoch, EV_PUT | + drbd_may_finish_epoch(mdev->tconn, epoch, EV_PUT | (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0)); - return 1; + return 0; } /** @@ -1024,7 +1178,7 @@ * @epoch: Epoch object. * @ev: Epoch event. */ -STATIC enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, +STATIC enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn, struct drbd_epoch *epoch, enum epoch_event ev) { @@ -1033,7 +1187,7 @@ int schedule_flush = 0; enum finish_epoch rv = FE_STILL_LIVE; - spin_lock(&mdev->epoch_lock); + spin_lock(&tconn->epoch_lock); do { next_epoch = NULL; finish = 0; @@ -1050,8 +1204,8 @@ /* Special case: If we just switched from WO_bio_barrier to WO_bdev_flush we should not finish the current epoch */ if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 && - mdev->write_ordering != WO_bio_barrier && - epoch == mdev->current_epoch) + tconn->write_ordering != WO_bio_barrier && + epoch == tconn->current_epoch) clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags); break; case EV_BARRIER_DONE: @@ -1062,40 +1216,38 @@ break; } - trace_drbd_epoch(mdev, epoch, ev); - if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && - epoch->list.prev == &mdev->current_epoch->list && + (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP) && + epoch->list.prev == &tconn->current_epoch->list && !test_bit(DE_IS_FINISHING, &epoch->flags)) { /* Nearly all conditions are met to finish that epoch... */ if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) || - mdev->write_ordering == WO_none || + tconn->write_ordering == WO_none || (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) || ev & EV_CLEANUP) { finish = 1; set_bit(DE_IS_FINISHING, &epoch->flags); } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) && - mdev->write_ordering == WO_bio_barrier) { + tconn->write_ordering == WO_bio_barrier) { atomic_inc(&epoch->active); schedule_flush = 1; } } if (finish) { if (!(ev & EV_CLEANUP)) { - spin_unlock(&mdev->epoch_lock); - drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); - spin_lock(&mdev->epoch_lock); + spin_unlock(&tconn->epoch_lock); + drbd_send_b_ack(epoch->mdev, epoch->barrier_nr, epoch_size); + spin_lock(&tconn->epoch_lock); } - dec_unacked(mdev); + if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) + dec_unacked(epoch->mdev); - if (mdev->current_epoch != epoch) { + if (tconn->current_epoch != epoch) { next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); list_del(&epoch->list); ev = EV_BECAME_LAST | (ev & EV_CLEANUP); - mdev->epochs--; - trace_drbd_epoch(mdev, epoch, EV_TRACE_FREE); + tconn->epochs--; kfree(epoch); if (rv == FE_STILL_LIVE) @@ -1115,22 +1267,22 @@ epoch = next_epoch; } while (1); - spin_unlock(&mdev->epoch_lock); + spin_unlock(&tconn->epoch_lock); if (schedule_flush) { struct flush_work *fw; fw = kmalloc(sizeof(*fw), GFP_ATOMIC); if (fw) { - trace_drbd_epoch(mdev, epoch, EV_TRACE_FLUSH); fw->w.cb = w_flush; fw->epoch = epoch; - drbd_queue_work(&mdev->data.work, &fw->w); + fw->w.mdev = epoch->mdev; + drbd_queue_work(&tconn->data.work, &fw->w); } else { - dev_warn(DEV, "Could not kmalloc a flush_work obj\n"); + conn_warn(tconn, "Could not kmalloc a flush_work obj\n"); set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); /* That is not a recursion, only one level */ - drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); - drbd_may_finish_epoch(mdev, epoch, EV_PUT); + drbd_may_finish_epoch(tconn, epoch, EV_BARRIER_DONE); + drbd_may_finish_epoch(tconn, epoch, EV_PUT); } } @@ -1139,12 +1291,15 @@ /** * drbd_bump_write_ordering() - Fall back to an other write ordering method - * @mdev: DRBD device. + * @tconn: DRBD connection. * @wo: Write ordering method to try. */ -void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) +void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo) { + struct disk_conf *dc; + struct drbd_conf *mdev; enum write_ordering_e pwo; + int vnr, i = 0; static char *write_ordering_str[] = { [WO_none] = "none", [WO_drain_io] = "drain", @@ -1152,113 +1307,258 @@ [WO_bio_barrier] = "barrier", }; - pwo = mdev->write_ordering; + pwo = tconn->write_ordering; wo = min(pwo, wo); - if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier) - wo = WO_bdev_flush; - if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) - wo = WO_drain_io; - if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) - wo = WO_none; - mdev->write_ordering = wo; - if (pwo != mdev->write_ordering || wo == WO_bio_barrier) - dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (i++ == 1 && wo == WO_bio_barrier) + wo = WO_bdev_flush; /* WO = barrier does not handle multiple volumes */ + if (!get_ldev(mdev)) + continue; + dc = rcu_dereference(mdev->ldev->disk_conf); + + if (wo == WO_bio_barrier && !dc->disk_barrier) + wo = WO_bdev_flush; + if (wo == WO_bdev_flush && !dc->disk_flushes) + wo = WO_drain_io; + if (wo == WO_drain_io && !dc->disk_drain) + wo = WO_none; + put_ldev(mdev); + } + rcu_read_unlock(); + tconn->write_ordering = wo; + if (pwo != tconn->write_ordering || wo == WO_bio_barrier) + conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]); } /** - * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set + * drbd_submit_peer_request() + * @mdev: DRBD device. + * @peer_req: peer request + * @rw: flag field, see bio->bi_rw + * + * May spread the pages to multiple bios, + * depending on bio_add_page restrictions. + * + * Returns 0 if all bios have been submitted, + * -ENOMEM if we could not allocate enough bios, + * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a + * single page to an empty bio (which should never happen and likely indicates + * that the lower level IO stack is in some way broken). This has been observed + * on certain Xen deployments. + */ +/* TODO allocate from our own bio_set. */ +int drbd_submit_peer_request(struct drbd_conf *mdev, + struct drbd_peer_request *peer_req, + const unsigned rw, const int fault_type) +{ + struct bio *bios = NULL; + struct bio *bio; + struct page *page = peer_req->pages; + sector_t sector = peer_req->i.sector; + unsigned ds = peer_req->i.size; + unsigned n_bios = 0; + unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; + int err = -ENOMEM; + + /* In most cases, we will only need one bio. But in case the lower + * level restrictions happen to be different at this offset on this + * side than those of the sending peer, we may need to submit the + * request in more than one bio. + * + * Plain bio_alloc is good enough here, this is no DRBD internally + * generated bio, but a bio allocated on behalf of the peer. + */ +next_bio: + bio = bio_alloc(GFP_NOIO, nr_pages); + if (!bio) { + dev_err(DEV, "submit_ee: Allocation of a bio failed\n"); + goto fail; + } + /* > peer_req->i.sector, unless this is the first bio */ + bio->bi_sector = sector; + bio->bi_bdev = mdev->ldev->backing_bdev; + /* we special case some flags in the multi-bio case, see below + * (REQ_FLUSH, or BIO_RW_BARRIER in older kernels) */ + bio->bi_rw = rw; + bio->bi_private = peer_req; + bio->bi_end_io = drbd_peer_request_endio; + + bio->bi_next = bios; + bios = bio; + ++n_bios; + + page_chain_for_each(page) { + unsigned len = min_t(unsigned, ds, PAGE_SIZE); + if (!bio_add_page(bio, page, len, 0)) { + /* A single page must always be possible! + * But in case it fails anyways, + * we deal with it, and complain (below). */ + if (bio->bi_vcnt == 0) { + dev_err(DEV, + "bio_add_page failed for len=%u, " + "bi_vcnt=0 (bi_sector=%llu)\n", + len, (unsigned long long)bio->bi_sector); + err = -ENOSPC; + goto fail; + } + goto next_bio; + } + ds -= len; + sector += len >> 9; + --nr_pages; + } + D_ASSERT(page == NULL); + D_ASSERT(ds == 0); + + atomic_set(&peer_req->pending_bios, n_bios); + do { + bio = bios; + bios = bios->bi_next; + bio->bi_next = NULL; + + drbd_generic_make_request(mdev, fault_type, bio); + + /* strip off REQ_FLUSH, + * unless it is the first or last bio */ + if (bios && bios->bi_next) + bios->bi_rw &= ~DRBD_REQ_FLUSH; + } while (bios); + return 0; + +fail: + while (bios) { + bio = bios; + bios = bios->bi_next; + bio_put(bio); + } + return err; +} + +static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev, + struct drbd_peer_request *peer_req) +{ + struct drbd_interval *i = &peer_req->i; + + drbd_remove_interval(&mdev->write_requests, i); + drbd_clear_interval(i); + + /* Wake up any processes waiting for this peer request to complete. */ + if (i->waiting) + wake_up(&mdev->misc_wait); +} + +/** + * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set * @mdev: DRBD device. * @w: work object. * @cancel: The connection will be closed anyways (unused in this callback) */ -int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) +int w_e_reissue(struct drbd_work *w, int cancel) __releases(local) { - struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; - struct bio *bio = e->private_bio; - + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; + int err; /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) so that we can finish that epoch in drbd_may_finish_epoch(). That is necessary if we already have a long chain of Epochs, before - we realize that BIO_RW_BARRIER is actually not supported */ + we realize that BARRIER is actually not supported */ /* As long as the -ENOTSUPP on the barrier is reported immediately that will never trigger. If it is reported late, we will just print that warning and continue correctly for all future requests with WO_bdev_flush */ - if (previous_epoch(mdev, e->epoch)) + if (previous_epoch(mdev->tconn, peer_req->epoch)) dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); - /* prepare bio for re-submit, - * re-init volatile members */ /* we still have a local reference, * get_ldev was done in receive_Data. */ - bio->bi_bdev = mdev->ldev->backing_bdev; - bio->bi_sector = e->sector; - bio->bi_size = e->size; - bio->bi_idx = 0; - - bio->bi_flags &= ~(BIO_POOL_MASK - 1); - bio->bi_flags |= 1 << BIO_UPTODATE; - - /* don't know whether this is necessary: */ - bio->bi_phys_segments = 0; - bio->bi_next = NULL; - - /* these should be unchanged: */ - /* bio->bi_end_io = drbd_endio_write_sec; */ - /* bio->bi_vcnt = whatever; */ - e->w.cb = e_end_block; - - /* This is no longer a barrier request. */ - bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); + peer_req->w.cb = e_end_block; + err = drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_DT_WR); + switch (err) { + case -ENOMEM: + peer_req->w.cb = w_e_reissue; + drbd_queue_work(&mdev->tconn->data.work, &peer_req->w); + /* retry later; fall through */ + case 0: + /* keep worker happy and connection up */ + return 0; - drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); + case -ENOSPC: + /* no other error expected, but anyways: */ + default: + /* forget the object, + * and cause a "Network failure" */ + spin_lock_irq(&mdev->tconn->req_lock); + list_del(&peer_req->w.list); + drbd_remove_epoch_entry_interval(mdev, peer_req); + spin_unlock_irq(&mdev->tconn->req_lock); + if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) + drbd_al_complete_io(mdev, &peer_req->i); + drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + EV_CLEANUP); + drbd_free_peer_req(mdev, peer_req); + dev_err(DEV, "submit failed, triggering re-connect\n"); + return err; + } +} - return 1; +void conn_wait_active_ee_empty(struct drbd_tconn *tconn) +{ + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + kref_get(&mdev->kref); + rcu_read_unlock(); + drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + kref_put(&mdev->kref, &drbd_minor_destroy); + rcu_read_lock(); + } + rcu_read_unlock(); } -STATIC int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi) { + struct drbd_conf *mdev; int rv, issue_flush; - struct p_barrier *p = (struct p_barrier *)h; + struct p_barrier *p = pi->data; struct drbd_epoch *epoch; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - - rv = drbd_recv(mdev, h->payload, h->length); - ERR_IF(rv != h->length) return FALSE; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; inc_unacked(mdev); - if (mdev->net_conf->wire_protocol != DRBD_PROT_C) - drbd_kick_lo(mdev); - - mdev->current_epoch->barrier_nr = p->barrier; - rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); + tconn->current_epoch->barrier_nr = p->barrier; + tconn->current_epoch->mdev = mdev; + rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR); /* P_BARRIER_ACK may imply that the corresponding extent is dropped from * the activity log, which means it would not be resynced in case the * R_PRIMARY crashes now. * Therefore we must send the barrier_ack after the barrier request was * completed. */ - switch (mdev->write_ordering) { + switch (tconn->write_ordering) { case WO_bio_barrier: case WO_none: if (rv == FE_RECYCLED) - return TRUE; + return 0; break; case WO_bdev_flush: case WO_drain_io: if (rv == FE_STILL_LIVE) { - set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); - drbd_wait_ee_list_empty(mdev, &mdev->active_ee); - rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &tconn->current_epoch->flags); + conn_wait_active_ee_empty(tconn); + rv = drbd_flush_after_epoch(tconn, tconn->current_epoch); } if (rv == FE_RECYCLED) - return TRUE; + return 0; /* The asender will send all the ACKs and barrier ACKs out, since all EEs moved from the active_ee to the done_ee. We need to @@ -1271,101 +1571,117 @@ epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); if (!epoch) { dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); - issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); - drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &tconn->current_epoch->flags); + conn_wait_active_ee_empty(tconn); if (issue_flush) { - rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); + rv = drbd_flush_after_epoch(tconn, tconn->current_epoch); if (rv == FE_RECYCLED) - return TRUE; + return 0; } drbd_wait_ee_list_empty(mdev, &mdev->done_ee); - return TRUE; + return 0; } epoch->flags = 0; atomic_set(&epoch->epoch_size, 0); atomic_set(&epoch->active, 0); - spin_lock(&mdev->epoch_lock); - if (atomic_read(&mdev->current_epoch->epoch_size)) { - list_add(&epoch->list, &mdev->current_epoch->list); - mdev->current_epoch = epoch; - mdev->epochs++; - trace_drbd_epoch(mdev, epoch, EV_TRACE_ALLOC); + spin_lock(&tconn->epoch_lock); + if (atomic_read(&tconn->current_epoch->epoch_size)) { + list_add(&epoch->list, &tconn->current_epoch->list); + tconn->current_epoch = epoch; + tconn->epochs++; } else { /* The current_epoch got recycled while we allocated this one... */ kfree(epoch); } - spin_unlock(&mdev->epoch_lock); + spin_unlock(&tconn->epoch_lock); - return TRUE; + return 0; } /* used from receive_RSDataReply (recv_resync_read) * and from receive_Data */ -STATIC struct drbd_epoch_entry * -read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) +STATIC struct drbd_peer_request * +read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, + int data_size) __must_hold(local) { - struct drbd_epoch_entry *e; - struct bio_vec *bvec; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + struct drbd_peer_request *peer_req; struct page *page; - struct bio *bio; - int dgs, ds, i, rr; - void *dig_in = mdev->int_dig_in; - void *dig_vv = mdev->int_dig_vv; - - dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? - crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; - - if (dgs) { - rr = drbd_recv(mdev, dig_in, dgs); - if (rr != dgs) { - dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", - rr, dgs); + int dgs, ds, err; + void *dig_in = mdev->tconn->int_dig_in; + void *dig_vv = mdev->tconn->int_dig_vv; + unsigned long *data; + + dgs = 0; + if (mdev->tconn->peer_integrity_tfm) { + dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); + /* + * FIXME: Receive the incoming digest into the receive buffer + * here, together with its struct p_data? + */ + err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); + if (err) return NULL; - } + data_size -= dgs; } - data_size -= dgs; + if (!expect(data_size != 0)) + return NULL; + if (!expect(IS_ALIGNED(data_size, 512))) + return NULL; + if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) + return NULL; - ERR_IF(data_size & 0x1ff) return NULL; - ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; + /* even though we trust out peer, + * we sometimes have to double check. */ + if (sector + (data_size>>9) > capacity) { + dev_err(DEV, "request from peer beyond end of local disk: " + "capacity: %llus < sector: %llus + size: %u\n", + (unsigned long long)capacity, + (unsigned long long)sector, data_size); + return NULL; + } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); - if (!e) + peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO); + if (!peer_req) return NULL; - bio = e->private_bio; + ds = data_size; - bio_for_each_segment(bvec, bio, i) { - page = bvec->bv_page; - rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); + page = peer_req->pages; + page_chain_for_each(page) { + unsigned len = min_t(int, ds, PAGE_SIZE); + data = kmap(page); + err = drbd_recv_all_warn(mdev->tconn, data, len); + if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) { + dev_err(DEV, "Fault injection: Corrupting data on receive\n"); + data[0] = data[0] ^ (unsigned long)-1; + } kunmap(page); - if (rr != min_t(int, ds, PAGE_SIZE)) { - drbd_free_ee(mdev, e); - dev_warn(DEV, "short read receiving data: read %d expected %d\n", - rr, min_t(int, ds, PAGE_SIZE)); + if (err) { + drbd_free_peer_req(mdev, peer_req); return NULL; } - ds -= rr; + ds -= len; } if (dgs) { - drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); + drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv); if (memcmp(dig_in, dig_vv, dgs)) { - dev_err(DEV, "Digest integrity check FAILED.\n"); - drbd_bcast_ee(mdev, "digest failed", - dgs, dig_in, dig_vv, e); - drbd_free_ee(mdev, e); + dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n", + (unsigned long long)sector, data_size); + drbd_free_peer_req(mdev, peer_req); return NULL; } } mdev->recv_cnt += data_size>>9; - return e; + return peer_req; } /* drbd_drain_block() just takes a data block @@ -1374,25 +1690,26 @@ STATIC int drbd_drain_block(struct drbd_conf *mdev, int data_size) { struct page *page; - int rr, rv = 1; + int err = 0; void *data; - page = drbd_pp_alloc(mdev, 1); + if (!data_size) + return 0; + + page = drbd_alloc_pages(mdev, 1, 1); data = kmap(page); while (data_size) { - rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); - if (rr != min_t(int, data_size, PAGE_SIZE)) { - rv = 0; - dev_warn(DEV, "short read receiving data: read %d expected %d\n", - rr, min_t(int, data_size, PAGE_SIZE)); + unsigned int len = min_t(int, data_size, PAGE_SIZE); + + err = drbd_recv_all_warn(mdev->tconn, data, len); + if (err) break; - } - data_size -= rr; + data_size -= len; } kunmap(page); - drbd_pp_free(mdev, page); - return rv; + drbd_free_pages(mdev, page, 0); + return err; } STATIC int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, @@ -1400,24 +1717,19 @@ { struct bio_vec *bvec; struct bio *bio; - int dgs, rr, i, expect; - void *dig_in = mdev->int_dig_in; - void *dig_vv = mdev->int_dig_vv; - - dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? - crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; - - if (dgs) { - rr = drbd_recv(mdev, dig_in, dgs); - if (rr != dgs) { - dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", - rr, dgs); - return 0; - } + int dgs, err, i, expect; + void *dig_in = mdev->tconn->int_dig_in; + void *dig_vv = mdev->tconn->int_dig_vv; + + dgs = 0; + if (mdev->tconn->peer_integrity_tfm) { + dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm); + err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs); + if (err) + return err; + data_size -= dgs; } - data_size -= dgs; - /* optimistically update recv_cnt. if receiving fails below, * we disconnect anyways, and counters will be reset. */ mdev->recv_cnt += data_size>>9; @@ -1426,143 +1738,151 @@ D_ASSERT(sector == bio->bi_sector); bio_for_each_segment(bvec, bio, i) { + void *mapped = kmap(bvec->bv_page) + bvec->bv_offset; expect = min_t(int, data_size, bvec->bv_len); - rr = drbd_recv(mdev, - kmap(bvec->bv_page)+bvec->bv_offset, - expect); + err = drbd_recv_all_warn(mdev->tconn, mapped, expect); kunmap(bvec->bv_page); - if (rr != expect) { - dev_warn(DEV, "short read receiving data reply: " - "read %d expected %d\n", - rr, expect); - return 0; - } - data_size -= rr; + if (err) + return err; + data_size -= expect; } if (dgs) { - drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); + drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv); if (memcmp(dig_in, dig_vv, dgs)) { dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); - return 0; + return -EINVAL; } } D_ASSERT(data_size == 0); - return 1; + return 0; } -/* e_end_resync_block() is called via - * drbd_process_done_ee() by asender only */ -STATIC int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) -{ - struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; - sector_t sector = e->sector; - int ok; - - D_ASSERT(hlist_unhashed(&e->colision)); - - if (likely(drbd_bio_uptodate(e->private_bio))) { - drbd_set_in_sync(mdev, sector, e->size); - ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); +/* + * e_end_resync_block() is called in asender context via + * drbd_finish_peer_reqs(). + */ +STATIC int e_end_resync_block(struct drbd_work *w, int unused) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; + sector_t sector = peer_req->i.sector; + int err; + + D_ASSERT(drbd_interval_empty(&peer_req->i)); + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + drbd_set_in_sync(mdev, sector, peer_req->i.size); + err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req); } else { /* Record failure to sync */ - drbd_rs_failed_io(mdev, sector, e->size); + drbd_rs_failed_io(mdev, sector, peer_req->i.size); - ok = drbd_send_ack(mdev, P_NEG_ACK, e); + err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); } dec_unacked(mdev); - return ok; + return err; } STATIC int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) { - struct drbd_epoch_entry *e; + struct drbd_peer_request *peer_req; - e = read_in_block(mdev, ID_SYNCER, sector, data_size); - if (!e) { - put_ldev(mdev); - return FALSE; - } + peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size); + if (!peer_req) + goto fail; dec_rs_pending(mdev); - e->private_bio->bi_end_io = drbd_endio_write_sec; - e->private_bio->bi_rw = WRITE; - e->w.cb = e_end_resync_block; - inc_unacked(mdev); /* corresponding dec_unacked() in e_end_resync_block() * respective _drbd_clear_done_ee */ - spin_lock_irq(&mdev->req_lock); - list_add(&e->w.list, &mdev->sync_ee); - spin_unlock_irq(&mdev->req_lock); + peer_req->w.cb = e_end_resync_block; + + spin_lock_irq(&mdev->tconn->req_lock); + list_add(&peer_req->w.list, &mdev->sync_ee); + spin_unlock_irq(&mdev->tconn->req_lock); + + atomic_add(data_size >> 9, &mdev->rs_sect_ev); + if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) + return 0; - trace_drbd_ee(mdev, e, "submitting for (rs)write"); - trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); - drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); - /* accounting done in endio */ + /* don't care for the reason here */ + dev_err(DEV, "submit failed, triggering re-connect\n"); + spin_lock_irq(&mdev->tconn->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&mdev->tconn->req_lock); - maybe_kick_lo(mdev); - return TRUE; + drbd_free_peer_req(mdev, peer_req); +fail: + put_ldev(mdev); + return -EIO; } -STATIC int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) +static struct drbd_request * +find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id, + sector_t sector, bool missing_ok, const char *func) { struct drbd_request *req; - sector_t sector; - unsigned int header_size, data_size; - int ok; - struct p_data *p = (struct p_data *)h; - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; + /* Request object according to our peer */ + req = (struct drbd_request *)(unsigned long)id; + if (drbd_contains_interval(root, sector, &req->i) && req->i.local) + return req; + if (!missing_ok) { + dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func, + (unsigned long)id, (unsigned long long)sector); + } + return NULL; +} - ERR_IF(data_size == 0) return FALSE; +STATIC int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi) +{ + struct drbd_conf *mdev; + struct drbd_request *req; + sector_t sector; + int err; + struct p_data *p = pi->data; - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; sector = be64_to_cpu(p->sector); - spin_lock_irq(&mdev->req_lock); - req = _ar_id_to_req(mdev, p->block_id, sector); - spin_unlock_irq(&mdev->req_lock); - if (unlikely(!req)) { - dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); - return FALSE; - } + spin_lock_irq(&mdev->tconn->req_lock); + req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__); + spin_unlock_irq(&mdev->tconn->req_lock); + if (unlikely(!req)) + return -EIO; - /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid + /* drbd_remove_request_interval() is done in _req_may_be_done, to avoid * special casing it there for the various failure cases. * still no race with drbd_fail_pending_reads */ - ok = recv_dless_read(mdev, req, sector, data_size); - - if (ok) - req_mod(req, data_received); + err = recv_dless_read(mdev, req, sector, pi->size); + if (!err) + req_mod(req, DATA_RECEIVED); /* else: nothing. handled from drbd_disconnect... * I don't think we may complete this just yet * in case we are "on-disconnect: freeze" */ - return ok; + return err; } -STATIC int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi) { + struct drbd_conf *mdev; sector_t sector; - unsigned int header_size, data_size; - int ok; - struct p_data *p = (struct p_data *)h; - - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - ERR_IF(data_size == 0) return FALSE; + int err; + struct p_data *p = pi->data; - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; sector = be64_to_cpu(p->sector); D_ASSERT(p->block_id == ID_SYNCER); @@ -1570,47 +1890,96 @@ if (get_ldev(mdev)) { /* data is submitted to disk within recv_resync_read. * corresponding put_ldev done below on error, - * or in drbd_endio_write_sec. */ - ok = recv_resync_read(mdev, sector, data_size); + * or in drbd_peer_request_endio. */ + err = recv_resync_read(mdev, sector, pi->size); } else { if (DRBD_ratelimit(5*HZ, 5)) dev_err(DEV, "Can not write resync data to local disk.\n"); - ok = drbd_drain_block(mdev, data_size); + err = drbd_drain_block(mdev, pi->size); + + drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); + } + + atomic_add(pi->size >> 9, &mdev->rs_sect_in); + + return err; +} + +static int w_restart_write(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_conf *mdev = w->mdev; + struct bio *bio; + unsigned long start_time; + unsigned long flags; - drbd_send_ack_dp(mdev, P_NEG_ACK, p); + spin_lock_irqsave(&mdev->tconn->req_lock, flags); + if (!expect(req->rq_state & RQ_POSTPONED)) { + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + return -EIO; } + bio = req->master_bio; + start_time = req->start_time; + /* Postponed requests will not have their master_bio completed! */ + __req_mod(req, DISCARD_WRITE, NULL); + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + + while (__drbd_make_request(mdev, bio, start_time)) + /* retry */ ; + return 0; +} + +static void restart_conflicting_writes(struct drbd_conf *mdev, + sector_t sector, int size) +{ + struct drbd_interval *i; + struct drbd_request *req; - return ok; + drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { + if (!i->local) + continue; + req = container_of(i, struct drbd_request, i); + if (req->rq_state & RQ_LOCAL_PENDING || + !(req->rq_state & RQ_POSTPONED)) + continue; + if (expect(list_empty(&req->w.list))) { + req->w.mdev = mdev; + req->w.cb = w_restart_write; + drbd_queue_work(&mdev->tconn->data.work, &req->w); + } + } } -/* e_end_block() is called via drbd_process_done_ee(). - * this means this function only runs in the asender thread +/* + * e_end_block() is called in asender context via drbd_finish_peer_reqs(). */ -STATIC int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +STATIC int e_end_block(struct drbd_work *w, int cancel) { - struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; - sector_t sector = e->sector; + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; + sector_t sector = peer_req->i.sector; struct drbd_epoch *epoch; - int ok = 1, pcmd; + int err = 0, pcmd; - if (e->flags & EE_IS_BARRIER) { - epoch = previous_epoch(mdev, e->epoch); + if (peer_req->flags & EE_IS_BARRIER) { + epoch = previous_epoch(mdev->tconn, peer_req->epoch); if (epoch) - drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); + drbd_may_finish_epoch(mdev->tconn, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); } - if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (peer_req->flags & EE_SEND_WRITE_ACK) { + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { pcmd = (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn <= C_PAUSED_SYNC_T && - e->flags & EE_MAY_SET_IN_SYNC) ? + peer_req->flags & EE_MAY_SET_IN_SYNC) ? P_RS_WRITE_ACK : P_WRITE_ACK; - ok &= drbd_send_ack(mdev, pcmd, e); + err = drbd_send_ack(mdev, pcmd, peer_req); if (pcmd == P_RS_WRITE_ACK) - drbd_set_in_sync(mdev, sector, e->size); + drbd_set_in_sync(mdev, sector, peer_req->i.size); } else { - ok = drbd_send_ack(mdev, P_NEG_ACK, e); + err = drbd_send_ack(mdev, P_NEG_ACK, peer_req); /* we expect it to be marked out of sync anyways... * maybe assert this? */ } @@ -1618,36 +1987,93 @@ } /* we delete from the conflict detection hash _after_ we sent out the * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ - if (mdev->net_conf->two_primaries) { - spin_lock_irq(&mdev->req_lock); - D_ASSERT(!hlist_unhashed(&e->colision)); - hlist_del_init(&e->colision); - spin_unlock_irq(&mdev->req_lock); - } else { - D_ASSERT(hlist_unhashed(&e->colision)); - } + if (peer_req->flags & EE_IN_INTERVAL_TREE) { + spin_lock_irq(&mdev->tconn->req_lock); + D_ASSERT(!drbd_interval_empty(&peer_req->i)); + drbd_remove_epoch_entry_interval(mdev, peer_req); + if (peer_req->flags & EE_RESTART_REQUESTS) + restart_conflicting_writes(mdev, sector, peer_req->i.size); + spin_unlock_irq(&mdev->tconn->req_lock); + } else + D_ASSERT(drbd_interval_empty(&peer_req->i)); + + drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + + return err; +} + +static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) +{ + struct drbd_conf *mdev = w->mdev; + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + int err; - drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + err = drbd_send_ack(mdev, ack, peer_req); + dec_unacked(mdev); - return ok; + return err; +} + +static int e_send_discard_write(struct drbd_work *w, int unused) +{ + return e_send_ack(w, P_DISCARD_WRITE); } -STATIC int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) +static int e_send_retry_write(struct drbd_work *w, int unused) { - struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; - int ok = 1; + struct drbd_tconn *tconn = w->mdev->tconn; - D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); - ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); + return e_send_ack(w, tconn->agreed_pro_version >= 100 ? + P_RETRY_WRITE : P_DISCARD_WRITE); +} - spin_lock_irq(&mdev->req_lock); - D_ASSERT(!hlist_unhashed(&e->colision)); - hlist_del_init(&e->colision); - spin_unlock_irq(&mdev->req_lock); +static bool seq_greater(u32 a, u32 b) +{ + /* + * We assume 32-bit wrap-around here. + * For 24-bit wrap-around, we would have to shift: + * a <<= 8; b <<= 8; + */ + return (s32)a - (s32)b > 0; +} - dec_unacked(mdev); +static u32 seq_max(u32 a, u32 b) +{ + return seq_greater(a, b) ? a : b; +} + +static bool need_peer_seq(struct drbd_conf *mdev) +{ + struct drbd_tconn *tconn = mdev->tconn; + int tp; + + /* + * We only need to keep track of the last packet_seq number of our peer + * if we are in dual-primary mode and we have the discard flag set; see + * handle_write_conflicts(). + */ + + rcu_read_lock(); + tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; + rcu_read_unlock(); - return ok; + return tp && test_bit(DISCARD_CONCURRENT, &tconn->flags); +} + +static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) +{ + unsigned int newest_peer_seq; + + if (need_peer_seq(mdev)) { + spin_lock(&mdev->peer_seq_lock); + newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); + mdev->peer_seq = newest_peer_seq; + spin_unlock(&mdev->peer_seq_lock); + /* wake up only if we actually changed mdev->peer_seq */ + if (peer_seq == newest_peer_seq) + wake_up(&mdev->seq_wait); + } } /* Called from receive_Data. @@ -1671,468 +2097,619 @@ * * returns 0 if we may process the packet, * -ERESTARTSYS if we were interrupted (by disconnect signal). */ -static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) +static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq) { DEFINE_WAIT(wait); - unsigned int p_seq; long timeout; - int ret = 0; + int ret; + + if (!need_peer_seq(mdev)) + return 0; + spin_lock(&mdev->peer_seq_lock); for (;;) { - prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); - if (seq_le(packet_seq, mdev->peer_seq+1)) + if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { + mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); + ret = 0; break; + } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } - p_seq = mdev->peer_seq; + prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); spin_unlock(&mdev->peer_seq_lock); - timeout = schedule_timeout(30*HZ); + rcu_read_lock(); + timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10; + rcu_read_unlock(); + timeout = schedule_timeout(timeout); spin_lock(&mdev->peer_seq_lock); - if (timeout == 0 && p_seq == mdev->peer_seq) { + if (!timeout) { ret = -ETIMEDOUT; - dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); + dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n"); break; } } - finish_wait(&mdev->seq_wait, &wait); - if (mdev->peer_seq+1 == packet_seq) - mdev->peer_seq++; spin_unlock(&mdev->peer_seq_lock); + finish_wait(&mdev->seq_wait, &wait); return ret; } +/* see also bio_flags_to_wire() + * DRBD_REQ_*, because we need to semantically map the flags to data packet + * flags and back. We may replicate to other kernel versions. */ +static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf) +{ + if (mdev->tconn->agreed_pro_version >= 95) + return (dpf & DP_RW_SYNC ? DRBD_REQ_SYNC : 0) | + (dpf & DP_FUA ? DRBD_REQ_FUA : 0) | + (dpf & DP_FLUSH ? DRBD_REQ_FLUSH : 0) | + (dpf & DP_DISCARD ? DRBD_REQ_DISCARD : 0); + + /* else: we used to communicate one bit only in older DRBD */ + return dpf & DP_RW_SYNC ? DRBD_REQ_SYNC : 0; +} + +static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector, + unsigned int size) +{ + struct drbd_interval *i; + + repeat: + drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { + struct drbd_request *req; + struct bio_and_error m; + + if (!i->local) + continue; + req = container_of(i, struct drbd_request, i); + if (!(req->rq_state & RQ_POSTPONED)) + continue; + req->rq_state &= ~RQ_POSTPONED; + __req_mod(req, NEG_ACKED, &m); + spin_unlock_irq(&mdev->tconn->req_lock); + if (m.bio) + complete_master_bio(mdev, &m); + spin_lock_irq(&mdev->tconn->req_lock); + goto repeat; + } +} + +static int handle_write_conflicts(struct drbd_conf *mdev, + struct drbd_peer_request *peer_req) +{ + struct drbd_tconn *tconn = mdev->tconn; + bool resolve_conflicts = test_bit(DISCARD_CONCURRENT, &tconn->flags); + sector_t sector = peer_req->i.sector; + const unsigned int size = peer_req->i.size; + struct drbd_interval *i; + bool equal; + int err; + + /* + * Inserting the peer request into the write_requests tree will prevent + * new conflicting local requests from being added. + */ + drbd_insert_interval(&mdev->write_requests, &peer_req->i); + + repeat: + drbd_for_each_overlap(i, &mdev->write_requests, sector, size) { + if (i == &peer_req->i) + continue; + + if (!i->local) { + /* + * Our peer has sent a conflicting remote request; this + * should not happen in a two-node setup. Wait for the + * earlier peer request to complete. + */ + err = drbd_wait_misc(mdev, i); + if (err) + goto out; + goto repeat; + } + + equal = i->sector == sector && i->size == size; + if (resolve_conflicts) { + /* + * If the peer request is fully contained within the + * overlapping request, it can be discarded; otherwise, + * it will be retried once all overlapping requests + * have completed. + */ + bool discard = i->sector <= sector && i->sector + + (i->size >> 9) >= sector + (size >> 9); + + if (!equal) + dev_alert(DEV, "Concurrent writes detected: " + "local=%llus +%u, remote=%llus +%u, " + "assuming %s came first\n", + (unsigned long long)i->sector, i->size, + (unsigned long long)sector, size, + discard ? "local" : "remote"); + + inc_unacked(mdev); + peer_req->w.cb = discard ? e_send_discard_write : + e_send_retry_write; + list_add_tail(&peer_req->w.list, &mdev->done_ee); + wake_asender(mdev->tconn); + + err = -ENOENT; + goto out; + } else { + struct drbd_request *req = + container_of(i, struct drbd_request, i); + + if (!equal) + dev_alert(DEV, "Concurrent writes detected: " + "local=%llus +%u, remote=%llus +%u\n", + (unsigned long long)i->sector, i->size, + (unsigned long long)sector, size); + + if (req->rq_state & RQ_LOCAL_PENDING || + !(req->rq_state & RQ_POSTPONED)) { + /* + * Wait for the node with the discard flag to + * decide if this request will be discarded or + * retried. Requests that are discarded will + * disappear from the write_requests tree. + * + * In addition, wait for the conflicting + * request to finish locally before submitting + * the conflicting peer request. + */ + err = drbd_wait_misc(mdev, &req->i); + if (err) { + _conn_request_state(mdev->tconn, + NS(conn, C_TIMEOUT), + CS_HARD); + fail_postponed_requests(mdev, sector, size); + goto out; + } + goto repeat; + } + /* + * Remember to restart the conflicting requests after + * the new peer request has completed. + */ + peer_req->flags |= EE_RESTART_REQUESTS; + } + } + err = 0; + + out: + if (err) + drbd_remove_epoch_entry_interval(mdev, peer_req); + return err; +} + /* mirrored write */ -STATIC int receive_Data(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) { + struct drbd_conf *mdev; sector_t sector; - struct drbd_epoch_entry *e; - struct p_data *p = (struct p_data *)h; - int header_size, data_size; + struct drbd_peer_request *peer_req; + struct p_data *p = pi->data; + u32 peer_seq = be32_to_cpu(p->seq_num); int rw = WRITE; u32 dp_flags; + int err, tp; - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - ERR_IF(data_size == 0) return FALSE; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; if (!get_ldev(mdev)) { - if (DRBD_ratelimit(5*HZ, 5)) - dev_err(DEV, "Can not write mirrored data block " - "to local disk.\n"); - spin_lock(&mdev->peer_seq_lock); - if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) - mdev->peer_seq++; - spin_unlock(&mdev->peer_seq_lock); + int err2; - drbd_send_ack_dp(mdev, P_NEG_ACK, p); - atomic_inc(&mdev->current_epoch->epoch_size); - return drbd_drain_block(mdev, data_size); + err = wait_for_and_update_peer_seq(mdev, peer_seq); + drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size); + atomic_inc(&tconn->current_epoch->epoch_size); + err2 = drbd_drain_block(mdev, pi->size); + if (!err) + err = err2; + return err; } - /* get_ldev(mdev) successful. - * Corresponding put_ldev done either below (on various errors), - * or in drbd_endio_write_sec, if we successfully submit the data at - * the end of this function. */ + /* + * Corresponding put_ldev done either below (on various errors), or in + * drbd_peer_request_endio, if we successfully submit the data at the + * end of this function. + */ sector = be64_to_cpu(p->sector); - e = read_in_block(mdev, p->block_id, sector, data_size); - if (!e) { + peer_req = read_in_block(mdev, p->block_id, sector, pi->size); + if (!peer_req) { put_ldev(mdev); - return FALSE; + return -EIO; } - e->private_bio->bi_end_io = drbd_endio_write_sec; - e->w.cb = e_end_block; + peer_req->w.cb = e_end_block; + + dp_flags = be32_to_cpu(p->dp_flags); + rw |= wire_flags_to_bio(mdev, dp_flags); + + if (dp_flags & DP_MAY_SET_IN_SYNC) + peer_req->flags |= EE_MAY_SET_IN_SYNC; + + /* last "fixes" to rw flags. + * Strip off BIO_RW_BARRIER unconditionally, + * it is not supposed to be here anyways. + * (Was FUA or FLUSH on the peer, + * and got translated to BARRIER on this side). + * Note that the epoch handling code below + * may add it again, though. + */ + rw &= ~DRBD_REQ_HARDBARRIER; - spin_lock(&mdev->epoch_lock); - e->epoch = mdev->current_epoch; - atomic_inc(&e->epoch->epoch_size); - atomic_inc(&e->epoch->active); + spin_lock(&tconn->epoch_lock); + peer_req->epoch = tconn->current_epoch; + atomic_inc(&peer_req->epoch->epoch_size); + atomic_inc(&peer_req->epoch->active); - if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) { + if (mdev->tconn->write_ordering == WO_bio_barrier && + atomic_read(&peer_req->epoch->epoch_size) == 1) { struct drbd_epoch *epoch; /* Issue a barrier if we start a new epoch, and the previous epoch was not a epoch containing a single request which already was a Barrier. */ - epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); - if (epoch == e->epoch) { - set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); - trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER); - rw |= (1<flags |= EE_IS_BARRIER; + epoch = list_entry(peer_req->epoch->list.prev, struct drbd_epoch, list); + if (epoch == peer_req->epoch) { + set_bit(DE_CONTAINS_A_BARRIER, &peer_req->epoch->flags); + rw |= DRBD_REQ_FLUSH | DRBD_REQ_FUA; + peer_req->flags |= EE_IS_BARRIER; } else { if (atomic_read(&epoch->epoch_size) > 1 || !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); - trace_drbd_epoch(mdev, epoch, EV_TRACE_SETTING_BI); - set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); - trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER); - rw |= (1<flags |= EE_IS_BARRIER; + set_bit(DE_CONTAINS_A_BARRIER, &peer_req->epoch->flags); + rw |= DRBD_REQ_FLUSH | DRBD_REQ_FUA; + peer_req->flags |= EE_IS_BARRIER; } } } - spin_unlock(&mdev->epoch_lock); - - dp_flags = be32_to_cpu(p->dp_flags); - if (dp_flags & DP_HARDBARRIER) { - dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); - /* rw |= (1<flags |= EE_MAY_SET_IN_SYNC; + spin_unlock(&tconn->epoch_lock); - /* I'm the receiver, I do hold a net_cnt reference. */ - if (!mdev->net_conf->two_primaries) { - spin_lock_irq(&mdev->req_lock); - } else { - /* don't get the req_lock yet, - * we may sleep in drbd_wait_peer_seq */ - const int size = e->size; - const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); - DEFINE_WAIT(wait); - struct drbd_request *i; - struct hlist_node *n; - struct hlist_head *slot; - int first; - - D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); - BUG_ON(mdev->ee_hash == NULL); - BUG_ON(mdev->tl_hash == NULL); - - /* conflict detection and handling: - * 1. wait on the sequence number, - * in case this data packet overtook ACK packets. - * 2. check our hash tables for conflicting requests. - * we only need to walk the tl_hash, since an ee can not - * have a conflict with an other ee: on the submitting - * node, the corresponding req had already been conflicting, - * and a conflicting req is never sent. - * - * Note: for two_primaries, we are protocol C, - * so there cannot be any request that is DONE - * but still on the transfer log. - * - * unconditionally add to the ee_hash. - * - * if no conflicting request is found: - * submit. - * - * if any conflicting request is found - * that has not yet been acked, - * AND I have the "discard concurrent writes" flag: - * queue (via done_ee) the P_DISCARD_ACK; OUT. - * - * if any conflicting request is found: - * block the receiver, waiting on misc_wait - * until no more conflicting requests are there, - * or we get interrupted (disconnect). - * - * we do not just write after local io completion of those - * requests, but only after req is done completely, i.e. - * we wait for the P_DISCARD_ACK to arrive! - * - * then proceed normally, i.e. submit. - */ - if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) + rcu_read_lock(); + tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; + rcu_read_unlock(); + if (tp) { + peer_req->flags |= EE_IN_INTERVAL_TREE; + err = wait_for_and_update_peer_seq(mdev, peer_seq); + if (err) goto out_interrupted; - - spin_lock_irq(&mdev->req_lock); - - hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); - -#define OVERLAPS overlaps(i->sector, i->size, sector, size) - slot = tl_hash_slot(mdev, sector); - first = 1; - for (;;) { - int have_unacked = 0; - int have_conflict = 0; - prepare_to_wait(&mdev->misc_wait, &wait, - TASK_INTERRUPTIBLE); - hlist_for_each_entry(i, n, slot, colision) { - if (OVERLAPS) { - /* only ALERT on first iteration, - * we may be woken up early... */ - if (first) - dev_alert(DEV, "%s[%u] Concurrent local write detected!" - " new: %llus +%u; pending: %llus +%u\n", - current->comm, current->pid, - (unsigned long long)sector, size, - (unsigned long long)i->sector, i->size); - if (i->rq_state & RQ_NET_PENDING) - ++have_unacked; - ++have_conflict; - } - } -#undef OVERLAPS - if (!have_conflict) - break; - - /* Discard Ack only for the _first_ iteration */ - if (first && discard && have_unacked) { - dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", - (unsigned long long)sector); - inc_unacked(mdev); - e->w.cb = e_send_discard_ack; - list_add_tail(&e->w.list, &mdev->done_ee); - - spin_unlock_irq(&mdev->req_lock); - - /* we could probably send that P_DISCARD_ACK ourselves, - * but I don't like the receiver using the msock */ - + spin_lock_irq(&mdev->tconn->req_lock); + err = handle_write_conflicts(mdev, peer_req); + if (err) { + spin_unlock_irq(&mdev->tconn->req_lock); + if (err == -ENOENT) { put_ldev(mdev); - wake_asender(mdev); - finish_wait(&mdev->misc_wait, &wait); - return TRUE; - } - - if (signal_pending(current)) { - hlist_del_init(&e->colision); - - spin_unlock_irq(&mdev->req_lock); - - finish_wait(&mdev->misc_wait, &wait); - goto out_interrupted; + return 0; } - - spin_unlock_irq(&mdev->req_lock); - if (first) { - first = 0; - dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " - "sec=%llus\n", (unsigned long long)sector); - } else if (discard) { - /* we had none on the first iteration. - * there must be none now. */ - D_ASSERT(have_unacked == 0); - } - schedule(); - spin_lock_irq(&mdev->req_lock); + goto out_interrupted; + } + } else + spin_lock_irq(&mdev->tconn->req_lock); + list_add(&peer_req->w.list, &mdev->active_ee); + spin_unlock_irq(&mdev->tconn->req_lock); + + if (mdev->tconn->agreed_pro_version < 100) { + rcu_read_lock(); + switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) { + case DRBD_PROT_C: + dp_flags |= DP_SEND_WRITE_ACK; + break; + case DRBD_PROT_B: + dp_flags |= DP_SEND_RECEIVE_ACK; + break; } - finish_wait(&mdev->misc_wait, &wait); + rcu_read_unlock(); } - list_add(&e->w.list, &mdev->active_ee); - spin_unlock_irq(&mdev->req_lock); - - switch (mdev->net_conf->wire_protocol) { - case DRBD_PROT_C: + if (dp_flags & DP_SEND_WRITE_ACK) { + peer_req->flags |= EE_SEND_WRITE_ACK; inc_unacked(mdev); /* corresponding dec_unacked() in e_end_block() * respective _drbd_clear_done_ee */ - break; - case DRBD_PROT_B: + } + + if (dp_flags & DP_SEND_RECEIVE_ACK) { /* I really don't like it that the receiver thread * sends on the msock, but anyways */ - drbd_send_ack(mdev, P_RECV_ACK, e); - break; - case DRBD_PROT_A: - /* nothing to do */ - break; + drbd_send_ack(mdev, P_RECV_ACK, peer_req); } - if (mdev->state.pdsk == D_DISKLESS) { + if (mdev->state.pdsk < D_INCONSISTENT) { /* In case we have the only disk of the cluster, */ - drbd_set_out_of_sync(mdev, e->sector, e->size); - e->flags |= EE_CALL_AL_COMPLETE_IO; - drbd_al_begin_io(mdev, e->sector); + drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); + peer_req->flags |= EE_CALL_AL_COMPLETE_IO; + peer_req->flags &= ~EE_MAY_SET_IN_SYNC; + drbd_al_begin_io(mdev, &peer_req->i); } - e->private_bio->bi_rw = rw; - trace_drbd_ee(mdev, e, "submitting for (data)write"); - trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); - drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); - /* accounting done in endio */ + err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); + if (!err) + return 0; - maybe_kick_lo(mdev); - return TRUE; + /* don't care for the reason here */ + dev_err(DEV, "submit failed, triggering re-connect\n"); + spin_lock_irq(&mdev->tconn->req_lock); + list_del(&peer_req->w.list); + drbd_remove_epoch_entry_interval(mdev, peer_req); + spin_unlock_irq(&mdev->tconn->req_lock); + if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) + drbd_al_complete_io(mdev, &peer_req->i); out_interrupted: - /* yes, the epoch_size now is imbalanced. - * but we drop the connection anyways, so we don't have a chance to - * receive a barrier... atomic_inc(&mdev->epoch_size); */ + drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP); put_ldev(mdev); - drbd_free_ee(mdev, e); - return FALSE; + drbd_free_peer_req(mdev, peer_req); + return err; +} + +/* We may throttle resync, if the lower device seems to be busy, + * and current sync rate is above c_min_rate. + * + * To decide whether or not the lower device is busy, we use a scheme similar + * to MD RAID is_mddev_idle(): if the partition stats reveal "significant" + * (more than 64 sectors) of activity we cannot account for with our own resync + * activity, it obviously is "busy". + * + * The current sync rate used here uses only the most recent two step marks, + * to have a short time average so we can react faster. + */ +int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector) +{ + unsigned long db, dt, dbdt; + struct lc_element *tmp; + int curr_events; + int throttle = 0; + unsigned int c_min_rate; + + rcu_read_lock(); + c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate; + rcu_read_unlock(); + + /* feature disabled? */ + if (c_min_rate == 0) + return 0; + + spin_lock_irq(&mdev->al_lock); + tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector)); + if (tmp) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_PRIORITY, &bm_ext->flags)) { + spin_unlock_irq(&mdev->al_lock); + return 0; + } + /* Do not slow down if app IO is already waiting for this extent */ + } + spin_unlock_irq(&mdev->al_lock); + + curr_events = drbd_backing_bdev_events(mdev) + - atomic_read(&mdev->rs_sect_ev); + + if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) { + unsigned long rs_left; + int i; + + mdev->rs_last_events = curr_events; + + /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, + * approx. */ + i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; + + if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) + rs_left = mdev->ov_left; + else + rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + + dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = mdev->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + + if (dbdt > c_min_rate) + throttle = 1; + } + return throttle; } -STATIC int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) + +STATIC int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi) { + struct drbd_conf *mdev; sector_t sector; - const sector_t capacity = drbd_get_capacity(mdev->this_bdev); - struct drbd_epoch_entry *e; + sector_t capacity; + struct drbd_peer_request *peer_req; struct digest_info *di = NULL; - int size, digest_size; + int size, verb; unsigned int fault_type; - struct p_block_req *p = - (struct p_block_req *)h; - const int brps = sizeof(*p)-sizeof(*h); + struct p_block_req *p = pi->data; - if (drbd_recv(mdev, h->payload, brps) != brps) - return FALSE; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + capacity = drbd_get_capacity(mdev->this_bdev); sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, (unsigned long long)sector, size); - return FALSE; + return -EINVAL; } if (sector + (size>>9) > capacity) { dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, (unsigned long long)sector, size); - return FALSE; + return -EINVAL; } if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { - if (DRBD_ratelimit(5*HZ, 5)) + verb = 1; + switch (pi->cmd) { + case P_DATA_REQUEST: + drbd_send_ack_rp(mdev, P_NEG_DREPLY, p); + break; + case P_RS_DATA_REQUEST: + case P_CSUM_RS_REQUEST: + case P_OV_REQUEST: + drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p); + break; + case P_OV_REPLY: + verb = 0; + dec_rs_pending(mdev); + drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC); + break; + default: + BUG(); + } + if (verb && DRBD_ratelimit(5*HZ, 5)) dev_err(DEV, "Can not satisfy peer's read request, " "no local data.\n"); - drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : - P_NEG_RS_DREPLY , p); - return TRUE; + + /* drain possibly payload */ + return drbd_drain_block(mdev, pi->size); } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); - if (!e) { + peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO); + if (!peer_req) { put_ldev(mdev); - return FALSE; + return -ENOMEM; } - e->private_bio->bi_rw = READ; - e->private_bio->bi_end_io = drbd_endio_read_sec; - - switch (h->command) { + switch (pi->cmd) { case P_DATA_REQUEST: - e->w.cb = w_e_end_data_req; + peer_req->w.cb = w_e_end_data_req; fault_type = DRBD_FAULT_DT_RD; - break; + /* application IO, don't drbd_rs_begin_io */ + goto submit; + case P_RS_DATA_REQUEST: - e->w.cb = w_e_end_rsdata_req; + peer_req->w.cb = w_e_end_rsdata_req; fault_type = DRBD_FAULT_RS_RD; - /* Eventually this should become asynchronously. Currently it - * blocks the whole receiver just to delay the reading of a - * resync data block. - * the drbd_work_queue mechanism is made for this... - */ - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, - * probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; - } + /* used in the sector offset progress display */ + mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); break; case P_OV_REPLY: case P_CSUM_RS_REQUEST: fault_type = DRBD_FAULT_RS_RD; - digest_size = h->length - brps ; - di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); + di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); if (!di) goto out_free_e; - di->digest_size = digest_size; + di->digest_size = pi->size; di->digest = (((char *)di)+sizeof(struct digest_info)); - if (drbd_recv(mdev, di->digest, digest_size) != digest_size) + peer_req->digest = di; + peer_req->flags |= EE_HAS_DIGEST; + + if (drbd_recv_all(mdev->tconn, di->digest, pi->size)) goto out_free_e; - e->block_id = (u64)(unsigned long)di; - if (h->command == P_CSUM_RS_REQUEST) { - D_ASSERT(mdev->agreed_pro_version >= 89); - e->w.cb = w_e_end_csum_rs_req; - } else if (h->command == P_OV_REPLY) { - e->w.cb = w_e_end_ov_reply; + if (pi->cmd == P_CSUM_RS_REQUEST) { + D_ASSERT(mdev->tconn->agreed_pro_version >= 89); + peer_req->w.cb = w_e_end_csum_rs_req; + /* used in the sector offset progress display */ + mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); + } else if (pi->cmd == P_OV_REPLY) { + /* track progress, we may need to throttle */ + atomic_add(size >> 9, &mdev->rs_sect_in); + peer_req->w.cb = w_e_end_ov_reply; dec_rs_pending(mdev); - break; - } - - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; + /* drbd_rs_begin_io done when we sent this request, + * but accounting still needs to be done. */ + goto submit_for_resync; } break; case P_OV_REQUEST: - if (mdev->state.conn >= C_CONNECTED && - mdev->state.conn != C_VERIFY_T) - dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", - drbd_conn_str(mdev->state.conn)); if (mdev->ov_start_sector == ~(sector_t)0 && - mdev->agreed_pro_version >= 90) { + mdev->tconn->agreed_pro_version >= 90) { + unsigned long now = jiffies; + int i; mdev->ov_start_sector = sector; mdev->ov_position = sector; - mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); + mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector); + mdev->rs_total = mdev->ov_left; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = mdev->ov_left; + mdev->rs_mark_time[i] = now; + } dev_info(DEV, "Online Verify start sector: %llu\n", (unsigned long long)sector); } - e->w.cb = w_e_end_ov_req; + peer_req->w.cb = w_e_end_ov_req; fault_type = DRBD_FAULT_RS_RD; - /* Eventually this should become asynchronous. Currently it - * blocks the whole receiver just to delay the reading of a - * resync data block. - * the drbd_work_queue mechanism is made for this... - */ - if (!drbd_rs_begin_io(mdev, sector)) { - /* we have been interrupted, - * probably connection lost! */ - D_ASSERT(signal_pending(current)); - goto out_free_e; - } break; - default: - dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", - cmdname(h->command)); - fault_type = DRBD_FAULT_MAX; + BUG(); } - spin_lock_irq(&mdev->req_lock); - list_add(&e->w.list, &mdev->read_ee); - spin_unlock_irq(&mdev->req_lock); + /* Throttle, drbd_rs_begin_io and submit should become asynchronous + * wrt the receiver, but it is not as straightforward as it may seem. + * Various places in the resync start and stop logic assume resync + * requests are processed in order, requeuing this on the worker thread + * introduces a bunch of new code for synchronization between threads. + * + * Unlimited throttling before drbd_rs_begin_io may stall the resync + * "forever", throttling after drbd_rs_begin_io will lock that extent + * for application writes for the same time. For now, just throttle + * here, where the rest of the code expects the receiver to sleep for + * a while, anyways. + */ + + /* Throttle before drbd_rs_begin_io, as that locks out application IO; + * this defers syncer requests for some time, before letting at least + * on request through. The resync controller on the receiving side + * will adapt to the incoming rate accordingly. + * + * We cannot throttle here if remote is Primary/SyncTarget: + * we would also throttle its application reads. + * In that case, throttling is done on the SyncTarget only. + */ + if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector)) + schedule_timeout_uninterruptible(HZ/10); + if (drbd_rs_begin_io(mdev, sector)) + goto out_free_e; + +submit_for_resync: + atomic_add(size >> 9, &mdev->rs_sect_ev); +submit: inc_unacked(mdev); + spin_lock_irq(&mdev->tconn->req_lock); + list_add_tail(&peer_req->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->tconn->req_lock); - trace_drbd_ee(mdev, e, "submitting for read"); - trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); - drbd_generic_make_request(mdev, fault_type, e->private_bio); - maybe_kick_lo(mdev); + if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0) + return 0; - return TRUE; + /* don't care for the reason here */ + dev_err(DEV, "submit failed, triggering re-connect\n"); + spin_lock_irq(&mdev->tconn->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&mdev->tconn->req_lock); + /* no drbd_rs_complete_io(), we are dropping the connection anyways */ out_free_e: - kfree(di); put_ldev(mdev); - drbd_free_ee(mdev, e); - return FALSE; + drbd_free_peer_req(mdev, peer_req); + return -EIO; } STATIC int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) { int self, peer, rv = -100; unsigned long ch_self, ch_peer; + enum drbd_after_sb_p after_sb_0p; self = mdev->ldev->md.uuid[UI_BITMAP] & 1; peer = mdev->p_uuid[UI_BITMAP] & 1; @@ -2140,10 +2717,14 @@ ch_peer = mdev->p_uuid[UI_SIZE]; ch_self = mdev->comm_bm_set; - switch (mdev->net_conf->after_sb_0p) { + rcu_read_lock(); + after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p; + rcu_read_unlock(); + switch (after_sb_0p) { case ASB_CONSENSUS: case ASB_DISCARD_SECONDARY: case ASB_CALL_HELPER: + case ASB_VIOLENTLY: dev_err(DEV, "Configuration error.\n"); break; case ASB_DISCONNECT: @@ -2172,14 +2753,14 @@ "Using discard-least-changes instead\n"); case ASB_DISCARD_ZERO_CHG: if (ch_peer == 0 && ch_self == 0) { - rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) + rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags) ? -1 : 1; break; } else { if (ch_peer == 0) { rv = 1; break; } if (ch_self == 0) { rv = -1; break; } } - if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) + if (after_sb_0p == ASB_DISCARD_ZERO_CHG) break; case ASB_DISCARD_LEAST_CHG: if (ch_self < ch_peer) @@ -2188,7 +2769,7 @@ rv = 1; else /* ( ch_self == ch_peer ) */ /* Well, then use something else. */ - rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) + rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags) ? -1 : 1; break; case ASB_DISCARD_LOCAL: @@ -2203,17 +2784,19 @@ STATIC int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) { - int self, peer, hg, rv = -100; - - self = mdev->ldev->md.uuid[UI_BITMAP] & 1; - peer = mdev->p_uuid[UI_BITMAP] & 1; + int hg, rv = -100; + enum drbd_after_sb_p after_sb_1p; - switch (mdev->net_conf->after_sb_1p) { + rcu_read_lock(); + after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p; + rcu_read_unlock(); + switch (after_sb_1p) { case ASB_DISCARD_YOUNGER_PRI: case ASB_DISCARD_OLDER_PRI: case ASB_DISCARD_LEAST_CHG: case ASB_DISCARD_LOCAL: case ASB_DISCARD_REMOTE: + case ASB_DISCARD_ZERO_CHG: dev_err(DEV, "Configuration error.\n"); break; case ASB_DISCONNECT: @@ -2233,12 +2816,14 @@ case ASB_CALL_HELPER: hg = drbd_asb_recover_0p(mdev); if (hg == -1 && mdev->state.role == R_PRIMARY) { - self = drbd_set_role(mdev, R_SECONDARY, 0); + enum drbd_state_rv rv2; + + drbd_set_role(mdev, R_SECONDARY, 0); /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, * we might be here in C_WF_REPORT_PARAMS which is transient. * we do not need to wait for the after state change work either. */ - self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); - if (self != SS_SUCCESS) { + rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); + if (rv2 != SS_SUCCESS) { drbd_khelper(mdev, "pri-lost-after-sb"); } else { dev_warn(DEV, "Successfully gave up primary role.\n"); @@ -2253,12 +2838,13 @@ STATIC int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) { - int self, peer, hg, rv = -100; + int hg, rv = -100; + enum drbd_after_sb_p after_sb_2p; - self = mdev->ldev->md.uuid[UI_BITMAP] & 1; - peer = mdev->p_uuid[UI_BITMAP] & 1; - - switch (mdev->net_conf->after_sb_2p) { + rcu_read_lock(); + after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p; + rcu_read_unlock(); + switch (after_sb_2p) { case ASB_DISCARD_YOUNGER_PRI: case ASB_DISCARD_OLDER_PRI: case ASB_DISCARD_LEAST_CHG: @@ -2266,6 +2852,7 @@ case ASB_DISCARD_REMOTE: case ASB_CONSENSUS: case ASB_DISCARD_SECONDARY: + case ASB_DISCARD_ZERO_CHG: dev_err(DEV, "Configuration error.\n"); break; case ASB_VIOLENTLY: @@ -2276,11 +2863,13 @@ case ASB_CALL_HELPER: hg = drbd_asb_recover_0p(mdev); if (hg == -1) { + enum drbd_state_rv rv2; + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, * we might be here in C_WF_REPORT_PARAMS which is transient. * we do not need to wait for the after state change work either. */ - self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); - if (self != SS_SUCCESS) { + rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); + if (rv2 != SS_SUCCESS) { drbd_khelper(mdev, "pri-lost-after-sb"); } else { dev_warn(DEV, "Successfully gave up primary role.\n"); @@ -2319,6 +2908,8 @@ -2 C_SYNC_TARGET set BitMap -100 after split brain, disconnect -1000 unrelated data +-1091 requires proto 91 +-1096 requires proto 96 */ STATIC int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) { @@ -2347,8 +2938,8 @@ if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { - if (mdev->agreed_pro_version < 91) - return -1001; + if (mdev->tconn->agreed_pro_version < 91) + return -1091; if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { @@ -2368,8 +2959,8 @@ if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { - if (mdev->agreed_pro_version < 91) - return -1001; + if (mdev->tconn->agreed_pro_version < 91) + return -1091; if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { @@ -2401,7 +2992,7 @@ case 1: /* self_pri && !peer_pri */ return 1; case 2: /* !self_pri && peer_pri */ return -1; case 3: /* self_pri && peer_pri */ - dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); + dc = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags); return dc ? -1 : 1; } } @@ -2414,17 +3005,22 @@ *rule_nr = 51; peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); if (self == peer) { - self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); - peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); - if (self == peer) { + if (mdev->tconn->agreed_pro_version < 96 ? + (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == + (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : + peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) { /* The last P_SYNC_UUID did not get though. Undo the last start of resync as sync source modifications of the peer's UUIDs. */ - if (mdev->agreed_pro_version < 91) - return -1001; + if (mdev->tconn->agreed_pro_version < 91) + return -1091; mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; + + dev_info(DEV, "Did not got last syncUUID packet, corrected:\n"); + drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); + return -1; } } @@ -2446,20 +3042,20 @@ *rule_nr = 71; self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); if (self == peer) { - self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); - peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); - if (self == peer) { + if (mdev->tconn->agreed_pro_version < 96 ? + (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == + (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) : + self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { /* The last P_SYNC_UUID did not get though. Undo the last start of resync as sync source modifications of our UUIDs. */ - if (mdev->agreed_pro_version < 91) - return -1001; + if (mdev->tconn->agreed_pro_version < 91) + return -1091; _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); - dev_info(DEV, "Undid last start of resync:\n"); - + dev_info(DEV, "Last syncUUID did not get through, corrected:\n"); drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); @@ -2501,9 +3097,10 @@ STATIC enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, enum drbd_disk_state peer_disk) __must_hold(local) { - int hg, rule_nr; enum drbd_conns rv = C_MASK; enum drbd_disk_state mydisk; + struct net_conf *nc; + int hg, rule_nr, rr_conflict, tentative; mydisk = mdev->state.disk; if (mydisk == D_NEGOTIATING) @@ -2522,8 +3119,8 @@ dev_alert(DEV, "Unrelated data, aborting!\n"); return C_MASK; } - if (hg == -1001) { - dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); + if (hg < -1000) { + dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); return C_MASK; } @@ -2537,7 +3134,13 @@ hg > 0 ? "source" : "target"); } - if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { + if (abs(hg) == 100) + drbd_khelper(mdev, "initial-split-brain"); + + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + + if (hg == 100 || (hg == -100 && nc->always_asbp)) { int pcount = (mdev->state.role == R_PRIMARY) + (peer_role == R_PRIMARY); int forced = (hg == -100); @@ -2566,9 +3169,9 @@ } if (hg == -100) { - if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) + if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1)) hg = -1; - if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) + if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1)) hg = 1; if (abs(hg) < 100) @@ -2576,9 +3179,16 @@ "Sync from %s node\n", (hg < 0) ? "peer" : "this"); } + rr_conflict = nc->rr_conflict; + tentative = nc->tentative; + rcu_read_unlock(); if (hg == -100) { - dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); + /* FIXME this log message is not correct if we end up here + * after an attempted attach on a diskless node. + * We just refuse to attach -- well, we drop the "connection" + * to that disk, in a way... */ + dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n"); drbd_khelper(mdev, "split-brain"); return C_MASK; } @@ -2590,7 +3200,7 @@ if (hg < 0 && /* by intention we do not use mydisk here. */ mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { - switch (mdev->net_conf->rr_conflict) { + switch (rr_conflict) { case ASB_CALL_HELPER: drbd_khelper(mdev, "pri-lost"); /* fall through */ @@ -2603,9 +3213,20 @@ } } + if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) { + if (hg == 0) + dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n"); + else + dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.", + drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET), + abs(hg) >= 2 ? "full" : "bit-map based"); + return C_MASK; + } + if (abs(hg) >= 2) { dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); - if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake", + BM_LOCKED_SET_ALLOWED)) return C_MASK; } @@ -2624,98 +3245,168 @@ return rv; } -/* returns 1 if invalid */ -STATIC int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) +STATIC enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) { /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ - if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || - (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) - return 0; + if (peer == ASB_DISCARD_REMOTE) + return ASB_DISCARD_LOCAL; /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ - if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || - self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) - return 1; + if (peer == ASB_DISCARD_LOCAL) + return ASB_DISCARD_REMOTE; /* everything else is valid if they are equal on both sides. */ - if (peer == self) - return 0; - - /* everything es is invalid. */ - return 1; + return peer; } -STATIC int receive_protocol(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_protocol *p = (struct p_protocol *)h; - int header_size, data_size; - int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; - int p_want_lose, p_two_primaries; - char p_integrity_alg[SHARED_SECRET_MAX] = ""; - - header_size = sizeof(*p) - sizeof(*h); - data_size = h->length - header_size; - - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + struct p_protocol *p = pi->data; + enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; + int p_proto, p_discard_my_data, p_two_primaries, cf; + struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; + char integrity_alg[SHARED_SECRET_MAX] = ""; + struct crypto_hash *peer_integrity_tfm = NULL; + void *int_dig_in = NULL, *int_dig_vv = NULL; p_proto = be32_to_cpu(p->protocol); p_after_sb_0p = be32_to_cpu(p->after_sb_0p); p_after_sb_1p = be32_to_cpu(p->after_sb_1p); p_after_sb_2p = be32_to_cpu(p->after_sb_2p); - p_want_lose = be32_to_cpu(p->want_lose); p_two_primaries = be32_to_cpu(p->two_primaries); + cf = be32_to_cpu(p->conn_flags); + p_discard_my_data = cf & CF_DISCARD_MY_DATA; - if (p_proto != mdev->net_conf->wire_protocol) { - dev_err(DEV, "incompatible communication protocols\n"); - goto disconnect; - } + if (tconn->agreed_pro_version >= 87) { + int err; - if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { - dev_err(DEV, "incompatible after-sb-0pri settings\n"); - goto disconnect; + if (pi->size > sizeof(integrity_alg)) + return -EIO; + err = drbd_recv_all(tconn, integrity_alg, pi->size); + if (err) + return err; + integrity_alg[SHARED_SECRET_MAX - 1] = 0; } - if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { - dev_err(DEV, "incompatible after-sb-1pri settings\n"); - goto disconnect; - } + if (pi->cmd != P_PROTOCOL_UPDATE) { + clear_bit(CONN_DRY_RUN, &tconn->flags); - if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { - dev_err(DEV, "incompatible after-sb-2pri settings\n"); - goto disconnect; - } + if (cf & CF_DRY_RUN) + set_bit(CONN_DRY_RUN, &tconn->flags); - if (p_want_lose && mdev->net_conf->want_lose) { - dev_err(DEV, "both sides have the 'want_lose' flag set\n"); - goto disconnect; - } + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); - if (p_two_primaries != mdev->net_conf->two_primaries) { - dev_err(DEV, "incompatible setting of the two-primaries options\n"); - goto disconnect; + if (p_proto != nc->wire_protocol) { + conn_err(tconn, "incompatible %s settings\n", "protocol"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { + conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { + conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { + conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri"); + goto disconnect_rcu_unlock; + } + + if (p_discard_my_data && nc->discard_my_data) { + conn_err(tconn, "incompatible %s settings\n", "discard-my-data"); + goto disconnect_rcu_unlock; + } + + if (p_two_primaries != nc->two_primaries) { + conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries"); + goto disconnect_rcu_unlock; + } + + if (strcmp(integrity_alg, nc->integrity_alg)) { + conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg"); + goto disconnect_rcu_unlock; + } + + rcu_read_unlock(); } - if (mdev->agreed_pro_version >= 87) { - unsigned char *my_alg = mdev->net_conf->integrity_alg; + if (integrity_alg[0]) { + int hash_size; + + /* + * We can only change the peer data integrity algorithm + * here. Changing our own data integrity algorithm + * requires that we send a P_PROTOCOL_UPDATE packet at + * the same time; otherwise, the peer has no way to + * tell between which packets the algorithm should + * change. + */ - if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) - return FALSE; + peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (!peer_integrity_tfm) { + conn_err(tconn, "peer data-integrity-alg %s not supported\n", + integrity_alg); + goto disconnect; + } - p_integrity_alg[SHARED_SECRET_MAX-1] = 0; - if (strcmp(p_integrity_alg, my_alg)) { - dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); + hash_size = crypto_hash_digestsize(peer_integrity_tfm); + int_dig_in = kmalloc(hash_size, GFP_KERNEL); + int_dig_vv = kmalloc(hash_size, GFP_KERNEL); + if (!(int_dig_in && int_dig_vv)) { + conn_err(tconn, "Allocation of buffers for data integrity checking failed\n"); goto disconnect; } - dev_info(DEV, "data-integrity-alg: %s\n", - my_alg[0] ? my_alg : (unsigned char *)""); } - return TRUE; + new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_net_conf) { + conn_err(tconn, "Allocation of new net_conf failed\n"); + goto disconnect; + } + + mutex_lock(&tconn->data.mutex); + mutex_lock(&tconn->conf_update); + old_net_conf = tconn->net_conf; + *new_net_conf = *old_net_conf; + + new_net_conf->wire_protocol = p_proto; + new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); + new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); + new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); + new_net_conf->two_primaries = p_two_primaries; + + rcu_assign_pointer(tconn->net_conf, new_net_conf); + mutex_unlock(&tconn->conf_update); + mutex_unlock(&tconn->data.mutex); + + crypto_free_hash(tconn->peer_integrity_tfm); + kfree(tconn->int_dig_in); + kfree(tconn->int_dig_vv); + tconn->peer_integrity_tfm = peer_integrity_tfm; + tconn->int_dig_in = int_dig_in; + tconn->int_dig_vv = int_dig_vv; + + if (strcmp(old_net_conf->integrity_alg, integrity_alg)) + conn_info(tconn, "peer data-integrity-alg: %s\n", + integrity_alg[0] ? integrity_alg : "(none)"); + synchronize_rcu(); + kfree(old_net_conf); + return 0; + +disconnect_rcu_unlock: + rcu_read_unlock(); disconnect: - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; + crypto_free_hash(peer_integrity_tfm); + kfree(int_dig_in); + kfree(int_dig_vv); + conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; } /* helper function @@ -2737,50 +3428,114 @@ alg, name, PTR_ERR(tfm)); return tfm; } - if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { - crypto_free_hash(tfm); - dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); - return ERR_PTR(-EINVAL); - } return tfm; } -STATIC int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) +static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi) +{ + void *buffer = tconn->data.rbuf; + int size = pi->size; + + while (size) { + int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); + s = drbd_recv(tconn, buffer, s); + if (s <= 0) { + if (s < 0) + return s; + break; + } + size -= s; + } + if (size) + return -EIO; + return 0; +} + +/* + * config_unknown_volume - device configuration command for unknown volume + * + * When a device is added to an existing connection, the node on which the + * device is added first will send configuration commands to its peer but the + * peer will not know about the device yet. It will warn and ignore these + * commands. Once the device is added on the second node, the second node will + * send the same device configuration commands, but in the other direction. + * + * (We can also end up here if drbd is misconfigured.) + */ +static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi) +{ + conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n", + cmdname(pi->cmd), pi->vnr); + return ignore_remaining_packet(tconn, pi); +} + +STATIC int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi) { - int ok = TRUE; - struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; + struct drbd_conf *mdev; + struct p_rs_param_95 *p; unsigned int header_size, data_size, exp_max_sz; struct crypto_hash *verify_tfm = NULL; struct crypto_hash *csums_tfm = NULL; - const int apv = mdev->agreed_pro_version; + struct net_conf *old_net_conf, *new_net_conf = NULL; + struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; + const int apv = tconn->agreed_pro_version; + struct fifo_buffer *old_plan = NULL, *new_plan = NULL; + int fifo_size = 0; + int err; + + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return config_unknown_volume(tconn, pi); exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) : apv == 88 ? sizeof(struct p_rs_param) + SHARED_SECRET_MAX - : /* 89 */ sizeof(struct p_rs_param_89); + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); - if (h->length > exp_max_sz) { + if (pi->size > exp_max_sz) { dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", - h->length, exp_max_sz); - return FALSE; + pi->size, exp_max_sz); + return -EIO; } if (apv <= 88) { - header_size = sizeof(struct p_rs_param) - sizeof(*h); - data_size = h->length - header_size; - } else /* apv >= 89 */ { - header_size = sizeof(struct p_rs_param_89) - sizeof(*h); - data_size = h->length - header_size; + header_size = sizeof(struct p_rs_param); + data_size = pi->size - header_size; + } else if (apv <= 94) { + header_size = sizeof(struct p_rs_param_89); + data_size = pi->size - header_size; + D_ASSERT(data_size == 0); + } else { + header_size = sizeof(struct p_rs_param_95); + data_size = pi->size - header_size; D_ASSERT(data_size == 0); } /* initialize verify_alg and csums_alg */ + p = pi->data; memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); - if (drbd_recv(mdev, h->payload, header_size) != header_size) - return FALSE; + err = drbd_recv_all(mdev->tconn, p, header_size); + if (err) + return err; + + mutex_lock(&mdev->tconn->conf_update); + old_net_conf = mdev->tconn->net_conf; + if (get_ldev(mdev)) { + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + put_ldev(mdev); + mutex_unlock(&mdev->tconn->conf_update); + dev_err(DEV, "Allocation of new disk_conf failed\n"); + return -ENOMEM; + } + + old_disk_conf = mdev->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; - mdev->sync_conf.rate = be32_to_cpu(p->rate); + new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); + } if (apv >= 88) { if (apv == 88) { @@ -2788,12 +3543,13 @@ dev_err(DEV, "verify-alg too long, " "peer wants %u, accepting only %u byte\n", data_size, SHARED_SECRET_MAX); - return FALSE; + err = -EIO; + goto reconnect; } - if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) - return FALSE; - + err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size); + if (err) + goto reconnect; /* we expect NUL terminated string */ /* but just in case someone tries to be evil */ D_ASSERT(p->verify_alg[data_size-1] == 0); @@ -2808,10 +3564,10 @@ p->csums_alg[SHARED_SECRET_MAX-1] = 0; } - if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { + if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { if (mdev->state.conn == C_WF_REPORT_PARAMS) { dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", - mdev->sync_conf.verify_alg, p->verify_alg); + old_net_conf->verify_alg, p->verify_alg); goto disconnect; } verify_tfm = drbd_crypto_alloc_digest_safe(mdev, @@ -2822,10 +3578,10 @@ } } - if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { + if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { if (mdev->state.conn == C_WF_REPORT_PARAMS) { dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", - mdev->sync_conf.csums_alg, p->csums_alg); + old_net_conf->csums_alg, p->csums_alg); goto disconnect; } csums_tfm = drbd_crypto_alloc_digest_safe(mdev, @@ -2836,35 +3592,91 @@ } } + if (apv > 94 && new_disk_conf) { + new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); + new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); + new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); + new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); + + fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != mdev->rs_plan_s->size) { + new_plan = fifo_alloc(fifo_size); + if (!new_plan) { + dev_err(DEV, "kmalloc of fifo_buffer failed"); + put_ldev(mdev); + goto disconnect; + } + } + } + + if (verify_tfm || csums_tfm) { + new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_net_conf) { + dev_err(DEV, "Allocation of new net_conf failed\n"); + goto disconnect; + } + + *new_net_conf = *old_net_conf; - spin_lock(&mdev->peer_seq_lock); - /* lock against drbd_nl_syncer_conf() */ - if (verify_tfm) { - strcpy(mdev->sync_conf.verify_alg, p->verify_alg); - mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; - crypto_free_hash(mdev->verify_tfm); - mdev->verify_tfm = verify_tfm; - dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); - } - if (csums_tfm) { - strcpy(mdev->sync_conf.csums_alg, p->csums_alg); - mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; - crypto_free_hash(mdev->csums_tfm); - mdev->csums_tfm = csums_tfm; - dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); + if (verify_tfm) { + strcpy(new_net_conf->verify_alg, p->verify_alg); + new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; + crypto_free_hash(mdev->tconn->verify_tfm); + mdev->tconn->verify_tfm = verify_tfm; + dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); + } + if (csums_tfm) { + strcpy(new_net_conf->csums_alg, p->csums_alg); + new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; + crypto_free_hash(mdev->tconn->csums_tfm); + mdev->tconn->csums_tfm = csums_tfm; + dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); + } + rcu_assign_pointer(tconn->net_conf, new_net_conf); } - spin_unlock(&mdev->peer_seq_lock); } - return ok; + if (new_disk_conf) { + rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); + put_ldev(mdev); + } + + if (new_plan) { + old_plan = mdev->rs_plan_s; + rcu_assign_pointer(mdev->rs_plan_s, new_plan); + } + + mutex_unlock(&mdev->tconn->conf_update); + synchronize_rcu(); + if (new_net_conf) + kfree(old_net_conf); + kfree(old_disk_conf); + kfree(old_plan); + + return 0; + +reconnect: + if (new_disk_conf) { + put_ldev(mdev); + kfree(new_disk_conf); + } + mutex_unlock(&mdev->tconn->conf_update); + return -EIO; + disconnect: + kfree(new_plan); + if (new_disk_conf) { + put_ldev(mdev); + kfree(new_disk_conf); + } + mutex_unlock(&mdev->tconn->conf_update); /* just for completeness: actually not needed, * as this is not reached if csums_tfm was ok. */ crypto_free_hash(csums_tfm); /* but free the verify_tfm again, if csums_tfm did not work out */ crypto_free_hash(verify_tfm); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; + conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; } STATIC void drbd_setup_order_type(struct drbd_conf *mdev, int peer) @@ -2886,95 +3698,94 @@ (unsigned long long)a, (unsigned long long)b); } -STATIC int receive_sizes(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_sizes *p = (struct p_sizes *)h; + struct drbd_conf *mdev; + struct p_sizes *p = pi->data; enum determine_dev_size dd = unchanged; - unsigned int max_seg_s; sector_t p_size, p_usize, my_usize; int ldsc = 0; /* local disk size changed */ - enum drbd_conns nconn; + enum dds_flags ddsf; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return config_unknown_volume(tconn, pi); p_size = be64_to_cpu(p->d_size); p_usize = be64_to_cpu(p->u_size); - if (p_size == 0 && mdev->state.disk == D_DISKLESS) { - dev_err(DEV, "some backing storage is needed\n"); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; - } - /* just store the peer's disk size for now. * we still need to figure out whether we accept that. */ mdev->p_size = p_size; -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) if (get_ldev(mdev)) { + rcu_read_lock(); + my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + warn_if_differ_considerably(mdev, "lower level device sizes", p_size, drbd_get_max_capacity(mdev->ldev)); warn_if_differ_considerably(mdev, "user requested size", - p_usize, mdev->ldev->dc.disk_size); + p_usize, my_usize); /* if this is the first connect, or an otherwise expected * param exchange, choose the minimum */ if (mdev->state.conn == C_WF_REPORT_PARAMS) - p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, - p_usize); - - my_usize = mdev->ldev->dc.disk_size; - - if (mdev->ldev->dc.disk_size != p_usize) { - mdev->ldev->dc.disk_size = p_usize; - dev_info(DEV, "Peer sets u_size to %lu sectors\n", - (unsigned long)mdev->ldev->dc.disk_size); - } + p_usize = min_not_zero(my_usize, p_usize); /* Never shrink a device with usable data during connect. But allow online shrinking if we are connected. */ - if (drbd_new_dev_size(mdev, mdev->ldev, 0) < - drbd_get_capacity(mdev->this_bdev) && - mdev->state.disk >= D_OUTDATED && - mdev->state.conn < C_CONNECTED) { + if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) < + drbd_get_capacity(mdev->this_bdev) && + mdev->state.disk >= D_OUTDATED && + mdev->state.conn < C_CONNECTED) { dev_err(DEV, "The peer's disk size is too small!\n"); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - mdev->ldev->dc.disk_size = my_usize; + conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); put_ldev(mdev); - return FALSE; + return -EIO; + } + + if (my_usize != p_usize) { + struct disk_conf *old_disk_conf, *new_disk_conf = NULL; + + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + dev_err(DEV, "Allocation of new disk_conf failed\n"); + put_ldev(mdev); + return -ENOMEM; + } + + mutex_lock(&mdev->tconn->conf_update); + old_disk_conf = mdev->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + new_disk_conf->disk_size = p_usize; + + rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf); + mutex_unlock(&mdev->tconn->conf_update); + synchronize_rcu(); + kfree(old_disk_conf); + + dev_info(DEV, "Peer sets u_size to %lu sectors\n", + (unsigned long)my_usize); } + put_ldev(mdev); } -#undef min_not_zero + ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(mdev)) { - dd = drbd_determin_dev_size(mdev, 0); + dd = drbd_determine_dev_size(mdev, ddsf); put_ldev(mdev); if (dd == dev_size_error) - return FALSE; + return -EIO; drbd_md_sync(mdev); } else { /* I am diskless, need to accept the peer's size. */ drbd_set_my_capacity(mdev, p_size); } - if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { - nconn = drbd_sync_handshake(mdev, - mdev->state.peer, mdev->state.pdsk); - put_ldev(mdev); - - if (nconn == C_MASK) { - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; - } - - if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; - } - } + mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size); + drbd_reconsider_max_bio_size(mdev); if (get_ldev(mdev)) { if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { @@ -2982,11 +3793,7 @@ ldsc = 1; } - max_seg_s = be32_to_cpu(p->max_segment_size); - if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) - drbd_setup_queue_param(mdev, max_seg_s); - - drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); + drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type)); put_ldev(mdev); } @@ -2995,30 +3802,34 @@ drbd_get_capacity(mdev->this_bdev) || ldsc) { /* we have different sizes, probably peer * needs to know my new size... */ - drbd_send_sizes(mdev, 0); + drbd_send_sizes(mdev, 0, ddsf); } if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || (dd == grew && mdev->state.conn == C_CONNECTED)) { if (mdev->state.pdsk >= D_INCONSISTENT && - mdev->state.disk >= D_INCONSISTENT) - resync_after_online_grow(mdev); - else + mdev->state.disk >= D_INCONSISTENT) { + if (ddsf & DDSF_NO_RESYNC) + dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n"); + else + resync_after_online_grow(mdev); + } else set_bit(RESYNC_AFTER_NEG, &mdev->flags); } } - return TRUE; + return 0; } -STATIC int receive_uuids(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_uuids *p = (struct p_uuids *)h; + struct drbd_conf *mdev; + struct p_uuids *p = pi->data; u64 *p_uuid; - int i; + int i, updated_uuids = 0; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return config_unknown_volume(tconn, pi); p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); @@ -3034,38 +3845,49 @@ (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", (unsigned long long)mdev->ed_uuid); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; + conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; } if (get_ldev(mdev)) { int skip_initial_sync = mdev->state.conn == C_CONNECTED && - mdev->agreed_pro_version >= 90 && + mdev->tconn->agreed_pro_version >= 90 && mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && (p_uuid[UI_FLAGS] & 8); if (skip_initial_sync) { dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, - "clear_n_write from receive_uuids"); + "clear_n_write from receive_uuids", + BM_LOCKED_TEST_ALLOWED); _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); _drbd_uuid_set(mdev, UI_BITMAP, 0); _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), CS_VERBOSE, NULL); drbd_md_sync(mdev); + updated_uuids = 1; } put_ldev(mdev); + } else if (mdev->state.disk < D_INCONSISTENT && + mdev->state.role == R_PRIMARY) { + /* I am a diskless primary, the peer just created a new current UUID + for me. */ + updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); } /* Before we test for the disk state, we should wait until an eventually ongoing cluster wide state change is finished. That is important if we are primary and are detaching from our disk. We need to see the new disk state... */ - wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); + mutex_lock(mdev->state_mutex); + mutex_unlock(mdev->state_mutex); if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) - drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); + updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); + + if (updated_uuids) + drbd_print_uuids(mdev, "receiver updated UUIDs to"); - return TRUE; + return 0; } /** @@ -3077,6 +3899,7 @@ union drbd_state ms; static enum drbd_conns c_tab[] = { + [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, [C_CONNECTED] = C_CONNECTED, [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, @@ -3095,53 +3918,76 @@ ms.disk = ps.pdsk; ms.peer_isp = (ps.aftr_isp | ps.user_isp); - return ms; + return ms; +} + +STATIC int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi) +{ + struct drbd_conf *mdev; + struct p_req_state *p = pi->data; + union drbd_state mask, val; + enum drbd_state_rv rv; + + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags) && + mutex_is_locked(mdev->state_mutex)) { + drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); + return 0; + } + + mask = convert_state(mask); + val = convert_state(val); + + rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); + drbd_send_sr_reply(mdev, rv); + + drbd_md_sync(mdev); + + return 0; } -STATIC int receive_req_state(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_req_state *p = (struct p_req_state *)h; + struct p_req_state *p = pi->data; union drbd_state mask, val; - int rv; - - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; + enum drbd_state_rv rv; mask.i = be32_to_cpu(p->mask); val.i = be32_to_cpu(p->val); - if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && - test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { - drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); - return TRUE; + if (test_bit(DISCARD_CONCURRENT, &tconn->flags) && + mutex_is_locked(&tconn->cstate_mutex)) { + conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG); + return 0; } mask = convert_state(mask); val = convert_state(val); - DRBD_STATE_DEBUG_INIT_VAL(val); - rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); - - drbd_send_sr_reply(mdev, rv); - drbd_md_sync(mdev); + rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); + conn_send_sr_reply(tconn, rv); - return TRUE; + return 0; } -STATIC int receive_state(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_state *p = (struct p_state *)h; - enum drbd_conns nconn, oconn; - union drbd_state ns, peer_state; + struct drbd_conf *mdev; + struct p_state *p = pi->data; + union drbd_state os, ns, peer_state; enum drbd_disk_state real_peer_disk; + enum chg_state_flags cs_flags; int rv; - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) - return FALSE; - - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return config_unknown_volume(tconn, pi); peer_state.i = be32_to_cpu(p->state); @@ -3151,154 +3997,249 @@ dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); } - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); retry: - oconn = nconn = mdev->state.conn; - spin_unlock_irq(&mdev->req_lock); + os = ns = drbd_read_state(mdev); + spin_unlock_irq(&mdev->tconn->req_lock); + + /* If some other part of the code (asender thread, timeout) + * already decided to close the connection again, + * we must not "re-establish" it here. */ + if (os.conn <= C_TEAR_DOWN) + return false; + + /* If this is the "end of sync" confirmation, usually the peer disk + * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits + * set) resync started in PausedSyncT, or if the timing of pause-/ + * unpause-sync events has been "just right", the peer disk may + * transition from D_CONSISTENT to D_UP_TO_DATE as well. + */ + if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && + real_peer_disk == D_UP_TO_DATE && + os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { + /* If we are (becoming) SyncSource, but peer is still in sync + * preparation, ignore its uptodate-ness to avoid flapping, it + * will change to inconsistent once the peer reaches active + * syncing states. + * It may have changed syncer-paused flags, however, so we + * cannot ignore this completely. */ + if (peer_state.conn > C_CONNECTED && + peer_state.conn < C_SYNC_SOURCE) + real_peer_disk = D_INCONSISTENT; + + /* if peer_state changes to connected at the same time, + * it explicitly notifies us that it finished resync. + * Maybe we should finish it up, too? */ + else if (os.conn >= C_SYNC_SOURCE && + peer_state.conn == C_CONNECTED) { + if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) + drbd_resync_finished(mdev); + return 0; + } + } + + /* peer says his disk is inconsistent, while we think it is uptodate, + * and this happens while the peer still thinks we have a sync going on, + * but we think we are already done with the sync. + * We ignore this to avoid flapping pdsk. + * This should not happen, if the peer is a recent version of drbd. */ + if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT && + os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE) + real_peer_disk = D_UP_TO_DATE; + + if (ns.conn == C_WF_REPORT_PARAMS) + ns.conn = C_CONNECTED; - if (nconn == C_WF_REPORT_PARAMS) - nconn = C_CONNECTED; + if (peer_state.conn == C_AHEAD) + ns.conn = C_BEHIND; if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && get_ldev_if_state(mdev, D_NEGOTIATING)) { int cr; /* consider resync */ /* if we established a new connection */ - cr = (oconn < C_CONNECTED); + cr = (os.conn < C_CONNECTED); /* if we had an established connection * and one of the nodes newly attaches a disk */ - cr |= (oconn == C_CONNECTED && + cr |= (os.conn == C_CONNECTED && (peer_state.disk == D_NEGOTIATING || - mdev->state.disk == D_NEGOTIATING)); + os.disk == D_NEGOTIATING)); /* if we have both been inconsistent, and the peer has been * forced to be UpToDate with --overwrite-data */ cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); /* if we had been plain connected, and the admin requested to * start a sync by "invalidate" or "invalidate-remote" */ - cr |= (oconn == C_CONNECTED && + cr |= (os.conn == C_CONNECTED && (peer_state.conn >= C_STARTING_SYNC_S && peer_state.conn <= C_WF_BITMAP_T)); if (cr) - nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); + ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); put_ldev(mdev); - if (nconn == C_MASK) { + if (ns.conn == C_MASK) { + ns.conn = C_CONNECTED; if (mdev->state.disk == D_NEGOTIATING) { - drbd_force_state(mdev, NS(disk, D_DISKLESS)); - nconn = C_CONNECTED; + drbd_force_state(mdev, NS(disk, D_FAILED)); } else if (peer_state.disk == D_NEGOTIATING) { dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); peer_state.disk = D_DISKLESS; + real_peer_disk = D_DISKLESS; } else { - D_ASSERT(oconn == C_WF_REPORT_PARAMS); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; + if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags)) + return -EIO; + D_ASSERT(os.conn == C_WF_REPORT_PARAMS); + conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; } } } - spin_lock_irq(&mdev->req_lock); - if (mdev->state.conn != oconn) + spin_lock_irq(&mdev->tconn->req_lock); + if (os.i != drbd_read_state(mdev).i) goto retry; clear_bit(CONSIDER_RESYNC, &mdev->flags); - ns.i = mdev->state.i; - ns.conn = nconn; ns.peer = peer_state.role; ns.pdsk = real_peer_disk; ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); - if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) + if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) ns.disk = mdev->new_state_tmp.disk; - DRBD_STATE_DEBUG_INIT_VAL(ns); - rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); - ns = mdev->state; - spin_unlock_irq(&mdev->req_lock); + cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); + if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && + test_bit(NEW_CUR_UUID, &mdev->flags)) { + /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this + for temporal network outages! */ + spin_unlock_irq(&mdev->tconn->req_lock); + dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); + tl_clear(mdev->tconn); + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); + return -EIO; + } + rv = _drbd_set_state(mdev, ns, cs_flags, NULL); + ns = drbd_read_state(mdev); + spin_unlock_irq(&mdev->tconn->req_lock); if (rv < SS_SUCCESS) { - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return FALSE; + conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; } - if (oconn > C_WF_REPORT_PARAMS) { - if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && + if (os.conn > C_WF_REPORT_PARAMS) { + if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED && peer_state.disk != D_NEGOTIATING ) { /* we want resync, peer has not yet decided to sync... */ /* Nowadays only used when forcing a node into primary role and setting its disk to UpToDate with that */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } } - mdev->net_conf->want_lose = 0; + clear_bit(DISCARD_MY_DATA, &mdev->flags); drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ - return TRUE; + return 0; } -STATIC int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_rs_uuid *p = (struct p_rs_uuid *)h; + struct drbd_conf *mdev; + struct p_rs_uuid *p = pi->data; + + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; wait_event(mdev->misc_wait, mdev->state.conn == C_WF_SYNC_UUID || + mdev->state.conn == C_BEHIND || mdev->state.conn < C_CONNECTED || mdev->state.disk < D_NEGOTIATING); /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; - if (drbd_recv(mdev, h->payload, h->length) != h->length) - return FALSE; - /* Here the _drbd_uuid_ functions are right, current should _not_ be rotated into the history */ if (get_ldev_if_state(mdev, D_NEGOTIATING)) { _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); _drbd_uuid_set(mdev, UI_BITMAP, 0UL); + drbd_print_uuids(mdev, "updated sync uuid"); drbd_start_resync(mdev, C_SYNC_TARGET); put_ldev(mdev); } else dev_err(DEV, "Ignoring SyncUUID packet!\n"); - return TRUE; + return 0; } -enum receive_bitmap_ret { OK, DONE, FAILED }; - -static enum receive_bitmap_ret -receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, - unsigned long *buffer, struct bm_xfer_ctx *c) -{ - unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); - unsigned want = num_words * sizeof(long); +/** + * receive_bitmap_plain + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size, + unsigned long *p, struct bm_xfer_ctx *c) +{ + unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - + drbd_header_size(mdev->tconn); + unsigned int num_words = min_t(size_t, data_size / sizeof(*p), + c->bm_words - c->word_offset); + unsigned int want = num_words * sizeof(*p); + int err; - if (want != h->length) { - dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); - return FAILED; + if (want != size) { + dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size); + return -EIO; } if (want == 0) - return DONE; - if (drbd_recv(mdev, buffer, want) != want) - return FAILED; + return 0; + err = drbd_recv_all(mdev->tconn, p, want); + if (err) + return err; - drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); + drbd_bm_merge_lel(mdev, c->word_offset, num_words, p); c->word_offset += num_words; c->bit_offset = c->word_offset * BITS_PER_LONG; if (c->bit_offset > c->bm_bits) c->bit_offset = c->bm_bits; - return OK; + return 1; +} + +static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) +{ + return (enum drbd_bitmap_code)(p->encoding & 0x0f); +} + +static int dcbp_get_start(struct p_compressed_bm *p) +{ + return (p->encoding & 0x80) != 0; +} + +static int dcbp_get_pad_bits(struct p_compressed_bm *p) +{ + return (p->encoding >> 4) & 0x7; } -static enum receive_bitmap_ret +/** + * recv_bm_rle_bits + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int recv_bm_rle_bits(struct drbd_conf *mdev, struct p_compressed_bm *p, - struct bm_xfer_ctx *c) + struct bm_xfer_ctx *c, + unsigned int len) { struct bitstream bs; u64 look_ahead; @@ -3306,27 +4247,26 @@ u64 tmp; unsigned long s = c->bit_offset; unsigned long e; - int len = p->head.length - (sizeof(*p) - sizeof(p->head)); - int toggle = DCBP_get_start(p); + int toggle = dcbp_get_start(p); int have; int bits; - bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); + bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); bits = bitstream_get_bits(&bs, &look_ahead, 64); if (bits < 0) - return FAILED; + return -EIO; for (have = bits; have > 0; s += rl, toggle = !toggle) { bits = vli_decode_bits(&rl, look_ahead); if (bits <= 0) - return FAILED; + return -EIO; if (toggle) { e = s + rl -1; if (e >= c->bm_bits) { dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); - return FAILED; + return -EIO; } _drbd_bm_set_bits(mdev, s, e); } @@ -3336,14 +4276,14 @@ have, bits, look_ahead, (unsigned int)(bs.cur.b - p->code), (unsigned int)bs.buf_len); - return FAILED; + return -EIO; } look_ahead >>= bits; have -= bits; bits = bitstream_get_bits(&bs, &tmp, 64 - have); if (bits < 0) - return FAILED; + return -EIO; look_ahead |= tmp << have; have += bits; } @@ -3351,35 +4291,44 @@ c->bit_offset = s; bm_xfer_ctx_bit_to_word_offset(c); - return (s == c->bm_bits) ? DONE : OK; + return (s != c->bm_bits); } -static enum receive_bitmap_ret +/** + * decode_bitmap_c + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int decode_bitmap_c(struct drbd_conf *mdev, struct p_compressed_bm *p, - struct bm_xfer_ctx *c) + struct bm_xfer_ctx *c, + unsigned int len) { - if (DCBP_get_code(p) == RLE_VLI_Bits) - return recv_bm_rle_bits(mdev, p, c); + if (dcbp_get_code(p) == RLE_VLI_Bits) + return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p)); /* other variants had been implemented for evaluation, * but have been dropped as this one turned out to be "best" * during all our tests. */ dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - return FAILED; + conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); + return -EIO; } void INFO_bm_xfer_stats(struct drbd_conf *mdev, const char *direction, struct bm_xfer_ctx *c) { /* what would it take to transfer it "plaintext" */ - unsigned plain = sizeof(struct p_header) * - ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) - + c->bm_words * sizeof(long); - unsigned total = c->bytes[0] + c->bytes[1]; - unsigned r; + unsigned int header_size = drbd_header_size(mdev->tconn); + unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; + unsigned int plain = + header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + + c->bm_words * sizeof(unsigned long); + unsigned int total = c->bytes[0] + c->bytes[1]; + unsigned int r; /* total can not be zero. but just in case: */ if (total == 0) @@ -3413,266 +4362,291 @@ in order to be agnostic to the 32 vs 64 bits issue. returns 0 on failure, 1 if we successfully received it. */ -STATIC int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi) { + struct drbd_conf *mdev; struct bm_xfer_ctx c; - void *buffer; - enum receive_bitmap_ret ret; - int ok = FALSE; - - wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); - - drbd_bm_lock(mdev, "receive bitmap"); - - /* maybe we should use some per thread scratch page, - * and allocate that during initial device creation? */ - buffer = (unsigned long *) __get_free_page(GFP_NOIO); - if (!buffer) { - dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); - goto out; - } + int err; + + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + + drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED); + /* you are supposed to send additional out-of-sync information + * if you actually set bits during this phase */ c = (struct bm_xfer_ctx) { .bm_bits = drbd_bm_bits(mdev), .bm_words = drbd_bm_words(mdev), }; - do { - if (h->command == P_BITMAP) { - ret = receive_bitmap_plain(mdev, h, buffer, &c); - } else if (h->command == P_COMPRESSED_BITMAP) { + for(;;) { + if (pi->cmd == P_BITMAP) + err = receive_bitmap_plain(mdev, pi->size, pi->data, &c); + else if (pi->cmd == P_COMPRESSED_BITMAP) { /* MAYBE: sanity check that we speak proto >= 90, * and the feature is enabled! */ - struct p_compressed_bm *p; + struct p_compressed_bm *p = pi->data; - if (h->length > BM_PACKET_PAYLOAD_BYTES) { + if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) { dev_err(DEV, "ReportCBitmap packet too large\n"); + err = -EIO; goto out; } - /* use the page buff */ - p = buffer; - memcpy(p, h, sizeof(*h)); - if (drbd_recv(mdev, p->head.payload, h->length) != h->length) + if (pi->size <= sizeof(*p)) { + dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size); + err = -EIO; goto out; - if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { - dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); - return FAILED; } - ret = decode_bitmap_c(mdev, p, &c); + err = drbd_recv_all(mdev->tconn, p, pi->size); + if (err) + goto out; + err = decode_bitmap_c(mdev, p, &c, pi->size); } else { - dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); + dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); + err = -EIO; goto out; } - c.packets[h->command == P_BITMAP]++; - c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; + c.packets[pi->cmd == P_BITMAP]++; + c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size; - if (ret != OK) + if (err <= 0) { + if (err < 0) + goto out; break; - - if (!drbd_recv_header(mdev, h)) + } + err = drbd_recv_header(mdev->tconn, pi); + if (err) goto out; - } while (ret == OK); - if (ret == FAILED) - goto out; + } INFO_bm_xfer_stats(mdev, "receive", &c); if (mdev->state.conn == C_WF_BITMAP_T) { - ok = !drbd_send_bitmap(mdev); - if (!ok) + enum drbd_state_rv rv; + + err = drbd_send_bitmap(mdev); + if (err) goto out; /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ - ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); - D_ASSERT(ok == SS_SUCCESS); + rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + D_ASSERT(rv == SS_SUCCESS); } else if (mdev->state.conn != C_WF_BITMAP_S) { /* admin may have requested C_DISCONNECTING, * other threads may have noticed network errors */ dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", drbd_conn_str(mdev->state.conn)); } + err = 0; - ok = TRUE; out: drbd_bm_unlock(mdev); - if (ok && mdev->state.conn == C_WF_BITMAP_S) + if (!err && mdev->state.conn == C_WF_BITMAP_S) drbd_start_resync(mdev, C_SYNC_SOURCE); - free_page((unsigned long) buffer); - return ok; + return err; } -STATIC int receive_skip(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi) { - /* TODO zero copy sink :) */ - static char sink[128]; - int size, want, r; - - dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", - h->command, h->length); + conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n", + pi->cmd, pi->size); - size = h->length; - while (size > 0) { - want = min_t(int, size, sizeof(sink)); - r = drbd_recv(mdev, sink, want); - ERR_IF(r <= 0) break; - size -= r; - } - return size == 0; + return ignore_remaining_packet(tconn, pi); } -STATIC int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) +STATIC int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi) { - if (mdev->state.disk >= D_INCONSISTENT) - drbd_kick_lo(mdev); + struct drbd_conf *mdev; + + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; /* Make sure we've acked all the TCP data associated * with the data requests being unplugged */ - drbd_tcp_quickack(mdev->data.socket); + drbd_tcp_quickack(mdev->tconn->data.socket); - return TRUE; + return 0; } -typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); +STATIC int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi) +{ + struct drbd_conf *mdev; + struct p_block_desc *p = pi->data; + + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + + switch (mdev->state.conn) { + case C_WF_SYNC_UUID: + case C_WF_BITMAP_T: + case C_BEHIND: + break; + default: + dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n", + drbd_conn_str(mdev->state.conn)); + } + + drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); + + return 0; +} -static drbd_cmd_handler_f drbd_default_handler[] = { - [P_DATA] = receive_Data, - [P_DATA_REPLY] = receive_DataReply, - [P_RS_DATA_REPLY] = receive_RSDataReply, - [P_BARRIER] = receive_Barrier, - [P_BITMAP] = receive_bitmap, - [P_COMPRESSED_BITMAP] = receive_bitmap, - [P_UNPLUG_REMOTE] = receive_UnplugRemote, - [P_DATA_REQUEST] = receive_DataRequest, - [P_RS_DATA_REQUEST] = receive_DataRequest, - [P_SYNC_PARAM] = receive_SyncParam, - [P_SYNC_PARAM89] = receive_SyncParam, - [P_PROTOCOL] = receive_protocol, - [P_UUIDS] = receive_uuids, - [P_SIZES] = receive_sizes, - [P_STATE] = receive_state, - [P_STATE_CHG_REQ] = receive_req_state, - [P_SYNC_UUID] = receive_sync_uuid, - [P_OV_REQUEST] = receive_DataRequest, - [P_OV_REPLY] = receive_DataRequest, - [P_CSUM_RS_REQUEST] = receive_DataRequest, - /* anything missing from this table is in - * the asender_tbl, see get_asender_cmd */ - [P_MAX_CMD] = NULL, +struct data_cmd { + int expect_payload; + size_t pkt_size; + int (*fn)(struct drbd_tconn *, struct packet_info *); }; -static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; -static drbd_cmd_handler_f *drbd_opt_cmd_handler; +static struct data_cmd drbd_cmd_handler[] = { + [P_DATA] = { 1, sizeof(struct p_data), receive_Data }, + [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, + [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , + [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , + [P_BITMAP] = { 1, 0, receive_bitmap } , + [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , + [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, + [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, + [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, + [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, + [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, + [P_STATE] = { 0, sizeof(struct p_state), receive_state }, + [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state }, + [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid }, + [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, + [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, + [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, + [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, +}; -STATIC void drbdd(struct drbd_conf *mdev) +STATIC void drbdd(struct drbd_tconn *tconn) { - drbd_cmd_handler_f handler; - struct p_header *header = &mdev->data.rbuf.header; + struct packet_info pi; + size_t shs; /* sub header size */ + int err; - while (get_t_state(&mdev->receiver) == Running) { - drbd_thread_current_set_cpu(mdev); - if (!drbd_recv_header(mdev, header)) { - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; - } + while (get_t_state(&tconn->receiver) == RUNNING) { + struct data_cmd *cmd; - if (header->command < P_MAX_CMD) - handler = drbd_cmd_handler[header->command]; - else if (P_MAY_IGNORE < header->command - && header->command < P_MAX_OPT_CMD) - handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; - else if (header->command > P_MAX_OPT_CMD) - handler = receive_skip; - else - handler = NULL; + drbd_thread_current_set_cpu(&tconn->receiver); + if (drbd_recv_header(tconn, &pi)) + goto err_out; - if (unlikely(!handler)) { - dev_err(DEV, "unknown packet type %d, l: %d!\n", - header->command, header->length); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; - } - if (unlikely(!handler(mdev, header))) { - dev_err(DEV, "error receiving %s, l: %d!\n", - cmdname(header->command), header->length); - drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); - break; + cmd = &drbd_cmd_handler[pi.cmd]; + if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { + conn_err(tconn, "Unexpected data packet %s (0x%04x)", + cmdname(pi.cmd), pi.cmd); + goto err_out; } - trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf, - __FILE__, __LINE__); - } -} + shs = cmd->pkt_size; + if (pi.size > shs && !cmd->expect_payload) { + conn_err(tconn, "No payload expected %s l:%d\n", + cmdname(pi.cmd), pi.size); + goto err_out; + } -STATIC void drbd_fail_pending_reads(struct drbd_conf *mdev) -{ - struct hlist_head *slot; - struct hlist_node *pos; - struct hlist_node *tmp; - struct drbd_request *req; - int i; + if (shs) { + err = drbd_recv_all_warn(tconn, pi.data, shs); + if (err) + goto err_out; + pi.size -= shs; + } - /* - * Application READ requests - */ - spin_lock_irq(&mdev->req_lock); - for (i = 0; i < APP_R_HSIZE; i++) { - slot = mdev->app_reads_hash+i; - hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { - /* it may (but should not any longer!) - * be on the work queue; if that assert triggers, - * we need to also grab the - * spin_lock_irq(&mdev->data.work.q_lock); - * and list_del_init here. */ - D_ASSERT(list_empty(&req->w.list)); - /* It would be nice to complete outside of spinlock. - * But this is easier for now. */ - _req_mod(req, connection_lost_while_pending); - } - } - for (i = 0; i < APP_R_HSIZE; i++) - if (!hlist_empty(mdev->app_reads_hash+i)) - dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " - "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); + err = cmd->fn(tconn, &pi); + if (err) { + conn_err(tconn, "error receiving %s, e: %d l: %d!\n", + cmdname(pi.cmd), err, pi.size); + goto err_out; + } + } + return; - memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); - spin_unlock_irq(&mdev->req_lock); + err_out: + conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD); } -void drbd_flush_workqueue(struct drbd_conf *mdev) +void conn_flush_workqueue(struct drbd_tconn *tconn) { struct drbd_wq_barrier barr; barr.w.cb = w_prev_work_done; + barr.w.tconn = tconn; init_completion(&barr.done); - drbd_queue_work(&mdev->data.work, &barr.w); + drbd_queue_work(&tconn->data.work, &barr.w); wait_for_completion(&barr.done); } -STATIC void drbd_disconnect(struct drbd_conf *mdev) +STATIC void conn_disconnect(struct drbd_tconn *tconn) { - enum drbd_fencing_p fp; - union drbd_state os, ns; - int rv = SS_UNKNOWN_ERROR; - unsigned int i; + struct drbd_conf *mdev; + enum drbd_conns oc; + int vnr; - if (mdev->state.conn == C_STANDALONE) + if (tconn->cstate == C_STANDALONE) return; - if (mdev->state.conn >= C_WF_CONNECTION) - dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n", - drbd_conn_str(mdev->state.conn)); + + /* We are about to start the cleanup after connection loss. + * Make sure drbd_make_request knows about that. + * Usually we should be in some network failure state already, + * but just in case we are not, we fix it up here. + */ + conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); /* asender does not clean up anything. it must not interfere, either */ - drbd_thread_stop(&mdev->asender); + drbd_thread_stop(&tconn->asender); + drbd_free_sock(tconn); + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + kref_get(&mdev->kref); + rcu_read_unlock(); + drbd_disconnected(mdev); + kref_put(&mdev->kref, &drbd_minor_destroy); + rcu_read_lock(); + } + rcu_read_unlock(); + + if (!list_empty(&tconn->current_epoch->list)) + conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n"); + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ + atomic_set(&tconn->current_epoch->epoch_size, 0); + + conn_info(tconn, "Connection closed\n"); + + if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN) + conn_try_outdate_peer_async(tconn); + + spin_lock_irq(&tconn->req_lock); + oc = tconn->cstate; + if (oc >= C_UNCONNECTED) + _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE); - mutex_lock(&mdev->data.mutex); - drbd_free_sock(mdev); - mutex_unlock(&mdev->data.mutex); + spin_unlock_irq(&tconn->req_lock); - spin_lock_irq(&mdev->req_lock); + if (oc == C_DISCONNECTING) + conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); +} + +STATIC int drbd_disconnected(struct drbd_conf *mdev) +{ + unsigned int i; + + /* wait for current activity to cease. */ + spin_lock_irq(&mdev->tconn->req_lock); _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); /* We do not have data structures that would allow us to * get the rs_pending_cnt down to 0 again. @@ -3690,9 +4664,7 @@ atomic_set(&mdev->rs_pending_cnt, 0); wake_up(&mdev->misc_wait); - /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); - set_bit(STOP_SYNC_TIMER, &mdev->flags); resync_timer_fn((unsigned long)mdev); /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, @@ -3700,81 +4672,24 @@ * to be "canceled" */ drbd_flush_workqueue(mdev); - /* This also does reclaim_net_ee(). If we do this too early, we might - * miss some resync ee and pages.*/ - drbd_process_done_ee(mdev); + drbd_finish_peer_reqs(mdev); + + /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() + might have issued a work again. The one before drbd_finish_peer_reqs() is + necessary to reclain net_ee in drbd_finish_peer_reqs(). */ + drbd_flush_workqueue(mdev); kfree(mdev->p_uuid); mdev->p_uuid = NULL; - if (!mdev->state.susp) - tl_clear(mdev); - - drbd_fail_pending_reads(mdev); - - dev_info(DEV, "Connection closed\n"); + if (!drbd_suspended(mdev)) + tl_clear(mdev->tconn); drbd_md_sync(mdev); - fp = FP_DONT_CARE; - if (get_ldev(mdev)) { - fp = mdev->ldev->dc.fencing; - put_ldev(mdev); - } - - if (mdev->state.role == R_PRIMARY) { - if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { - enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); - drbd_request_state(mdev, NS(pdsk, nps)); - } - } - - spin_lock_irq(&mdev->req_lock); - os = mdev->state; - if (os.conn >= C_UNCONNECTED) { - /* Do not restart in case we are C_DISCONNECTING */ - ns = os; - ns.conn = C_UNCONNECTED; - DRBD_STATE_DEBUG_INIT_VAL(ns); - rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); - } - spin_unlock_irq(&mdev->req_lock); - - if (os.conn == C_DISCONNECTING) { - struct hlist_head *h; - wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); - - /* we must not free the tl_hash - * while application io is still on the fly */ - wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); - - spin_lock_irq(&mdev->req_lock); - /* paranoia code */ - for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", - (int)(h - mdev->ee_hash), h->first); - kfree(mdev->ee_hash); - mdev->ee_hash = NULL; - mdev->ee_hash_s = 0; - - /* paranoia code */ - for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", - (int)(h - mdev->tl_hash), h->first); - kfree(mdev->tl_hash); - mdev->tl_hash = NULL; - mdev->tl_hash_s = 0; - spin_unlock_irq(&mdev->req_lock); - - crypto_free_hash(mdev->cram_hmac_tfm); - mdev->cram_hmac_tfm = NULL; - - kfree(mdev->net_conf); - mdev->net_conf = NULL; - drbd_request_state(mdev, NS(conn, C_STANDALONE)); - } + /* serialize with bitmap writeout triggered by the state change, + * if any. */ + wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); /* tcp_close and release of sendpage pages can be deferred. I don't * want to use SO_LINGER, because apparently it can be deferred for @@ -3783,21 +4698,22 @@ * Actually we don't care for exactly when the network stack does its * put_page(), but release our reference on these pages right here. */ - i = drbd_release_ee(mdev, &mdev->net_ee); + i = drbd_free_peer_reqs(mdev, &mdev->net_ee); if (i) dev_info(DEV, "net_ee not empty, killed %u entries\n", i); + i = atomic_read(&mdev->pp_in_use_by_net); + if (i) + dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i); i = atomic_read(&mdev->pp_in_use); if (i) - dev_info(DEV, "pp_in_use = %u, expected 0\n", i); + dev_info(DEV, "pp_in_use = %d, expected 0\n", i); D_ASSERT(list_empty(&mdev->read_ee)); D_ASSERT(list_empty(&mdev->active_ee)); D_ASSERT(list_empty(&mdev->sync_ee)); D_ASSERT(list_empty(&mdev->done_ee)); - /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ - atomic_set(&mdev->current_epoch->epoch_size, 0); - D_ASSERT(list_empty(&mdev->current_epoch->list)); + return 0; } /* @@ -3809,29 +4725,19 @@ * * for now, they are expected to be zero, but ignored. */ -STATIC int drbd_send_handshake(struct drbd_conf *mdev) +STATIC int drbd_send_features(struct drbd_tconn *tconn) { - /* ASSERT current == mdev->receiver ... */ - struct p_handshake *p = &mdev->data.sbuf.handshake; - int ok; - - if (mutex_lock_interruptible(&mdev->data.mutex)) { - dev_err(DEV, "interrupted during initial handshake\n"); - return 0; /* interrupted. not ok. */ - } - - if (mdev->data.socket == NULL) { - mutex_unlock(&mdev->data.mutex); - return 0; - } + struct drbd_socket *sock; + struct p_connection_features *p; + sock = &tconn->data; + p = conn_prepare_command(tconn, sock); + if (!p) + return -EIO; memset(p, 0, sizeof(*p)); p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); - ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, - (struct p_header *)p, sizeof(*p), 0 ); - mutex_unlock(&mdev->data.mutex); - return ok; + return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); } /* @@ -3841,43 +4747,38 @@ * -1 peer talks different language, * no point in trying again, please go standalone. */ -STATIC int drbd_do_handshake(struct drbd_conf *mdev) +STATIC int drbd_do_features(struct drbd_tconn *tconn) { - /* ASSERT current == mdev->receiver ... */ - struct p_handshake *p = &mdev->data.rbuf.handshake; - const int expect = sizeof(struct p_handshake) - -sizeof(struct p_header); - int rv; + /* ASSERT current == tconn->receiver ... */ + struct p_connection_features *p; + const int expect = sizeof(struct p_connection_features); + struct packet_info pi; + int err; - rv = drbd_send_handshake(mdev); - if (!rv) + err = drbd_send_features(tconn); + if (err) return 0; - rv = drbd_recv_header(mdev, &p->head); - if (!rv) + err = drbd_recv_header(tconn, &pi); + if (err) return 0; - if (p->head.command != P_HAND_SHAKE) { - dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", - cmdname(p->head.command), p->head.command); + if (pi.cmd != P_CONNECTION_FEATURES) { + conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); return -1; } - if (p->head.length != expect) { - dev_err(DEV, "expected HandShake length: %u, received: %u\n", - expect, p->head.length); + if (pi.size != expect) { + conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n", + expect, pi.size); return -1; } - rv = drbd_recv(mdev, &p->head.payload, expect); - - if (rv != expect) { - dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); + p = pi.data; + err = drbd_recv_all_warn(tconn, p, expect); + if (err) return 0; - } - - trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf, - __FILE__, __LINE__); p->protocol_min = be32_to_cpu(p->protocol_min); p->protocol_max = be32_to_cpu(p->protocol_max); @@ -3888,15 +4789,15 @@ PRO_VERSION_MIN > p->protocol_max) goto incompat; - mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); - dev_info(DEV, "Handshake successful: " - "Agreed network protocol version %d\n", mdev->agreed_pro_version); + conn_info(tconn, "Handshake successful: " + "Agreed network protocol version %d\n", tconn->agreed_pro_version); return 1; incompat: - dev_err(DEV, "incompatible DRBD dialects: " + conn_err(tconn, "incompatible DRBD dialects: " "I support %d-%d, peer supports %d-%d\n", PRO_VERSION_MIN, PRO_VERSION_MAX, p->protocol_min, p->protocol_max); @@ -3904,7 +4805,7 @@ } #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) -STATIC int drbd_do_auth(struct drbd_conf *mdev) +STATIC int drbd_do_auth(struct drbd_tconn *tconn) { dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); @@ -3919,118 +4820,139 @@ -1 - auth failed, don't try again. */ -STATIC int drbd_do_auth(struct drbd_conf *mdev) +STATIC int drbd_do_auth(struct drbd_tconn *tconn) { + struct drbd_socket *sock; char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ struct scatterlist sg; char *response = NULL; char *right_response = NULL; char *peers_ch = NULL; - struct p_header p; - unsigned int key_len = strlen(mdev->net_conf->shared_secret); + unsigned int key_len; + char secret[SHARED_SECRET_MAX]; /* 64 byte */ unsigned int resp_size; struct hash_desc desc; - int rv; + struct packet_info pi; + struct net_conf *nc; + int err, rv; + + /* FIXME: Put the challenge/response into the preallocated socket buffer. */ + + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + key_len = strlen(nc->shared_secret); + memcpy(secret, nc->shared_secret, key_len); + rcu_read_unlock(); - desc.tfm = mdev->cram_hmac_tfm; + desc.tfm = tconn->cram_hmac_tfm; desc.flags = 0; - rv = crypto_hash_setkey(mdev->cram_hmac_tfm, - (u8 *)mdev->net_conf->shared_secret, key_len); + rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len); if (rv) { - dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); + conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv); rv = -1; goto fail; } get_random_bytes(my_challenge, CHALLENGE_LEN); - rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); + sock = &tconn->data; + if (!conn_prepare_command(tconn, sock)) { + rv = 0; + goto fail; + } + rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0, + my_challenge, CHALLENGE_LEN); if (!rv) goto fail; - rv = drbd_recv_header(mdev, &p); - if (!rv) + err = drbd_recv_header(tconn, &pi); + if (err) { + rv = 0; goto fail; + } - if (p.command != P_AUTH_CHALLENGE) { - dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", - cmdname(p.command), p.command); + if (pi.cmd != P_AUTH_CHALLENGE) { + conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); rv = 0; goto fail; } - if (p.length > CHALLENGE_LEN*2) { - dev_err(DEV, "expected AuthChallenge payload too big.\n"); + if (pi.size > CHALLENGE_LEN * 2) { + conn_err(tconn, "expected AuthChallenge payload too big.\n"); rv = -1; goto fail; } - peers_ch = kmalloc(p.length, GFP_NOIO); + peers_ch = kmalloc(pi.size, GFP_NOIO); if (peers_ch == NULL) { - dev_err(DEV, "kmalloc of peers_ch failed\n"); + conn_err(tconn, "kmalloc of peers_ch failed\n"); rv = -1; goto fail; } - rv = drbd_recv(mdev, peers_ch, p.length); - - if (rv != p.length) { - dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); + err = drbd_recv_all_warn(tconn, peers_ch, pi.size); + if (err) { rv = 0; goto fail; } - resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); + resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm); response = kmalloc(resp_size, GFP_NOIO); if (response == NULL) { - dev_err(DEV, "kmalloc of response failed\n"); + conn_err(tconn, "kmalloc of response failed\n"); rv = -1; goto fail; } sg_init_table(&sg, 1); - sg_set_buf(&sg, peers_ch, p.length); + sg_set_buf(&sg, peers_ch, pi.size); rv = crypto_hash_digest(&desc, &sg, sg.length, response); if (rv) { - dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); + conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); rv = -1; goto fail; } - rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); - if (!rv) + if (!conn_prepare_command(tconn, sock)) { + rv = 0; goto fail; - - rv = drbd_recv_header(mdev, &p); + } + rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0, + response, resp_size); if (!rv) goto fail; - if (p.command != P_AUTH_RESPONSE) { - dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", - cmdname(p.command), p.command); + err = drbd_recv_header(tconn, &pi); + if (err) { rv = 0; goto fail; } - if (p.length != resp_size) { - dev_err(DEV, "expected AuthResponse payload of wrong size\n"); + if (pi.cmd != P_AUTH_RESPONSE) { + conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); rv = 0; goto fail; } - rv = drbd_recv(mdev, response , resp_size); + if (pi.size != resp_size) { + conn_err(tconn, "expected AuthResponse payload of wrong size\n"); + rv = 0; + goto fail; + } - if (rv != resp_size) { - dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); + err = drbd_recv_all_warn(tconn, response , resp_size); + if (err) { rv = 0; goto fail; } right_response = kmalloc(resp_size, GFP_NOIO); if (right_response == NULL) { - dev_err(DEV, "kmalloc of right_response failed\n"); + conn_err(tconn, "kmalloc of right_response failed\n"); rv = -1; goto fail; } @@ -4039,7 +4961,7 @@ rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); if (rv) { - dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); + conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv); rv = -1; goto fail; } @@ -4047,8 +4969,8 @@ rv = !memcmp(response, right_response, resp_size); if (rv) - dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", - resp_size, mdev->net_conf->cram_hmac_alg); + conn_info(tconn, "Peer authenticated using %d bytes HMAC\n", + resp_size); else rv = -1; @@ -4063,226 +4985,248 @@ int drbdd_init(struct drbd_thread *thi) { - struct drbd_conf *mdev = thi->mdev; - unsigned int minor = mdev_to_minor(mdev); + struct drbd_tconn *tconn = thi->tconn; int h; - sprintf(current->comm, "drbd%d_receiver", minor); - - dev_info(DEV, "receiver (re)started\n"); + conn_info(tconn, "receiver (re)started\n"); do { - h = drbd_connect(mdev); + h = conn_connect(tconn); if (h == 0) { - drbd_disconnect(mdev); - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + conn_disconnect(tconn); + schedule_timeout_interruptible(HZ); } if (h == -1) { - dev_warn(DEV, "Discarding network configuration.\n"); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + conn_warn(tconn, "Discarding network configuration.\n"); + conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); } } while (h == 0); - if (h > 0) { - if (get_net_conf(mdev)) { - drbdd(mdev); - put_net_conf(mdev); - } - } + if (h > 0) + drbdd(tconn); - drbd_disconnect(mdev); + conn_disconnect(tconn); - dev_info(DEV, "receiver terminated\n"); + conn_info(tconn, "receiver terminated\n"); return 0; } /* ********* acknowledge sender ******** */ -STATIC int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_req_state_reply *p = (struct p_req_state_reply *)h; + struct p_req_state_reply *p = pi->data; + int retcode = be32_to_cpu(p->retcode); + + if (retcode >= SS_SUCCESS) { + set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags); + } else { + set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags); + conn_err(tconn, "Requested state change failed by peer: %s (%d)\n", + drbd_set_st_err_str(retcode), retcode); + } + wake_up(&tconn->ping_wait); + + return 0; +} +STATIC int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi) +{ + struct drbd_conf *mdev; + struct p_req_state_reply *p = pi->data; int retcode = be32_to_cpu(p->retcode); + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + if (retcode >= SS_SUCCESS) { set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); } else { set_bit(CL_ST_CHG_FAIL, &mdev->flags); dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", - drbd_set_st_err_str(retcode), retcode); + drbd_set_st_err_str(retcode), retcode); } wake_up(&mdev->state_wait); - return TRUE; + return 0; } -STATIC int got_Ping(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi) { - return drbd_send_ping_ack(mdev); + return drbd_send_ping_ack(tconn); } -STATIC int got_PingAck(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi) { - /* restore idle timeout */ - mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags)) + wake_up(&tconn->ping_wait); - return TRUE; + return 0; } -STATIC int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_block_ack *p = (struct p_block_ack *)h; + struct drbd_conf *mdev; + struct p_block_ack *p = pi->data; sector_t sector = be64_to_cpu(p->sector); int blksize = be32_to_cpu(p->blksize); - D_ASSERT(mdev->agreed_pro_version >= 89); + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + + D_ASSERT(mdev->tconn->agreed_pro_version >= 89); update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - drbd_rs_complete_io(mdev, sector); - drbd_set_in_sync(mdev, sector, blksize); - /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ - mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, sector); + drbd_set_in_sync(mdev, sector, blksize); + /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ + mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + put_ldev(mdev); + } dec_rs_pending(mdev); + atomic_add(blksize >> 9, &mdev->rs_sect_in); - return TRUE; -} - -/* when we receive the ACK for a write request, - * verify that we actually know about it */ -static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, - u64 id, sector_t sector) -{ - struct hlist_head *slot = tl_hash_slot(mdev, sector); - struct hlist_node *n; - struct drbd_request *req; - - hlist_for_each_entry(req, n, slot, colision) { - if ((unsigned long)req == (unsigned long)id) { - if (req->sector != sector) { - dev_err(DEV, "_ack_id_to_req: found req %p but it has " - "wrong sector (%llus versus %llus)\n", req, - (unsigned long long)req->sector, - (unsigned long long)sector); - break; - } - return req; - } - } - dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n", - (void *)(unsigned long)id, (unsigned long long)sector); - return NULL; + return 0; } -typedef struct drbd_request *(req_validator_fn) - (struct drbd_conf *mdev, u64 id, sector_t sector); - -static int validate_req_change_req_state(struct drbd_conf *mdev, - u64 id, sector_t sector, req_validator_fn validator, - const char *func, enum drbd_req_event what) +static int +validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector, + struct rb_root *root, const char *func, + enum drbd_req_event what, bool missing_ok) { struct drbd_request *req; struct bio_and_error m; - spin_lock_irq(&mdev->req_lock); - req = validator(mdev, id, sector); + spin_lock_irq(&mdev->tconn->req_lock); + req = find_request(mdev, root, id, sector, missing_ok, func); if (unlikely(!req)) { - spin_unlock_irq(&mdev->req_lock); - dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); - return FALSE; + spin_unlock_irq(&mdev->tconn->req_lock); + return -EIO; } __req_mod(req, what, &m); - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); if (m.bio) complete_master_bio(mdev, &m); - return TRUE; + return 0; } -STATIC int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_block_ack *p = (struct p_block_ack *)h; + struct drbd_conf *mdev; + struct p_block_ack *p = pi->data; sector_t sector = be64_to_cpu(p->sector); int blksize = be32_to_cpu(p->blksize); enum drbd_req_event what; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - if (is_syncer_block_id(p->block_id)) { + if (p->block_id == ID_SYNCER) { drbd_set_in_sync(mdev, sector, blksize); dec_rs_pending(mdev); - return TRUE; + return 0; } - switch (be16_to_cpu(h->command)) { + switch (pi->cmd) { case P_RS_WRITE_ACK: - D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); - what = write_acked_by_peer_and_sis; + what = WRITE_ACKED_BY_PEER_AND_SIS; break; case P_WRITE_ACK: - D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); - what = write_acked_by_peer; + what = WRITE_ACKED_BY_PEER; break; case P_RECV_ACK: - D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); - what = recv_acked_by_peer; + what = RECV_ACKED_BY_PEER; break; - case P_DISCARD_ACK: - D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); - what = conflict_discarded_by_peer; + case P_DISCARD_WRITE: + what = DISCARD_WRITE; + break; + case P_RETRY_WRITE: + what = POSTPONE_WRITE; break; default: - D_ASSERT(0); - return FALSE; + BUG(); } return validate_req_change_req_state(mdev, p->block_id, sector, - _ack_id_to_req, __func__ , what); + &mdev->write_requests, __func__, + what, false); } -STATIC int got_NegAck(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_block_ack *p = (struct p_block_ack *)h; + struct drbd_conf *mdev; + struct p_block_ack *p = pi->data; sector_t sector = be64_to_cpu(p->sector); + int size = be32_to_cpu(p->blksize); + int err; - if (DRBD_ratelimit(5*HZ, 5)) - dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - if (is_syncer_block_id(p->block_id)) { - int size = be32_to_cpu(p->blksize); + if (p->block_id == ID_SYNCER) { dec_rs_pending(mdev); drbd_rs_failed_io(mdev, sector, size); - return TRUE; + return 0; } - return validate_req_change_req_state(mdev, p->block_id, sector, - _ack_id_to_req, __func__ , neg_acked); + + err = validate_req_change_req_state(mdev, p->block_id, sector, + &mdev->write_requests, __func__, + NEG_ACKED, true); + if (err) { + /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. + The master bio might already be completed, therefore the + request is no longer in the collision hash. */ + /* In Protocol B we might already have got a P_RECV_ACK + but then get a P_NEG_ACK afterwards. */ + drbd_set_out_of_sync(mdev, sector, size); + } + return 0; } -STATIC int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_block_ack *p = (struct p_block_ack *)h; + struct drbd_conf *mdev; + struct p_block_ack *p = pi->data; sector_t sector = be64_to_cpu(p->sector); + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); - dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", + + dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n", (unsigned long long)sector, be32_to_cpu(p->blksize)); return validate_req_change_req_state(mdev, p->block_id, sector, - _ar_id_to_req, __func__ , neg_acked); + &mdev->read_requests, __func__, + NEG_ACKED, false); } -STATIC int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi) { + struct drbd_conf *mdev; sector_t sector; int size; - struct p_block_ack *p = (struct p_block_ack *)h; + struct p_block_ack *p = pi->data; + + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); - D_ASSERT(p->block_id == ID_SYNCER); update_peer_seq(mdev, be32_to_cpu(p->seq_num)); @@ -4290,146 +5234,210 @@ if (get_ldev_if_state(mdev, D_FAILED)) { drbd_rs_complete_io(mdev, sector); - drbd_rs_failed_io(mdev, sector, size); + switch (pi->cmd) { + case P_NEG_RS_DREPLY: + drbd_rs_failed_io(mdev, sector, size); + case P_RS_CANCEL: + break; + default: + BUG(); + } put_ldev(mdev); } - return TRUE; + return 0; } -STATIC int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_barrier_ack *p = (struct p_barrier_ack *)h; + struct drbd_conf *mdev; + struct p_barrier_ack *p = pi->data; - tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + + tl_release(mdev->tconn, p->barrier, be32_to_cpu(p->set_size)); + + if (mdev->state.conn == C_AHEAD && + atomic_read(&mdev->ap_in_flight) == 0 && + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { + mdev->start_resync_timer.expires = jiffies + HZ; + add_timer(&mdev->start_resync_timer); + } - return TRUE; + return 0; } -STATIC int got_OVResult(struct drbd_conf *mdev, struct p_header *h) +STATIC int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi) { - struct p_block_ack *p = (struct p_block_ack *)h; + struct drbd_conf *mdev; + struct p_block_ack *p = pi->data; struct drbd_work *w; sector_t sector; int size; + mdev = vnr_to_mdev(tconn, pi->vnr); + if (!mdev) + return -EIO; + sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); update_peer_seq(mdev, be32_to_cpu(p->seq_num)); if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) - drbd_ov_oos_found(mdev, sector, size); + drbd_ov_out_of_sync_found(mdev, sector, size); else - ov_oos_print(mdev); + ov_out_of_sync_print(mdev); + + if (!get_ldev(mdev)) + return 0; drbd_rs_complete_io(mdev, sector); dec_rs_pending(mdev); - if (--mdev->ov_left == 0) { + --mdev->ov_left; + + /* let's advance progress step marks only for every other megabyte */ + if ((mdev->ov_left & 0x200) == 0x200) + drbd_advance_rs_marks(mdev, mdev->ov_left); + + if (mdev->ov_left == 0) { w = kmalloc(sizeof(*w), GFP_NOIO); if (w) { w->cb = w_ov_finished; - drbd_queue_work_front(&mdev->data.work, w); + w->mdev = mdev; + drbd_queue_work_front(&mdev->tconn->data.work, w); } else { dev_err(DEV, "kmalloc(w) failed."); - ov_oos_print(mdev); + ov_out_of_sync_print(mdev); drbd_resync_finished(mdev); } } - return TRUE; + put_ldev(mdev); + return 0; +} + +STATIC int got_skip(struct drbd_tconn *tconn, struct packet_info *pi) +{ + return 0; +} + +static int tconn_finish_peer_reqs(struct drbd_tconn *tconn) +{ + struct drbd_conf *mdev; + int vnr, not_empty = 0; + + do { + clear_bit(SIGNAL_ASENDER, &tconn->flags); + flush_signals(current); + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + kref_get(&mdev->kref); + rcu_read_unlock(); + if (drbd_finish_peer_reqs(mdev)) { + kref_put(&mdev->kref, &drbd_minor_destroy); + return 1; + } + kref_put(&mdev->kref, &drbd_minor_destroy); + rcu_read_lock(); + } + set_bit(SIGNAL_ASENDER, &tconn->flags); + + spin_lock_irq(&tconn->req_lock); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + not_empty = !list_empty(&mdev->done_ee); + if (not_empty) + break; + } + spin_unlock_irq(&tconn->req_lock); + rcu_read_unlock(); + } while (not_empty); + + return 0; } struct asender_cmd { size_t pkt_size; - int (*process)(struct drbd_conf *mdev, struct p_header *h); + int (*fn)(struct drbd_tconn *tconn, struct packet_info *); }; -static struct asender_cmd *get_asender_cmd(int cmd) -{ - static struct asender_cmd asender_tbl[] = { - /* anything missing from this table is in - * the drbd_cmd_handler (drbd_default_handler) table, - * see the beginning of drbdd() */ - [P_PING] = { sizeof(struct p_header), got_Ping }, - [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, +static struct asender_cmd asender_tbl[] = { + [P_PING] = { 0, got_Ping }, + [P_PING_ACK] = { 0, got_PingAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, - [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_DISCARD_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, - [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, + [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, - [P_MAX_CMD] = { 0, NULL }, - }; - if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) - return NULL; - return &asender_tbl[cmd]; -} + [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, + [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, + [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, + [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, +}; int drbd_asender(struct drbd_thread *thi) { - struct drbd_conf *mdev = thi->mdev; - struct p_header *h = &mdev->meta.rbuf.header; + struct drbd_tconn *tconn = thi->tconn; struct asender_cmd *cmd = NULL; - - int rv, len; - void *buf = h; + struct packet_info pi; + int rv; + void *buf = tconn->meta.rbuf; int received = 0; - int expect = sizeof(struct p_header); - int empty; - - sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); + unsigned int header_size = drbd_header_size(tconn); + int expect = header_size; + bool ping_timeout_active = false; + struct net_conf *nc; + int ping_timeo, tcp_cork, ping_int; current->policy = SCHED_RR; /* Make this a realtime task! */ current->rt_priority = 2; /* more important than all other tasks */ - while (get_t_state(thi) == Running) { - drbd_thread_current_set_cpu(mdev); - if (test_and_clear_bit(SEND_PING, &mdev->flags)) { - ERR_IF(!drbd_send_ping(mdev)) goto reconnect; - mdev->meta.socket->sk->sk_rcvtimeo = - mdev->net_conf->ping_timeo*HZ/10; - } + while (get_t_state(thi) == RUNNING) { + drbd_thread_current_set_cpu(thi); - /* conditionally cork; - * it may hurt latency if we cork without much to send */ - if (!mdev->net_conf->no_cork && - 3 < atomic_read(&mdev->unacked_cnt)) - drbd_tcp_cork(mdev->meta.socket); - while (1) { - clear_bit(SIGNAL_ASENDER, &mdev->flags); - flush_signals(current); - if (!drbd_process_done_ee(mdev)) { - dev_err(DEV, "process_done_ee() = NOT_OK\n"); + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + ping_timeo = nc->ping_timeo; + tcp_cork = nc->tcp_cork; + ping_int = nc->ping_int; + rcu_read_unlock(); + + if (test_and_clear_bit(SEND_PING, &tconn->flags)) { + if (drbd_send_ping(tconn)) { + conn_err(tconn, "drbd_send_ping has failed\n"); goto reconnect; } - /* to avoid race with newly queued ACKs */ - set_bit(SIGNAL_ASENDER, &mdev->flags); - spin_lock_irq(&mdev->req_lock); - empty = list_empty(&mdev->done_ee); - spin_unlock_irq(&mdev->req_lock); - /* new ack may have been queued right here, - * but then there is also a signal pending, - * and we start over... */ - if (empty) - break; + tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; + ping_timeout_active = true; + } + + /* TODO: conditionally cork; it may hurt latency if we cork without + much to send */ + if (tcp_cork) + drbd_tcp_cork(tconn->meta.socket); + if (tconn_finish_peer_reqs(tconn)) { + conn_err(tconn, "tconn_finish_peer_reqs() failed\n"); + goto reconnect; } /* but unconditionally uncork unless disabled */ - if (!mdev->net_conf->no_cork) - drbd_tcp_uncork(mdev->meta.socket); + if (tcp_cork) + drbd_tcp_uncork(tconn->meta.socket); /* short circuit, recv_msg would return EINTR anyways. */ if (signal_pending(current)) continue; - rv = drbd_recv_short(mdev, mdev->meta.socket, - buf, expect-received, 0); - clear_bit(SIGNAL_ASENDER, &mdev->flags); + rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0); + clear_bit(SIGNAL_ASENDER, &tconn->flags); flush_signals(current); @@ -4447,70 +5455,78 @@ received += rv; buf += rv; } else if (rv == 0) { - dev_err(DEV, "meta connection shut down by peer.\n"); + conn_err(tconn, "meta connection shut down by peer.\n"); goto reconnect; } else if (rv == -EAGAIN) { - if (mdev->meta.socket->sk->sk_rcvtimeo == - mdev->net_conf->ping_timeo*HZ/10) { - dev_err(DEV, "PingAck did not arrive in time.\n"); + /* If the data socket received something meanwhile, + * that is good enough: peer is still alive. */ + if (time_after(tconn->last_received, + jiffies - tconn->meta.socket->sk->sk_rcvtimeo)) + continue; + if (ping_timeout_active) { + conn_err(tconn, "PingAck did not arrive in time.\n"); goto reconnect; } - set_bit(SEND_PING, &mdev->flags); + set_bit(SEND_PING, &tconn->flags); continue; } else if (rv == -EINTR) { continue; } else { - dev_err(DEV, "sock_recvmsg returned %d\n", rv); + conn_err(tconn, "sock_recvmsg returned %d\n", rv); goto reconnect; } if (received == expect && cmd == NULL) { - if (unlikely(h->magic != BE_DRBD_MAGIC)) { - dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); + if (decode_header(tconn, tconn->meta.rbuf, &pi)) goto reconnect; - } - cmd = get_asender_cmd(be16_to_cpu(h->command)); - len = be16_to_cpu(h->length); - if (unlikely(cmd == NULL)) { - dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", - (long)be32_to_cpu(h->magic), - h->command, h->length); + cmd = &asender_tbl[pi.cmd]; + if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { + conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); goto disconnect; } - expect = cmd->pkt_size; - ERR_IF(len != expect-sizeof(struct p_header)) { - trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__); - DUMPI(expect); + expect = header_size + cmd->pkt_size; + if (pi.size != expect - header_size) { + conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n", + pi.cmd, pi.size); goto reconnect; } } if (received == expect) { - D_ASSERT(cmd != NULL); - trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__); - if (!cmd->process(mdev, h)) + bool err; + + err = cmd->fn(tconn, &pi); + if (err) { + conn_err(tconn, "%pf failed\n", cmd->fn); goto reconnect; + } + + tconn->last_received = jiffies; + + if (cmd == &asender_tbl[P_PING_ACK]) { + /* restore idle timeout */ + tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; + ping_timeout_active = false; + } - buf = h; + buf = tconn->meta.rbuf; received = 0; - expect = sizeof(struct p_header); + expect = header_size; cmd = NULL; } } if (0) { reconnect: - drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); } if (0) { disconnect: - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD); } - clear_bit(SIGNAL_ASENDER, &mdev->flags); + clear_bit(SIGNAL_ASENDER, &tconn->flags); - D_ASSERT(mdev->state.conn < C_CONNECTED); - dev_info(DEV, "asender terminated\n"); + conn_info(tconn, "asender terminated\n"); return 0; } diff -Nru drbd8-8.3.7/drbd/drbd_req.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_req.c --- drbd8-8.3.7/drbd/drbd_req.c 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_req.c 2012-02-02 14:09:14.000000000 +0000 @@ -29,7 +29,6 @@ #include #include #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_req.h" @@ -43,6 +42,8 @@ #define _drbd_end_io_acct(...) do {} while (0) #else +STATIC bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); + /* Update disk stats at start of I/O request */ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) { @@ -60,6 +61,8 @@ cpu = part_stat_lock(); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); + (void) cpu; /* The macro invocations above want the cpu argument, I do not like + the compiler warning about cpu only assigned but never used... */ part_inc_in_flight(&mdev->vdisk->part0, rw); part_stat_unlock(); #endif @@ -89,33 +92,67 @@ #endif +static struct drbd_request *drbd_req_new(struct drbd_conf *mdev, + struct bio *bio_src) +{ + struct drbd_request *req; + + req = mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (!req) + return NULL; + + drbd_req_make_private_bio(req, bio_src); + req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; + req->w.mdev = mdev; + req->master_bio = bio_src; + req->epoch = 0; + + drbd_clear_interval(&req->i); + req->i.sector = bio_src->bi_sector; + req->i.size = bio_src->bi_size; + req->i.local = true; + req->i.waiting = false; + + INIT_LIST_HEAD(&req->tl_requests); + INIT_LIST_HEAD(&req->w.list); + + return req; +} + +static void drbd_req_free(struct drbd_request *req) +{ + mempool_free(req, drbd_request_mempool); +} + /* rw is bio_data_dir(), only READ or WRITE */ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) { const unsigned long s = req->rq_state; + + /* remove it from the transfer log. + * well, only if it had been there in the first + * place... if it had not (local only or conflicting + * and never sent), it should still be "empty" as + * initialized in drbd_req_new(), so we can list_del() it + * here unconditionally */ + list_del(&req->tl_requests); + /* if it was a write, we may have to set the corresponding * bit(s) out-of-sync first. If it had a local part, we need to * release the reference to the activity log. */ if (rw == WRITE) { - /* remove it from the transfer log. - * well, only if it had been there in the first - * place... if it had not (local only or conflicting - * and never sent), it should still be "empty" as - * initialized in drbd_req_new(), so we can list_del() it - * here unconditionally */ - list_del(&req->tl_requests); /* Set out-of-sync unless both OK flags are set * (local only or remote failed). * Other places where we set out-of-sync: * READ with local io-error */ if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) - drbd_set_out_of_sync(mdev, req->sector, req->size); + drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) - drbd_set_in_sync(mdev, req->sector, req->size); + drbd_set_in_sync(mdev, req->i.sector, req->i.size); /* one might be tempted to move the drbd_al_complete_io - * to the local io completion callback drbd_endio_pri. + * to the local io completion callback drbd_request_endio. * but, if this was a mirror write, we may only * drbd_al_complete_io after this is RQ_NET_DONE, * otherwise the extent could be dropped from the al @@ -126,136 +163,83 @@ */ if (s & RQ_LOCAL_MASK) { if (get_ldev_if_state(mdev, D_FAILED)) { - drbd_al_complete_io(mdev, req->sector); + if (s & RQ_IN_ACT_LOG) + drbd_al_complete_io(mdev, &req->i); put_ldev(mdev); } else if (DRBD_ratelimit(5*HZ, 3)) { - dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " + dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), " "but my Disk seems to have failed :(\n", - (unsigned long long) req->sector); + (unsigned long long) req->i.sector, req->i.size); } } } - /* if it was a local io error, we want to notify our - * peer about that, and see if we need to - * detach the disk and stuff. - * to avoid allocating some special work - * struct, reuse the request. */ - - /* THINK - * why do we do this not when we detect the error, - * but delay it until it is "done", i.e. possibly - * until the next barrier ack? */ - - if (rw == WRITE && - ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { - if (!(req->w.list.next == LIST_POISON1 || - list_empty(&req->w.list))) { - /* DEBUG ASSERT only; if this triggers, we - * probably corrupt the worker list here */ - DUMPP(req->w.list.next); - DUMPP(req->w.list.prev); - } - req->w.cb = w_io_error; - drbd_queue_work(&mdev->data.work, &req->w); - /* drbd_req_free() is done in w_io_error */ - } else { - drbd_req_free(req); - } + drbd_req_free(req); } static void queue_barrier(struct drbd_conf *mdev) { struct drbd_tl_epoch *b; + struct drbd_tconn *tconn = mdev->tconn; /* We are within the req_lock. Once we queued the barrier for sending, * we set the CREATE_BARRIER bit. It is cleared as soon as a new * barrier/epoch object is added. This is the only place this bit is * set. It indicates that the barrier for this epoch is already queued, * and no new epoch has been created yet. */ - if (test_bit(CREATE_BARRIER, &mdev->flags)) + if (test_bit(CREATE_BARRIER, &tconn->flags)) return; - b = mdev->newest_tle; + b = tconn->newest_tle; b->w.cb = w_send_barrier; + b->w.mdev = mdev; /* inc_ap_pending done here, so we won't * get imbalanced on connection loss. * dec_ap_pending will be done in got_BarrierAck * or (on connection loss) in tl_clear. */ inc_ap_pending(mdev); - drbd_queue_work(&mdev->data.work, &b->w); - set_bit(CREATE_BARRIER, &mdev->flags); + drbd_queue_work(&tconn->data.work, &b->w); + set_bit(CREATE_BARRIER, &tconn->flags); } static void _about_to_complete_local_write(struct drbd_conf *mdev, struct drbd_request *req) { const unsigned long s = req->rq_state; - struct drbd_request *i; - struct drbd_epoch_entry *e; - struct hlist_node *n; - struct hlist_head *slot; - /* before we can signal completion to the upper layers, - * we may need to close the current epoch */ + /* Before we can signal completion to the upper layers, + * we may need to close the current epoch. + * We can skip this, if this request has not even been sent, because we + * did not have a fully established connection yet/anymore, during + * bitmap exchange, or while we are C_AHEAD due to congestion policy. + */ if (mdev->state.conn >= C_CONNECTED && - req->epoch == mdev->newest_tle->br_number) + (s & RQ_NET_SENT) != 0 && + req->epoch == mdev->tconn->newest_tle->br_number) queue_barrier(mdev); - - /* we need to do the conflict detection stuff, - * if we have the ee_hash (two_primaries) and - * this has been on the network */ - if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { - const sector_t sector = req->sector; - const int size = req->size; - - /* ASSERT: - * there must be no conflicting requests, since - * they must have been failed on the spot */ -#define OVERLAPS overlaps(sector, size, i->sector, i->size) - slot = tl_hash_slot(mdev, sector); - hlist_for_each_entry(i, n, slot, colision) { - if (OVERLAPS) { - dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " - "other: %p %llus +%u\n", - req, (unsigned long long)sector, size, - i, (unsigned long long)i->sector, i->size); - } - } - - /* maybe "wake" those conflicting epoch entries - * that wait for this request to finish. - * - * currently, there can be only _one_ such ee - * (well, or some more, which would be pending - * P_DISCARD_ACK not yet sent by the asender...), - * since we block the receiver thread upon the - * first conflict detection, which will wait on - * misc_wait. maybe we want to assert that? - * - * anyways, if we found one, - * we just have to do a wake_up. */ -#undef OVERLAPS -#define OVERLAPS overlaps(sector, size, e->sector, e->size) - slot = ee_hash_slot(mdev, req->sector); - hlist_for_each_entry(e, n, slot, colision) { - if (OVERLAPS) { - wake_up(&mdev->misc_wait); - break; - } - } - } -#undef OVERLAPS } void complete_master_bio(struct drbd_conf *mdev, struct bio_and_error *m) { - trace_drbd_bio(mdev, "Rq", m->bio, 1, NULL); bio_endio(m->bio, m->error); dec_ap_bio(mdev); } + +static void drbd_remove_request_interval(struct rb_root *root, + struct drbd_request *req) +{ + struct drbd_conf *mdev = req->w.mdev; + struct drbd_interval *i = &req->i; + + drbd_remove_interval(root, i); + + /* Wake up any processes waiting for this request to complete. */ + if (i->waiting) + wake_up(&mdev->misc_wait); +} + /* Helper for __req_mod(). * Set m->bio to the master bio, if it is fit to be completed, * or leave it alone (it is initialized to NULL in __req_mod), @@ -265,11 +249,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) { const unsigned long s = req->rq_state; - struct drbd_conf *mdev = req->mdev; - /* only WRITES may end up here without a master bio (on barrier ack) */ - int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; - - trace_drbd_req(req, nothing, "_req_may_be_done"); + struct drbd_conf *mdev = req->w.mdev; + int rw = req->rq_state & RQ_WRITE ? WRITE : READ; /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) @@ -280,18 +261,22 @@ * the receiver, * the bio_endio completion callbacks. */ + if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) + return; + if (req->i.waiting) { + /* Retry all conflicting peer requests. */ + wake_up(&mdev->misc_wait); + } if (s & RQ_NET_QUEUED) return; if (s & RQ_NET_PENDING) return; - if (s & RQ_LOCAL_PENDING) - return; if (req->master_bio) { - /* this is data_received (remote read) + /* this is DATA_RECEIVED (remote read) * or protocol C P_WRITE_ACK * or protocol B P_RECV_ACK - * or protocol A "handed_over_to_network" (SendAck) + * or protocol A "HANDED_OVER_TO_NETWORK" (SendAck) * or canceled or failed, * or killed from the transfer log due to connection loss. */ @@ -307,17 +292,23 @@ * what we need to do here is just: complete the master_bio. * * local completion error, if any, has been stored as ERR_PTR - * in private_bio within drbd_endio_pri. + * in private_bio within drbd_request_endio. */ int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); int error = PTR_ERR(req->private_bio); /* remove the request from the conflict detection * respective block_id verification hash */ - if (!hlist_unhashed(&req->colision)) - hlist_del(&req->colision); - else - D_ASSERT((s & RQ_NET_MASK) == 0); + if (!drbd_interval_empty(&req->i)) { + struct rb_root *root; + + if (rw == WRITE) + root = &mdev->write_requests; + else + root = &mdev->read_requests; + drbd_remove_request_interval(root, req); + } else if (!(s & RQ_POSTPONED)) + D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); /* for writes we need to do some extra housekeeping */ if (rw == WRITE) @@ -326,108 +317,32 @@ /* Update disk stats */ _drbd_end_io_acct(mdev, req); - m->error = ok ? 0 : (error ?: -EIO); - m->bio = req->master_bio; + if (!(s & RQ_POSTPONED)) { + m->error = ok ? 0 : (error ?: -EIO); + m->bio = req->master_bio; + } req->master_bio = NULL; } + if (s & RQ_LOCAL_PENDING) + return; + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { /* this is disconnected (local only) operation, - * or protocol C P_WRITE_ACK, - * or protocol A or B P_BARRIER_ACK, + * or protocol A, B, or C P_BARRIER_ACK, * or killed from the transfer log due to connection loss. */ _req_is_done(mdev, req, rw); } /* else: network part and not DONE yet. that is - * protocol A or B, barrier ack still pending... */ + * protocol A, B, or C, barrier ack still pending... */ } -/* - * checks whether there was an overlapping request - * or ee already registered. - * - * if so, return 1, in which case this request is completed on the spot, - * without ever being submitted or send. - * - * return 0 if it is ok to submit this request. - * - * NOTE: - * paranoia: assume something above us is broken, and issues different write - * requests for the same block simultaneously... - * - * To ensure these won't be reordered differently on both nodes, resulting in - * diverging data sets, we discard the later one(s). Not that this is supposed - * to happen, but this is the rationale why we also have to check for - * conflicting requests with local origin, and why we have to do so regardless - * of whether we allowed multiple primaries. - * - * BTW, in case we only have one primary, the ee_hash is empty anyways, and the - * second hlist_for_each_entry becomes a noop. This is even simpler than to - * grab a reference on the net_conf, and check for the two_primaries flag... - */ -STATIC int _req_conflicts(struct drbd_request *req) +static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m) { - struct drbd_conf *mdev = req->mdev; - const sector_t sector = req->sector; - const int size = req->size; - struct drbd_request *i; - struct drbd_epoch_entry *e; - struct hlist_node *n; - struct hlist_head *slot; - - D_ASSERT(hlist_unhashed(&req->colision)); - - if (!get_net_conf(mdev)) - return 0; - - /* BUG_ON */ - ERR_IF (mdev->tl_hash_s == 0) - goto out_no_conflict; - BUG_ON(mdev->tl_hash == NULL); - -#define OVERLAPS overlaps(i->sector, i->size, sector, size) - slot = tl_hash_slot(mdev, sector); - hlist_for_each_entry(i, n, slot, colision) { - if (OVERLAPS) { - dev_alert(DEV, "%s[%u] Concurrent local write detected! " - "[DISCARD L] new: %llus +%u; " - "pending: %llus +%u\n", - current->comm, current->pid, - (unsigned long long)sector, size, - (unsigned long long)i->sector, i->size); - goto out_conflict; - } - } - - if (mdev->ee_hash_s) { - /* now, check for overlapping requests with remote origin */ - BUG_ON(mdev->ee_hash == NULL); -#undef OVERLAPS -#define OVERLAPS overlaps(e->sector, e->size, sector, size) - slot = ee_hash_slot(mdev, sector); - hlist_for_each_entry(e, n, slot, colision) { - if (OVERLAPS) { - dev_alert(DEV, "%s[%u] Concurrent remote write detected!" - " [DISCARD L] new: %llus +%u; " - "pending: %llus +%u\n", - current->comm, current->pid, - (unsigned long long)sector, size, - (unsigned long long)e->sector, e->size); - goto out_conflict; - } - } - } -#undef OVERLAPS - -out_no_conflict: - /* this is like it should be, and what we expected. - * our users do behave after all... */ - put_net_conf(mdev); - return 0; + struct drbd_conf *mdev = req->w.mdev; -out_conflict: - put_net_conf(mdev); - return 1; + if (!drbd_suspended(mdev)) + _req_may_be_done(req, m); } /* obviously this could be coded as many single functions @@ -442,13 +357,15 @@ * and it enforces that we have to think in a very structured manner * about the "events" that may happen to a request during its life time ... */ -void __req_mod(struct drbd_request *req, enum drbd_req_event what, +int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m) { - struct drbd_conf *mdev = req->mdev; - m->bio = NULL; + struct drbd_conf *mdev = req->w.mdev; + struct net_conf *nc; + int p, rv = 0; - trace_drbd_req(req, what, NULL); + if (m) + m->bio = NULL; switch (what) { default: @@ -457,92 +374,106 @@ /* does not happen... * initialization done in drbd_req_new - case created: + case CREATED: break; */ - case to_be_send: /* via network */ - /* reached via drbd_make_request_common + case TO_BE_SENT: /* via network */ + /* reached via __drbd_make_request * and from w_read_retry_remote */ D_ASSERT(!(req->rq_state & RQ_NET_MASK)); req->rq_state |= RQ_NET_PENDING; + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + p = nc->wire_protocol; + rcu_read_unlock(); + req->rq_state |= + p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK : + p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0; inc_ap_pending(mdev); break; - case to_be_submitted: /* locally */ - /* reached via drbd_make_request_common */ + case TO_BE_SUBMITTED: /* locally */ + /* reached via __drbd_make_request */ D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); req->rq_state |= RQ_LOCAL_PENDING; break; - case completed_ok: - if (bio_data_dir(req->master_bio) == WRITE) - mdev->writ_cnt += req->size>>9; + case COMPLETED_OK: + if (req->rq_state & RQ_WRITE) + mdev->writ_cnt += req->i.size >> 9; else - mdev->read_cnt += req->size>>9; + mdev->read_cnt += req->i.size >> 9; req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); req->rq_state &= ~RQ_LOCAL_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; - case write_completed_with_error: + case ABORT_DISK_IO: + req->rq_state |= RQ_LOCAL_ABORTED; + if (req->rq_state & RQ_WRITE) + _req_may_be_done_not_susp(req, m); + else + goto goto_queue_for_net_read; + break; + + case WRITE_COMPLETED_WITH_ERROR: req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", - (unsigned long long)req->sector, req->size); - /* and now: check how to handle local io error. */ - __drbd_chk_io_error(mdev, FALSE); - _req_may_be_done(req, m); + __drbd_chk_io_error(mdev, false); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; - case read_ahead_completed_with_error: + case READ_AHEAD_COMPLETED_WITH_ERROR: /* it is legal to fail READA */ req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); put_ldev(mdev); break; - case read_completed_with_error: - drbd_set_out_of_sync(mdev, req->sector, req->size); + case READ_COMPLETED_WITH_ERROR: + drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", - (unsigned long long)req->sector, req->size); - /* _req_mod(req,to_be_send); oops, recursion... */ D_ASSERT(!(req->rq_state & RQ_NET_MASK)); - req->rq_state |= RQ_NET_PENDING; - inc_ap_pending(mdev); - __drbd_chk_io_error(mdev, FALSE); + __drbd_chk_io_error(mdev, false); put_ldev(mdev); - /* NOTE: if we have no connection, - * or know the peer has no good data either, - * then we don't actually need to "queue_for_net_read", - * but we do so anyways, since the drbd_io_error() - * and the potential state change to "Diskless" - * needs to be done from process context */ - /* fall through: _req_mod(req,queue_for_net_read); */ + goto_queue_for_net_read: + + /* no point in retrying if there is no good remote data, + * or we have no connection. */ + if (mdev->state.pdsk != D_UP_TO_DATE) { + _req_may_be_done_not_susp(req, m); + break; + } + + /* _req_mod(req,TO_BE_SENT); oops, recursion... */ + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + /* fall through: _req_mod(req,QUEUE_FOR_NET_READ); */ - case queue_for_net_read: + case QUEUE_FOR_NET_READ: /* READ or READA, and * no local disk, * or target area marked as invalid, * or just got an io-error. */ - /* from drbd_make_request_common + /* from __drbd_make_request * or from bio_endio during read io-error recovery */ /* so we can verify the handle in the answer packet * corresponding hlist_del is in _req_may_be_done() */ - hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); + D_ASSERT(drbd_interval_empty(&req->i)); + drbd_insert_interval(&mdev->read_requests, &req->i); set_bit(UNPLUG_REMOTE, &mdev->flags); @@ -551,15 +482,16 @@ req->w.cb = (req->rq_state & RQ_LOCAL_MASK) ? w_read_retry_remote : w_send_read_req; - drbd_queue_work(&mdev->data.work, &req->w); + drbd_queue_work(&mdev->tconn->data.work, &req->w); break; - case queue_for_net_write: + case QUEUE_FOR_NET_WRITE: /* assert something? */ - /* from drbd_make_request_common only */ + /* from __drbd_make_request only */ - hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); /* corresponding hlist_del is in _req_may_be_done() */ + D_ASSERT(drbd_interval_empty(&req->i)); + drbd_insert_interval(&mdev->write_requests, &req->i); /* NOTE * In case the req ended up on the transfer log before being @@ -570,7 +502,7 @@ * * _req_add_to_epoch(req); this has to be after the * _maybe_start_new_epoch(req); which happened in - * drbd_make_request_common, because we now may set the bit + * __drbd_make_request, because we now may set the bit * again ourselves to close the current epoch. * * Add req to the (now) current epoch (barrier). */ @@ -580,44 +512,57 @@ * hurting performance. */ set_bit(UNPLUG_REMOTE, &mdev->flags); - /* see drbd_make_request_common, + /* see __drbd_make_request, * just after it grabs the req_lock */ - D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); + D_ASSERT(test_bit(CREATE_BARRIER, &mdev->tconn->flags) == 0); - req->epoch = mdev->newest_tle->br_number; - list_add_tail(&req->tl_requests, - &mdev->newest_tle->requests); + req->epoch = mdev->tconn->newest_tle->br_number; /* increment size of current epoch */ - mdev->newest_tle->n_req++; + mdev->tconn->newest_tle->n_writes++; /* queue work item to send data */ D_ASSERT(req->rq_state & RQ_NET_PENDING); req->rq_state |= RQ_NET_QUEUED; req->w.cb = w_send_dblock; - drbd_queue_work(&mdev->data.work, &req->w); + drbd_queue_work(&mdev->tconn->data.work, &req->w); /* close the epoch, in case it outgrew the limit */ - if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + p = nc->max_epoch_size; + rcu_read_unlock(); + if (mdev->tconn->newest_tle->n_writes >= p) queue_barrier(mdev); break; - case send_canceled: + case QUEUE_FOR_SEND_OOS: + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = w_send_out_of_sync; + drbd_queue_work(&mdev->tconn->data.work, &req->w); + break; + + case OOS_HANDED_TO_NETWORK: + /* actually the same */ + case SEND_CANCELED: /* treat it the same */ - case send_failed: + case SEND_FAILED: /* real cleanup will be done from tl_clear. just update flags * so it is no longer marked as on the worker queue */ req->rq_state &= ~RQ_NET_QUEUED; /* if we did it right, tl_clear should be scheduled only after * this, so this should not be necessary! */ - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; - case handed_over_to_network: + case HANDED_OVER_TO_NETWORK: /* assert something? */ + if (bio_data_dir(req->master_bio) == WRITE) + atomic_add(req->i.size >> 9, &mdev->ap_in_flight); + if (bio_data_dir(req->master_bio) == WRITE && - mdev->net_conf->wire_protocol == DRBD_PROT_A) { + !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) { /* this is what is dangerous about protocol A: * pretend it was successfully written on the peer. */ if (req->rq_state & RQ_NET_PENDING) { @@ -632,39 +577,48 @@ req->rq_state &= ~RQ_NET_QUEUED; req->rq_state |= RQ_NET_SENT; /* because _drbd_send_zc_bio could sleep, and may want to - * dereference the bio even after the "write_acked_by_peer" and - * "completed_ok" events came in, once we return from + * dereference the bio even after the "WRITE_ACKED_BY_PEER" and + * "COMPLETED_OK" events came in, once we return from * _drbd_send_zc_bio (drbd_send_dblock), we have to check * whether it is done already, and end it. */ - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; - case connection_lost_while_pending: + case READ_RETRY_REMOTE_CANCELED: + req->rq_state &= ~RQ_NET_QUEUED; + /* fall through, in case we raced with drbd_disconnect */ + case CONNECTION_LOST_WHILE_PENDING: /* transfer log cleanup after connection loss */ /* assert something? */ if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev); + + p = !(req->rq_state & RQ_WRITE) && req->rq_state & RQ_NET_PENDING; + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); req->rq_state |= RQ_NET_DONE; + if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE) + atomic_sub(req->i.size >> 9, &mdev->ap_in_flight); + /* if it is still queued, we may not complete it here. * it will be canceled soon. */ - if (!(req->rq_state & RQ_NET_QUEUED)) - _req_may_be_done(req, m); + if (!(req->rq_state & RQ_NET_QUEUED)) { + if (p) + goto goto_read_retry_local; + _req_may_be_done(req, m); /* Allowed while state.susp */ + } break; - case write_acked_by_peer_and_sis: + case WRITE_ACKED_BY_PEER_AND_SIS: req->rq_state |= RQ_NET_SIS; - case conflict_discarded_by_peer: + case DISCARD_WRITE: /* for discarded conflicting writes of multiple primaries, * there is no need to keep anything in the tl, potential * node crashes are covered by the activity log. */ - if (what == conflict_discarded_by_peer) - dev_alert(DEV, "Got DiscardAck packet %llus +%u!" - " DRBD is not a random data generator!\n", - (unsigned long long)req->sector, req->size); req->rq_state |= RQ_NET_DONE; /* fall through */ - case write_acked_by_peer: + case WRITE_ACKED_BY_PEER: + D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); /* protocol C; successfully written on peer. * Nothing to do here. * We want to keep the tl in place for all protocols, to cater @@ -675,51 +629,129 @@ * request could set NET_DONE right here, and not wait for the * P_BARRIER_ACK, but that is an unnecessary optimization. */ + goto ack_common; /* this makes it effectively the same as for: */ - case recv_acked_by_peer: + case RECV_ACKED_BY_PEER: + D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK); /* protocol B; pretends to be successfully written on peer. - * see also notes above in handed_over_to_network about + * see also notes above in HANDED_OVER_TO_NETWORK about * protocol != C */ + ack_common: req->rq_state |= RQ_NET_OK; D_ASSERT(req->rq_state & RQ_NET_PENDING); dec_ap_pending(mdev); + atomic_sub(req->i.size >> 9, &mdev->ap_in_flight); req->rq_state &= ~RQ_NET_PENDING; - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; - case neg_acked: + case POSTPONE_WRITE: + D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK); + /* If this node has already detected the write conflict, the + * worker will be waiting on misc_wait. Wake it up once this + * request has completed locally. + */ + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_POSTPONED; + _req_may_be_done_not_susp(req, m); + break; + + case NEG_ACKED: /* assert something? */ - if (req->rq_state & RQ_NET_PENDING) + if (req->rq_state & RQ_NET_PENDING) { dec_ap_pending(mdev); + if (req->rq_state & RQ_WRITE) + atomic_sub(req->i.size >> 9, &mdev->ap_in_flight); + } req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); req->rq_state |= RQ_NET_DONE; - _req_may_be_done(req, m); - /* else: done by handed_over_to_network */ + + if (!(req->rq_state & RQ_WRITE)) + goto goto_read_retry_local; + + _req_may_be_done_not_susp(req, m); + /* else: done by HANDED_OVER_TO_NETWORK */ + break; + + goto_read_retry_local: + if (!drbd_may_do_local_read(mdev, req->i.sector, req->i.size)) { + _req_may_be_done_not_susp(req, m); + break; + } + D_ASSERT(!(req->rq_state & RQ_LOCAL_PENDING)); + req->rq_state |= RQ_LOCAL_PENDING; + + get_ldev(mdev); + req->w.cb = w_restart_disk_io; + drbd_queue_work(&mdev->tconn->data.work, &req->w); + break; + + case FAIL_FROZEN_DISK_IO: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + + _req_may_be_done(req, m); /* Allowed while state.susp */ + break; + + case RESTART_FROZEN_DISK_IO: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + + req->rq_state &= ~RQ_LOCAL_COMPLETED; + + rv = MR_READ; + if (bio_data_dir(req->master_bio) == WRITE) + rv = MR_WRITE; + + get_ldev(mdev); + req->w.cb = w_restart_disk_io; + drbd_queue_work(&mdev->tconn->data.work, &req->w); break; - case barrier_acked: + case RESEND: + /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK + before the connection loss (B&C only); only P_BARRIER_ACK was missing. + Trowing them out of the TL here by pretending we got a BARRIER_ACK + We ensure that the peer was not rebooted */ + if (!(req->rq_state & RQ_NET_OK)) { + if (req->w.cb) { + drbd_queue_work(&mdev->tconn->data.work, &req->w); + rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; + } + break; + } + /* else, fall through to BARRIER_ACKED */ + + case BARRIER_ACKED: + if (!(req->rq_state & RQ_WRITE)) + break; + if (req->rq_state & RQ_NET_PENDING) { - /* barrier came in before all requests have been acked. + /* barrier came in before all requests were acked. * this is bad, because if the connection is lost now, * we won't be able to clean them up... */ - dev_err(DEV, "FIXME (barrier_acked but pending)\n"); - trace_drbd_req(req, nothing, "FIXME (barrier_acked but pending)"); - list_move(&req->tl_requests, &mdev->out_of_sequence_requests); + dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n"); + list_move(&req->tl_requests, &mdev->tconn->out_of_sequence_requests); } - D_ASSERT(req->rq_state & RQ_NET_SENT); - req->rq_state |= RQ_NET_DONE; - _req_may_be_done(req, m); + if ((req->rq_state & RQ_NET_MASK) != 0) { + req->rq_state |= RQ_NET_DONE; + if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) + atomic_sub(req->i.size>>9, &mdev->ap_in_flight); + } + _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case data_received: + case DATA_RECEIVED: D_ASSERT(req->rq_state & RQ_NET_PENDING); dec_ap_pending(mdev); req->rq_state &= ~RQ_NET_PENDING; req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); - _req_may_be_done(req, m); + _req_may_be_done_not_susp(req, m); break; }; + + return rv; } /* we may do a local read if: @@ -729,39 +761,98 @@ * since size may be bigger than BM_BLOCK_SIZE, * we may need to check several bits. */ -STATIC int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) +STATIC bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) { unsigned long sbnr, ebnr; sector_t esector, nr_sectors; if (mdev->state.disk == D_UP_TO_DATE) - return 1; - if (mdev->state.disk >= D_OUTDATED) - return 0; - if (mdev->state.disk < D_INCONSISTENT) - return 0; - /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ - nr_sectors = drbd_get_capacity(mdev->this_bdev); + return true; + if (mdev->state.disk != D_INCONSISTENT) + return false; esector = sector + (size >> 9) - 1; - + nr_sectors = drbd_get_capacity(mdev->this_bdev); D_ASSERT(sector < nr_sectors); D_ASSERT(esector < nr_sectors); sbnr = BM_SECT_TO_BIT(sector); ebnr = BM_SECT_TO_BIT(esector); - return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); + return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0; +} + +static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector) +{ + enum drbd_read_balancing rbm; + struct backing_dev_info *bdi; + int stripe_shift; + + if (mdev->state.pdsk < D_UP_TO_DATE) + return false; + + rcu_read_lock(); + rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing; + rcu_read_unlock(); + + switch (rbm) { + case RB_CONGESTED_REMOTE: + bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info; + return bdi_read_congested(bdi); + case RB_LEAST_PENDING: + return atomic_read(&mdev->local_cnt) > + atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt); + case RB_32K_STRIPING: /* stripe_shift = 15 */ + case RB_64K_STRIPING: + case RB_128K_STRIPING: + case RB_256K_STRIPING: + case RB_512K_STRIPING: + case RB_1M_STRIPING: /* stripe_shift = 20 */ + stripe_shift = (rbm - RB_32K_STRIPING + 15); + return (sector >> (stripe_shift - 9)) & 1; + case RB_ROUND_ROBIN: + return test_and_change_bit(READ_BALANCE_RR, &mdev->flags); + case RB_PREFER_REMOTE: + return true; + case RB_PREFER_LOCAL: + default: + return false; + } } -STATIC int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) +/* + * complete_conflicting_writes - wait for any conflicting write requests + * + * The write_requests tree contains all active write requests which we + * currently know about. Wait for any requests to complete which conflict with + * the new one. + */ +static int complete_conflicting_writes(struct drbd_conf *mdev, + sector_t sector, int size) +{ + for(;;) { + struct drbd_interval *i; + int err; + + i = drbd_find_overlap(&mdev->write_requests, sector, size); + if (!i) + return 0; + err = drbd_wait_misc(mdev, i); + if (err) + return err; + } +} + +int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) { const int rw = bio_rw(bio); const int size = bio->bi_size; const sector_t sector = bio->bi_sector; struct drbd_tl_epoch *b = NULL; struct drbd_request *req; - int local, remote; - int err = -EIO; + struct net_conf *nc; + int local, remote, send_oos = 0; + int err; + int ret = 0; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -773,8 +864,7 @@ bio_endio(bio, -ENOMEM); return 0; } - - trace_drbd_bio(mdev, "Rq", bio, 0, req); + req->start_time = start_time; local = get_ldev(mdev); if (!local) { @@ -786,7 +876,8 @@ } else { /* READ || READA */ if (local) { - if (!drbd_may_do_local_read(mdev, sector, size)) { + if (!drbd_may_do_local_read(mdev, sector, size) || + remote_due_to_read_balancing(mdev, sector)) { /* we could kick the syncer to * sync this extent asap, wait for * it, then continue locally. @@ -819,15 +910,19 @@ * resync extent to finish, and, if necessary, pulls in the target * extent into the activity log, which involves further disk io because * of transactional on-disk meta data updates. */ - if (rw == WRITE && local) - drbd_al_begin_io(mdev, sector); + if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { + req->rq_state |= RQ_IN_ACT_LOG; + drbd_al_begin_io(mdev, &req->i); + } - remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || - (mdev->state.pdsk == D_INCONSISTENT && - mdev->state.conn >= C_CONNECTED)); + remote = remote && drbd_should_do_remote(mdev->state); + send_oos = rw == WRITE && drbd_should_send_out_of_sync(mdev->state); + D_ASSERT(!(remote && send_oos)); - if (!(local || remote)) { - dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + if (!(local || remote) && !drbd_suspended(mdev)) { + if (DRBD_ratelimit(5*HZ, 3)) + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + err = -EIO; goto fail_free_complete; } @@ -837,9 +932,9 @@ * but there is a race between testing the bit and pointer outside the * spinlock, and grabbing the spinlock. * if we lost that race, we retry. */ - if (rw == WRITE && remote && - mdev->unused_spare_tle == NULL && - test_bit(CREATE_BARRIER, &mdev->flags)) { + if (rw == WRITE && (remote || send_oos) && + mdev->tconn->unused_spare_tle == NULL && + test_bit(CREATE_BARRIER, &mdev->tconn->flags)) { allocate_barrier: b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); if (!b) { @@ -850,31 +945,56 @@ } /* GOOD, everything prepared, grab the spin_lock */ - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); - if (remote) { - remote = (mdev->state.pdsk == D_UP_TO_DATE || - (mdev->state.pdsk == D_INCONSISTENT && - mdev->state.conn >= C_CONNECTED)); - if (!remote) + if (rw == WRITE) { + err = complete_conflicting_writes(mdev, sector, size); + if (err) { + if (err != -ERESTARTSYS) + _conn_request_state(mdev->tconn, + NS(conn, C_TIMEOUT), + CS_HARD); + spin_unlock_irq(&mdev->tconn->req_lock); + err = -EIO; + goto fail_free_complete; + } + } + + if (drbd_suspended(mdev)) { + /* If we got suspended, use the retry mechanism in + drbd_make_request() to restart processing of this + bio. In the next call to drbd_make_request + we sleep in inc_ap_bio() */ + ret = 1; + spin_unlock_irq(&mdev->tconn->req_lock); + goto fail_free_complete; + } + + if (remote || send_oos) { + remote = drbd_should_do_remote(mdev->state); + send_oos = rw == WRITE && drbd_should_send_out_of_sync(mdev->state); + D_ASSERT(!(remote && send_oos)); + + if (!(remote || send_oos)) dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); if (!(local || remote)) { dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); + err = -EIO; goto fail_free_complete; } } - if (b && mdev->unused_spare_tle == NULL) { - mdev->unused_spare_tle = b; + if (b && mdev->tconn->unused_spare_tle == NULL) { + mdev->tconn->unused_spare_tle = b; b = NULL; } - if (rw == WRITE && remote && - mdev->unused_spare_tle == NULL && - test_bit(CREATE_BARRIER, &mdev->flags)) { + if (rw == WRITE && (remote || send_oos) && + mdev->tconn->unused_spare_tle == NULL && + test_bit(CREATE_BARRIER, &mdev->tconn->flags)) { /* someone closed the current epoch * while we were grabbing the spinlock */ - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); goto allocate_barrier; } @@ -892,13 +1012,13 @@ * barrier packet. To get the write ordering right, we only have to * make sure that, if this is a write request and it triggered a * barrier packet, this request is queued within the same spinlock. */ - if (remote && mdev->unused_spare_tle && - test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { - _tl_add_barrier(mdev, mdev->unused_spare_tle); - mdev->unused_spare_tle = NULL; + if ((remote || send_oos) && mdev->tconn->unused_spare_tle && + test_and_clear_bit(CREATE_BARRIER, &mdev->tconn->flags)) { + _tl_add_barrier(mdev->tconn, mdev->tconn->unused_spare_tle); + mdev->tconn->unused_spare_tle = NULL; } else { D_ASSERT(!(remote && rw == WRITE && - test_bit(CREATE_BARRIER, &mdev->flags))); + test_bit(CREATE_BARRIER, &mdev->tconn->flags))); } /* NOTE @@ -917,37 +1037,11 @@ /* mark them early for readability. * this just sets some state flags. */ if (remote) - _req_mod(req, to_be_send); + _req_mod(req, TO_BE_SENT); if (local) - _req_mod(req, to_be_submitted); + _req_mod(req, TO_BE_SUBMITTED); - /* check this request on the collision detection hash tables. - * if we have a conflict, just complete it here. - * THINK do we want to check reads, too? (I don't think so...) */ - if (rw == WRITE && _req_conflicts(req)) { - /* this is a conflicting request. - * even though it may have been only _partially_ - * overlapping with one of the currently pending requests, - * without even submitting or sending it, we will - * pretend that it was successfully served right now. - */ - if (local) { - bio_put(req->private_bio); - req->private_bio = NULL; - drbd_al_complete_io(mdev, req->sector); - put_ldev(mdev); - local = 0; - } - if (remote) - dec_ap_pending(mdev); - _drbd_end_io_acct(mdev, req); - /* THINK: do we want to fail it (-EIO), or pretend success? */ - bio_endio(req->master_bio, 0); - req->master_bio = NULL; - dec_ap_bio(mdev); - drbd_req_free(req); - remote = 0; - } + list_add_tail(&req->tl_requests, &mdev->tconn->newest_tle->requests); /* NOTE remote first: to get the concurrent write detection right, * we must register the request before start of local IO. */ @@ -957,189 +1051,124 @@ * or READ, but not in sync. */ _req_mod(req, (rw == WRITE) - ? queue_for_net_write - : queue_for_net_read); + ? QUEUE_FOR_NET_WRITE + : QUEUE_FOR_NET_READ); + } + if (send_oos && drbd_set_out_of_sync(mdev, sector, size)) + _req_mod(req, QUEUE_FOR_SEND_OOS); + + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + if (remote && + nc->on_congestion != OC_BLOCK && mdev->tconn->agreed_pro_version >= 96) { + int congested = 0; + + if (nc->cong_fill && + atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) { + dev_info(DEV, "Congestion-fill threshold reached\n"); + congested = 1; + } + + if (mdev->act_log->used >= nc->cong_extents) { + dev_info(DEV, "Congestion-extents threshold reached\n"); + congested = 1; + } + + if (congested) { + queue_barrier(mdev); /* last barrier, after mirrored writes */ + + if (nc->on_congestion == OC_PULL_AHEAD) + _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); + else /*nc->on_congestion == OC_DISCONNECT */ + _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); + } } - spin_unlock_irq(&mdev->req_lock); + rcu_read_unlock(); + + spin_unlock_irq(&mdev->tconn->req_lock); kfree(b); /* if someone else has beaten us to it... */ if (local) { req->private_bio->bi_bdev = mdev->ldev->backing_bdev; - trace_drbd_bio(mdev, "Pri", req->private_bio, 0, NULL); - - if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR - : rw == READ ? DRBD_FAULT_DT_RD - : DRBD_FAULT_DT_RA)) + /* State may have changed since we grabbed our reference on the + * mdev->ldev member. Double check, and short-circuit to endio. + * In case the last activity log transaction failed to get on + * stable storage, and this is a WRITE, we may not even submit + * this bio. */ + if (get_ldev(mdev)) { + if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR + : rw == READ ? DRBD_FAULT_DT_RD + : DRBD_FAULT_DT_RA)) + bio_endio(req->private_bio, -EIO); + else + generic_make_request(req->private_bio); + put_ldev(mdev); + } else bio_endio(req->private_bio, -EIO); - else - generic_make_request(req->private_bio); } - /* we need to plug ALWAYS since we possibly need to kick lo_dev. - * we plug after submit, so we won't miss an unplug event */ - drbd_plug_device(mdev); - return 0; fail_free_complete: - if (rw == WRITE && local) - drbd_al_complete_io(mdev, sector); + if (req->rq_state & RQ_IN_ACT_LOG) + drbd_al_complete_io(mdev, &req->i); fail_and_free_req: if (local) { bio_put(req->private_bio); req->private_bio = NULL; put_ldev(mdev); } - bio_endio(bio, err); + if (!ret) + bio_endio(bio, err); + drbd_req_free(req); dec_ap_bio(mdev); kfree(b); - return 0; -} - -/* helper function for drbd_make_request - * if we can determine just by the mdev (state) that this request will fail, - * return 1 - * otherwise return 0 - */ -static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) -{ - /* Unconfigured */ - if (mdev->state.conn == C_DISCONNECTING && - mdev->state.disk == D_DISKLESS) - return 1; - - if (mdev->state.role != R_PRIMARY && - (!allow_oos || is_write)) { - if (DRBD_ratelimit(5*HZ, 5)) { - dev_err(DEV, "Process %s[%u] tried to %s; " - "since we are not in Primary state, " - "we cannot allow this\n", - current->comm, current->pid, - is_write ? "WRITE" : "READ"); - } - return 1; - } - - /* - * Paranoia: we might have been primary, but sync target, or - * even diskless, then lost the connection. - * This should have been handled (panic? suspend?) somewhere - * else. But maybe it was not, so check again here. - * Caution: as long as we do not have a read/write lock on mdev, - * to serialize state changes, this is racy, since we may lose - * the connection *after* we test for the cstate. - */ - if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { - if (DRBD_ratelimit(5*HZ, 5)) - dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); - return 1; - } - - return 0; + return ret; } -int drbd_make_request_26(struct request_queue *q, struct bio *bio) +MAKE_REQUEST_TYPE drbd_make_request(struct request_queue *q, struct bio *bio) { - unsigned int s_enr, e_enr; struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; + unsigned long start_time; - if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { - bio_endio(bio, -EPERM); - return 0; - } - - /* Reject barrier requests if we know the underlying device does - * not support them. - * XXX: Need to get this info from peer as well some how so we - * XXX: reject if EITHER side/data/metadata area does not support them. - * - * because of those XXX, this is not yet enabled, - * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. - */ - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) { - /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ + /* We never supported BIO_RW_BARRIER. + * We don't need to, anymore, either: starting with kernel 2.6.36, + * we have REQ_FUA and REQ_FLUSH, which will be handled transparently + * by the block layer. */ + if (unlikely(bio->bi_rw & DRBD_REQ_HARDBARRIER)) { bio_endio(bio, -EOPNOTSUPP); - return 0; + MAKE_REQUEST_RETURN; } + start_time = jiffies; + /* * what we "blindly" assume: */ D_ASSERT(bio->bi_size > 0); - D_ASSERT((bio->bi_size & 0x1ff) == 0); - D_ASSERT(bio->bi_idx == 0); + D_ASSERT(IS_ALIGNED(bio->bi_size, 512)); - /* to make some things easier, force alignment of requests within the - * granularity of our hash tables */ - s_enr = bio->bi_sector >> HT_SHIFT; - e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; - - if (likely(s_enr == e_enr)) { - inc_ap_bio(mdev, 1); - return drbd_make_request_common(mdev, bio); - } - - /* can this bio be split generically? - * Maybe add our own split-arbitrary-bios function. */ - if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { - /* rather error out here than BUG in bio_split */ - dev_err(DEV, "bio would need to, but cannot, be split: " - "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", - bio->bi_vcnt, bio->bi_idx, bio->bi_size, - (unsigned long long)bio->bi_sector); - bio_endio(bio, -EINVAL); - } else { - /* This bio crosses some boundary, so we have to split it. */ - struct bio_pair *bp; - /* works for the "do not cross hash slot boundaries" case - * e.g. sector 262269, size 4096 - * s_enr = 262269 >> 6 = 4097 - * e_enr = (262269+8-1) >> 6 = 4098 - * HT_SHIFT = 6 - * sps = 64, mask = 63 - * first_sectors = 64 - (262269 & 63) = 3 - */ - const sector_t sect = bio->bi_sector; - const int sps = 1 << HT_SHIFT; /* sectors per slot */ - const int mask = sps - 1; - const sector_t first_sectors = sps - (sect & mask); - bp = bio_split(bio, -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) - bio_split_pool, -#endif - first_sectors); + do { + inc_ap_bio(mdev); + } while (__drbd_make_request(mdev, bio, start_time)); - /* we need to get a "reference count" (ap_bio_cnt) - * to avoid races with the disconnect/reconnect/suspend code. - * In case we need to split the bio here, we need to get two references - * atomically, otherwise we might deadlock when trying to submit the - * second one! */ - inc_ap_bio(mdev, 2); - - D_ASSERT(e_enr == s_enr + 1); - - drbd_make_request_common(mdev, &bp->bio1); - drbd_make_request_common(mdev, &bp->bio2); - bio_pair_release(bp); - } - return 0; + MAKE_REQUEST_RETURN; } -/* This is called by bio_add_page(). With this function we reduce - * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs - * units (was AL_EXTENTs). +/* This is called by bio_add_page(). + * + * q->max_hw_sectors and other global limits are already enforced there. + * + * We need to call down to our lower level device, + * in case it has special restrictions. * - * we do the calculation within the lower 32bit of the byte offsets, - * since we don't care for actual offset, but only check whether it - * would cross "activity log extent" boundaries. + * We also may need to enforce configured max-bio-bvecs limits. * * As long as the BIO is empty we have to allow at least one bvec, - * regardless of size and offset. so the resulting bio may still - * cross extent boundaries. those are dealt with (bio_split) in - * drbd_make_request_26. + * regardless of size and offset, so no need to ask lower levels. */ int drbd_merge_bvec(struct request_queue *q, #ifdef HAVE_bvec_merge_data @@ -1150,22 +1179,14 @@ struct bio_vec *bvec) { struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; - unsigned int bio_offset = - (unsigned int)bvm->bi_sector << 9; /* 32 bit */ unsigned int bio_size = bvm->bi_size; - int limit, backing_limit; + int limit = DRBD_MAX_BIO_SIZE; + int backing_limit; - limit = DRBD_MAX_SEGMENT_SIZE - - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); - if (limit < 0) - limit = 0; - if (bio_size == 0) { - if (limit <= bvec->bv_len) - limit = bvec->bv_len; - } else if (limit && get_ldev(mdev)) { + if (bio_size && get_ldev(mdev)) { struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; - if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { + if (b->merge_bvec_fn) { backing_limit = b->merge_bvec_fn(b, bvm, bvec); limit = min(limit, backing_limit); } @@ -1173,3 +1194,54 @@ } return limit; } + +void request_timer_fn(unsigned long data) +{ + struct drbd_conf *mdev = (struct drbd_conf *) data; + struct drbd_tconn *tconn = mdev->tconn; + struct drbd_request *req; /* oldest request */ + struct list_head *le; + struct net_conf *nc; + unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ + + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + ent = nc ? nc->timeout * HZ/10 * nc->ko_count : 0; + + if (get_ldev(mdev)) { + dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10; + put_ldev(mdev); + } + rcu_read_unlock(); + + et = min_not_zero(dt, ent); + + if (!et || (mdev->state.conn < C_WF_REPORT_PARAMS && mdev->state.disk <= D_FAILED)) + return; /* Recurring timer stopped */ + + spin_lock_irq(&tconn->req_lock); + le = &tconn->oldest_tle->requests; + if (list_empty(le)) { + spin_unlock_irq(&tconn->req_lock); + mod_timer(&mdev->request_timer, jiffies + et); + return; + } + + le = le->prev; + req = list_entry(le, struct drbd_request, tl_requests); + if (ent && req->rq_state & RQ_NET_PENDING) { + if (time_is_before_eq_jiffies(req->start_time + ent)) { + dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); + _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); + } + } + if (dt && req->rq_state & RQ_LOCAL_PENDING) { + if (time_is_before_eq_jiffies(req->start_time + dt)) { + dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(mdev, 1); + } + } + nt = (time_is_before_eq_jiffies(req->start_time + et) ? jiffies : req->start_time) + et; + spin_unlock_irq(&tconn->req_lock); + mod_timer(&mdev->request_timer, nt); +} diff -Nru drbd8-8.3.7/drbd/drbd_req.h drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_req.h --- drbd8-8.3.7/drbd/drbd_req.h 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_req.h 2012-02-02 14:09:14.000000000 +0000 @@ -57,7 +57,7 @@ * * It may me handed over to the local disk subsystem. * It may be completed by the local disk subsystem, - * either sucessfully or with io-error. + * either successfully or with io-error. * In case it is a READ request, and it failed locally, * it may be retried remotely. * @@ -77,33 +77,41 @@ */ enum drbd_req_event { - created, - to_be_send, - to_be_submitted, + CREATED, + TO_BE_SENT, + TO_BE_SUBMITTED, /* XXX yes, now I am inconsistent... - * these two are not "events" but "actions" + * these are not "events" but "actions" * oh, well... */ - queue_for_net_write, - queue_for_net_read, - - send_canceled, - send_failed, - handed_over_to_network, - connection_lost_while_pending, - recv_acked_by_peer, - write_acked_by_peer, - write_acked_by_peer_and_sis, /* and set_in_sync */ - conflict_discarded_by_peer, - neg_acked, - barrier_acked, /* in protocol A and B */ - data_received, /* (remote read) */ - - read_completed_with_error, - read_ahead_completed_with_error, - write_completed_with_error, - completed_ok, - nothing, /* for tracing only */ + QUEUE_FOR_NET_WRITE, + QUEUE_FOR_NET_READ, + QUEUE_FOR_SEND_OOS, + + SEND_CANCELED, + SEND_FAILED, + HANDED_OVER_TO_NETWORK, + OOS_HANDED_TO_NETWORK, + CONNECTION_LOST_WHILE_PENDING, + READ_RETRY_REMOTE_CANCELED, + RECV_ACKED_BY_PEER, + WRITE_ACKED_BY_PEER, + WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */ + DISCARD_WRITE, + POSTPONE_WRITE, + NEG_ACKED, + BARRIER_ACKED, /* in protocol A and B */ + DATA_RECEIVED, /* (remote read) */ + + READ_COMPLETED_WITH_ERROR, + READ_AHEAD_COMPLETED_WITH_ERROR, + WRITE_COMPLETED_WITH_ERROR, + ABORT_DISK_IO, + COMPLETED_OK, + RESEND, + FAIL_FROZEN_DISK_IO, + RESTART_FROZEN_DISK_IO, + NOTHING, }; /* encoding of request states for now. we don't actually need that many bits. @@ -112,18 +120,21 @@ * same time, so we should hold the request lock anyways. */ enum drbd_req_state_bits { - /* 210 - * 000: no local possible - * 001: to be submitted + /* 3210 + * 0000: no local possible + * 0001: to be submitted * UNUSED, we could map: 011: submitted, completion still pending - * 110: completed ok - * 010: completed with error + * 0110: completed ok + * 0010: completed with error + * 1001: Aborted (before completion) + * 1x10: Aborted and completed -> free */ __RQ_LOCAL_PENDING, __RQ_LOCAL_COMPLETED, __RQ_LOCAL_OK, + __RQ_LOCAL_ABORTED, - /* 76543 + /* 87654 * 00000: no network possible * 00001: to be send * 00011: to be send, on worker queue @@ -132,8 +143,8 @@ * recv_ack (B) or implicit "ack" (A), * still waiting for the barrier ack. * master_bio may already be completed and invalidated. - * 11100: write_acked (C), - * data_received (for remote read, any protocol) + * 11100: write acked (C), + * data received (for remote read, any protocol) * or finally the barrier ack has arrived (B,A)... * request can be freed * 01100: neg-acked (write, protocol C) @@ -182,13 +193,29 @@ /* keep this last, its for the RQ_NET_MASK */ __RQ_NET_MAX, + + /* Set when this is a write, clear for a read */ + __RQ_WRITE, + + /* Should call drbd_al_complete_io() for this request... */ + __RQ_IN_ACT_LOG, + + /* The peer has sent a retry ACK */ + __RQ_POSTPONED, + + /* We expect a receive ACK (wire proto B) */ + __RQ_EXP_RECEIVE_ACK, + + /* We expect a write ACK (wite proto C) */ + __RQ_EXP_WRITE_ACK, }; #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) +#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) -#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ +#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) @@ -200,85 +227,27 @@ /* 0x1f8 */ #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) -/* epoch entries */ -static inline -struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) -{ - BUG_ON(mdev->ee_hash_s == 0); - return mdev->ee_hash + - ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); -} - -/* transfer log (drbd_request objects) */ -static inline -struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) -{ - BUG_ON(mdev->tl_hash_s == 0); - return mdev->tl_hash + - ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); -} - -/* application reads (drbd_request objects) */ -static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) -{ - return mdev->app_reads_hash - + ((unsigned int)(sector) % APP_R_HSIZE); -} - -/* when we receive the answer for a read request, - * verify that we actually know about it */ -static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, - u64 id, sector_t sector) -{ - struct hlist_head *slot = ar_hash_slot(mdev, sector); - struct hlist_node *n; - struct drbd_request *req; - - hlist_for_each_entry(req, n, slot, colision) { - if ((unsigned long)req == (unsigned long)id) { - D_ASSERT(req->sector == sector); - return req; - } - } - return NULL; -} +#define RQ_WRITE (1UL << __RQ_WRITE) +#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) +#define RQ_POSTPONED (1UL << __RQ_POSTPONED) +#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK) +#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK) + +/* For waking up the frozen transfer log mod_req() has to return if the request + should be counted in the epoch object*/ +#define MR_WRITE 1 +#define MR_READ 2 -static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, - struct bio *bio_src) +static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) { struct bio *bio; - struct drbd_request *req = - mempool_alloc(drbd_request_mempool, GFP_NOIO); - if (likely(req)) { - bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ - - req->rq_state = 0; - req->mdev = mdev; - req->master_bio = bio_src; - req->private_bio = bio; - req->epoch = 0; - req->sector = bio->bi_sector; - req->size = bio->bi_size; - req->start_time = jiffies; - INIT_HLIST_NODE(&req->colision); - INIT_LIST_HEAD(&req->tl_requests); - INIT_LIST_HEAD(&req->w.list); - - bio->bi_private = req; - bio->bi_end_io = drbd_endio_pri; - bio->bi_next = NULL; - } - return req; -} + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ -static inline void drbd_req_free(struct drbd_request *req) -{ - mempool_free(req, drbd_request_mempool); -} + req->private_bio = bio; -static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) -{ - return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); + bio->bi_private = req; + bio->bi_end_io = drbd_request_endio; + bio->bi_next = NULL; } /* Short lived temporary struct on the stack. @@ -291,36 +260,65 @@ extern void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m); -extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, +extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m); extern void complete_master_bio(struct drbd_conf *mdev, struct bio_and_error *m); +extern void request_timer_fn(unsigned long data); +extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); +extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what); /* use this if you don't want to deal with calling complete_master_bio() * outside the spinlock, e.g. when walking some list on cleanup. */ -static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) +static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) { - struct drbd_conf *mdev = req->mdev; + struct drbd_conf *mdev = req->w.mdev; struct bio_and_error m; + int rv; /* __req_mod possibly frees req, do not touch req after that! */ - __req_mod(req, what, &m); + rv = __req_mod(req, what, &m); if (m.bio) complete_master_bio(mdev, &m); + + return rv; } /* completion of master bio is outside of spinlock. - * If you need it irqsave, do it your self! */ -static inline void req_mod(struct drbd_request *req, + * If you need it irqsave, do it your self! + * Which means: don't use from bio endio callback. */ +static inline int req_mod(struct drbd_request *req, enum drbd_req_event what) { - struct drbd_conf *mdev = req->mdev; + struct drbd_conf *mdev = req->w.mdev; struct bio_and_error m; - spin_lock_irq(&mdev->req_lock); - __req_mod(req, what, &m); - spin_unlock_irq(&mdev->req_lock); + int rv; + + spin_lock_irq(&mdev->tconn->req_lock); + rv = __req_mod(req, what, &m); + spin_unlock_irq(&mdev->tconn->req_lock); if (m.bio) complete_master_bio(mdev, &m); + + return rv; +} + +static inline bool drbd_should_do_remote(union drbd_dev_state s) +{ + return s.pdsk == D_UP_TO_DATE || + (s.pdsk >= D_INCONSISTENT && + s.conn >= C_WF_BITMAP_T && + s.conn < C_AHEAD); + /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. + That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* + states. */ } +static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) +{ + return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; + /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary + since we enter state C_AHEAD only if proto >= 96 */ +} + #endif diff -Nru drbd8-8.3.7/drbd/drbd_state.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_state.c --- drbd8-8.3.7/drbd/drbd_state.c 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_state.c 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,1762 @@ +/* + drbd_state.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev + from Logicworks, Inc. for making SDP replication support possible. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include "drbd_int.h" +#include "drbd_req.h" + +/* in drbd_main.c */ +extern void tl_abort_disk_io(struct drbd_conf *mdev); + +struct after_state_chg_work { + struct drbd_work w; + union drbd_state os; + union drbd_state ns; + enum chg_state_flags flags; + struct completion *done; +}; + +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; + +STATIC int w_after_state_ch(struct drbd_work *w, int unused); +STATIC void after_state_ch(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags); +STATIC enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); +STATIC enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state); +STATIC enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); +STATIC union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, + enum sanitize_state_warnings *warn); + +static inline bool is_susp(union drbd_state s) +{ + return s.susp || s.susp_nod || s.susp_fen; +} + +bool conn_all_vols_unconf(struct drbd_tconn *tconn) +{ + struct drbd_conf *mdev; + bool rv = true; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (mdev->state.disk != D_DISKLESS || + mdev->state.conn != C_STANDALONE || + mdev->state.role != R_SECONDARY) { + rv = false; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +/* Unfortunately the states where not correctly ordered, when + they where defined. therefore can not use max_t() here. */ +static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) +{ + if (role1 == R_PRIMARY || role2 == R_PRIMARY) + return R_PRIMARY; + if (role1 == R_SECONDARY || role2 == R_SECONDARY) + return R_SECONDARY; + return R_UNKNOWN; +} +static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) +{ + if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) + return R_UNKNOWN; + if (role1 == R_SECONDARY || role2 == R_SECONDARY) + return R_SECONDARY; + return R_PRIMARY; +} + +enum drbd_role conn_highest_role(struct drbd_tconn *tconn) +{ + enum drbd_role role = R_UNKNOWN; + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) + role = max_role(role, mdev->state.role); + rcu_read_unlock(); + + return role; +} + +enum drbd_role conn_highest_peer(struct drbd_tconn *tconn) +{ + enum drbd_role peer = R_UNKNOWN; + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) + peer = max_role(peer, mdev->state.peer); + rcu_read_unlock(); + + return peer; +} + +enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn) +{ + enum drbd_disk_state ds = D_DISKLESS; + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) + ds = max_t(enum drbd_disk_state, ds, mdev->state.disk); + rcu_read_unlock(); + + return ds; +} + +enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn) +{ + enum drbd_disk_state ds = D_MASK; + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) + ds = min_t(enum drbd_disk_state, ds, mdev->state.disk); + rcu_read_unlock(); + + return ds; +} + +enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn) +{ + enum drbd_disk_state ds = D_DISKLESS; + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) + ds = max_t(enum drbd_disk_state, ds, mdev->state.pdsk); + rcu_read_unlock(); + + return ds; +} + +enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn) +{ + enum drbd_conns conn = C_MASK; + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) + conn = min_t(enum drbd_conns, conn, mdev->state.conn); + rcu_read_unlock(); + + return conn; +} + +/** + * cl_wide_st_chg() - true if the state change is a cluster wide one + * @mdev: DRBD device. + * @os: old (current) state. + * @ns: new (wanted) state. + */ +STATIC int cl_wide_st_chg(struct drbd_conf *mdev, + union drbd_state os, union drbd_state ns) +{ + return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && + ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || + (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || + (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || + (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) || + (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS); +} + +static union drbd_state +apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val) +{ + union drbd_state ns; + ns.i = (os.i & ~mask.i) | val.i; + return ns; +} + +enum drbd_state_rv +drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, + union drbd_state mask, union drbd_state val) +{ + unsigned long flags; + union drbd_state ns; + enum drbd_state_rv rv; + + spin_lock_irqsave(&mdev->tconn->req_lock, flags); + ns = apply_mask_val(drbd_read_state(mdev), mask, val); + rv = _drbd_set_state(mdev, ns, f, NULL); + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + + return rv; +} + +/** + * drbd_force_state() - Impose a change which happens outside our control on our state + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + */ +void drbd_force_state(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) +{ + drbd_change_state(mdev, CS_HARD, mask, val); +} + +STATIC enum drbd_state_rv +_req_st_cond(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val) +{ + union drbd_state os, ns; + unsigned long flags; + enum drbd_state_rv rv; + + if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) + return SS_CW_SUCCESS; + + if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) + return SS_CW_FAILED_BY_PEER; + + spin_lock_irqsave(&mdev->tconn->req_lock, flags); + os = drbd_read_state(mdev); + ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); + rv = is_valid_transition(os, ns); + if (rv == SS_SUCCESS) + rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + + if (!cl_wide_st_chg(mdev, os, ns)) + rv = SS_CW_NO_NEED; + if (rv == SS_UNKNOWN_ERROR) { + rv = is_valid_state(mdev, ns); + if (rv == SS_SUCCESS) { + rv = is_valid_soft_transition(os, ns); + if (rv == SS_SUCCESS) + rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + } + } + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + + return rv; +} + +/** + * drbd_req_state() - Perform an eventually cluster wide state change + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Should not be called directly, use drbd_request_state() or + * _drbd_request_state(). + */ +STATIC enum drbd_state_rv +drbd_req_state(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + struct completion done; + unsigned long flags; + union drbd_state os, ns; + enum drbd_state_rv rv; + + init_completion(&done); + + if (f & CS_SERIALIZE) + mutex_lock(mdev->state_mutex); + + ns = val; /* assign debug info, if any */ + spin_lock_irqsave(&mdev->tconn->req_lock, flags); + os = drbd_read_state(mdev); + ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); + rv = is_valid_transition(os, ns); + if (rv < SS_SUCCESS) { + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + goto abort; + } + + if (cl_wide_st_chg(mdev, os, ns)) { + rv = is_valid_state(mdev, ns); + if (rv == SS_SUCCESS) + rv = is_valid_soft_transition(os, ns); + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + + if (drbd_send_state_req(mdev, mask, val)) { + rv = SS_CW_FAILED_BY_PEER; + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + + wait_event(mdev->state_wait, + (rv = _req_st_cond(mdev, mask, val))); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + spin_lock_irqsave(&mdev->tconn->req_lock, flags); + ns = apply_mask_val(drbd_read_state(mdev), mask, val); + rv = _drbd_set_state(mdev, ns, f, &done); + } else { + rv = _drbd_set_state(mdev, ns, f, &done); + } + + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + + if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { + D_ASSERT(current != mdev->tconn->worker.task); + wait_for_completion(&done); + } + +abort: + if (f & CS_SERIALIZE) + mutex_unlock(mdev->state_mutex); + + return rv; +} + +/** + * _drbd_request_state() - Request a state change (with flags) + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE + * flag, or when logging of failed state change requests is not desired. + */ +enum drbd_state_rv +_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + wait_event(mdev->state_wait, + (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); + + return rv; +} + +/* pretty print of drbd internal state */ + +#define STATE_FMT " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n" +#define STATE_ARGS(tag, s) \ + tag, \ + drbd_conn_str(s.conn), \ + drbd_role_str(s.role), \ + drbd_role_str(s.peer), \ + drbd_disk_str(s.disk), \ + drbd_disk_str(s.pdsk), \ + is_susp(s) ? 's' : 'r', \ + s.aftr_isp ? 'a' : '-', \ + s.peer_isp ? 'p' : '-', \ + s.user_isp ? 'u' : '-', \ + s.susp_fen ? 'F' : '-', \ + s.susp_nod ? 'N' : '-' + +void print_st(struct drbd_conf *mdev, const char *tag, union drbd_state s) +{ + dev_err(DEV, STATE_FMT, STATE_ARGS(tag, s)); +} + + +void print_st_err(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum drbd_state_rv err) +{ + if (err == SS_IN_TRANSIENT_STATE) + return; + dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); + print_st(mdev, " state", os); + print_st(mdev, "wanted", ns); +} + +static long print_state_change(char *pb, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char *pbp; + pbp = pb; + *pbp = 0; + + if (ns.role != os.role && flags & CS_DC_ROLE) + pbp += sprintf(pbp, "role( %s -> %s ) ", + drbd_role_str(os.role), + drbd_role_str(ns.role)); + if (ns.peer != os.peer && flags & CS_DC_PEER) + pbp += sprintf(pbp, "peer( %s -> %s ) ", + drbd_role_str(os.peer), + drbd_role_str(ns.peer)); + if (ns.conn != os.conn && flags & CS_DC_CONN) + pbp += sprintf(pbp, "conn( %s -> %s ) ", + drbd_conn_str(os.conn), + drbd_conn_str(ns.conn)); + if (ns.disk != os.disk && flags & CS_DC_DISK) + pbp += sprintf(pbp, "disk( %s -> %s ) ", + drbd_disk_str(os.disk), + drbd_disk_str(ns.disk)); + if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK) + pbp += sprintf(pbp, "pdsk( %s -> %s ) ", + drbd_disk_str(os.pdsk), + drbd_disk_str(ns.pdsk)); + + return pbp - pb; +} + +static void drbd_pr_state_change(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char pb[300]; + char *pbp = pb; + + pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK); + + if (ns.aftr_isp != os.aftr_isp) + pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", + os.aftr_isp, + ns.aftr_isp); + if (ns.peer_isp != os.peer_isp) + pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", + os.peer_isp, + ns.peer_isp); + if (ns.user_isp != os.user_isp) + pbp += sprintf(pbp, "user_isp( %d -> %d ) ", + os.user_isp, + ns.user_isp); + + if (pbp != pb) + dev_info(DEV, "%s\n", pb); +} + +static void conn_pr_state_change(struct drbd_tconn *tconn, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char pb[300]; + char *pbp = pb; + + pbp += print_state_change(pbp, os, ns, flags); + + if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP) + pbp += sprintf(pbp, "susp( %d -> %d ) ", + is_susp(os), + is_susp(ns)); + + if (pbp != pb) + conn_info(tconn, "%s\n", pb); +} + + +/** + * is_valid_state() - Returns an SS_ error code if ns is not valid + * @mdev: DRBD device. + * @ns: State to consider. + */ +STATIC enum drbd_state_rv +is_valid_state(struct drbd_conf *mdev, union drbd_state ns) +{ + /* See drbd_state_sw_errors in drbd_strings.c */ + + enum drbd_fencing_p fp; + enum drbd_state_rv rv = SS_SUCCESS; + struct net_conf *nc; + + rcu_read_lock(); + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; + put_ldev(mdev); + } + + nc = rcu_dereference(mdev->tconn->net_conf); + if (nc) { + if (!nc->two_primaries && ns.role == R_PRIMARY) { + if (ns.peer == R_PRIMARY) + rv = SS_TWO_PRIMARIES; + else if (conn_highest_peer(mdev->tconn) == R_PRIMARY) + rv = SS_O_VOL_PEER_PRI; + } + } + + if (rv <= 0) + /* already found a reason to abort */; + else if (ns.role == R_SECONDARY && mdev->open_cnt) + rv = SS_DEVICE_IN_USE; + + else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (fp >= FP_RESOURCE && + ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) + rv = SS_PRIMARY_NOP; + + else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) + rv = SS_NO_LOCAL_DISK; + + else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) + rv = SS_NO_REMOTE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if ((ns.conn == C_CONNECTED || + ns.conn == C_WF_BITMAP_S || + ns.conn == C_SYNC_SOURCE || + ns.conn == C_PAUSED_SYNC_S) && + ns.disk == D_OUTDATED) + rv = SS_CONNECTED_OUTDATES; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + (nc->verify_alg[0] == 0)) + rv = SS_NO_VERIFY_ALG; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + mdev->tconn->agreed_pro_version < 88) + rv = SS_NOT_SUPPORTED; + + else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) + rv = SS_CONNECTED_OUTDATES; + + rcu_read_unlock(); + + return rv; +} + +/** + * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible + * This function limits state transitions that may be declined by DRBD. I.e. + * user requests (aka soft transitions). + * @mdev: DRBD device. + * @ns: new state. + * @os: old state. + */ +STATIC enum drbd_state_rv +is_valid_soft_transition(union drbd_state os, union drbd_state ns) +{ + enum drbd_state_rv rv = SS_SUCCESS; + + if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && + os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) + rv = SS_ALREADY_STANDALONE; + + if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) + rv = SS_NO_NET_CONFIG; + + if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) + rv = SS_LOWER_THAN_OUTDATED; + + if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) + rv = SS_IN_TRANSIENT_STATE; + + /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) + rv = SS_IN_TRANSIENT_STATE; */ + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + ns.conn != os.conn && os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) + && os.conn < C_WF_REPORT_PARAMS) + rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ + + return rv; +} + +STATIC enum drbd_state_rv +is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc) +{ + /* no change -> nothing to do, at least for the connection part */ + if (oc == nc) + return SS_NOTHING_TO_DO; + + /* disconnect of an unconfigured connection does not make sense */ + if (oc == C_STANDALONE && nc == C_DISCONNECTING) + return SS_ALREADY_STANDALONE; + + /* from C_STANDALONE, we start with C_UNCONNECTED */ + if (oc == C_STANDALONE && nc != C_UNCONNECTED) + return SS_NEED_CONNECTION; + + /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ + if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) + return SS_IN_TRANSIENT_STATE; + + /* After C_DISCONNECTING only C_STANDALONE may follow */ + if (oc == C_DISCONNECTING && nc != C_STANDALONE) + return SS_IN_TRANSIENT_STATE; + + return SS_SUCCESS; +} + + +/** + * is_valid_transition() - Returns an SS_ error code if the state transition is not possible + * This limits hard state transitions. Hard state transitions are facts there are + * imposed on DRBD by the environment. E.g. disk broke or network broke down. + * But those hard state transitions are still not allowed to do everything. + * @ns: new state. + * @os: old state. + */ +STATIC enum drbd_state_rv +is_valid_transition(union drbd_state os, union drbd_state ns) +{ + enum drbd_state_rv rv; + + rv = is_valid_conn_transition(os.conn, ns.conn); + + /* we cannot fail (again) if we already detached */ + if (ns.disk == D_FAILED && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + return rv; +} + +static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + dev_warn(DEV, "%s\n", msg_table[warn]); +} + +/** + * sanitize_state() - Resolves implicitly necessary additional changes to a state transition + * @mdev: DRBD device. + * @os: old state. + * @ns: new state. + * @warn_sync_abort: + * + * When we loose connection, we have to set the state of the peers disk (pdsk) + * to D_UNKNOWN. This rule and many more along those lines are in this function. + */ +STATIC union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns, + enum sanitize_state_warnings *warn) +{ + enum drbd_fencing_p fp; + enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + + if (warn) + *warn = NO_WARNING; + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + rcu_read_lock(); + fp = rcu_dereference(mdev->ldev->disk_conf)->fencing; + rcu_read_unlock(); + put_ldev(mdev); + } + + /* Implications from connection to peer and peer_isp */ + if (ns.conn < C_CONNECTED) { + ns.peer_isp = 0; + ns.peer = R_UNKNOWN; + if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) + ns.pdsk = D_UNKNOWN; + } + + /* Clear the aftr_isp when becoming unconfigured */ + if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) + ns.aftr_isp = 0; + + /* An implication of the disk states onto the connection state */ + /* Abort resync if a disk fails/detaches */ + if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { + if (warn) + *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; + ns.conn = C_CONNECTED; + } + + /* Connection breaks down before we finished "Negotiating" */ + if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && + get_ldev_if_state(mdev, D_NEGOTIATING)) { + if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { + ns.disk = mdev->new_state_tmp.disk; + ns.pdsk = mdev->new_state_tmp.pdsk; + } else { + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; + ns.disk = D_DISKLESS; + ns.pdsk = D_UNKNOWN; + } + put_ldev(mdev); + } + + /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ + if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { + if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) + ns.disk = D_UP_TO_DATE; + if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) + ns.pdsk = D_UP_TO_DATE; + } + + /* Implications of the connection stat on the disk states */ + disk_min = D_DISKLESS; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_UNKNOWN; + switch ((enum drbd_conns)ns.conn) { + case C_WF_BITMAP_T: + case C_PAUSED_SYNC_T: + case C_STARTING_SYNC_T: + case C_WF_SYNC_UUID: + case C_BEHIND: + disk_min = D_INCONSISTENT; + disk_max = D_OUTDATED; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_VERIFY_S: + case C_VERIFY_T: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_CONNECTED: + disk_min = D_DISKLESS; + disk_max = D_UP_TO_DATE; + pdsk_min = D_DISKLESS; + pdsk_max = D_UP_TO_DATE; + break; + case C_WF_BITMAP_S: + case C_PAUSED_SYNC_S: + case C_STARTING_SYNC_S: + case C_AHEAD: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ + break; + case C_SYNC_TARGET: + disk_min = D_INCONSISTENT; + disk_max = D_INCONSISTENT; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_SYNC_SOURCE: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_INCONSISTENT; + break; + case C_STANDALONE: + case C_DISCONNECTING: + case C_UNCONNECTED: + case C_TIMEOUT: + case C_BROKEN_PIPE: + case C_NETWORK_FAILURE: + case C_PROTOCOL_ERROR: + case C_TEAR_DOWN: + case C_WF_CONNECTION: + case C_WF_REPORT_PARAMS: + case C_MASK: + break; + } + if (ns.disk > disk_max) + ns.disk = disk_max; + + if (ns.disk < disk_min) { + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; + ns.disk = disk_min; + } + if (ns.pdsk > pdsk_max) + ns.pdsk = pdsk_max; + + if (ns.pdsk < pdsk_min) { + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; + ns.pdsk = pdsk_min; + } + + if (fp == FP_STONITH && + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) + ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ + + if (mdev->tconn->res_opts.on_no_data == OND_SUSPEND_IO && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ + + if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { + if (ns.conn == C_SYNC_SOURCE) + ns.conn = C_PAUSED_SYNC_S; + if (ns.conn == C_SYNC_TARGET) + ns.conn = C_PAUSED_SYNC_T; + } else { + if (ns.conn == C_PAUSED_SYNC_S) + ns.conn = C_SYNC_SOURCE; + if (ns.conn == C_PAUSED_SYNC_T) + ns.conn = C_SYNC_TARGET; + } + + return ns; +} + +void drbd_resume_al(struct drbd_conf *mdev) +{ + if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags)) + dev_info(DEV, "Resumed AL updates\n"); +} + +/* helper for __drbd_set_state */ +static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) +{ + if (mdev->tconn->agreed_pro_version < 90) + mdev->ov_start_sector = 0; + mdev->rs_total = drbd_bm_bits(mdev); + mdev->ov_position = 0; + if (cs == C_VERIFY_T) { + /* starting online verify from an arbitrary position + * does not fit well into the existing protocol. + * on C_VERIFY_T, we initialize ov_left and friends + * implicitly in receive_DataRequest once the + * first P_OV_REQUEST is received */ + mdev->ov_start_sector = ~(sector_t)0; + } else { + unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); + if (bit >= mdev->rs_total) { + mdev->ov_start_sector = + BM_BIT_TO_SECT(mdev->rs_total - 1); + mdev->rs_total = 1; + } else + mdev->rs_total -= bit; + mdev->ov_position = mdev->ov_start_sector; + } + mdev->ov_left = mdev->rs_total; +} + +/** + * __drbd_set_state() - Set a new DRBD state + * @mdev: DRBD device. + * @ns: new state. + * @flags: Flags + * @done: Optional completion, that will get completed after the after_state_ch() finished + * + * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + */ +enum drbd_state_rv +__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) +{ + union drbd_state os; + enum drbd_state_rv rv = SS_SUCCESS; + enum sanitize_state_warnings ssw; + struct after_state_chg_work *ascw; + + os = drbd_read_state(mdev); + + ns = sanitize_state(mdev, ns, &ssw); + if (ns.i == os.i) + return SS_NOTHING_TO_DO; + + rv = is_valid_transition(os, ns); + if (rv < SS_SUCCESS) + return rv; + + if (!(flags & CS_HARD)) { + /* pre-state-change checks ; only look at ns */ + /* See drbd_state_sw_errors in drbd_strings.c */ + + rv = is_valid_state(mdev, ns); + if (rv < SS_SUCCESS) { + /* If the old state was illegal as well, then let + this happen...*/ + + if (is_valid_state(mdev, os) == rv) + rv = is_valid_soft_transition(os, ns); + } else + rv = is_valid_soft_transition(os, ns); + } + + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + return rv; + } + + print_sanitize_warnings(mdev, ssw); + + drbd_pr_state_change(mdev, os, ns, flags); + + /* Display changes to the susp* flags that where caused by the call to + sanitize_state(). Only display it here if we where not called from + _conn_request_state() */ + if (!(flags & CS_DC_SUSP)) + conn_pr_state_change(mdev->tconn, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP); + + /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference + * on the ldev here, to be sure the transition -> D_DISKLESS resp. + * drbd_ldev_destroy() won't happen before our corresponding + * after_state_ch works run, where we put_ldev again. */ + if ((os.disk != D_FAILED && ns.disk == D_FAILED) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) + atomic_inc(&mdev->local_cnt); + + /* assignment inclusive debug info about what code path + * initiated this state change. */ + mdev->state.i = ns.i; + mdev->tconn->susp = ns.susp; + mdev->tconn->susp_nod = ns.susp_nod; + mdev->tconn->susp_fen = ns.susp_fen; + + if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) + drbd_print_uuids(mdev, "attached to UUIDs"); + + wake_up(&mdev->misc_wait); + wake_up(&mdev->state_wait); + wake_up(&mdev->tconn->ping_wait); + + /* aborted verify run. log the last position */ + if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && + ns.conn < C_CONNECTED) { + mdev->ov_start_sector = + BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left); + dev_info(DEV, "Online Verify reached sector %llu\n", + (unsigned long long)mdev->ov_start_sector); + } + + if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { + dev_info(DEV, "Syncer continues.\n"); + mdev->rs_paused += (long)jiffies + -(long)mdev->rs_mark_time[mdev->rs_last_mark]; + if (ns.conn == C_SYNC_TARGET) + mod_timer(&mdev->resync_timer, jiffies); + } + + if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && + (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { + dev_info(DEV, "Resync suspended\n"); + mdev->rs_mark_time[mdev->rs_last_mark] = jiffies; + } + + if (os.conn == C_CONNECTED && + (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { + unsigned long now = jiffies; + int i; + + set_ov_position(mdev, ns.conn); + mdev->rs_start = now; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; + mdev->ov_last_oos_size = 0; + mdev->ov_last_oos_start = 0; + + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = mdev->ov_left; + mdev->rs_mark_time[i] = now; + } + + drbd_rs_controller_reset(mdev); + + if (ns.conn == C_VERIFY_S) { + dev_info(DEV, "Starting Online Verify from sector %llu\n", + (unsigned long long)mdev->ov_position); + mod_timer(&mdev->resync_timer, jiffies); + } + } + + if (get_ldev(mdev)) { + u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| + MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| + MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); + + mdf &= ~MDF_AL_CLEAN; + if (test_bit(CRASHED_PRIMARY, &mdev->flags)) + mdf |= MDF_CRASHED_PRIMARY; + if (mdev->state.role == R_PRIMARY || + (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) + mdf |= MDF_PRIMARY_IND; + if (mdev->state.conn > C_WF_REPORT_PARAMS) + mdf |= MDF_CONNECTED_IND; + if (mdev->state.disk > D_INCONSISTENT) + mdf |= MDF_CONSISTENT; + if (mdev->state.disk > D_OUTDATED) + mdf |= MDF_WAS_UP_TO_DATE; + if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) + mdf |= MDF_PEER_OUT_DATED; + if (mdf != mdev->ldev->md.flags) { + mdev->ldev->md.flags = mdf; + drbd_md_mark_dirty(mdev); + } + if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) + drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); + put_ldev(mdev); + } + + /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ + if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && + os.peer == R_SECONDARY && ns.peer == R_PRIMARY) + set_bit(CONSIDER_RESYNC, &mdev->flags); + + /* Receiver should clean up itself */ + if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) + drbd_thread_stop_nowait(&mdev->tconn->receiver); + + /* Now the receiver finished cleaning up itself, it should die */ + if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) + drbd_thread_stop_nowait(&mdev->tconn->receiver); + + /* Upon network failure, we need to restart the receiver. */ + if (os.conn > C_WF_CONNECTION && + ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) + drbd_thread_restart_nowait(&mdev->tconn->receiver); + + /* Resume AL writing if we get a connection */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) + drbd_resume_al(mdev); + + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); + if (ascw) { + ascw->os = os; + ascw->ns = ns; + ascw->flags = flags; + ascw->w.cb = w_after_state_ch; + ascw->w.mdev = mdev; + ascw->done = done; + drbd_queue_work(&mdev->tconn->data.work, &ascw->w); + } else { + dev_err(DEV, "Could not kmalloc an ascw\n"); + } + + return rv; +} + +STATIC int w_after_state_ch(struct drbd_work *w, int unused) +{ + struct after_state_chg_work *ascw = + container_of(w, struct after_state_chg_work, w); + struct drbd_conf *mdev = w->mdev; + + after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); + if (ascw->flags & CS_WAIT_COMPLETE) { + D_ASSERT(ascw->done != NULL); + complete(ascw->done); + } + kfree(ascw); + + return 0; +} + +static void abw_start_sync(struct drbd_conf *mdev, int rv) +{ + if (rv) { + dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); + _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); + return; + } + + switch (mdev->state.conn) { + case C_STARTING_SYNC_T: + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + break; + case C_STARTING_SYNC_S: + drbd_start_resync(mdev, C_SYNC_SOURCE); + break; + } +} + +static int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, + int (*io_fn)(struct drbd_conf *), + char *why, enum bm_flag flags) +{ + int rv; + + D_ASSERT(current == mdev->tconn->worker.task); + + /* open coded non-blocking drbd_suspend_io(mdev); */ + set_bit(SUSPEND_IO, &mdev->flags); + + drbd_bm_lock(mdev, why, flags); + rv = io_fn(mdev); + drbd_bm_unlock(mdev); + + drbd_resume_io(mdev); + + return rv; +} + +/** + * after_state_ch() - Perform after state change actions that may sleep + * @mdev: DRBD device. + * @os: old state. + * @ns: new state. + * @flags: Flags + */ +STATIC void after_state_ch(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags) +{ + struct sib_info sib; + + sib.sib_reason = SIB_STATE_CHANGE; + sib.os = os; + sib.ns = ns; + + if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { + clear_bit(CRASHED_PRIMARY, &mdev->flags); + if (mdev->p_uuid) + mdev->p_uuid[UI_FLAGS] &= ~((u64)2); + } + + /* Inform userspace about the change... */ + drbd_bcast_event(mdev, &sib); + + if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + drbd_khelper(mdev, "pri-on-incon-degr"); + + /* Here we have the actions that are performed after a + state change. This function might sleep */ + + if (ns.susp_nod) { + enum drbd_req_event what = NOTHING; + + if (os.conn < C_CONNECTED && conn_lowest_conn(mdev->tconn) >= C_CONNECTED) + what = RESEND; + + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + conn_lowest_disk(mdev->tconn) > D_NEGOTIATING) + what = RESTART_FROZEN_DISK_IO; + + if (what != NOTHING) { + spin_lock_irq(&mdev->tconn->req_lock); + _tl_restart(mdev->tconn, what); + _drbd_set_state(_NS(mdev, susp_nod, 0), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->tconn->req_lock); + } + } + + /* Became sync source. With protocol >= 96, we still need to send out + * the sync uuid now. Need to do that before any drbd_send_state, or + * the other side may go "paused sync" before receiving the sync uuids, + * which is unexpected. */ + if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && + mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) { + drbd_gen_and_send_sync_uuid(mdev); + put_ldev(mdev); + } + + /* Do not change the order of the if above and the two below... */ + if (os.pdsk == D_DISKLESS && + ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ + drbd_send_uuids(mdev); + drbd_send_state(mdev, ns); + } + /* No point in queuing send_bitmap if we don't have a connection + * anymore, so check also the _current_ state, not only the new state + * at the time this work was queued. */ + if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && + mdev->state.conn == C_WF_BITMAP_S) + drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, + "send_bitmap (WFBitMapS)", + BM_LOCKED_TEST_ALLOWED); + + /* Lost contact to peer's copy of the data */ + if ((os.pdsk >= D_INCONSISTENT && + os.pdsk != D_UNKNOWN && + os.pdsk != D_OUTDATED) + && (ns.pdsk < D_INCONSISTENT || + ns.pdsk == D_UNKNOWN || + ns.pdsk == D_OUTDATED)) { + if (get_ldev(mdev)) { + if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + if (drbd_suspended(mdev)) { + set_bit(NEW_CUR_UUID, &mdev->flags); + } else { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } + } + put_ldev(mdev); + } + } + + if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } + /* D_DISKLESS Peer becomes secondary */ + if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) + /* We may still be Primary ourselves. + * No harm done if the bitmap still changes, + * redirtied pages will follow later. */ + drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, + "demote diskless peer", BM_LOCKED_SET_ALLOWED); + put_ldev(mdev); + } + + /* Write out all changed bits on demote. + * Though, no need to da that just yet + * if there is a resync going on still */ + if (os.role == R_PRIMARY && ns.role == R_SECONDARY && + mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { + /* No changes to the bitmap expected this time, so assert that, + * even though no harm was done if it did change. */ + drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, + "demote", BM_LOCKED_TEST_ALLOWED); + put_ldev(mdev); + } + + /* Last part of the attaching process ... */ + if (ns.conn >= C_CONNECTED && + os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { + drbd_send_sizes(mdev, 0, 0); /* to start sync... */ + drbd_send_uuids(mdev); + drbd_send_state(mdev, ns); + } + + /* We want to pause/continue resync, tell peer. */ + if (ns.conn >= C_CONNECTED && + ((os.aftr_isp != ns.aftr_isp) || + (os.user_isp != ns.user_isp))) + drbd_send_state(mdev, ns); + + /* In case one of the isp bits got set, suspend other devices. */ + if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && + (ns.aftr_isp || ns.peer_isp || ns.user_isp)) + suspend_other_sg(mdev); + + /* Make sure the peer gets informed about eventual state + changes (ISP bits) while we were in WFReportParams. */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + if (os.conn != C_AHEAD && ns.conn == C_AHEAD) + drbd_send_state(mdev, ns); + + /* We are in the progress to start a full sync... */ + if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) + /* no other bitmap changes expected during this phase */ + drbd_queue_bitmap_io(mdev, + &drbd_bmio_set_n_write, &abw_start_sync, + "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); + + /* We are invalidating our self... */ + if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && + os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) + /* other bitmap operation expected during this phase */ + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, + "set_n_write from invalidate", BM_LOCKED_MASK); + + /* first half of local IO error, failure to attach, + * or administrative detach */ + if (os.disk != D_FAILED && ns.disk == D_FAILED) { + enum drbd_io_error_p eh; + int was_io_error; + /* corresponding get_ldev was in __drbd_set_state, to serialize + * our cleanup here with the transition to D_DISKLESS, + * so it is safe to dreference ldev here. */ + rcu_read_lock(); + eh = rcu_dereference(mdev->ldev->disk_conf)->on_io_error; + rcu_read_unlock(); + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + /* Immediately allow completion of all application IO, that waits + for completion from the local disk. */ + tl_abort_disk_io(mdev); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + drbd_rs_cancel_all(mdev); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); + put_ldev(mdev); + + if (was_io_error && eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + } + + /* second half of local IO error, failure to attach, + * or administrative detach, + * after local_cnt references have reached zero again */ + if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { + /* We must still be diskless, + * re-attach has to be serialized with this! */ + if (mdev->state.disk != D_DISKLESS) + dev_err(DEV, + "ASSERT FAILED: disk is %s while going diskless\n", + drbd_disk_str(mdev->state.disk)); + + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + /* corresponding get_ldev in __drbd_set_state + * this may finaly trigger drbd_ldev_destroy. */ + put_ldev(mdev); + } + + /* Notify peer that I had a local IO error and did not detach. */ + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + /* Disks got bigger while they were detached */ + if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && + test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { + if (ns.conn == C_CONNECTED) + resync_after_online_grow(mdev); + } + + /* A resync finished or aborted, wake paused devices... */ + if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || + (os.peer_isp && !ns.peer_isp) || + (os.user_isp && !ns.user_isp)) + resume_next_sg(mdev); + + /* sync target done with resync. Explicitly notify peer, even though + * it should (at least for non-empty resyncs) already know itself. */ + if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) + drbd_send_state(mdev, ns); + + /* This triggers bitmap writeout of potentially still unwritten pages + * if the resync finished cleanly, or aborted because of peer disk + * failure, or because of connection loss. + * For resync aborted because of local disk failure, we cannot do + * any bitmap writeout anymore. + * No harm done if some bits change during this phase. + */ + if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { + drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, + "write from resync_finished", BM_LOCKED_SET_ALLOWED); + put_ldev(mdev); + } + + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY) { + if (os.aftr_isp != ns.aftr_isp) + resume_next_sg(mdev); + } + + drbd_md_sync(mdev); +} + +struct after_conn_state_chg_work { + struct drbd_work w; + enum drbd_conns oc; + union drbd_state ns_min; + union drbd_state ns_max; /* new, max state, over all mdevs */ + enum chg_state_flags flags; +}; + +STATIC int w_after_conn_state_ch(struct drbd_work *w, int unused) +{ + struct after_conn_state_chg_work *acscw = + container_of(w, struct after_conn_state_chg_work, w); + struct drbd_tconn *tconn = w->tconn; + enum drbd_conns oc = acscw->oc; + union drbd_state ns_max = acscw->ns_max; + union drbd_state ns_min = acscw->ns_min; + struct drbd_conf *mdev; + int vnr; + + kfree(acscw); + + /* Upon network configuration, we need to start the receiver */ + if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED) + drbd_thread_start(&tconn->receiver); + + if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { + struct net_conf *old_conf; + + mutex_lock(&tconn->conf_update); + old_conf = tconn->net_conf; + tconn->my_addr_len = 0; + tconn->peer_addr_len = 0; + rcu_assign_pointer(tconn->net_conf, NULL); + conn_free_crypto(tconn); + mutex_unlock(&tconn->conf_update); + + synchronize_rcu(); + kfree(old_conf); + } + + if (ns_max.susp_fen) { + /* case1: The outdate peer handler is successful: */ + if (ns_max.pdsk <= D_OUTDATED) { + tl_clear(tconn); + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + if (test_bit(NEW_CUR_UUID, &mdev->flags)) { + drbd_uuid_new_current(mdev); + clear_bit(NEW_CUR_UUID, &mdev->flags); + } + } + rcu_read_unlock(); + conn_request_state(tconn, + (union drbd_state) { { .susp_fen = 1 } }, + (union drbd_state) { { .susp_fen = 0 } }, + CS_VERBOSE); + } + /* case2: The connection was established again: */ + if (ns_min.conn >= C_CONNECTED) { + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) + clear_bit(NEW_CUR_UUID, &mdev->flags); + rcu_read_unlock(); + spin_lock_irq(&tconn->req_lock); + _tl_restart(tconn, RESEND); + _conn_request_state(tconn, + (union drbd_state) { { .susp_fen = 1 } }, + (union drbd_state) { { .susp_fen = 0 } }, + CS_VERBOSE); + spin_unlock_irq(&tconn->req_lock); + } + } + kref_put(&tconn->kref, &conn_destroy); + return 0; +} + +void conn_old_common_state(struct drbd_tconn *tconn, union drbd_state *pcs, enum chg_state_flags *pf) +{ + enum chg_state_flags flags = ~0; + struct drbd_conf *mdev; + int vnr, first_vol = 1; + union drbd_dev_state os, cs = { + { .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = tconn->cstate, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN, + } }; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + os = mdev->state; + + if (first_vol) { + cs = os; + first_vol = 0; + continue; + } + + if (cs.role != os.role) + flags &= ~CS_DC_ROLE; + + if (cs.peer != os.peer) + flags &= ~CS_DC_PEER; + + if (cs.conn != os.conn) + flags &= ~CS_DC_CONN; + + if (cs.disk != os.disk) + flags &= ~CS_DC_DISK; + + if (cs.pdsk != os.pdsk) + flags &= ~CS_DC_PDSK; + } + rcu_read_unlock(); + + *pf |= CS_DC_MASK; + *pf &= flags; + (*pcs).i = cs.i; +} + +static enum drbd_state_rv +conn_is_valid_transition(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv = SS_SUCCESS; + union drbd_state ns, os; + struct drbd_conf *mdev; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + os = drbd_read_state(mdev); + ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL); + + if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) + ns.disk = os.disk; + + if (ns.i == os.i) + continue; + + rv = is_valid_transition(os, ns); + if (rv < SS_SUCCESS) + break; + + if (!(flags & CS_HARD)) { + rv = is_valid_state(mdev, ns); + if (rv < SS_SUCCESS) { + if (is_valid_state(mdev, os) == rv) + rv = is_valid_soft_transition(os, ns); + } else + rv = is_valid_soft_transition(os, ns); + } + if (rv < SS_SUCCESS) + break; + } + rcu_read_unlock(); + + if (rv < SS_SUCCESS && flags & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + + return rv; +} + +void +conn_set_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, + union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) +{ + union drbd_state ns, os, ns_max = { }; + union drbd_state ns_min = { + { .role = R_MASK, + .peer = R_MASK, + .conn = val.conn, + .disk = D_MASK, + .pdsk = D_MASK + } }; + struct drbd_conf *mdev; + enum drbd_state_rv rv; + int vnr, number_of_volumes = 0; + + if (mask.conn == C_MASK) + tconn->cstate = val.conn; + + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + number_of_volumes++; + os = drbd_read_state(mdev); + ns = apply_mask_val(os, mask, val); + ns = sanitize_state(mdev, ns, NULL); + + if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) + ns.disk = os.disk; + + rv = __drbd_set_state(mdev, ns, flags, NULL); + if (rv < SS_SUCCESS) + BUG(); + + ns.i = mdev->state.i; + ns_max.role = max_role(ns.role, ns_max.role); + ns_max.peer = max_role(ns.peer, ns_max.peer); + ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn); + ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk); + ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk); + + ns_min.role = min_role(ns.role, ns_min.role); + ns_min.peer = min_role(ns.peer, ns_min.peer); + ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn); + ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk); + ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk); + } + rcu_read_unlock(); + + if (number_of_volumes == 0) { + ns_min = ns_max = (union drbd_state) { { + .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = val.conn, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN + } }; + } + + ns_min.susp = ns_max.susp = tconn->susp; + ns_min.susp_nod = ns_max.susp_nod = tconn->susp_nod; + ns_min.susp_fen = ns_max.susp_fen = tconn->susp_fen; + + *pns_min = ns_min; + *pns_max = ns_max; +} + +static enum drbd_state_rv +_conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val) +{ + enum drbd_state_rv rv; + + if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags)) + return SS_CW_SUCCESS; + + if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) + return SS_CW_FAILED_BY_PEER; + + spin_lock_irq(&tconn->req_lock); + rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; + + if (rv == SS_UNKNOWN_ERROR) + rv = conn_is_valid_transition(tconn, mask, val, 0); + + if (rv == SS_SUCCESS) + rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + + spin_unlock_irq(&tconn->req_lock); + + return rv; +} + +static enum drbd_state_rv +conn_cl_wide(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, + enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + spin_unlock_irq(&tconn->req_lock); + mutex_lock(&tconn->cstate_mutex); + + if (conn_send_state_req(tconn, mask, val)) { + rv = SS_CW_FAILED_BY_PEER; + /* if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); */ + goto abort; + } + + wait_event(tconn->ping_wait, (rv = _conn_rq_cond(tconn, mask, val))); + +abort: + mutex_unlock(&tconn->cstate_mutex); + spin_lock_irq(&tconn->req_lock); + + return rv; +} + +enum drbd_state_rv +_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv = SS_SUCCESS; + struct after_conn_state_chg_work *acscw; + enum drbd_conns oc = tconn->cstate; + union drbd_state ns_max, ns_min, os; + + rv = is_valid_conn_transition(oc, val.conn); + if (rv < SS_SUCCESS) + goto abort; + + rv = conn_is_valid_transition(tconn, mask, val, flags); + if (rv < SS_SUCCESS) + goto abort; + + if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING && + !(flags & (CS_LOCAL_ONLY | CS_HARD))) { + rv = conn_cl_wide(tconn, mask, val, flags); + if (rv < SS_SUCCESS) + goto abort; + } + + conn_old_common_state(tconn, &os, &flags); + flags |= CS_DC_SUSP; + conn_set_state(tconn, mask, val, &ns_min, &ns_max, flags); + conn_pr_state_change(tconn, os, ns_max, flags); + + acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); + if (acscw) { + acscw->oc = os.conn; + acscw->ns_min = ns_min; + acscw->ns_max = ns_max; + acscw->flags = flags; + acscw->w.cb = w_after_conn_state_ch; + kref_get(&tconn->kref); + acscw->w.tconn = tconn; + drbd_queue_work(&tconn->data.work, &acscw->w); + } else { + conn_err(tconn, "Could not kmalloc an acscw\n"); + } + + return rv; + abort: + if (flags & CS_VERBOSE) { + conn_err(tconn, "State change failed: %s\n", drbd_set_st_err_str(rv)); + conn_err(tconn, " state = { cs:%s }\n", drbd_conn_str(oc)); + conn_err(tconn, "wanted = { cs:%s }\n", drbd_conn_str(val.conn)); + } + return rv; +} + +enum drbd_state_rv +conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv; + + spin_lock_irq(&tconn->req_lock); + rv = _conn_request_state(tconn, mask, val, flags); + spin_unlock_irq(&tconn->req_lock); + + return rv; +} diff -Nru drbd8-8.3.7/drbd/drbd_state.h drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_state.h --- drbd8-8.3.7/drbd/drbd_state.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_state.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,162 @@ +#ifndef DRBD_STATE_H +#define DRBD_STATE_H + +struct drbd_conf; +struct drbd_tconn; + +/** + * DOC: DRBD State macros + * + * These macros are used to express state changes in easily readable form. + * + * The NS macros expand to a mask and a value, that can be bit ored onto the + * current state as soon as the spinlock (req_lock) was taken. + * + * The _NS macros are used for state functions that get called with the + * spinlock. These macros expand directly to the new state value. + * + * Besides the basic forms NS() and _NS() additional _?NS[23] are defined + * to express state changes that affect more than one aspect of the state. + * + * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) + * Means that the network connection was established and that the peer + * is in secondary role. + */ +#define role_MASK R_MASK +#define peer_MASK R_MASK +#define disk_MASK D_MASK +#define pdsk_MASK D_MASK +#define conn_MASK C_MASK +#define susp_MASK 1 +#define user_isp_MASK 1 +#define aftr_isp_MASK 1 +#define susp_nod_MASK 1 +#define susp_fen_MASK 1 + +#define NS(T, S) \ + ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T = (S); val; }) +#define NS2(T1, S1, T2, S2) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val; }) +#define NS3(T1, S1, T2, S2, T3, S3) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val.T3 = (S3); val; }) + +#define _NS(D, T, S) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; }) +#define _NS2(D, T1, S1, T2, S2) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns; }) +#define _NS3(D, T1, S1, T2, S2, T3, S3) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) + + +enum chg_state_flags { + CS_HARD = 1 << 0, + CS_VERBOSE = 1 << 1, + CS_WAIT_COMPLETE = 1 << 2, + CS_SERIALIZE = 1 << 3, + CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, + CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */ + CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */ + CS_DC_PEER = 1 << 6, + CS_DC_CONN = 1 << 7, + CS_DC_DISK = 1 << 8, + CS_DC_PDSK = 1 << 9, + CS_DC_SUSP = 1 << 10, + CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK, + CS_IGN_OUTD_FAIL = 1 << 11, +}; + +/* drbd_dev_state and drbd_state are different types. This is to stress the + small difference. There is no suspended flag (.susp), and no suspended + while fence handler runs flas (susp_fen). */ +union drbd_dev_state { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned _unused:1 ; + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned peer_isp:1 ; + unsigned user_isp:1 ; + unsigned _pad:11; /* 0 unused */ +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned _pad:11; + unsigned user_isp:1 ; + unsigned peer_isp:1 ; + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned _unused:1 ; + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ +#else +# error "this endianess is not supported" +#endif + }; + unsigned int i; +}; + +extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev, + enum chg_state_flags f, + union drbd_state mask, + union drbd_state val); +extern void drbd_force_state(struct drbd_conf *, union drbd_state, + union drbd_state); +extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *, + union drbd_state, + union drbd_state, + enum chg_state_flags); +extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state, + enum chg_state_flags, + struct completion *done); +extern void print_st_err(struct drbd_conf *, union drbd_state, + union drbd_state, int); + +enum drbd_state_rv +_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags); + +enum drbd_state_rv +conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags); + +extern void drbd_resume_al(struct drbd_conf *mdev); +extern bool conn_all_vols_unconf(struct drbd_tconn *tconn); + +/** + * drbd_request_state() - Reqest a state change + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * + * This is the most graceful way of requesting a state change. It is verbose + * quite verbose in case the state change is not possible, and all those + * state changes are globally serialized. + */ +static inline int drbd_request_state(struct drbd_conf *mdev, + union drbd_state mask, + union drbd_state val) +{ + return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); +} + +enum drbd_role conn_highest_role(struct drbd_tconn *tconn); +enum drbd_role conn_highest_peer(struct drbd_tconn *tconn); +enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn); +enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn); +enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn); +enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn); + +#endif diff -Nru drbd8-8.3.7/drbd/drbd_strings.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_strings.c --- drbd8-8.3.7/drbd/drbd_strings.c 2009-08-26 13:27:50.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_strings.c 2012-02-02 14:09:14.000000000 +0000 @@ -48,6 +48,8 @@ [C_PAUSED_SYNC_T] = "PausedSyncT", [C_VERIFY_S] = "VerifyS", [C_VERIFY_T] = "VerifyT", + [C_AHEAD] = "Ahead", + [C_BEHIND] = "Behind", }; static const char *drbd_role_s_names[] = { @@ -70,7 +72,7 @@ static const char *drbd_state_sw_errors[] = { [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", - [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", + [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", @@ -87,12 +89,13 @@ [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", + [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", }; const char *drbd_conn_str(enum drbd_conns s) { /* enums are unsigned... */ - return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; + return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s]; } const char *drbd_role_str(enum drbd_role s) @@ -105,7 +108,7 @@ return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; } -const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) +const char *drbd_set_st_err_str(enum drbd_state_rv err) { return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : err > SS_TWO_PRIMARIES ? "TOO_LARGE" diff -Nru drbd8-8.3.7/drbd/drbd_tracing.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_tracing.c --- drbd8-8.3.7/drbd/drbd_tracing.c 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_tracing.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,756 +0,0 @@ -/* - drbd_tracing.c - - This file is part of DRBD by Philipp Reisner and Lars Ellenberg. - - Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. - Copyright (C) 2003-2008, Philipp Reisner . - Copyright (C) 2003-2008, Lars Ellenberg . - - drbd is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - drbd is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with drbd; see the file COPYING. If not, write to - the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - - */ - -#include -#include -#include -#include "drbd_int.h" -#include "drbd_tracing.h" -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Philipp Reisner, Lars Ellenberg"); -MODULE_DESCRIPTION("DRBD tracepoint probes"); -MODULE_PARM_DESC(trace_mask, "Bitmap of events to trace see drbd_tracing.c"); -MODULE_PARM_DESC(trace_level, "Current tracing level (changeable in /sys)"); -MODULE_PARM_DESC(trace_devs, "Bitmap of devices to trace (changeable in /sys)"); - -unsigned int trace_mask = 0; /* Bitmap of events to trace */ -int trace_level; /* Current trace level */ -int trace_devs; /* Bitmap of devices to trace */ - -module_param(trace_mask, uint, 0444); -module_param(trace_level, int, 0644); -module_param(trace_devs, int, 0644); - -enum { - TRACE_PACKET = 0x0001, - TRACE_RQ = 0x0002, - TRACE_UUID = 0x0004, - TRACE_RESYNC = 0x0008, - TRACE_EE = 0x0010, - TRACE_UNPLUG = 0x0020, - TRACE_NL = 0x0040, - TRACE_AL_EXT = 0x0080, - TRACE_INT_RQ = 0x0100, - TRACE_MD_IO = 0x0200, - TRACE_EPOCH = 0x0400, -}; - -/* Buffer printing support - * dbg_print_flags: used for Flags arg to drbd_print_buffer - * - DBGPRINT_BUFFADDR; if set, each line starts with the - * virtual address of the line being output. If clear, - * each line starts with the offset from the beginning - * of the buffer. */ -enum dbg_print_flags { - DBGPRINT_BUFFADDR = 0x0001, -}; - -/* Macro stuff */ -STATIC char *nl_packet_name(int packet_type) -{ -/* Generate packet type strings */ -#define NL_PACKET(name, number, fields) \ - [P_ ## name] = # name, -#define NL_INTEGER Argh! -#define NL_BIT Argh! -#define NL_INT64 Argh! -#define NL_STRING Argh! - - static char *nl_tag_name[P_nl_after_last_packet] = { -#include "linux/drbd_nl.h" - }; - - return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ? - nl_tag_name[packet_type] : "*Unknown*"; -} -/* /Macro stuff */ - -static inline int is_mdev_trace(struct drbd_conf *mdev, unsigned int level) -{ - return trace_level >= level && ((1 << mdev_to_minor(mdev)) & trace_devs); -} - -static void probe_drbd_unplug(struct drbd_conf *mdev, char *msg) -{ - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - dev_info(DEV, "%s, ap_bio_count=%d\n", msg, atomic_read(&mdev->ap_bio_cnt)); -} - -static void probe_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index) -{ - static char *uuid_str[UI_EXTENDED_SIZE] = { - [UI_CURRENT] = "CURRENT", - [UI_BITMAP] = "BITMAP", - [UI_HISTORY_START] = "HISTORY_START", - [UI_HISTORY_END] = "HISTORY_END", - [UI_SIZE] = "SIZE", - [UI_FLAGS] = "FLAGS", - }; - - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - if (index >= UI_EXTENDED_SIZE) { - dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n"); - return; - } - - dev_info(DEV, " uuid[%s] now %016llX\n", - uuid_str[index], - (unsigned long long)mdev->ldev->md.uuid[index]); -} - -static void probe_drbd_md_io(struct drbd_conf *mdev, int rw, - struct drbd_backing_dev *bdev) -{ - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - dev_info(DEV, " %s metadata superblock now\n", - rw == READ ? "Reading" : "Writing"); -} - -static void probe_drbd_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg) -{ - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - dev_info(DEV, "EE %s sec=%llus size=%u e=%p\n", - msg, (unsigned long long)e->sector, e->size, e); -} - -static void probe_drbd_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch, - enum epoch_event ev) -{ - static char *epoch_event_str[] = { - [EV_PUT] = "put", - [EV_GOT_BARRIER_NR] = "got_barrier_nr", - [EV_BARRIER_DONE] = "barrier_done", - [EV_BECAME_LAST] = "became_last", - [EV_TRACE_FLUSH] = "issuing_flush", - [EV_TRACE_ADD_BARRIER] = "added_barrier", - [EV_TRACE_SETTING_BI] = "just set barrier_in_next_epoch", - }; - - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - ev &= ~EV_CLEANUP; - - switch (ev) { - case EV_TRACE_ALLOC: - dev_info(DEV, "Allocate epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs); - break; - case EV_TRACE_FREE: - dev_info(DEV, "Freeing epoch %p/%d { size=%d } nr_epochs=%d\n", - epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size), - mdev->epochs); - break; - default: - dev_info(DEV, "Update epoch %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n", - epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size), - atomic_read(&epoch->active), - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-', - test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-', - test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-', - test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-', - epoch_event_str[ev]); - } -} - -static void probe_drbd_netlink(void *data, int is_req) -{ - struct cn_msg *msg = data; - - if (is_req) { - struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)msg->data; - - printk(KERN_INFO "drbd%d: " - "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n", - nlp->drbd_minor, - nl_packet_name(nlp->packet_type), - nlp->packet_type, - msg->seq, msg->ack, msg->len); - } else { - struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply *)msg->data; - - printk(KERN_INFO "drbd%d: " - "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n", - nlp->minor, - nlp->packet_type == P_nl_after_last_packet ? - "Empty-Reply" : nl_packet_name(nlp->packet_type), - nlp->packet_type, - msg->seq, msg->ack, msg->len); - } -} - -static void probe_drbd_actlog(struct drbd_conf *mdev, sector_t sector, char* msg) -{ - unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); - - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - dev_info(DEV, "%s (sec=%llus, al_enr=%u, rs_enr=%d)\n", - msg, (unsigned long long) sector, enr, - (int)BM_SECT_TO_EXT(sector)); -} - -/** - * drbd_print_buffer() - Hexdump arbitrary binary data into a buffer - * @prefix: String is output at the beginning of each line output. - * @flags: Currently only defined flag: DBGPRINT_BUFFADDR; if set, each - * line starts with the virtual address of the line being - * output. If clear, each line starts with the offset from the - * beginning of the buffer. - * @size: Indicates the size of each entry in the buffer. Supported - * values are sizeof(char), sizeof(short) and sizeof(int) - * @buffer: Start address of buffer - * @buffer_va: Virtual address of start of buffer (normally the same - * as Buffer, but having it separate allows it to hold - * file address for example) - * @length: length of buffer - */ -static void drbd_print_buffer(const char *prefix, unsigned int flags, int size, - const void *buffer, const void *buffer_va, - unsigned int length) - -#define LINE_SIZE 16 -#define LINE_ENTRIES (int)(LINE_SIZE/size) -{ - const unsigned char *pstart; - const unsigned char *pstart_va; - const unsigned char *pend; - char bytes_str[LINE_SIZE*3+8], ascii_str[LINE_SIZE+8]; - char *pbytes = bytes_str, *pascii = ascii_str; - int offset = 0; - long sizemask; - int field_width; - int index; - const unsigned char *pend_str; - const unsigned char *p; - int count; - - /* verify size parameter */ - if (size != sizeof(char) && - size != sizeof(short) && - size != sizeof(int)) { - printk(KERN_DEBUG "drbd_print_buffer: " - "ERROR invalid size %d\n", size); - return; - } - - sizemask = size-1; - field_width = size*2; - - /* Adjust start/end to be on appropriate boundary for size */ - buffer = (const char *)((long)buffer & ~sizemask); - pend = (const unsigned char *) - (((long)buffer + length + sizemask) & ~sizemask); - - if (flags & DBGPRINT_BUFFADDR) { - /* Move start back to nearest multiple of line size, - * if printing address. This results in nicely formatted output - * with addresses being on line size (16) byte boundaries */ - pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1)); - } else { - pstart = (const unsigned char *)buffer; - } - - /* Set value of start VA to print if addresses asked for */ - pstart_va = (const unsigned char *)buffer_va - - ((const unsigned char *)buffer-pstart); - - /* Calculate end position to nicely align right hand side */ - pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1)); - - /* Init strings */ - *pbytes = *pascii = '\0'; - - /* Start at beginning of first line */ - p = pstart; - count = 0; - - while (p < pend_str) { - if (p < (const unsigned char *)buffer || p >= pend) { - /* Before start of buffer or after end- print spaces */ - pbytes += sprintf(pbytes, "%*c ", field_width, ' '); - pascii += sprintf(pascii, "%*c", size, ' '); - p += size; - } else { - /* Add hex and ascii to strings */ - int val; - switch (size) { - default: - case 1: - val = *(unsigned char *)p; - break; - case 2: - val = *(unsigned short *)p; - break; - case 4: - val = *(unsigned int *)p; - break; - } - - pbytes += sprintf(pbytes, "%0*x ", field_width, val); - - for (index = size; index; index--) { - *pascii++ = isprint(*p) ? *p : '.'; - p++; - } - } - - count++; - - if (count == LINE_ENTRIES || p >= pend_str) { - /* Null terminate and print record */ - *pascii = '\0'; - printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n", - prefix, - (flags & DBGPRINT_BUFFADDR) - ? (long)pstart_va:(long)offset, - LINE_ENTRIES*(field_width+1), bytes_str, - LINE_SIZE, ascii_str); - - /* Move onto next line */ - pstart_va += (p-pstart); - pstart = p; - count = 0; - offset += LINE_SIZE; - - /* Re-init strings */ - pbytes = bytes_str; - pascii = ascii_str; - *pbytes = *pascii = '\0'; - } - } -} - -static void probe_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, va_list args) -{ - char str[256]; - - if (!is_mdev_trace(mdev, level)) - return; - - if (vsnprintf(str, 256, fmt, args) >= 256) - str[255] = 0; - - printk(KERN_INFO "%s %s: %s", dev_driver_string(disk_to_dev(mdev->vdisk)), - dev_name(disk_to_dev(mdev->vdisk)), str); -} - -static void probe_drbd_bio(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete, - struct drbd_request *r) -{ -#if defined(CONFIG_LBDAF) || defined(CONFIG_LBD) -#define SECTOR_FORMAT "%Lx" -#else -#define SECTOR_FORMAT "%lx" -#endif -#define SECTOR_SHIFT 9 - - unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT); - char *faddr = (char *)(lowaddr); - char rb[sizeof(void *)*2+6] = { 0, }; - struct bio_vec *bvec; - int segno; - - const int rw = bio->bi_rw; - const int biorw = (rw & (RW_MASK|RWA_MASK)); - const int biobarrier = (rw & (1<>>", - pfx, - biorw == WRITE ? "Write" : "Read", - biobarrier ? " : B" : "", - biosync ? " : S" : "", - bio, - rb, - complete ? (bio_flagged(bio, BIO_UPTODATE) ? "Success, " : "Failed, ") : "", - bio->bi_sector << SECTOR_SHIFT, - bio->bi_size); - - if (trace_level >= TRACE_LVL_METRICS && - ((biorw == WRITE) ^ complete)) { - printk(KERN_DEBUG " ind page offset length\n"); - __bio_for_each_segment(bvec, bio, segno, 0) { - printk(KERN_DEBUG " [%d] %p %8.8x %8.8x\n", segno, - bvec->bv_page, bvec->bv_offset, bvec->bv_len); - - if (trace_level >= TRACE_LVL_ALL) { - char *bvec_buf; - unsigned long flags; - - bvec_buf = bvec_kmap_irq(bvec, &flags); - - drbd_print_buffer(" ", DBGPRINT_BUFFADDR, 1, - bvec_buf, - faddr, - (bvec->bv_len <= 0x80) - ? bvec->bv_len : 0x80); - - bvec_kunmap_irq(bvec_buf, &flags); - - if (bvec->bv_len > 0x40) - printk(KERN_DEBUG " ....\n"); - - faddr += bvec->bv_len; - } - } - } -} - -static void probe_drbd_req(struct drbd_request *req, enum drbd_req_event what, char *msg) -{ - static const char *rq_event_names[] = { - [created] = "created", - [to_be_send] = "to_be_send", - [to_be_submitted] = "to_be_submitted", - [queue_for_net_write] = "queue_for_net_write", - [queue_for_net_read] = "queue_for_net_read", - [send_canceled] = "send_canceled", - [send_failed] = "send_failed", - [handed_over_to_network] = "handed_over_to_network", - [connection_lost_while_pending] = - "connection_lost_while_pending", - [recv_acked_by_peer] = "recv_acked_by_peer", - [write_acked_by_peer] = "write_acked_by_peer", - [neg_acked] = "neg_acked", - [conflict_discarded_by_peer] = "conflict_discarded_by_peer", - [barrier_acked] = "barrier_acked", - [data_received] = "data_received", - [read_completed_with_error] = "read_completed_with_error", - [read_ahead_completed_with_error] = "reada_completed_with_error", - [write_completed_with_error] = "write_completed_with_error", - [completed_ok] = "completed_ok", - }; - - struct drbd_conf *mdev = req->mdev; - - const int rw = (req->master_bio == NULL || - bio_data_dir(req->master_bio) == WRITE) ? - 'W' : 'R'; - const unsigned long s = req->rq_state; - - if (what != nothing) { - dev_info(DEV, "__req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]); - } else { - dev_info(DEV, "%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n", - msg, req, rw, - s & RQ_LOCAL_PENDING ? 'p' : '-', - s & RQ_LOCAL_COMPLETED ? 'c' : '-', - s & RQ_LOCAL_OK ? 'o' : '-', - s & RQ_NET_PENDING ? 'p' : '-', - s & RQ_NET_QUEUED ? 'q' : '-', - s & RQ_NET_SENT ? 's' : '-', - s & RQ_NET_DONE ? 'd' : '-', - s & RQ_NET_OK ? 'o' : '-', - req->epoch, - (unsigned long long)req->sector, - req->size, - drbd_conn_str(mdev->state.conn)); - } -} - - -#define drbd_peer_str drbd_role_str -#define drbd_pdsk_str drbd_disk_str - -#define PSM(A) \ -do { \ - if (mask.A) { \ - int i = snprintf(p, len, " " #A "( %s )", \ - drbd_##A##_str(val.A)); \ - if (i >= len) \ - return op; \ - p += i; \ - len -= i; \ - } \ -} while (0) - -STATIC char *dump_st(char *p, int len, union drbd_state mask, union drbd_state val) -{ - char *op = p; - *p = '\0'; - PSM(role); - PSM(peer); - PSM(conn); - PSM(disk); - PSM(pdsk); - - return op; -} - -#define INFOP(fmt, args...) \ -do { \ - if (trace_level >= TRACE_LVL_ALL) { \ - dev_info(DEV, "%s:%d: %s [%d] %s %s " fmt , \ - file, line, current->comm, current->pid, \ - sockname, recv ? "<<<" : ">>>" , \ - ## args); \ - } else { \ - dev_info(DEV, "%s %s " fmt, sockname, \ - recv ? "<<<" : ">>>" , \ - ## args); \ - } \ -} while (0) - -STATIC char *_dump_block_id(u64 block_id, char *buff) -{ - if (is_syncer_block_id(block_id)) - strcpy(buff, "SyncerId"); - else - sprintf(buff, "%llx", (unsigned long long)block_id); - - return buff; -} - -static void probe_drbd_packet(struct drbd_conf *mdev, struct socket *sock, - int recv, union p_polymorph *p, char *file, int line) -{ - char *sockname = sock == mdev->meta.socket ? "meta" : "data"; - int cmd = (recv == 2) ? p->header.command : be16_to_cpu(p->header.command); - char tmp[300]; - union drbd_state m, v; - - switch (cmd) { - case P_HAND_SHAKE: - INFOP("%s (protocol %u-%u)\n", cmdname(cmd), - be32_to_cpu(p->handshake.protocol_min), - be32_to_cpu(p->handshake.protocol_max)); - break; - - case P_BITMAP: /* don't report this */ - case P_COMPRESSED_BITMAP: /* don't report this */ - break; - - case P_DATA: - INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd), - (unsigned long long)be64_to_cpu(p->data.sector), - _dump_block_id(p->data.block_id, tmp), - be32_to_cpu(p->data.seq_num), - be32_to_cpu(p->data.dp_flags) - ); - break; - - case P_DATA_REPLY: - case P_RS_DATA_REPLY: - INFOP("%s (sector %llus, id %s)\n", cmdname(cmd), - (unsigned long long)be64_to_cpu(p->data.sector), - _dump_block_id(p->data.block_id, tmp) - ); - break; - - case P_RECV_ACK: - case P_WRITE_ACK: - case P_RS_WRITE_ACK: - case P_DISCARD_ACK: - case P_NEG_ACK: - case P_NEG_RS_DREPLY: - INFOP("%s (sector %llus, size %u, id %s, seq %u)\n", - cmdname(cmd), - (long long)be64_to_cpu(p->block_ack.sector), - be32_to_cpu(p->block_ack.blksize), - _dump_block_id(p->block_ack.block_id, tmp), - be32_to_cpu(p->block_ack.seq_num) - ); - break; - - case P_DATA_REQUEST: - case P_RS_DATA_REQUEST: - INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd), - (long long)be64_to_cpu(p->block_req.sector), - be32_to_cpu(p->block_req.blksize), - _dump_block_id(p->block_req.block_id, tmp) - ); - break; - - case P_BARRIER: - case P_BARRIER_ACK: - INFOP("%s (barrier %u)\n", cmdname(cmd), p->barrier.barrier); - break; - - case P_SYNC_PARAM: - case P_SYNC_PARAM89: - INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n", - cmdname(cmd), be32_to_cpu(p->rs_param_89.rate), - p->rs_param_89.verify_alg, p->rs_param_89.csums_alg); - break; - - case P_UUIDS: - INFOP("%s Curr:%016llX, Bitmap:%016llX, " - "HisSt:%016llX, HisEnd:%016llX\n", - cmdname(cmd), - (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_CURRENT]), - (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_BITMAP]), - (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_START]), - (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_END])); - break; - - case P_SIZES: - INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, " - "max bio %x, q order %x)\n", - cmdname(cmd), - (long long)(be64_to_cpu(p->sizes.d_size)>>(20-9)), - (long long)(be64_to_cpu(p->sizes.u_size)>>(20-9)), - (long long)(be64_to_cpu(p->sizes.c_size)>>(20-9)), - be32_to_cpu(p->sizes.max_segment_size), - be32_to_cpu(p->sizes.queue_order_type)); - break; - - case P_STATE: - v.i = be32_to_cpu(p->state.state); - m.i = 0xffffffff; - dump_st(tmp, sizeof(tmp), m, v); - INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp); - break; - - case P_STATE_CHG_REQ: - m.i = be32_to_cpu(p->req_state.mask); - v.i = be32_to_cpu(p->req_state.val); - dump_st(tmp, sizeof(tmp), m, v); - INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp); - break; - - case P_STATE_CHG_REPLY: - INFOP("%s (ret %x)\n", cmdname(cmd), - be32_to_cpu(p->req_state_reply.retcode)); - break; - - case P_PING: - case P_PING_ACK: - /* - * Dont trace pings at summary level - */ - if (trace_level < TRACE_LVL_ALL) - break; - /* fall through... */ - default: - INFOP("%s (%u)\n", cmdname(cmd), cmd); - break; - } -} - - -static int __init drbd_trace_init(void) -{ - int ret; - - if (trace_mask & TRACE_UNPLUG) { - ret = register_trace_drbd_unplug(probe_drbd_unplug); - WARN_ON(ret); - } - if (trace_mask & TRACE_UUID) { - ret = register_trace_drbd_uuid(probe_drbd_uuid); - WARN_ON(ret); - } - if (trace_mask & TRACE_EE) { - ret = register_trace_drbd_ee(probe_drbd_ee); - WARN_ON(ret); - } - if (trace_mask & TRACE_PACKET) { - ret = register_trace_drbd_packet(probe_drbd_packet); - WARN_ON(ret); - } - if (trace_mask & TRACE_MD_IO) { - ret = register_trace_drbd_md_io(probe_drbd_md_io); - WARN_ON(ret); - } - if (trace_mask & TRACE_EPOCH) { - ret = register_trace_drbd_epoch(probe_drbd_epoch); - WARN_ON(ret); - } - if (trace_mask & TRACE_NL) { - ret = register_trace_drbd_netlink(probe_drbd_netlink); - WARN_ON(ret); - } - if (trace_mask & TRACE_AL_EXT) { - ret = register_trace_drbd_actlog(probe_drbd_actlog); - WARN_ON(ret); - } - if (trace_mask & TRACE_RQ) { - ret = register_trace_drbd_bio(probe_drbd_bio); - WARN_ON(ret); - } - if (trace_mask & TRACE_INT_RQ) { - ret = register_trace_drbd_req(probe_drbd_req); - WARN_ON(ret); - } - if (trace_mask & TRACE_RESYNC) { - ret = register_trace__drbd_resync(probe_drbd_resync); - WARN_ON(ret); - } - return 0; -} - -module_init(drbd_trace_init); - -static void __exit drbd_trace_exit(void) -{ - if (trace_mask & TRACE_UNPLUG) - unregister_trace_drbd_unplug(probe_drbd_unplug); - if (trace_mask & TRACE_UUID) - unregister_trace_drbd_uuid(probe_drbd_uuid); - if (trace_mask & TRACE_EE) - unregister_trace_drbd_ee(probe_drbd_ee); - if (trace_mask & TRACE_PACKET) - unregister_trace_drbd_packet(probe_drbd_packet); - if (trace_mask & TRACE_MD_IO) - unregister_trace_drbd_md_io(probe_drbd_md_io); - if (trace_mask & TRACE_EPOCH) - unregister_trace_drbd_epoch(probe_drbd_epoch); - if (trace_mask & TRACE_NL) - unregister_trace_drbd_netlink(probe_drbd_netlink); - if (trace_mask & TRACE_AL_EXT) - unregister_trace_drbd_actlog(probe_drbd_actlog); - if (trace_mask & TRACE_RQ) - unregister_trace_drbd_bio(probe_drbd_bio); - if (trace_mask & TRACE_INT_RQ) - unregister_trace_drbd_req(probe_drbd_req); - if (trace_mask & TRACE_RESYNC) - unregister_trace__drbd_resync(probe_drbd_resync); - - tracepoint_synchronize_unregister(); -} - -module_exit(drbd_trace_exit); diff -Nru drbd8-8.3.7/drbd/drbd_tracing.h drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_tracing.h --- drbd8-8.3.7/drbd/drbd_tracing.h 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_tracing.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,87 +0,0 @@ -/* - drbd_tracing.h - - This file is part of DRBD by Philipp Reisner and Lars Ellenberg. - - Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. - Copyright (C) 2003-2008, Philipp Reisner . - Copyright (C) 2003-2008, Lars Ellenberg . - - drbd is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - drbd is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with drbd; see the file COPYING. If not, write to - the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - - */ - -#ifndef DRBD_TRACING_H -#define DRBD_TRACING_H - -#include -#include "drbd_int.h" -#include "drbd_req.h" - -enum { - TRACE_LVL_ALWAYS = 0, - TRACE_LVL_SUMMARY, - TRACE_LVL_METRICS, - TRACE_LVL_ALL, - TRACE_LVL_MAX -}; - -DECLARE_TRACE(drbd_unplug, - TP_PROTO(struct drbd_conf *mdev, char* msg), - TP_ARGS(mdev, msg)); - -DECLARE_TRACE(drbd_uuid, - TP_PROTO(struct drbd_conf *mdev, enum drbd_uuid_index index), - TP_ARGS(mdev, index)); - -DECLARE_TRACE(drbd_ee, - TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg), - TP_ARGS(mdev, e, msg)); - -DECLARE_TRACE(drbd_md_io, - TP_PROTO(struct drbd_conf *mdev, int rw, struct drbd_backing_dev *bdev), - TP_ARGS(mdev, rw, bdev)); - -DECLARE_TRACE(drbd_epoch, - TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev), - TP_ARGS(mdev, epoch, ev)); - -DECLARE_TRACE(drbd_netlink, - TP_PROTO(void *data, int is_req), - TP_ARGS(data, is_req)); - -DECLARE_TRACE(drbd_actlog, - TP_PROTO(struct drbd_conf *mdev, sector_t sector, char* msg), - TP_ARGS(mdev, sector, msg)); - -DECLARE_TRACE(drbd_bio, - TP_PROTO(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete, - struct drbd_request *r), - TP_ARGS(mdev, pfx, bio, complete, r)); - -DECLARE_TRACE(drbd_req, - TP_PROTO(struct drbd_request *req, enum drbd_req_event what, char *msg), - TP_ARGS(req, what, msg)); - -DECLARE_TRACE(drbd_packet, - TP_PROTO(struct drbd_conf *mdev, struct socket *sock, - int recv, union p_polymorph *p, char *file, int line), - TP_ARGS(mdev, sock, recv, p, file, line)); - -DECLARE_TRACE(_drbd_resync, - TP_PROTO(struct drbd_conf *mdev, int level, const char *fmt, va_list args), - TP_ARGS(mdev, level, fmt, args)); - -#endif diff -Nru drbd8-8.3.7/drbd/drbd_vli.h drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_vli.h --- drbd8-8.3.7/drbd/drbd_vli.h 2009-07-27 08:47:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_vli.h 2012-02-02 14:09:14.000000000 +0000 @@ -32,7 +32,7 @@ * the bitmap transfer time can take much too long, * if transmitted in plain text. * - * We try to reduce the transfered bitmap information + * We try to reduce the transferred bitmap information * by encoding runlengths of bit polarity. * * We never actually need to encode a "zero" (runlengths are positive). diff -Nru drbd8-8.3.7/drbd/drbd_worker.c drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_worker.c --- drbd8-8.3.7/drbd/drbd_worker.c 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_worker.c 2012-02-02 14:09:14.000000000 +0000 @@ -27,41 +27,27 @@ #include #include #include -#include #include #include #include #include #include #include -#ifdef HAVE_LINUX_SCATTERLIST_H -/* 2.6.11 (suse 9.3, fc4) does not include requisites - * from linux/scatterlist.h :( */ -#include -#include #include -#endif #include "drbd_int.h" #include "drbd_req.h" -#include "drbd_tracing.h" - -#define SLEEP_TIME (HZ/10) - -STATIC int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); +STATIC int w_make_ov_request(struct drbd_work *w, int cancel); -/* defined here: - drbd_md_io_complete - drbd_endio_write_sec - drbd_endio_read_sec - drbd_endio_pri - - * more endio handlers: - atodb_endio in drbd_actlog.c - drbd_bm_async_io_complete in drbd_bitmap.c +/* endio handlers: + * drbd_md_io_complete (defined here) + * drbd_request_endio (defined here) + * drbd_peer_request_endio (defined here) + * bm_async_io_complete (defined in drbd_bitmap.c) + * * For all these callbacks, note the following: * The callbacks will be called in irq context by the IDE drivers, * and in Softirqs/Tasklets/BH context by the SCSI drivers. @@ -72,7 +58,7 @@ /* About the global_state_lock Each state transition on an device holds a read lock. In case we have - to evaluate the sync after dependencies, we grab a write lock, because + to evaluate the resync after dependencies, we grab a write lock, because we need stable states on all devices for that. */ rwlock_t global_state_lock; @@ -82,171 +68,179 @@ BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error) { struct drbd_md_io *md_io; + struct drbd_conf *mdev; BIO_ENDIO_FN_START; md_io = (struct drbd_md_io *)bio->bi_private; + mdev = container_of(md_io, struct drbd_conf, md_io); + md_io->error = error; - trace_drbd_bio(md_io->mdev, "Md", bio, 1, NULL); + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able + * to timeout on the lower level device, and eventually detach from it. + * If this io completion runs after that timeout expired, this + * drbd_md_put_buffer() may allow us to finally try and re-attach. + * During normal operation, this only puts that extra reference + * down to 1 again. + * Make sure we first drop the reference, and only then signal + * completion, or we may (in drbd_al_read_log()) cycle so fast into the + * next drbd_md_sync_page_io(), that we trigger the + * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there. + */ + drbd_md_put_buffer(mdev); + md_io->done = 1; + wake_up(&mdev->misc_wait); + bio_put(bio); + put_ldev(mdev); - complete(&md_io->event); BIO_ENDIO_FN_RETURN; } /* reads on behalf of the partner, * "submitted" by the receiver */ -BIO_ENDIO_TYPE drbd_endio_read_sec BIO_ENDIO_ARGS(struct bio *bio, int error) __releases(local) +void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local) { unsigned long flags = 0; - struct drbd_epoch_entry *e = NULL; - struct drbd_conf *mdev; - int uptodate = bio_flagged(bio, BIO_UPTODATE); - - e = bio->bi_private; - mdev = e->mdev; - - BIO_ENDIO_FN_START; - if (error) - dev_warn(DEV, "read: error=%d s=%llus\n", error, - (unsigned long long)e->sector); - if (!error && !uptodate) { - dev_warn(DEV, "read: setting error to -EIO s=%llus\n", - (unsigned long long)e->sector); - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - error = -EIO; - } + struct drbd_conf *mdev = peer_req->w.mdev; - D_ASSERT(e->block_id != ID_VACANT); - - trace_drbd_bio(mdev, "Sec", bio, 1, NULL); - - spin_lock_irqsave(&mdev->req_lock, flags); - mdev->read_cnt += e->size >> 9; - list_del(&e->w.list); + spin_lock_irqsave(&mdev->tconn->req_lock, flags); + mdev->read_cnt += peer_req->i.size >> 9; + list_del(&peer_req->w.list); if (list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); - spin_unlock_irqrestore(&mdev->req_lock, flags); + if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) + __drbd_chk_io_error(mdev, false); + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); - drbd_chk_io_error(mdev, error, FALSE); - drbd_queue_work(&mdev->data.work, &e->w); + drbd_queue_work(&mdev->tconn->data.work, &peer_req->w); put_ldev(mdev); +} - trace_drbd_ee(mdev, e, "read completed"); - BIO_ENDIO_FN_RETURN; +static int is_failed_barrier(int ee_flags) +{ + return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED)) + == (EE_IS_BARRIER|EE_WAS_ERROR); } /* writes on behalf of the partner, or resync writes, - * "submitted" by the receiver. - */ -BIO_ENDIO_TYPE drbd_endio_write_sec BIO_ENDIO_ARGS(struct bio *bio, int error) __releases(local) + * "submitted" by the receiver, final stage. */ +static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) { unsigned long flags = 0; - struct drbd_epoch_entry *e = NULL; - struct drbd_conf *mdev; - sector_t e_sector; + struct drbd_conf *mdev = peer_req->w.mdev; + struct drbd_interval i; int do_wake; - int is_syncer_req; + u64 block_id; int do_al_complete_io; - int uptodate = bio_flagged(bio, BIO_UPTODATE); - int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); - - e = bio->bi_private; - mdev = e->mdev; - BIO_ENDIO_FN_START; - if (error) - dev_warn(DEV, "write: error=%d s=%llus\n", error, - (unsigned long long)e->sector); - if (!error && !uptodate) { - dev_warn(DEV, "write: setting error to -EIO s=%llus\n", - (unsigned long long)e->sector); - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - error = -EIO; - } - - /* error == -ENOTSUPP would be a better test, - * alas it is not reliable */ - if (error && is_barrier && e->flags & EE_IS_BARRIER) { - drbd_bump_write_ordering(mdev, WO_bdev_flush); - spin_lock_irqsave(&mdev->req_lock, flags); - list_del(&e->w.list); - e->w.cb = w_e_reissue; + /* if this is a failed barrier request, disable use of barriers, + * and schedule for resubmission */ + if (is_failed_barrier(peer_req->flags)) { + drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush); + spin_lock_irqsave(&mdev->tconn->req_lock, flags); + list_del(&peer_req->w.list); + peer_req->flags = (peer_req->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED; + peer_req->w.cb = w_e_reissue; /* put_ldev actually happens below, once we come here again. */ __release(local); - spin_unlock_irqrestore(&mdev->req_lock, flags); - drbd_queue_work(&mdev->data.work, &e->w); - BIO_ENDIO_FN_RETURN; + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); + drbd_queue_work(&mdev->tconn->data.work, &peer_req->w); + return; } - D_ASSERT(e->block_id != ID_VACANT); - - trace_drbd_bio(mdev, "Sec", bio, 1, NULL); - - spin_lock_irqsave(&mdev->req_lock, flags); - mdev->writ_cnt += e->size >> 9; - is_syncer_req = is_syncer_block_id(e->block_id); - - /* after we moved e to done_ee, + /* after we moved peer_req to done_ee, * we may no longer access it, * it may be freed/reused already! * (as soon as we release the req_lock) */ - e_sector = e->sector; - do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; - - list_del(&e->w.list); /* has been on active_ee or sync_ee */ - list_add_tail(&e->w.list, &mdev->done_ee); + i = peer_req->i; + do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; + block_id = peer_req->block_id; + + spin_lock_irqsave(&mdev->tconn->req_lock, flags); + mdev->writ_cnt += peer_req->i.size >> 9; + list_del(&peer_req->w.list); /* has been on active_ee or sync_ee */ + list_add_tail(&peer_req->w.list, &mdev->done_ee); + + /* + * Do not remove from the write_requests tree here: we did not send the + * Ack yet and did not wake possibly waiting conflicting requests. + * Removed from the tree from "drbd_process_done_ee" within the + * appropriate w.cb (e_end_block/e_end_resync_block) or from + * _drbd_clear_done_ee. + */ - trace_drbd_ee(mdev, e, "write completed"); + do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee); - /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, - * neither did we wake possibly waiting conflicting requests. - * done from "drbd_process_done_ee" within the appropriate w.cb - * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ - - do_wake = is_syncer_req - ? list_empty(&mdev->sync_ee) - : list_empty(&mdev->active_ee); + if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) + __drbd_chk_io_error(mdev, false); + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); - if (error) - __drbd_chk_io_error(mdev, FALSE); - spin_unlock_irqrestore(&mdev->req_lock, flags); - - if (is_syncer_req) - drbd_rs_complete_io(mdev, e_sector); + if (block_id == ID_SYNCER) + drbd_rs_complete_io(mdev, i.sector); if (do_wake) wake_up(&mdev->ee_wait); if (do_al_complete_io) - drbd_al_complete_io(mdev, e_sector); + drbd_al_complete_io(mdev, &i); - wake_asender(mdev); + wake_asender(mdev->tconn); put_ldev(mdev); +} +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +BIO_ENDIO_TYPE drbd_peer_request_endio BIO_ENDIO_ARGS(struct bio *bio, int error) +{ + struct drbd_peer_request *peer_req = bio->bi_private; + struct drbd_conf *mdev = peer_req->w.mdev; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + int is_write = bio_data_dir(bio) == WRITE; + + BIO_ENDIO_FN_START; + if (error && DRBD_ratelimit(5*HZ, 5)) + dev_warn(DEV, "%s: error=%d s=%llus\n", + is_write ? "write" : "read", error, + (unsigned long long)peer_req->i.sector); + if (!error && !uptodate) { + if (DRBD_ratelimit(5*HZ, 5)) + dev_warn(DEV, "%s: setting error to -EIO s=%llus\n", + is_write ? "write" : "read", + (unsigned long long)peer_req->i.sector); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + if (error) + set_bit(__EE_WAS_ERROR, &peer_req->flags); + + bio_put(bio); /* no need for the bio anymore */ + if (atomic_dec_and_test(&peer_req->pending_bios)) { + if (is_write) + drbd_endio_write_sec_final(peer_req); + else + drbd_endio_read_sec_final(peer_req); + } BIO_ENDIO_FN_RETURN; } /* read, readA or write requests on R_PRIMARY coming from drbd_make_request */ -BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error) +BIO_ENDIO_TYPE drbd_request_endio BIO_ENDIO_ARGS(struct bio *bio, int error) { unsigned long flags; struct drbd_request *req = bio->bi_private; - struct drbd_conf *mdev = req->mdev; + struct drbd_conf *mdev = req->w.mdev; struct bio_and_error m; enum drbd_req_event what; int uptodate = bio_flagged(bio, BIO_UPTODATE); BIO_ENDIO_FN_START; - if (error) - dev_warn(DEV, "p %s: error=%d\n", - bio_data_dir(bio) == WRITE ? "write" : "read", error); if (!error && !uptodate) { dev_warn(DEV, "p %s: setting error to -EIO\n", bio_data_dir(bio) == WRITE ? "write" : "read"); @@ -256,74 +250,78 @@ error = -EIO; } - trace_drbd_bio(mdev, "Pri", bio, 1, NULL); - /* to avoid recursion in __req_mod */ if (unlikely(error)) { what = (bio_data_dir(bio) == WRITE) - ? write_completed_with_error - : (bio_rw(bio) == READA) - ? read_completed_with_error - : read_ahead_completed_with_error; + ? WRITE_COMPLETED_WITH_ERROR + : (bio_rw(bio) == READ) + ? READ_COMPLETED_WITH_ERROR + : READ_AHEAD_COMPLETED_WITH_ERROR; } else - what = completed_ok; + what = COMPLETED_OK; bio_put(req->private_bio); req->private_bio = ERR_PTR(error); - spin_lock_irqsave(&mdev->req_lock, flags); + /* not req_mod(), we need irqsave here! */ + spin_lock_irqsave(&mdev->tconn->req_lock, flags); __req_mod(req, what, &m); - spin_unlock_irqrestore(&mdev->req_lock, flags); + spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); if (m.bio) complete_master_bio(mdev, &m); BIO_ENDIO_FN_RETURN; } -int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) -{ - struct drbd_request *req = container_of(w, struct drbd_request, w); - - /* NOTE: mdev->ldev can be NULL by the time we get here! */ - /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ - - /* the only way this callback is scheduled is from _req_may_be_done, - * when it is done and had a local write error, see comments there */ - drbd_req_free(req); - - return TRUE; -} - -int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_read_retry_remote(struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_conf *mdev = w->mdev; /* We should not detach for read io-error, * but try to WRITE the P_DATA_REPLY to the failed location, * to give the disk the chance to relocate that block */ - spin_lock_irq(&mdev->req_lock); - if (cancel || - mdev->state.conn < C_CONNECTED || - mdev->state.pdsk <= D_INCONSISTENT) { - _req_mod(req, send_canceled); - spin_unlock_irq(&mdev->req_lock); - dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); - return 1; + spin_lock_irq(&mdev->tconn->req_lock); + if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { + _req_mod(req, READ_RETRY_REMOTE_CANCELED); + spin_unlock_irq(&mdev->tconn->req_lock); + return 0; } - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); - return w_send_read_req(mdev, w, 0); + return w_send_read_req(w, 0); } -int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, + struct drbd_peer_request *peer_req, void *digest) { - ERR_IF(cancel) return 1; - dev_err(DEV, "resync inactive, but callback triggered??\n"); - return 1; /* Simply ignore this! */ + struct hash_desc desc; + struct scatterlist sg; + struct page *page = peer_req->pages; + struct page *tmp; + unsigned len; + + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(&sg, 1); + crypto_hash_init(&desc); + + while ((tmp = page_chain_next(page))) { + /* all but the last page will be fully used */ + sg_set_page(&sg, page, PAGE_SIZE, 0); + crypto_hash_update(&desc, &sg, sg.length); + page = tmp; + } + /* and now the last, possibly only partially used page */ + len = peer_req->i.size & (PAGE_SIZE - 1); + sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); + crypto_hash_update(&desc, &sg, sg.length); + crypto_hash_final(&desc, digest); } -void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) +void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) { struct hash_desc desc; struct scatterlist sg; @@ -343,169 +341,286 @@ crypto_hash_final(&desc, digest); } -STATIC int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +/* MAYBE merge common code with w_e_end_ov_req */ +STATIC int w_e_send_csum(struct drbd_work *w, int cancel) { - struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; int digest_size; void *digest; - int ok; + int err = 0; - D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); - - if (unlikely(cancel)) { - drbd_free_ee(mdev, e); - return 1; - } + if (unlikely(cancel)) + goto out; - if (likely(drbd_bio_uptodate(e->private_bio))) { - digest_size = crypto_hash_digestsize(mdev->csums_tfm); - digest = kmalloc(digest_size, GFP_NOIO); - if (digest) { - drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); + if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0)) + goto out; - inc_rs_pending(mdev); - ok = drbd_send_drequest_csum(mdev, - e->sector, - e->size, - digest, - digest_size, - P_CSUM_RS_REQUEST); - kfree(digest); - } else { - dev_err(DEV, "kmalloc() of digest failed.\n"); - ok = 0; - } - } else - ok = 1; + digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; + drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); + /* Free peer_req and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(mdev, peer_req); + peer_req = NULL; + inc_rs_pending(mdev); + err = drbd_send_drequest_csum(mdev, sector, size, + digest, digest_size, + P_CSUM_RS_REQUEST); + kfree(digest); + } else { + dev_err(DEV, "kmalloc() of digest failed.\n"); + err = -ENOMEM; + } - drbd_free_ee(mdev, e); +out: + if (peer_req) + drbd_free_peer_req(mdev, peer_req); - if (unlikely(!ok)) + if (unlikely(err)) dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); - return ok; + return err; } #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) STATIC int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) { - struct drbd_epoch_entry *e; + struct drbd_peer_request *peer_req; if (!get_ldev(mdev)) - return 0; + return -EIO; + + if (drbd_rs_should_slow_down(mdev, sector)) + goto defer; /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ - e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); - if (!e) { - put_ldev(mdev); - return 2; - } + peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector, + size, GFP_TRY); + if (!peer_req) + goto defer; + + peer_req->w.cb = w_e_send_csum; + spin_lock_irq(&mdev->tconn->req_lock); + list_add(&peer_req->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->tconn->req_lock); + + atomic_add(size >> 9, &mdev->rs_sect_ev); + if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0) + return 0; - spin_lock_irq(&mdev->req_lock); - list_add(&e->w.list, &mdev->read_ee); - spin_unlock_irq(&mdev->req_lock); + /* If it failed because of ENOMEM, retry should help. If it failed + * because bio_add_page failed (probably broken lower level driver), + * retry may or may not help. + * If it does not, you may need to force disconnect. */ + spin_lock_irq(&mdev->tconn->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&mdev->tconn->req_lock); - e->private_bio->bi_end_io = drbd_endio_read_sec; - e->private_bio->bi_rw = READ; - e->w.cb = w_e_send_csum; + drbd_free_peer_req(mdev, peer_req); +defer: + put_ldev(mdev); + return -EAGAIN; +} - mdev->read_cnt += size >> 9; - drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); +int w_resync_timer(struct drbd_work *w, int cancel) +{ + struct drbd_conf *mdev = w->mdev; + switch (mdev->state.conn) { + case C_VERIFY_S: + w_make_ov_request(w, cancel); + break; + case C_SYNC_TARGET: + w_make_resync_request(w, cancel); + break; + } - return 1; + return 0; } void resync_timer_fn(unsigned long data) { - unsigned long flags; struct drbd_conf *mdev = (struct drbd_conf *) data; - int queue; - spin_lock_irqsave(&mdev->req_lock, flags); + if (list_empty(&mdev->resync_work.list)) + drbd_queue_work(&mdev->tconn->data.work, &mdev->resync_work); +} - if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { - queue = 1; - if (mdev->state.conn == C_VERIFY_S) - mdev->resync_work.cb = w_make_ov_request; - else - mdev->resync_work.cb = w_make_resync_request; - } else { - queue = 0; - mdev->resync_work.cb = w_resync_inactive; +static void fifo_set(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] = value; +} + +static int fifo_push(struct fifo_buffer *fb, int value) +{ + int ov; + + ov = fb->values[fb->head_index]; + fb->values[fb->head_index++] = value; + + if (fb->head_index >= fb->size) + fb->head_index = 0; + + return ov; +} + +static void fifo_add_val(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] += value; +} + +struct fifo_buffer *fifo_alloc(int fifo_size) +{ + struct fifo_buffer *fb; + + fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_KERNEL); + if (!fb) + return NULL; + + fb->head_index = 0; + fb->size = fifo_size; + fb->total = 0; + + return fb; +} + +STATIC int drbd_rs_controller(struct drbd_conf *mdev) +{ + struct disk_conf *dc; + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + unsigned int want; /* The number of sectors we want in the proxy */ + int req_sect; /* Number of sectors to request in this turn */ + int correction; /* Number of sectors more we need in the proxy*/ + int cps; /* correction per invocation of drbd_rs_controller() */ + int steps; /* Number of time steps to plan ahead */ + int curr_corr; + int max_sect; + struct fifo_buffer *plan; + + sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */ + mdev->rs_in_flight -= sect_in; + + dc = rcu_dereference(mdev->ldev->disk_conf); + plan = rcu_dereference(mdev->rs_plan_s); + + steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ + + if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */ + want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps; + } else { /* normal path */ + want = dc->c_fill_target ? dc->c_fill_target : + sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10); } - spin_unlock_irqrestore(&mdev->req_lock, flags); + correction = want - mdev->rs_in_flight - plan->total; + + /* Plan ahead */ + cps = correction / steps; + fifo_add_val(plan, cps); + plan->total += cps * steps; + + /* What we do in this step */ + curr_corr = fifo_push(plan, 0); + plan->total -= curr_corr; + + req_sect = sect_in + curr_corr; + if (req_sect < 0) + req_sect = 0; + + max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ; + if (req_sect > max_sect) + req_sect = max_sect; + + /* + dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n", + sect_in, mdev->rs_in_flight, want, correction, + steps, cps, mdev->rs_planed, curr_corr, req_sect); + */ + + return req_sect; +} + +STATIC int drbd_rs_number_requests(struct drbd_conf *mdev) +{ + int number; + + rcu_read_lock(); + if (rcu_dereference(mdev->rs_plan_s)->size) { + number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9); + mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; + } else { + mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate; + number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); + } + rcu_read_unlock(); - /* harmless race: list_empty outside data.work.q_lock */ - if (list_empty(&mdev->resync_work.list) && queue) - drbd_queue_work(&mdev->data.work, &mdev->resync_work); + /* ignore the amount of pending requests, the resync controller should + * throttle down to incoming reply rate soon enough anyways. */ + return number; } -int w_make_resync_request(struct drbd_conf *mdev, - struct drbd_work *w, int cancel) +int w_make_resync_request(struct drbd_work *w, int cancel) { + struct drbd_conf *mdev = w->mdev; unsigned long bit; sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); - int max_segment_size = queue_max_segment_size(mdev->rq_queue); - int number, i, size, pe, mx; + int max_bio_size; + int number, rollback_i, size; int align, queued, sndbuf; + int i = 0; - PARANOIA_BUG_ON(w != &mdev->resync_work); +#ifdef PARANOIA + BUG_ON(w != &mdev->resync_work); +#endif if (unlikely(cancel)) - return 1; + return 0; - if (unlikely(mdev->state.conn < C_CONNECTED)) { - dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected"); + if (mdev->rs_total == 0) { + /* empty resync? */ + drbd_resync_finished(mdev); return 0; } - if (mdev->state.conn != C_SYNC_TARGET) - dev_err(DEV, "%s in w_make_resync_request\n", - drbd_conn_str(mdev->state.conn)); - if (!get_ldev(mdev)) { /* Since we only need to access mdev->rsync a get_ldev_if_state(mdev,D_FAILED) would be sufficient, but to continue resync with a broken disk makes no sense at all */ dev_err(DEV, "Disk broke down during resync!\n"); - mdev->resync_work.cb = w_resync_inactive; - return 1; + return 0; } - number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); - pe = atomic_read(&mdev->rs_pending_cnt); - - mutex_lock(&mdev->data.mutex); - if (mdev->data.socket) - mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req); - else - mx = 1; - mutex_unlock(&mdev->data.mutex); - - /* For resync rates >160MB/sec, allow more pending RS requests */ - if (number > mx) - mx = number; - - /* Limit the number of pending RS requests to no more than the peer's receive buffer */ - if ((pe + number) > mx) { - number = mx - pe; - } + max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; + number = drbd_rs_number_requests(mdev); + if (number == 0) + goto requeue; for (i = 0; i < number; i++) { /* Stop generating RS requests, when half of the send buffer is filled */ - mutex_lock(&mdev->data.mutex); - if (mdev->data.socket) { - queued = mdev->data.socket->sk->sk_wmem_queued; - sndbuf = mdev->data.socket->sk->sk_sndbuf; + mutex_lock(&mdev->tconn->data.mutex); + if (mdev->tconn->data.socket) { + queued = mdev->tconn->data.socket->sk->sk_wmem_queued; + sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf; } else { queued = 1; sndbuf = 0; } - mutex_unlock(&mdev->data.mutex); + mutex_unlock(&mdev->tconn->data.mutex); if (queued > sndbuf / 2) goto requeue; @@ -513,16 +628,16 @@ size = BM_BLOCK_SIZE; bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); - if (bit == -1UL) { + if (bit == DRBD_END_OF_BITMAP) { mdev->bm_resync_fo = drbd_bm_bits(mdev); - mdev->resync_work.cb = w_resync_inactive; put_ldev(mdev); - return 1; + return 0; } sector = BM_BIT_TO_SECT(bit); - if (drbd_try_rs_begin_io(mdev, sector)) { + if (drbd_rs_should_slow_down(mdev, sector) || + drbd_try_rs_begin_io(mdev, sector)) { mdev->bm_resync_fo = bit; goto requeue; } @@ -533,22 +648,17 @@ goto next_sector; } -#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE +#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE /* try to find some adjacent bits. * we stop if we have already the maximum req size. * * Additionally always align bigger requests, in order to * be prepared for all stripe sizes of software RAIDs. - * - * we _do_ care about the agreed-upon q->max_segment_size - * here, as splitting up the requests on the other side is more - * difficult. the consequence is, that on lvm and md and other - * "indirect" devices, this is dead code, since - * q->max_segment_size will be PAGE_SIZE. */ align = 1; + rollback_i = i; for (;;) { - if (size + BM_BLOCK_SIZE > max_segment_size) + if (size + BM_BLOCK_SIZE > max_bio_size) break; /* Be always aligned */ @@ -580,25 +690,33 @@ /* adjust very last sectors, in case we are oddly sized */ if (sector + (size>>9) > capacity) size = (capacity-sector)<<9; - if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { + if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) { switch (read_for_csum(mdev, sector, size)) { - case 0: /* Disk failure*/ + case -EIO: /* Disk failure */ put_ldev(mdev); - return 0; - case 2: /* Allocation failed */ + return -EIO; + case -EAGAIN: /* allocation failed, or ldev busy */ drbd_rs_complete_io(mdev, sector); mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); + i = rollback_i; goto requeue; - /* case 1: everything ok */ + case 0: + /* everything ok */ + break; + default: + BUG(); } } else { + int err; + inc_rs_pending(mdev); - if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, - sector, size, ID_SYNCER)) { + err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST, + sector, size, ID_SYNCER); + if (err) { dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); dec_rs_pending(mdev); put_ldev(mdev); - return 0; + return err; } } } @@ -610,19 +728,20 @@ * resync data block, and the last bit is cleared. * until then resync "work" is "inactive" ... */ - mdev->resync_work.cb = w_resync_inactive; put_ldev(mdev); - return 1; + return 0; } requeue: + mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); put_ldev(mdev); - return 1; + return 0; } -STATIC int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +STATIC int w_make_ov_request(struct drbd_work *w, int cancel) { + struct drbd_conf *mdev = w->mdev; int number, i, size; sector_t sector; const sector_t capacity = drbd_get_capacity(mdev->this_bdev); @@ -630,27 +749,18 @@ if (unlikely(cancel)) return 1; - if (unlikely(mdev->state.conn < C_CONNECTED)) { - dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected"); - return 0; - } - - number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); - if (atomic_read(&mdev->rs_pending_cnt) > number) - goto requeue; - - number -= atomic_read(&mdev->rs_pending_cnt); + number = drbd_rs_number_requests(mdev); sector = mdev->ov_position; for (i = 0; i < number; i++) { if (sector >= capacity) { - mdev->resync_work.cb = w_resync_inactive; return 1; } size = BM_BLOCK_SIZE; - if (drbd_try_rs_begin_io(mdev, sector)) { + if (drbd_rs_should_slow_down(mdev, sector) || + drbd_try_rs_begin_io(mdev, sector)) { mdev->ov_position = sector; goto requeue; } @@ -659,7 +769,7 @@ size = (capacity-sector)<<9; inc_rs_pending(mdev); - if (!drbd_send_ov_request(mdev, sector, size)) { + if (drbd_send_ov_request(mdev, sector, size)) { dec_rs_pending(mdev); return 0; } @@ -668,27 +778,39 @@ mdev->ov_position = sector; requeue: + mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); return 1; } - -int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_ov_finished(struct drbd_work *w, int cancel) { + struct drbd_conf *mdev = w->mdev; kfree(w); - ov_oos_print(mdev); + ov_out_of_sync_print(mdev); drbd_resync_finished(mdev); - return 1; + return 0; } -STATIC int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +STATIC int w_resync_finished(struct drbd_work *w, int cancel) { + struct drbd_conf *mdev = w->mdev; kfree(w); drbd_resync_finished(mdev); - return 1; + return 0; +} + +STATIC void ping_peer(struct drbd_conf *mdev) +{ + struct drbd_tconn *tconn = mdev->tconn; + + clear_bit(GOT_PING_ACK, &tconn->flags); + request_ping(tconn); + wait_event(tconn->ping_wait, + test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED); } int drbd_resync_finished(struct drbd_conf *mdev) @@ -698,6 +820,7 @@ union drbd_state os, ns; struct drbd_work *w; char *khelper_cmd = NULL; + int verify_done = 0; /* Remove all elements from the resync LRU. Since future actions * might set bits in the (main) bitmap, then the entries in the @@ -708,13 +831,12 @@ * queue (or even the read operations for those packets * is not finished by now). Retry in 100ms. */ - drbd_kick_lo(mdev); - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ / 10); + schedule_timeout_interruptible(HZ / 10); w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); if (w) { w->cb = w_resync_finished; - drbd_queue_work(&mdev->data.work, w); + w->mdev = mdev; + drbd_queue_work(&mdev->tconn->data.work, w); return 1; } dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); @@ -730,8 +852,12 @@ if (!get_ldev(mdev)) goto out; - spin_lock_irq(&mdev->req_lock); - os = mdev->state; + ping_peer(mdev); + + spin_lock_irq(&mdev->tconn->req_lock); + os = drbd_read_state(mdev); + + verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); /* This protects us against multiple calls (that can happen in the presence of application IO), and against connectivity loss just before we arrive here. */ @@ -742,8 +868,7 @@ ns.conn = C_CONNECTED; dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", - (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? - "Online verify " : "Resync", + verify_done ? "Online verify " : "Resync", dt + mdev->rs_paused, mdev->rs_paused, dbdt); n_oos = drbd_bm_total_weight(mdev); @@ -760,13 +885,13 @@ if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) khelper_cmd = "after-resync-target"; - if (mdev->csums_tfm && mdev->rs_total) { + if (mdev->tconn->csums_tfm && mdev->rs_total) { const unsigned long s = mdev->rs_same_csum; const unsigned long t = mdev->rs_total; const int ratio = (t == 0) ? 0 : (t < 100000) ? ((s*100)/t) : (s/(t/100)); - dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " + dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; " "transferred %luK total %luK\n", ratio, Bit2KB(mdev->rs_same_csum), @@ -801,32 +926,33 @@ } } - drbd_uuid_set_bm(mdev, 0UL); - - if (mdev->p_uuid) { - /* Now the two UUID sets are equal, update what we - * know of the peer. */ - int i; - for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) - mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; + if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) { + /* for verify runs, we don't update uuids here, + * so there would be nothing to report. */ + drbd_uuid_set_bm(mdev, 0UL); + drbd_print_uuids(mdev, "updated UUIDs"); + if (mdev->p_uuid) { + /* Now the two UUID sets are equal, update what we + * know of the peer. */ + int i; + for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) + mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; + } } } - DRBD_STATE_DEBUG_INIT_VAL(ns); _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); out_unlock: - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); put_ldev(mdev); out: mdev->rs_total = 0; mdev->rs_failed = 0; mdev->rs_paused = 0; - mdev->ov_start_sector = 0; + if (verify_done) + mdev->ov_start_sector = 0; - if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { - dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); - drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); - } + drbd_md_sync(mdev); if (khelper_cmd) drbd_khelper(mdev, khelper_cmd); @@ -835,15 +961,19 @@ } /* helper */ -static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req) { - if (drbd_bio_has_active_page(e->private_bio)) { + if (drbd_peer_req_has_active_page(peer_req)) { /* This might happen if sendpage() has not finished */ - spin_lock_irq(&mdev->req_lock); - list_add_tail(&e->w.list, &mdev->net_ee); - spin_unlock_irq(&mdev->req_lock); + int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; + atomic_add(i, &mdev->pp_in_use_by_net); + atomic_sub(i, &mdev->pp_in_use); + spin_lock_irq(&mdev->tconn->req_lock); + list_add_tail(&peer_req->w.list, &mdev->net_ee); + spin_unlock_irq(&mdev->tconn->req_lock); + wake_up(&drbd_pp_wait); } else - drbd_free_ee(mdev, e); + drbd_free_peer_req(mdev, peer_req); } /** @@ -852,182 +982,203 @@ * @w: work object. * @cancel: The connection will be closed anyways */ -int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_e_end_data_req(struct drbd_work *w, int cancel) { - struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); - int ok; + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; + int err; if (unlikely(cancel)) { - drbd_free_ee(mdev, e); + drbd_free_peer_req(mdev, peer_req); dec_unacked(mdev); - return 1; + return 0; } - if (likely(drbd_bio_uptodate(e->private_bio))) { - ok = drbd_send_block(mdev, P_DATA_REPLY, e); + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + err = drbd_send_block(mdev, P_DATA_REPLY, peer_req); } else { if (DRBD_ratelimit(5*HZ, 5)) dev_err(DEV, "Sending NegDReply. sector=%llus.\n", - (unsigned long long)e->sector); + (unsigned long long)peer_req->i.sector); - ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); + err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req); } dec_unacked(mdev); - move_to_net_ee_or_free(mdev, e); + move_to_net_ee_or_free(mdev, peer_req); - if (unlikely(!ok)) + if (unlikely(err)) dev_err(DEV, "drbd_send_block() failed\n"); - return ok; + return err; } /** - * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS + * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST * @mdev: DRBD device. * @w: work object. * @cancel: The connection will be closed anyways */ -int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_e_end_rsdata_req(struct drbd_work *w, int cancel) { - struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); - int ok; + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; + int err; if (unlikely(cancel)) { - drbd_free_ee(mdev, e); + drbd_free_peer_req(mdev, peer_req); dec_unacked(mdev); - return 1; + return 0; } if (get_ldev_if_state(mdev, D_FAILED)) { - drbd_rs_complete_io(mdev, e->sector); + drbd_rs_complete_io(mdev, peer_req->i.sector); put_ldev(mdev); } - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (mdev->state.conn == C_AHEAD) { + err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req); + } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { inc_rs_pending(mdev); - ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); + err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); } else { if (DRBD_ratelimit(5*HZ, 5)) dev_err(DEV, "Not sending RSDataReply, " "partner DISKLESS!\n"); - ok = 1; + err = 0; } } else { if (DRBD_ratelimit(5*HZ, 5)) dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", - (unsigned long long)e->sector); + (unsigned long long)peer_req->i.sector); - ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); + err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); /* update resync data with failure */ - drbd_rs_failed_io(mdev, e->sector, e->size); + drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size); } dec_unacked(mdev); - move_to_net_ee_or_free(mdev, e); + move_to_net_ee_or_free(mdev, peer_req); - if (unlikely(!ok)) + if (unlikely(err)) dev_err(DEV, "drbd_send_block() failed\n"); - return ok; + return err; } -int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) { - struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; struct digest_info *di; int digest_size; void *digest = NULL; - int ok, eq = 0; + int err, eq = 0; if (unlikely(cancel)) { - drbd_free_ee(mdev, e); + drbd_free_peer_req(mdev, peer_req); dec_unacked(mdev); - return 1; + return 0; } - drbd_rs_complete_io(mdev, e->sector); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, peer_req->i.sector); + put_ldev(mdev); + } - di = (struct digest_info *)(unsigned long)e->block_id; + di = peer_req->digest; - if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { /* quick hack to try to avoid a race against reconfiguration. * a real fix would be much more involved, * introducing more locking mechanisms */ - if (mdev->csums_tfm) { - digest_size = crypto_hash_digestsize(mdev->csums_tfm); + if (mdev->tconn->csums_tfm) { + digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm); D_ASSERT(digest_size == di->digest_size); digest = kmalloc(digest_size, GFP_NOIO); } if (digest) { - drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); + drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest); eq = !memcmp(digest, di->digest, digest_size); kfree(digest); } if (eq) { - drbd_set_in_sync(mdev, e->sector, e->size); - mdev->rs_same_csum++; - ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); + drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size); + /* rs_same_csums unit is BM_BLOCK_SIZE */ + mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; + err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req); } else { inc_rs_pending(mdev); - e->block_id = ID_SYNCER; - ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); + peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ + peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ + kfree(di); + err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req); } } else { - ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); + err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req); if (DRBD_ratelimit(5*HZ, 5)) dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); } dec_unacked(mdev); + move_to_net_ee_or_free(mdev, peer_req); - kfree(di); - - move_to_net_ee_or_free(mdev, e); - - if (unlikely(!ok)) + if (unlikely(err)) dev_err(DEV, "drbd_send_block/ack() failed\n"); - return ok; + return err; } -int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_e_end_ov_req(struct drbd_work *w, int cancel) { - struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; int digest_size; void *digest; - int ok = 1; + int err = 0; if (unlikely(cancel)) goto out; - if (unlikely(!drbd_bio_uptodate(e->private_bio))) - goto out; - - digest_size = crypto_hash_digestsize(mdev->verify_tfm); + digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); /* FIXME if this allocation fails, online verify will not terminate! */ digest = kmalloc(digest_size, GFP_NOIO); - if (digest) { - drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); - inc_rs_pending(mdev); - ok = drbd_send_drequest_csum(mdev, e->sector, e->size, - digest, digest_size, P_OV_REPLY); - if (!ok) - dec_rs_pending(mdev); - kfree(digest); + if (!digest) { + err = -ENOMEM; + goto out; } -out: - drbd_free_ee(mdev, e); + if (!(peer_req->flags & EE_WAS_ERROR)) + drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); + else + memset(digest, 0, digest_size); - dec_unacked(mdev); + /* Free peer_req and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(mdev, peer_req); + peer_req = NULL; + + inc_rs_pending(mdev); + err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY); + if (err) + dec_rs_pending(mdev); + kfree(digest); - return ok; +out: + if (peer_req) + drbd_free_peer_req(mdev, peer_req); + dec_unacked(mdev); + return err; } -void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) +void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size) { if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { mdev->ov_last_oos_size += size>>9; @@ -1036,110 +1187,142 @@ mdev->ov_last_oos_size = size>>9; } drbd_set_out_of_sync(mdev, sector, size); - set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); } -int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_e_end_ov_reply(struct drbd_work *w, int cancel) { - struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_conf *mdev = w->mdev; struct digest_info *di; - int digest_size; void *digest; - int ok, eq = 0; + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; + int digest_size; + int err, eq = 0; if (unlikely(cancel)) { - drbd_free_ee(mdev, e); + drbd_free_peer_req(mdev, peer_req); dec_unacked(mdev); - return 1; + return 0; } /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all * the resync lru has been cleaned up already */ - drbd_rs_complete_io(mdev, e->sector); + if (get_ldev(mdev)) { + drbd_rs_complete_io(mdev, peer_req->i.sector); + put_ldev(mdev); + } - di = (struct digest_info *)(unsigned long)e->block_id; + di = peer_req->digest; - if (likely(drbd_bio_uptodate(e->private_bio))) { - digest_size = crypto_hash_digestsize(mdev->verify_tfm); + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm); digest = kmalloc(digest_size, GFP_NOIO); if (digest) { - drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); + drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest); D_ASSERT(digest_size == di->digest_size); eq = !memcmp(digest, di->digest, digest_size); kfree(digest); } - } else { - ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); - if (DRBD_ratelimit(5*HZ, 5)) - dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); } - dec_unacked(mdev); - - kfree(di); - + /* Free peer_req and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(mdev, peer_req); if (!eq) - drbd_ov_oos_found(mdev, e->sector, e->size); + drbd_ov_out_of_sync_found(mdev, sector, size); else - ov_oos_print(mdev); + ov_out_of_sync_print(mdev); - ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, - eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); + err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, + eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); + + dec_unacked(mdev); - drbd_free_ee(mdev, e); + --mdev->ov_left; - if (--mdev->ov_left == 0) { - ov_oos_print(mdev); + /* let's advance progress step marks only for every other megabyte */ + if ((mdev->ov_left & 0x200) == 0x200) + drbd_advance_rs_marks(mdev, mdev->ov_left); + + if (mdev->ov_left == 0) { + ov_out_of_sync_print(mdev); drbd_resync_finished(mdev); } - return ok; + return err; } -int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_prev_work_done(struct drbd_work *w, int cancel) { struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); + complete(&b->done); - return 1; + return 0; } -int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_send_barrier(struct drbd_work *w, int cancel) { + struct drbd_socket *sock; struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); - struct p_barrier *p = &mdev->data.sbuf.barrier; - int ok = 1; + struct drbd_conf *mdev = w->mdev; + struct p_barrier *p; /* really avoid racing with tl_clear. w.cb may have been referenced * just before it was reassigned and re-queued, so double check that. * actually, this race was harmless, since we only try to send the * barrier packet here, and otherwise do nothing with the object. * but compare with the head of w_clear_epoch */ - spin_lock_irq(&mdev->req_lock); + spin_lock_irq(&mdev->tconn->req_lock); if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) cancel = 1; - spin_unlock_irq(&mdev->req_lock); + spin_unlock_irq(&mdev->tconn->req_lock); if (cancel) - return 1; - - if (!drbd_get_data_sock(mdev)) return 0; + + sock = &mdev->tconn->data; + p = drbd_prepare_command(mdev, sock); + if (!p) + return -EIO; p->barrier = b->br_number; /* inc_ap_pending was done where this was queued. * dec_ap_pending will be done in got_BarrierAck * or (on connection loss) in w_clear_epoch. */ - ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, - (struct p_header *)p, sizeof(*p), 0); - drbd_put_data_sock(mdev); - - return ok; + return drbd_send_command(mdev, sock, P_BARRIER, sizeof(*p), NULL, 0); } -int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_send_write_hint(struct drbd_work *w, int cancel) { + struct drbd_conf *mdev = w->mdev; + struct drbd_socket *sock; + if (cancel) - return 1; - return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); + return 0; + sock = &mdev->tconn->data; + if (!drbd_prepare_command(mdev, sock)) + return -EIO; + return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0); +} + +int w_send_out_of_sync(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_conf *mdev = w->mdev; + int err; + + if (unlikely(cancel)) { + req_mod(req, SEND_CANCELED); + return 0; + } + + err = drbd_send_out_of_sync(mdev, req); + req_mod(req, OOS_HANDED_TO_NETWORK); + + return err; } /** @@ -1148,20 +1331,21 @@ * @w: work object. * @cancel: The connection will be closed anyways */ -int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_send_dblock(struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); - int ok; + struct drbd_conf *mdev = w->mdev; + int err; if (unlikely(cancel)) { - req_mod(req, send_canceled); - return 1; + req_mod(req, SEND_CANCELED); + return 0; } - ok = drbd_send_dblock(mdev, req); - req_mod(req, ok ? handed_over_to_network : send_failed); + err = drbd_send_dblock(mdev, req); + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); - return ok; + return err; } /** @@ -1170,39 +1354,56 @@ * @w: work object. * @cancel: The connection will be closed anyways */ -int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +int w_send_read_req(struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); - int ok; + struct drbd_conf *mdev = w->mdev; + int err; if (unlikely(cancel)) { - req_mod(req, send_canceled); - return 1; + req_mod(req, SEND_CANCELED); + return 0; } - ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, - (unsigned long)req); + err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size, + (unsigned long)req); - if (!ok) { - /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); - * so this is probably redundant */ - if (mdev->state.conn >= C_CONNECTED) - drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); - } - req_mod(req, ok ? handed_over_to_network : send_failed); + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + + return err; +} + +int w_restart_disk_io(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_conf *mdev = w->mdev; - return ok; + if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) + drbd_al_begin_io(mdev, &req->i); + + drbd_req_make_private_bio(req, req->master_bio); + req->private_bio->bi_bdev = mdev->ldev->backing_bdev; + generic_make_request(req->private_bio); + + return 0; } STATIC int _drbd_may_sync_now(struct drbd_conf *mdev) { struct drbd_conf *odev = mdev; + int resync_after; while (1) { - if (odev->sync_conf.after == -1) + if (!odev->ldev) + return 1; + rcu_read_lock(); + resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; + rcu_read_unlock(); + if (resync_after == -1) + return 1; + odev = minor_to_mdev(resync_after); + if (!expect(odev)) return 1; - odev = minor_to_mdev(odev->sync_conf.after); - ERR_IF(!odev) return 1; if ((odev->state.conn >= C_SYNC_SOURCE && odev->state.conn <= C_PAUSED_SYNC_T) || odev->state.aftr_isp || odev->state.peer_isp || @@ -1222,16 +1423,15 @@ struct drbd_conf *odev; int i, rv = 0; - for (i = 0; i < minor_count; i++) { - odev = minor_to_mdev(i); - if (!odev) - continue; + rcu_read_lock(); + idr_for_each_entry(&minors, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; if (!_drbd_may_sync_now(odev)) rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) != SS_NOTHING_TO_DO); } + rcu_read_unlock(); return rv; } @@ -1247,10 +1447,8 @@ struct drbd_conf *odev; int i, rv = 0; - for (i = 0; i < minor_count; i++) { - odev = minor_to_mdev(i); - if (!odev) - continue; + rcu_read_lock(); + idr_for_each_entry(&minors, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; if (odev->state.aftr_isp) { @@ -1260,6 +1458,7 @@ != SS_NOTHING_TO_DO) ; } } + rcu_read_unlock(); return rv; } @@ -1277,46 +1476,86 @@ write_unlock_irq(&global_state_lock); } -static int sync_after_error(struct drbd_conf *mdev, int o_minor) +/* caller must hold global_state_lock */ +enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) { struct drbd_conf *odev; + int resync_after; if (o_minor == -1) return NO_ERROR; if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) - return ERR_SYNC_AFTER; + return ERR_RESYNC_AFTER; /* check for loops */ odev = minor_to_mdev(o_minor); while (1) { if (odev == mdev) - return ERR_SYNC_AFTER_CYCLE; + return ERR_RESYNC_AFTER_CYCLE; + rcu_read_lock(); + resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; + rcu_read_unlock(); /* dependency chain ends here, no cycles. */ - if (odev->sync_conf.after == -1) + if (resync_after == -1) return NO_ERROR; /* follow the dependency chain */ - odev = minor_to_mdev(odev->sync_conf.after); + odev = minor_to_mdev(resync_after); } } -int drbd_alter_sa(struct drbd_conf *mdev, int na) +/* caller must hold global_state_lock */ +void drbd_resync_after_changed(struct drbd_conf *mdev) { int changes; - int retcode; - write_lock_irq(&global_state_lock); - retcode = sync_after_error(mdev, na); - if (retcode == NO_ERROR) { - mdev->sync_conf.after = na; - do { - changes = _drbd_pause_after(mdev); - changes |= _drbd_resume_next(mdev); - } while (changes); + do { + changes = _drbd_pause_after(mdev); + changes |= _drbd_resume_next(mdev); + } while (changes); +} + +void drbd_rs_controller_reset(struct drbd_conf *mdev) +{ + struct fifo_buffer *plan; + + atomic_set(&mdev->rs_sect_in, 0); + atomic_set(&mdev->rs_sect_ev, 0); + mdev->rs_in_flight = 0; + + /* Updating the RCU protected object in place is necessary since + this function gets called from atomic context. + It is valid since all other updates also lead to an completely + empty fifo */ + rcu_read_lock(); + plan = rcu_dereference(mdev->rs_plan_s); + plan->total = 0; + fifo_set(plan, 0); + rcu_read_unlock(); +} + +void start_resync_timer_fn(unsigned long data) +{ + struct drbd_conf *mdev = (struct drbd_conf *) data; + + drbd_queue_work(&mdev->tconn->data.work, &mdev->start_resync_work); +} + +int w_start_resync(struct drbd_work *w, int cancel) +{ + struct drbd_conf *mdev = w->mdev; + + if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) { + dev_warn(DEV, "w_start_resync later...\n"); + mdev->start_resync_timer.expires = jiffies + HZ/10; + add_timer(&mdev->start_resync_timer); + return 0; } - write_unlock_irq(&global_state_lock); - return retcode; + + drbd_start_resync(mdev, C_SYNC_SOURCE); + clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); + return 0; } /** @@ -1332,52 +1571,71 @@ union drbd_state ns; int r; - if (mdev->state.conn >= C_SYNC_SOURCE) { + if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) { dev_err(DEV, "Resync already running!\n"); return; } - trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, "Resync starting: side=%s\n", - side == C_SYNC_TARGET ? "SyncTarget" : "SyncSource"); - - /* In case a previous resync run was aborted by an IO error/detach on the peer. */ - drbd_rs_cancel_all(mdev); + if (mdev->state.conn < C_AHEAD) { + /* In case a previous resync run was aborted by an IO error/detach on the peer. */ + drbd_rs_cancel_all(mdev); + /* This should be done when we abort the resync. We definitely do not + want to have this for connections going back and forth between + Ahead/Behind and SyncSource/SyncTarget */ + } + + if (!test_bit(B_RS_H_DONE, &mdev->flags)) { + if (side == C_SYNC_TARGET) { + /* Since application IO was locked out during C_WF_BITMAP_T and + C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET + we check that we might make the data inconsistent. */ + r = drbd_khelper(mdev, "before-resync-target"); + r = (r >> 8) & 0xff; + if (r > 0) { + dev_info(DEV, "before-resync-target handler returned %d, " + "dropping connection.\n", r); + conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); + return; + } + } else /* C_SYNC_SOURCE */ { + r = drbd_khelper(mdev, "before-resync-source"); + r = (r >> 8) & 0xff; + if (r > 0) { + if (r == 3) { + dev_info(DEV, "before-resync-source handler returned %d, " + "ignoring. Old userland tools?", r); + } else { + dev_info(DEV, "before-resync-source handler returned %d, " + "dropping connection.\n", r); + conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD); + return; + } + } + } + } - if (side == C_SYNC_TARGET) { - /* Since application IO was locked out during C_WF_BITMAP_T and - C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET - we check that we might make the data inconsistent. */ - r = drbd_khelper(mdev, "before-resync-target"); - r = (r >> 8) & 0xff; - if (r > 0) { - dev_info(DEV, "before-resync-target handler returned %d, " - "dropping connection.\n", r); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + if (current == mdev->tconn->worker.task) { + /* The worker should not sleep waiting for state_mutex, + that can take long */ + if (!mutex_trylock(mdev->state_mutex)) { + set_bit(B_RS_H_DONE, &mdev->flags); + mdev->start_resync_timer.expires = jiffies + HZ/5; + add_timer(&mdev->start_resync_timer); return; } + } else { + mutex_lock(mdev->state_mutex); } + clear_bit(B_RS_H_DONE, &mdev->flags); - drbd_state_lock(mdev); - + write_lock_irq(&global_state_lock); if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { - drbd_state_unlock(mdev); + write_unlock_irq(&global_state_lock); + mutex_unlock(mdev->state_mutex); return; } - if (side == C_SYNC_TARGET) { - mdev->bm_resync_fo = 0; - } else /* side == C_SYNC_SOURCE */ { - u64 uuid; - - get_random_bytes(&uuid, sizeof(u64)); - drbd_uuid_set(mdev, UI_BITMAP, uuid); - drbd_send_sync_uuid(mdev, uuid); - - D_ASSERT(mdev->state.disk == D_UP_TO_DATE); - } - - write_lock_irq(&global_state_lock); - ns = mdev->state; + ns = drbd_read_state(mdev); ns.aftr_isp = !_drbd_may_sync_now(mdev); @@ -1388,42 +1646,75 @@ else /* side == C_SYNC_SOURCE */ ns.pdsk = D_INCONSISTENT; - DRBD_STATE_DEBUG_INIT_VAL(ns); r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); - ns = mdev->state; + ns = drbd_read_state(mdev); if (ns.conn < C_CONNECTED) r = SS_UNKNOWN_ERROR; if (r == SS_SUCCESS) { - mdev->rs_total = - mdev->rs_mark_left = drbd_bm_total_weight(mdev); + unsigned long tw = drbd_bm_total_weight(mdev); + unsigned long now = jiffies; + int i; + mdev->rs_failed = 0; mdev->rs_paused = 0; - mdev->rs_start = - mdev->rs_mark_time = jiffies; mdev->rs_same_csum = 0; + mdev->rs_last_events = 0; + mdev->rs_last_sect_ev = 0; + mdev->rs_total = tw; + mdev->rs_start = now; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + mdev->rs_mark_left[i] = tw; + mdev->rs_mark_time[i] = now; + } _drbd_pause_after(mdev); } write_unlock_irq(&global_state_lock); - drbd_state_unlock(mdev); - put_ldev(mdev); if (r == SS_SUCCESS) { dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", drbd_conn_str(ns.conn), (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), (unsigned long) mdev->rs_total); + if (side == C_SYNC_TARGET) + mdev->bm_resync_fo = 0; - if (mdev->rs_total == 0) { - /* Peer still reachable? Beware of failing before-resync-target handlers! */ - request_ping(mdev); - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */ + /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid + * with w_send_oos, or the sync target will get confused as to + * how much bits to resync. We cannot do that always, because for an + * empty resync and protocol < 95, we need to do it here, as we call + * drbd_resync_finished from here in that case. + * We drbd_gen_and_send_sync_uuid here for protocol < 96, + * and from after_state_ch otherwise. */ + if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96) + drbd_gen_and_send_sync_uuid(mdev); + + if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) { + /* This still has a race (about when exactly the peers + * detect connection loss) that can lead to a full sync + * on next handshake. In 8.3.9 we fixed this with explicit + * resync-finished notifications, but the fix + * introduces a protocol change. Sleeping for some + * time longer than the ping interval + timeout on the + * SyncSource, to give the SyncTarget the chance to + * detect connection loss, then waiting for a ping + * response (implicit in drbd_resync_finished) reduces + * the race considerably, but does not solve it. */ + if (side == C_SYNC_SOURCE) { + struct net_conf *nc; + int timeo; + + rcu_read_lock(); + nc = rcu_dereference(mdev->tconn->net_conf); + timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; + rcu_read_unlock(); + schedule_timeout_interruptible(timeo); + } drbd_resync_finished(mdev); - return; } + drbd_rs_controller_reset(mdev); /* ns.conn may already be != mdev->state.conn, * we may have been paused in between, or become paused until * the timer triggers. @@ -1433,51 +1724,61 @@ drbd_md_sync(mdev); } + put_ldev(mdev); + mutex_unlock(mdev->state_mutex); } int drbd_worker(struct drbd_thread *thi) { - struct drbd_conf *mdev = thi->mdev; + struct drbd_tconn *tconn = thi->tconn; struct drbd_work *w = NULL; + struct drbd_conf *mdev; + struct net_conf *nc; LIST_HEAD(work_list); - int intr = 0, i; + int vnr, intr = 0; + int cork; - sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); + while (get_t_state(thi) == RUNNING) { + drbd_thread_current_set_cpu(thi); - while (get_t_state(thi) == Running) { - drbd_thread_current_set_cpu(mdev); + if (down_trylock(&tconn->data.work.s)) { + mutex_lock(&tconn->data.mutex); - if (down_trylock(&mdev->data.work.s)) { - mutex_lock(&mdev->data.mutex); - if (mdev->data.socket && !mdev->net_conf->no_cork) - drbd_tcp_uncork(mdev->data.socket); - mutex_unlock(&mdev->data.mutex); - - intr = down_interruptible(&mdev->data.work.s); - - mutex_lock(&mdev->data.mutex); - if (mdev->data.socket && !mdev->net_conf->no_cork) - drbd_tcp_cork(mdev->data.socket); - mutex_unlock(&mdev->data.mutex); + rcu_read_lock(); + nc = rcu_dereference(tconn->net_conf); + cork = nc ? nc->tcp_cork : 0; + rcu_read_unlock(); + + if (tconn->data.socket && cork) + drbd_tcp_uncork(tconn->data.socket); + mutex_unlock(&tconn->data.mutex); + + intr = down_interruptible(&tconn->data.work.s); + + mutex_lock(&tconn->data.mutex); + if (tconn->data.socket && cork) + drbd_tcp_cork(tconn->data.socket); + mutex_unlock(&tconn->data.mutex); } if (intr) { - D_ASSERT(intr == -EINTR); flush_signals(current); - ERR_IF (get_t_state(thi) == Running) + if (get_t_state(thi) == RUNNING) { + conn_warn(tconn, "Worker got an unexpected signal\n"); continue; + } break; } - if (get_t_state(thi) != Running) + if (get_t_state(thi) != RUNNING) break; /* With this break, we have done a down() but not consumed the entry from the list. The cleanup code takes care of this... */ w = NULL; - spin_lock_irq(&mdev->data.work.q_lock); - ERR_IF(list_empty(&mdev->data.work.q)) { + spin_lock_irq(&tconn->data.work.q_lock); + if (list_empty(&tconn->data.work.q)) { /* something terribly wrong in our logic. * we were able to down() the semaphore, * but the list is empty... doh. @@ -1489,57 +1790,52 @@ * * I'll try to get away just starting over this loop. */ - spin_unlock_irq(&mdev->data.work.q_lock); + conn_warn(tconn, "Work list unexpectedly empty\n"); + spin_unlock_irq(&tconn->data.work.q_lock); continue; } - w = list_entry(mdev->data.work.q.next, struct drbd_work, list); + w = list_entry(tconn->data.work.q.next, struct drbd_work, list); list_del_init(&w->list); - spin_unlock_irq(&mdev->data.work.q_lock); + spin_unlock_irq(&tconn->data.work.q_lock); - if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { + if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS)) { /* dev_warn(DEV, "worker: a callback failed! \n"); */ - if (mdev->state.conn >= C_CONNECTED) - drbd_force_state(mdev, - NS(conn, C_NETWORK_FAILURE)); + if (tconn->cstate >= C_WF_REPORT_PARAMS) + conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD); } } - D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); - D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); - - spin_lock_irq(&mdev->data.work.q_lock); - i = 0; - while (!list_empty(&mdev->data.work.q)) { - list_splice_init(&mdev->data.work.q, &work_list); - spin_unlock_irq(&mdev->data.work.q_lock); + + spin_lock_irq(&tconn->data.work.q_lock); + while (!list_empty(&tconn->data.work.q)) { + list_splice_init(&tconn->data.work.q, &work_list); + spin_unlock_irq(&tconn->data.work.q_lock); while (!list_empty(&work_list)) { w = list_entry(work_list.next, struct drbd_work, list); list_del_init(&w->list); - w->cb(mdev, w, 1); - i++; /* dead debugging code */ + w->cb(w, 1); } - spin_lock_irq(&mdev->data.work.q_lock); + spin_lock_irq(&tconn->data.work.q_lock); } - sema_init(&mdev->data.work.s, 0); + sema_init(&tconn->data.work.s, 0); /* DANGEROUS race: if someone did queue his work within the spinlock, * but up() ed outside the spinlock, we could get an up() on the * semaphore without corresponding list entry. * So don't do that. */ - spin_unlock_irq(&mdev->data.work.q_lock); + spin_unlock_irq(&tconn->data.work.q_lock); - D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); - /* _drbd_set_state only uses stop_nowait. - * wait here for the Exiting receiver. */ - drbd_thread_stop(&mdev->receiver); - drbd_mdev_cleanup(mdev); - - dev_info(DEV, "worker terminated\n"); - - clear_bit(DEVICE_DYING, &mdev->flags); - clear_bit(CONFIG_PENDING, &mdev->flags); - wake_up(&mdev->state_wait); + rcu_read_lock(); + idr_for_each_entry(&tconn->volumes, mdev, vnr) { + D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); + kref_get(&mdev->kref); + rcu_read_unlock(); + drbd_mdev_cleanup(mdev); + kref_put(&mdev->kref, &drbd_minor_destroy); + rcu_read_lock(); + } + rcu_read_unlock(); return 0; } diff -Nru drbd8-8.3.7/drbd/drbd_wrappers.h drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_wrappers.h --- drbd8-8.3.7/drbd/drbd_wrappers.h 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/drbd_wrappers.h 2012-02-02 14:09:14.000000000 +0000 @@ -5,40 +5,38 @@ #include #include -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -# error "use a 2.6 kernel, please" +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) +# error "At least kernel version 2.6.18 (with patches) required" #endif -#include -#ifndef bio_rw_flagged -#define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag))) +/* The history of blkdev_issue_flush() + + It had 2 arguments before fbd9b09a177a481eda256447c881f014f29034fe, + after it had 4 arguments. (With that commit came BLKDEV_IFL_WAIT) + + It had 4 arguments before dd3932eddf428571762596e17b65f5dc92ca361b, + after it got 3 arguments. (With that commit came BLKDEV_DISCARD_SECURE + and BLKDEV_IFL_WAIT disappeared again.) */ +#include +#ifndef BLKDEV_IFL_WAIT +#ifndef BLKDEV_DISCARD_SECURE +/* before fbd9b09a177 */ +#define blkdev_issue_flush(b, gfpf, s) blkdev_issue_flush(b, s) +#endif +/* after dd3932eddf4 no define at all */ +#else +/* between fbd9b09a177 and dd3932eddf4 */ +#define blkdev_issue_flush(b, gfpf, s) blkdev_issue_flush(b, gfpf, s, BLKDEV_IFL_WAIT) #endif +#include +#include #include +#include /* for the proc_create wrapper */ #include -/* struct page has a union in 2.6.15 ... - * an anonymous union and struct since 2.6.16 - * or in fc5 "2.6.15" */ -#include -#ifndef page_private -# define page_private(page) ((page)->private) -# define set_page_private(page, v) ((page)->private = (v)) -#endif - -/* mutex was not available before 2.6.16. - * various vendors provide various degrees of backports. - * we provide the missing parts ourselves, if neccessary. - * this one is for RHEL/Centos 4 */ -#if defined(mutex_lock) && !defined(mutex_is_locked) -#define mutex_is_locked(m) (atomic_read(&(m)->count) != 1) -#endif - -/* see get_sb_bdev and bd_claim */ -extern char *drbd_sec_holder; - #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) static inline unsigned short queue_logical_block_size(struct request_queue *q) { @@ -53,9 +51,9 @@ return queue_logical_block_size(bdev_get_queue(bdev)); } -static inline unsigned int queue_max_segment_size(struct request_queue *q) +static inline unsigned int queue_max_hw_sectors(struct request_queue *q) { - return q->max_segment_size; + return q->max_hw_sectors; } static inline unsigned int queue_max_sectors(struct request_queue *q) @@ -73,9 +71,21 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) { /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ - return bdev ? bdev->bd_inode->i_size >> 9 : 0; + return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0; } +#ifdef COMPAT_HAVE_VOID_MAKE_REQUEST +/* in Commit 5a7bbad27a410350e64a2d7f5ec18fc73836c14f (between Linux-3.1 and 3.2) + make_request() becomes type void. Before it had type int. */ +#define MAKE_REQUEST_TYPE void +#define MAKE_REQUEST_RETURN return +#else +#define MAKE_REQUEST_TYPE int +#define MAKE_REQUEST_RETURN return 0 +#endif + +#include "drbd_int.h" + /* sets the number of 512 byte sectors of our virtual device */ static inline void drbd_set_my_capacity(struct drbd_conf *mdev, sector_t size) @@ -85,20 +95,58 @@ mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; } -#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) +#ifndef COMPAT_HAVE_FMODE_T +typedef unsigned __bitwise__ fmode_t; +#endif -static inline int drbd_bio_has_active_page(struct bio *bio) +#ifndef COMPAT_HAVE_BLKDEV_GET_BY_PATH +/* see kernel 2.6.37, + * d4d7762 block: clean up blkdev_get() wrappers and their users + * e525fd8 block: make blkdev_get/put() handle exclusive access + * and kernel 2.6.28 + * 30c40d2 [PATCH] propagate mode through open_bdev_excl/close_bdev_excl + * Also note that there is no FMODE_EXCL before + * 86d434d [PATCH] eliminate use of ->f_flags in block methods + */ +#ifndef COMPAT_HAVE_OPEN_BDEV_EXCLUSIVE +#ifndef FMODE_EXCL +#define FMODE_EXCL 0 +#endif +static inline +struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) +{ + /* drbd does not open readonly, but try to be correct, anyways */ + return open_bdev_excl(path, (mode & FMODE_WRITE) ? 0 : MS_RDONLY, holder); +} +static inline +void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) +{ + /* mode ignored. */ + close_bdev_excl(bdev); +} +#endif +static inline struct block_device *blkdev_get_by_path(const char *path, + fmode_t mode, void *holder) { - struct bio_vec *bvec; - int i; + return open_bdev_exclusive(path, mode, holder); +} - __bio_for_each_segment(bvec, bio, i, 0) { - if (page_count(bvec->bv_page) > 1) - return 1; - } +static inline int drbd_blkdev_put(struct block_device *bdev, fmode_t mode) +{ + /* blkdev_put != close_bdev_exclusive, in general, so this is obviously + * not correct, and there should be some if (mode & FMODE_EXCL) ... + * But this is the only way it is used in DRBD, + * and for <= 2.6.27, there is no FMODE_EXCL anyways. */ + close_bdev_exclusive(bdev, mode); + /* blkdev_put seems to not have useful return values, + * close_bdev_exclusive is void. */ return 0; } +#define blkdev_put(b, m) drbd_blkdev_put(b, m) +#endif + +#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) /* Before Linux-2.6.24 bie_endio() had the size of the bio as second argument. @@ -117,9 +165,8 @@ /* bi_end_io handlers */ extern BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error); -extern BIO_ENDIO_TYPE drbd_endio_read_sec BIO_ENDIO_ARGS(struct bio *bio, int error); -extern BIO_ENDIO_TYPE drbd_endio_write_sec BIO_ENDIO_ARGS(struct bio *bio, int error); -extern BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error); +extern BIO_ENDIO_TYPE drbd_peer_request_endio BIO_ENDIO_ARGS(struct bio *bio, int error); +extern BIO_ENDIO_TYPE drbd_request_endio BIO_ENDIO_ARGS(struct bio *bio, int error); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) #define part_inc_in_flight(A, B) part_inc_in_flight(A) @@ -148,14 +195,6 @@ #define sg_init_table(S,N) ({}) -#ifdef NEED_SG_SET_BUF -static inline void sg_set_buf(struct scatterlist *sg, const void *buf, - unsigned int buflen) -{ - sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); -} -#endif - #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28) @@ -178,17 +217,11 @@ #endif static inline void drbd_kobject_uevent(struct drbd_conf *mdev) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,15) - kobject_uevent(disk_to_kobj(mdev->vdisk), KOBJ_CHANGE, NULL); -#else kobject_uevent(disk_to_kobj(mdev->vdisk), KOBJ_CHANGE); /* rhel4 / sles9 and older don't have this at all, * which means user space (udev) won't get events about possible changes of * corresponding resource + disk names after the initial drbd minor creation. */ -#endif -#endif } @@ -208,39 +241,28 @@ return; } - if (FAULT_ACTIVE(mdev, fault_type)) + if (drbd_insert_fault(mdev, fault_type)) bio_endio(bio, -EIO); else generic_make_request(bio); } -static inline void drbd_plug_device(struct drbd_conf *mdev) +static inline int drbd_backing_bdev_events(struct drbd_conf *mdev) { - struct request_queue *q; - q = bdev_get_queue(mdev->this_bdev); - - spin_lock_irq(q->queue_lock); - -/* XXX the check on !blk_queue_plugged is redundant, - * implicitly checked in blk_plug_device */ - - if (!blk_queue_plugged(q)) { - blk_plug_device(q); - del_timer(&q->unplug_timer); - /* unplugging should not happen automatically... */ - } - spin_unlock_irq(q->queue_lock); -} - -#ifdef DEFINE_SOCK_CREATE_KERN -#define sock_create_kern sock_create -#endif - -#ifdef USE_KMEM_CACHE_S -#define kmem_cache kmem_cache_s + struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk; +#if defined(__disk_stat_inc) + /* older kernel */ + return (int)disk_stat_read(disk, sectors[0]) + + (int)disk_stat_read(disk, sectors[1]); +#else + /* recent kernel */ + return (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]); #endif +} -#ifdef DEFINE_KERNEL_SOCK_SHUTDOWN +#ifndef COMPAT_HAVE_SOCK_SHUTDOWN +#define COMPAT_HAVE_SOCK_SHUTDOWN 1 enum sock_shutdown_cmd { SHUT_RD = 0, SHUT_WR = 1, @@ -263,72 +285,6 @@ #define drbd_unregister_blkdev unregister_blkdev #endif -#ifdef NEED_BACKPORT_OF_ATOMIC_ADD - -#if defined(__x86_64__) - -static __inline__ int atomic_add_return(int i, atomic_t *v) -{ - int __i = i; - __asm__ __volatile__( - LOCK_PREFIX "xaddl %0, %1;" - :"=r"(i) - :"m"(v->counter), "0"(i)); - return i + __i; -} - -static __inline__ int atomic_sub_return(int i, atomic_t *v) -{ - return atomic_add_return(-i, v); -} - -#define atomic_inc_return(v) (atomic_add_return(1,v)) -#define atomic_dec_return(v) (atomic_sub_return(1,v)) - -#elif defined(__i386__) || defined(__arch_um__) - -static __inline__ int atomic_add_return(int i, atomic_t *v) -{ - int __i; -#ifdef CONFIG_M386 - unsigned long flags; - if(unlikely(boot_cpu_data.x86==3)) - goto no_xadd; -#endif - /* Modern 486+ processor */ - __i = i; - __asm__ __volatile__( - LOCK_PREFIX "xaddl %0, %1;" - :"=r"(i) - :"m"(v->counter), "0"(i)); - return i + __i; - -#ifdef CONFIG_M386 -no_xadd: /* Legacy 386 processor */ - local_irq_save(flags); - __i = atomic_read(v); - atomic_set(v, i + __i); - local_irq_restore(flags); - return i + __i; -#endif -} - -static __inline__ int atomic_sub_return(int i, atomic_t *v) -{ - return atomic_add_return(-i, v); -} - -#define atomic_inc_return(v) (atomic_add_return(1,v)) -#define atomic_dec_return(v) (atomic_sub_return(1,v)) - -#else -# error "You need to copy/past atomic_inc_return()/atomic_dec_return() here" -# error "for your architecture. (Hint: Kernels after 2.6.10 have those" -# error "by default! Using a later kernel might be less effort!)" -#endif - -#endif - #if !defined(CRYPTO_ALG_ASYNC) /* With Linux-2.6.19 the crypto API changed! */ /* This is not a generic backport of the new api, it just implements @@ -449,33 +405,12 @@ #endif -static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) -{ -#ifdef CRYPTO_ALG_TYPE_HASH_MASK - /* see include/linux/crypto.h */ - return !((crypto_tfm_alg_type(tfm) ^ CRYPTO_ALG_TYPE_HASH) - & CRYPTO_ALG_TYPE_HASH_MASK); -#else - return crypto_tfm_alg_type(tfm) == CRYPTO_ALG_TYPE_HASH; -#endif -} - - -#ifdef NEED_BACKPORT_OF_KZALLOC -static inline void *kzalloc(size_t size, int flags) -{ - void *rv = kmalloc(size, flags); - if (rv) - memset(rv, 0, size); - - return rv; -} -#endif - /* see upstream commit 2d3854a37e8b767a51aba38ed6d22817b0631e33 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30) #ifndef cpumask_bits +#ifndef COMPAT_HAVE_NR_CPU_IDS #define nr_cpu_ids NR_CPUS +#endif #define nr_cpumask_bits nr_cpu_ids typedef cpumask_t cpumask_var_t[1]; @@ -623,19 +558,6 @@ # define __cond_lock(x,c) (c) #endif -#ifndef KERNEL_HAS_GFP_T -#define KERNEL_HAS_GFP_T -typedef unsigned gfp_t; -#endif - - -/* struct kvec didn't exist before 2.6.8, this is an ugly - * #define to work around it ... - jt */ - -#ifndef KERNEL_HAS_KVEC -#define kvec iovec -#endif - #ifndef net_random #define random32 net_random #endif @@ -651,43 +573,530 @@ * this "backport" does not close the race that lead to the API change, * but only provides an equivalent function call. */ -#ifndef KERNEL_HAS_PROC_CREATE -static inline struct proc_dir_entry *proc_create(const char *name, +#ifndef COMPAT_HAVE_PROC_CREATE_DATA +static inline struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, struct proc_dir_entry *parent, - struct file_operations *proc_fops) + struct file_operations *proc_fops, void *data) { struct proc_dir_entry *pde = create_proc_entry(name, mode, parent); - if (pde) + if (pde) { pde->proc_fops = proc_fops; + pde->data = data; + } return pde; } #endif +#ifndef COMPAT_HAVE_BLK_QUEUE_MAX_HW_SECTORS +static inline void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max) +{ + blk_queue_max_sectors(q, max); +} +#elif defined(COMPAT_USE_BLK_QUEUE_MAX_SECTORS_ANYWAYS) + /* For kernel versions 2.6.31 to 2.6.33 inclusive, even though + * blk_queue_max_hw_sectors is present, we actually need to use + * blk_queue_max_sectors to set max_hw_sectors. :-( + * RHEL6 2.6.32 chose to be different and already has eliminated + * blk_queue_max_sectors as upstream 2.6.34 did. + */ +#define blk_queue_max_hw_sectors(q, max) blk_queue_max_sectors(q, max) +#endif + +#ifndef COMPAT_HAVE_BLK_QUEUE_MAX_SEGMENTS +static inline void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) +{ + blk_queue_max_phys_segments(q, max_segments); + blk_queue_max_hw_segments(q, max_segments); +#define BLK_MAX_SEGMENTS MAX_HW_SEGMENTS /* or max MAX_PHYS_SEGMENTS. Probably does not matter */ +} +#endif + +#ifndef COMPAT_HAVE_BOOL_TYPE +typedef _Bool bool; +enum { + false = 0, + true = 1 +}; +#endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30) -#define TP_PROTO(args...) args -#define TP_ARGS(args...) args +/* REQ_* and BIO_RW_* flags have been moved around in the tree, + * and have finally been "merged" with + * 7b6d91daee5cac6402186ff224c3af39d79f4a0e and + * 7cc015811ef8992dfcce314d0ed9642bc18143d1 + * We communicate between different systems, + * so we have to somehow semantically map the bi_rw flags + * bi_rw (some kernel version) -> data packet flags -> bi_rw (other kernel version) + */ + +/* RHEL 6.1 backported FLUSH/FUA as BIO_RW_FLUSH/FUA + * and at that time also introduced the defines BIO_FLUSH/FUA. + * There is also REQ_FLUSH/FUA, but these do NOT share + * the same value space as the bio rw flags, yet. + */ +#ifdef BIO_FLUSH + +#define DRBD_REQ_FLUSH (1UL << BIO_RW_FLUSH) +#define DRBD_REQ_FUA (1UL << BIO_RW_FUA) +#define DRBD_REQ_HARDBARRIER (1UL << BIO_RW_BARRIER) +#define DRBD_REQ_DISCARD (1UL << BIO_RW_DISCARD) +#define DRBD_REQ_SYNC (1UL << BIO_RW_SYNCIO) +#define DRBD_REQ_UNPLUG (1UL << BIO_RW_UNPLUG) + +#elif defined(REQ_FLUSH) /* introduced in 2.6.36, + * now equivalent to bi_rw */ + +#define DRBD_REQ_SYNC REQ_SYNC +#define DRBD_REQ_UNPLUG REQ_UNPLUG +#define DRBD_REQ_FLUSH REQ_FLUSH +#define DRBD_REQ_FUA REQ_FUA +#define DRBD_REQ_DISCARD REQ_DISCARD +/* REQ_HARDBARRIER has been around for a long time, + * without being directly related to bi_rw. + * so the ifdef is only usful inside the ifdef REQ_FLUSH! + * commit 7cc0158 (v2.6.36-rc1) made it a bi_rw flag, ... */ +#ifdef REQ_HARDBARRIER +#define DRBD_REQ_HARDBARRIER REQ_HARDBARRIER +#else +/* ... but REQ_HARDBARRIER was removed again in 02e031c (v2.6.37-rc4). */ +#define DRBD_REQ_HARDBARRIER 0 +#endif + +#else /* "older", and hopefully not + * "partially backported" kernel */ + +#if defined(BIO_RW_SYNC) +/* see upstream commits + * 213d9417fec62ef4c3675621b9364a667954d4dd, + * 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 + * later, the defines even became an enum ;-) */ +#define DRBD_REQ_SYNC (1UL << BIO_RW_SYNC) +#define DRBD_REQ_UNPLUG (1UL << BIO_RW_SYNC) +#else +/* cannot test on defined(BIO_RW_SYNCIO), it may be an enum */ +#define DRBD_REQ_SYNC (1UL << BIO_RW_SYNCIO) +#define DRBD_REQ_UNPLUG (1UL << BIO_RW_UNPLUG) +#endif + +#define DRBD_REQ_FLUSH (1UL << BIO_RW_BARRIER) +/* REQ_FUA has been around for a longer time, + * without a direct equivalent in bi_rw. */ +#define DRBD_REQ_FUA (1UL << BIO_RW_BARRIER) +#define DRBD_REQ_HARDBARRIER (1UL << BIO_RW_BARRIER) + +/* we don't support DISCARDS yet, anyways. + * cannot test on defined(BIO_RW_DISCARD), it may be an enum */ +#define DRBD_REQ_DISCARD 0 +#endif + +/* this results in: + bi_rw -> dp_flags + +< 2.6.28 + SYNC -> SYNC|UNPLUG + BARRIER -> FUA|FLUSH + there is no DISCARD +2.6.28 + SYNC -> SYNC|UNPLUG + BARRIER -> FUA|FLUSH + DISCARD -> DISCARD +2.6.29 + SYNCIO -> SYNC + UNPLUG -> UNPLUG + BARRIER -> FUA|FLUSH + DISCARD -> DISCARD +2.6.36 + SYNC -> SYNC + UNPLUG -> UNPLUG + FUA -> FUA + FLUSH -> FLUSH + DISCARD -> DISCARD +-------------------------------------- + dp_flags -> bi_rw +< 2.6.28 + SYNC -> SYNC (and unplug) + UNPLUG -> SYNC (and unplug) + FUA -> BARRIER + FLUSH -> BARRIER + there is no DISCARD, + it will be silently ignored on the receiving side. +2.6.28 + SYNC -> SYNC (and unplug) + UNPLUG -> SYNC (and unplug) + FUA -> BARRIER + FLUSH -> BARRIER + DISCARD -> DISCARD + (if that fails, we handle it like any other IO error) +2.6.29 + SYNC -> SYNCIO + UNPLUG -> UNPLUG + FUA -> BARRIER + FLUSH -> BARRIER + DISCARD -> DISCARD +2.6.36 + SYNC -> SYNC + UNPLUG -> UNPLUG + FUA -> FUA + FLUSH -> FLUSH + DISCARD -> DISCARD + +NOTE: DISCARDs likely need some work still. We should actually never see +DISCARD requests, as our queue does not announce QUEUE_FLAG_DISCARD yet. +*/ + +#ifndef CONFIG_DYNAMIC_DEBUG +/* At least in 2.6.34 the function macro dynamic_dev_dbg() is broken when compiling + without CONFIG_DYNAMIC_DEBUG. It has 'format' in the argument list, it references + to 'fmt' in its body. */ +#ifdef dynamic_dev_dbg +#undef dynamic_dev_dbg +#define dynamic_dev_dbg(dev, fmt, ...) \ + do { if (0) dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); } while (0) +#endif +#endif + +#ifndef min_not_zero +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) +#endif + +/* Introduced with 2.6.26. See include/linux/jiffies.h */ +#ifndef time_is_before_eq_jiffies +#define time_is_before_jiffies(a) time_after(jiffies, a) +#define time_is_after_jiffies(a) time_before(jiffies, a) +#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) +#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) +#endif + +#ifdef COMPAT_BIO_SPLIT_HAS_BIO_SPLIT_POOL_PARAMETER +#define bio_split(bi, first_sectors) bio_split(bi, bio_split_pool, first_sectors) +#endif + +#ifndef COMPAT_HAVE_BIOSET_CREATE_FRONT_PAD +/* see comments in compat/tests/have_bioset_create_front_pad.c */ +#ifdef COMPAT_BIOSET_CREATE_HAS_THREE_PARAMETERS +#define bioset_create(pool_size, front_pad) bioset_create(pool_size, pool_size, 1) +#else +#define bioset_create(pool_size, front_pad) bioset_create(pool_size, 1) +#endif +#endif + + +#if !(defined(COMPAT_HAVE_RB_AUGMENT_FUNCTIONS) && \ + defined(AUGMENTED_RBTREE_SYMBOLS_EXPORTED)) + +/* + * Make sure the replacements for the augmented rbtree helper functions do not + * clash with functions the kernel implements but does not export. + */ +#define rb_augment_f drbd_rb_augment_f +#define rb_augment_path drbd_rb_augment_path +#define rb_augment_insert drbd_rb_augment_insert +#define rb_augment_erase_begin drbd_rb_augment_erase_begin +#define rb_augment_erase_end drbd_rb_augment_erase_end + +typedef void (*rb_augment_f)(struct rb_node *node, void *data); + +static inline void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) +{ + struct rb_node *parent; + +up: + func(node, data); + parent = rb_parent(node); + if (!parent) + return; + + if (node == parent->rb_left && parent->rb_right) + func(parent->rb_right, data); + else if (parent->rb_left) + func(parent->rb_left, data); -#undef DECLARE_TRACE -#define DECLARE_TRACE(name, proto, args) \ - static inline void _do_trace_##name(struct tracepoint *tp, proto) \ - { } \ - static inline void trace_##name(proto) \ - { } \ - static inline int register_trace_##name(void (*probe)(proto)) \ - { \ - return -ENOSYS; \ - } \ - static inline int unregister_trace_##name(void (*probe)(proto)) \ - { \ - return -ENOSYS; \ + node = parent; + goto up; +} + +/* + * after inserting @node into the tree, update the tree to account for + * both the new entry and any damage done by rebalance + */ +static inline void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data) +{ + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + + rb_augment_path(node, func, data); +} + +/* + * before removing the node, find the deepest node on the rebalance path + * that will still be there after @node gets removed + */ +static inline struct rb_node *rb_augment_erase_begin(struct rb_node *node) +{ + struct rb_node *deepest; + + if (!node->rb_right && !node->rb_left) + deepest = rb_parent(node); + else if (!node->rb_right) + deepest = node->rb_left; + else if (!node->rb_left) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); } -#undef DEFINE_TRACE -#define DEFINE_TRACE(name) + return deepest; +} + +/* + * after removal, update the tree to account for the removed entry + * and any rebalance damage. + */ +static inline void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data) +{ + if (node) + rb_augment_path(node, func, data); +} +#endif + +/* + * In commit c4945b9e (v2.6.39-rc1), the little-endian bit operations have been + * renamed to be less weird. + */ +#ifndef COMPAT_HAVE_FIND_NEXT_ZERO_BIT_LE +#define find_next_zero_bit_le(addr, size, offset) \ + generic_find_next_zero_le_bit(addr, size, offset) +#define find_next_bit_le(addr, size, offset) \ + generic_find_next_le_bit(addr, size, offset) +#define test_bit_le(nr, addr) \ + generic_test_le_bit(nr, addr) +#define __test_and_set_bit_le(nr, addr) \ + generic___test_and_set_le_bit(nr, addr) +#define __test_and_clear_bit_le(nr, addr) \ + generic___test_and_clear_le_bit(nr, addr) +#endif + +#ifndef IDR_GET_NEXT_EXPORTED +/* Body in compat/idr.c */ +extern void *idr_get_next(struct idr *idp, int *nextidp); +#endif + +/* #ifndef COMPAT_HAVE_LIST_ENTRY_RCU */ +#ifndef list_entry_rcu +#ifndef rcu_dereference_raw +/* see c26d34a rcu: Add lockdep-enabled variants of rcu_dereference() */ +#define rcu_dereference_raw(p) rcu_dereference(p) +#endif +#define list_entry_rcu(ptr, type, member) \ + ({typeof (*ptr) *__ptr = (typeof (*ptr) __force *)ptr; \ + container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \ + }) +#endif + +/* + * Introduced in 930631ed (v2.6.19-rc1). + */ +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#endif + +/* + * IS_ALIGNED() was added to in mainline commit 0c0e6195 (and + * improved in f10db627); 2.6.24-rc1. + */ +#ifndef IS_ALIGNED +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) +#endif + +/* + * NLA_TYPE_MASK and nla_type() were added to in mainline + * commit 8f4c1f9b; v2.6.24-rc1. Before that, none of the nlattr->nla_type + * flags had a special meaning. + */ + +#ifndef NLA_TYPE_MASK +#define NLA_TYPE_MASK ~0 + +static inline int nla_type(const struct nlattr *nla) +{ + return nla->nla_type & NLA_TYPE_MASK; +} + +#endif + +/* + * nlmsg_hdr was added to in mainline commit b529ccf2 + * (v2.6.22-rc1). + */ + +#ifndef COMPAT_HAVE_NLMSG_HDR +static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) +{ + return (struct nlmsghdr *)skb->data; +} +#endif + +/* + * genlmsg_reply() was added to in mainline commit 81878d27 + * (v2.6.20-rc2). + */ + +#ifndef COMPAT_HAVE_GENLMSG_REPLY +#include + +static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) +{ + return genlmsg_unicast(skb, info->snd_pid); +} +#endif + +/* + * genlmsg_msg_size() and genlmsg_total_size() were added to + * in mainline commit 17db952c (v2.6.19-rc1). + */ + +#ifndef COMPAT_HAVE_GENLMSG_MSG_SIZE +#include +#include + +static inline int genlmsg_msg_size(int payload) +{ + return GENL_HDRLEN + payload; +} + +static inline int genlmsg_total_size(int payload) +{ + return NLMSG_ALIGN(genlmsg_msg_size(payload)); +} +#endif + +/* + * genlmsg_new() was added to in mainline commit 3dabc715 + * (v2.6.20-rc2). + */ + +#ifndef COMPAT_HAVE_GENLMSG_NEW +#include + +static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) +{ + return nlmsg_new(genlmsg_total_size(payload), flags); +} +#endif + +/* + * genlmsg_put() was introduced in mainline commit 482a8524 (v2.6.15-rc1) and + * changed in 17c157c8 (v2.6.20-rc2). genlmsg_put_reply() was introduced in + * 17c157c8. We replace the compat_genlmsg_put() from 482a8524. + */ + +#ifndef COMPAT_HAVE_GENLMSG_PUT_REPLY +#include + +static inline void *compat_genlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, + struct genl_family *family, int flags, + u8 cmd) +{ + struct nlmsghdr *nlh; + struct genlmsghdr *hdr; + + nlh = nlmsg_put(skb, pid, seq, family->id, GENL_HDRLEN + + family->hdrsize, flags); + if (nlh == NULL) + return NULL; + + hdr = nlmsg_data(nlh); + hdr->cmd = cmd; + hdr->version = family->version; + hdr->reserved = 0; + + return (char *) hdr + GENL_HDRLEN; +} + +#define genlmsg_put compat_genlmsg_put + +static inline void *genlmsg_put_reply(struct sk_buff *skb, + struct genl_info *info, + struct genl_family *family, + int flags, u8 cmd) +{ + return genlmsg_put(skb, info->snd_pid, info->snd_seq, family, + flags, cmd); +} +#endif + +/* + * compat_genlmsg_multicast() got a gfp_t parameter in mainline commit d387f6ad + * (v2.6.19-rc1). + */ + +#ifdef COMPAT_NEED_GENLMSG_MULTICAST_WRAPPER +#include + +static inline int compat_genlmsg_multicast(struct sk_buff *skb, u32 pid, + unsigned int group, gfp_t flags) +{ + return genlmsg_multicast(skb, pid, group); +} + +#define genlmsg_multicast compat_genlmsg_multicast + +#endif + +/* + * Dynamic generic netlink multicast groups were introduced in mainline commit + * 2dbba6f7 (v2.6.23-rc1). Before that, netlink had a fixed number of 32 + * multicast groups. Use an arbitrary hard-coded group number for that case. + */ + +#ifndef COMPAT_HAVE_CTRL_ATTR_MCAST_GROUPS + +struct genl_multicast_group { + struct genl_family *family; /* private */ + struct list_head list; /* private */ + char name[GENL_NAMSIZ]; + u32 id; +}; + +static inline int genl_register_mc_group(struct genl_family *family, + struct genl_multicast_group *grp) +{ + grp->id = 1; + return 0; +} + +static inline void genl_unregister_mc_group(struct genl_family *family, + struct genl_multicast_group *grp) +{ +} #endif +/* pr_warning was introduced with 2.6.37 (commit 968ab183) + */ +#ifndef pr_fmt +#define pr_fmt(fmt) fmt +#endif + +#ifndef pr_warning +#define pr_warning(fmt, ...) \ + printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +#endif + +#ifndef COMPAT_HAVE_IS_ERR_OR_NULL +static inline long __must_check IS_ERR_OR_NULL(const void *ptr) +{ + return !ptr || IS_ERR_VALUE((unsigned long)ptr); +} +#endif #endif diff -Nru drbd8-8.3.7/drbd/linux/connector.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/connector.h --- drbd8-8.3.7/drbd/linux/connector.h 2009-07-27 08:47:42.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/connector.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,186 +0,0 @@ -/* - * connector.h - * - * 2004-2005 Copyright (c) Evgeniy Polyakov - * All rights reserved. - * - * Modified by Philipp Reiser to work on older 2.6.x kernels. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef __CONNECTOR_H -#define __CONNECTOR_H -#define DRBD_CONNECTOR_BACKPORT_HEADER - -#include - -#define NETLINK_CONNECTOR 11 - -#define CN_IDX_CONNECTOR 0xffffffff -#define CN_VAL_CONNECTOR 0xffffffff - -/* - * Process Events connector unique ids -- used for message routing - */ -#define CN_IDX_PROC 0x1 -#define CN_VAL_PROC 0x1 -#define CN_IDX_CIFS 0x2 -#define CN_VAL_CIFS 0x1 - -#define CN_NETLINK_USERS 1 - -/* - * Maximum connector's message size. - */ -#define CONNECTOR_MAX_MSG_SIZE 1024 - -/* - * idx and val are unique identifiers which - * are used for message routing and - * must be registered in connector.h for in-kernel usage. - */ - -struct cb_id { - __u32 idx; - __u32 val; -}; - -struct cn_msg { - struct cb_id id; - - __u32 seq; - __u32 ack; - - __u16 len; /* Length of the following data */ - __u16 flags; - __u8 data[0]; -}; - -/* - * Notify structure - requests notification about - * registering/unregistering idx/val in range [first, first+range]. - */ -struct cn_notify_req { - __u32 first; - __u32 range; -}; - -/* - * Main notification control message - * *_notify_num - number of appropriate cn_notify_req structures after - * this struct. - * group - notification receiver's idx. - * len - total length of the attached data. - */ -struct cn_ctl_msg { - __u32 idx_notify_num; - __u32 val_notify_num; - __u32 group; - __u32 len; - __u8 data[0]; -}; - -#ifdef __KERNEL__ -#include - -#ifndef KERNEL_HAS_GFP_T -#define KERNEL_HAS_GFP_T -typedef unsigned gfp_t; -#endif - -#include - -#include -#include - -#include - -#define CN_CBQ_NAMELEN 32 - -struct cn_queue_dev { - atomic_t refcnt; - unsigned char name[CN_CBQ_NAMELEN]; - - struct workqueue_struct *cn_queue; - - struct list_head queue_list; - spinlock_t queue_lock; - - int netlink_groups; - struct sock *nls; -}; - -struct cn_callback_id { - unsigned char name[CN_CBQ_NAMELEN]; - struct cb_id id; -}; - -struct cn_callback_data { - void (*destruct_data) (void *); - void *ddata; - - void *callback_priv; - void (*callback) (void *); - - void *free; -}; - -struct cn_callback_entry { - struct list_head callback_entry; - struct cn_callback *cb; - struct work_struct work; - struct cn_queue_dev *pdev; - - struct cn_callback_id id; - struct cn_callback_data data; - - int seq, group; - struct sock *nls; -}; - -struct cn_ctl_entry { - struct list_head notify_entry; - struct cn_ctl_msg *msg; -}; - -struct cn_dev { - struct cb_id id; - - u32 seq, groups; - struct sock *nls; - void (*input) (struct sock * sk, int len); - - struct cn_queue_dev *cbdev; -}; - -int cn_add_callback(struct cb_id *, char *, void (*callback) (void *)); -void cn_del_callback(struct cb_id *); -int cn_netlink_send(struct cn_msg *, u32, gfp_t); - -int cn_queue_add_callback(struct cn_queue_dev *dev, char *name, struct cb_id *id, void (*callback)(void *)); -void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id); - -struct cn_queue_dev *cn_queue_alloc_dev(char *name, struct sock *); -void cn_queue_free_dev(struct cn_queue_dev *dev); - -int cn_cb_equal(struct cb_id *, struct cb_id *); - -void cn_queue_wrapper(void *data); - -extern int cn_already_initialized; - -#endif /* __KERNEL__ */ -#endif /* __CONNECTOR_H */ diff -Nru drbd8-8.3.7/drbd/linux/drbd.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd.h --- drbd8-8.3.7/drbd/linux/drbd.h 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd.h 2012-02-02 14:09:14.000000000 +0000 @@ -25,7 +25,6 @@ */ #ifndef DRBD_H #define DRBD_H -#include #include #include @@ -38,9 +37,9 @@ #include #include -/* Altough the Linux source code makes a difference between +/* Although the Linux source code makes a difference between generic endianness and the bitfields' endianness, there is no - architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness + architecture as of Linux-2.6.24-rc4 where the bitfields' endianness does not match the generic endianness. */ #if __BYTE_ORDER == __LITTLE_ENDIAN @@ -53,7 +52,6 @@ #endif - enum drbd_io_error_p { EP_PASS_ON, /* FIXME should the better be named "Ignore"? */ EP_CALL_HELPER, @@ -61,7 +59,8 @@ }; enum drbd_fencing_p { - FP_DONT_CARE, + FP_NOT_AVAIL = -1, /* Not a policy */ + FP_DONT_CARE = 0, FP_RESOURCE, FP_STONITH }; @@ -86,8 +85,33 @@ ASB_VIOLENTLY }; +enum drbd_on_no_data { + OND_IO_ERROR, + OND_SUSPEND_IO +}; + +enum drbd_on_congestion { + OC_BLOCK, + OC_PULL_AHEAD, + OC_DISCONNECT, +}; + +enum drbd_read_balancing { + RB_PREFER_LOCAL, + RB_PREFER_REMOTE, + RB_ROUND_ROBIN, + RB_LEAST_PENDING, + RB_CONGESTED_REMOTE, + RB_32K_STRIPING, + RB_64K_STRIPING, + RB_128K_STRIPING, + RB_256K_STRIPING, + RB_512K_STRIPING, + RB_1M_STRIPING, +}; + /* KEEP the order, do not delete or insert. Only append. */ -enum drbd_ret_codes { +enum drbd_ret_code { ERR_CODE_BASE = 100, NO_ERROR = 101, ERR_LOCAL_ADDR = 102, @@ -96,8 +120,8 @@ ERR_OPEN_MD_DISK = 105, ERR_DISK_NOT_BDEV = 107, ERR_MD_NOT_BDEV = 108, - ERR_DISK_TO_SMALL = 111, - ERR_MD_DISK_TO_SMALL = 112, + ERR_DISK_TOO_SMALL = 111, + ERR_MD_DISK_TOO_SMALL = 112, ERR_BDCLAIM_DISK = 114, ERR_BDCLAIM_MD_DISK = 115, ERR_MD_IDX_INVALID = 116, @@ -114,8 +138,8 @@ ERR_INTR = 129, /* EINTR */ ERR_RESIZE_RESYNC = 130, ERR_NO_PRIMARY = 131, - ERR_SYNC_AFTER = 132, - ERR_SYNC_AFTER_CYCLE = 133, + ERR_RESYNC_AFTER = 132, + ERR_RESYNC_AFTER_CYCLE = 133, ERR_PAUSE_IS_SET = 134, ERR_PAUSE_IS_CLEAR = 135, ERR_PACKET_NR = 137, @@ -134,6 +158,19 @@ ERR_DATA_NOT_CURRENT = 150, ERR_CONNECTED = 151, /* DRBD 8.3 only */ ERR_PERM = 152, + ERR_NEED_APV_93 = 153, + ERR_STONITH_AND_PROT_A = 154, + ERR_CONG_NOT_PROTO_A = 155, + ERR_PIC_AFTER_DEP = 156, + ERR_PIC_PEER_DEP = 157, + ERR_RES_NOT_KNOWN = 158, + ERR_RES_IN_USE = 159, + ERR_MINOR_CONFIGURED = 160, + ERR_MINOR_EXISTS = 161, + ERR_INVALID_REQUEST = 162, + ERR_NEED_APV_100 = 163, + ERR_NEED_ALLOW_TWO_PRI = 164, + ERR_MD_UNCLEAN = 165, /* insert new ones above this line */ AFTER_LAST_ERR_CODE @@ -163,7 +200,7 @@ /* These temporal states are all used on the way * from >= C_CONNECTED to Unconnected. * The 'disconnect reason' states - * I do not allow to change beween them. */ + * I do not allow to change between them. */ C_TIMEOUT, C_BROKEN_PIPE, C_NETWORK_FAILURE, @@ -174,7 +211,7 @@ C_WF_REPORT_PARAMS, /* we have a socket */ C_CONNECTED, /* we have introduced each other */ C_STARTING_SYNC_S, /* starting full sync by admin request. */ - C_STARTING_SYNC_T, /* stariing full sync by admin request. */ + C_STARTING_SYNC_T, /* starting full sync by admin request. */ C_WF_BITMAP_S, C_WF_BITMAP_T, C_WF_SYNC_UUID, @@ -187,6 +224,10 @@ C_VERIFY_T, C_PAUSED_SYNC_S, C_PAUSED_SYNC_T, + + C_AHEAD, + C_BEHIND, + C_MASK = 31 }; @@ -211,7 +252,7 @@ * pointed out by Maxim Uvarov q * even though we transmit as "cpu_to_be32(state)", * the offsets of the bitfields still need to be swapped - * on different endianess. + * on different endianness. */ struct { #if defined(__LITTLE_ENDIAN_BITFIELD) @@ -220,13 +261,17 @@ unsigned conn:5 ; /* 17/32 cstates */ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ - unsigned susp:1 ; /* 2/2 IO suspended no/yes */ + unsigned susp:1 ; /* 2/2 IO suspended no/yes (by user) */ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ unsigned peer_isp:1 ; unsigned user_isp:1 ; - unsigned _pad:11; /* 0 unused */ + unsigned susp_nod:1 ; /* IO suspended because no data */ + unsigned susp_fen:1 ; /* IO suspended because fence peer handler runs*/ + unsigned _pad:9; /* 0 unused */ #elif defined(__BIG_ENDIAN_BITFIELD) - unsigned _pad:11; /* 0 unused */ + unsigned _pad:9; + unsigned susp_fen:1 ; + unsigned susp_nod:1 ; unsigned user_isp:1 ; unsigned peer_isp:1 ; unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ @@ -237,20 +282,13 @@ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ unsigned role:2 ; /* 3/4 primary/secondary/unknown */ #else -# error "this endianess is not supported" -#endif -#ifndef DRBD_DEBUG_STATE_CHANGES -#define DRBD_DEBUG_STATE_CHANGES 0 -#endif -#if DRBD_DEBUG_STATE_CHANGES - unsigned int line; - const char *func; +# error "this endianness is not supported" #endif }; unsigned int i; }; -enum drbd_state_ret_codes { +enum drbd_state_rv { SS_CW_NO_NEED = 4, SS_CW_SUCCESS = 3, SS_NOTHING_TO_DO = 2, @@ -274,14 +312,15 @@ SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ - SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ + SS_O_VOL_PEER_PRI = -20, + SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ }; /* from drbd_strings.c */ extern const char *drbd_conn_str(enum drbd_conns); extern const char *drbd_role_str(enum drbd_role); extern const char *drbd_disk_str(enum drbd_disk_state); -extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes); +extern const char *drbd_set_st_err_str(enum drbd_state_rv); #define SHARED_SECRET_MAX 64 @@ -291,7 +330,8 @@ #define MDF_FULL_SYNC (1 << 3) #define MDF_WAS_UP_TO_DATE (1 << 4) #define MDF_PEER_OUT_DATED (1 << 5) -#define MDF_CRASHED_PRIMARY (1 << 6) +#define MDF_CRASHED_PRIMARY (1 << 6) +#define MDF_AL_CLEAN (1 << 7) enum drbd_uuid_index { UI_CURRENT, @@ -311,42 +351,23 @@ #define UUID_JUST_CREATED ((__u64)4) +/* magic numbers used in meta data and network packets */ #define DRBD_MAGIC 0x83740267 -#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) +#define DRBD_MAGIC_BIG 0x835a +#define DRBD_MAGIC_100 0x8620ec20 + +#define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3) +#define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4) +#define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5) + + +/* how I came up with this magic? + * base64 decode "actlog==" ;) */ +#define DRBD_AL_MAGIC 0x69cb65a2 /* these are of type "int" */ #define DRBD_MD_INDEX_INTERNAL -1 #define DRBD_MD_INDEX_FLEX_EXT -2 #define DRBD_MD_INDEX_FLEX_INT -3 -/* Start of the new netlink/connector stuff */ - -#define DRBD_NL_CREATE_DEVICE 0x01 -#define DRBD_NL_SET_DEFAULTS 0x02 - -/* The following line should be moved over to linux/connector.h - * when the time comes */ -#ifndef CN_IDX_DRBD -# define CN_IDX_DRBD 0x4 -/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */ -#endif -#define CN_VAL_DRBD 0x1 - -/* For searching a vacant cn_idx value */ -#define CN_IDX_STEP 6977 - -struct drbd_nl_cfg_req { - int packet_type; - unsigned int drbd_minor; - int flags; - unsigned short tag_list[]; -}; - -struct drbd_nl_cfg_reply { - int packet_type; - unsigned int minor; - int ret_code; /* enum ret_code or set_st_err_t */ - unsigned short tag_list[]; /* only used with get_* calls */ -}; - #endif diff -Nru drbd8-8.3.7/drbd/linux/drbd_config.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_config.h --- drbd8-8.3.7/drbd/linux/drbd_config.h 2010-01-13 16:14:27.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_config.h 2012-02-02 14:09:14.000000000 +0000 @@ -22,10 +22,20 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.7" -#define API_VERSION 88 +/* Necessary to build the external module against >= Linux-2.6.33 */ +#ifdef REL_VERSION +#undef REL_VERSION +#undef API_VERSION +#undef PRO_VERSION_MIN +#undef PRO_VERSION_MAX +#endif + +/* End of external module for 2.6.33 stuff */ + +#define REL_VERSION "8.4.1" +#define API_VERSION 1 #define PRO_VERSION_MIN 86 -#define PRO_VERSION_MAX 91 +#define PRO_VERSION_MAX 100 #ifndef __CHECKER__ /* for a sparse run, we need all STATICs */ #define DBG_ALL_SYMBOLS /* no static functs, improves quality of OOPS traces */ @@ -45,60 +55,8 @@ /* Enable fault insertion code */ #define DRBD_ENABLE_FAULTS -/* RedHat's 2.6.9 kernels have the gfp_t type. Mainline has this feature - * since 2.6.16. If you build for RedHat enable the line below. */ -#define KERNEL_HAS_GFP_T - -/* kernel.org has atomic_add_return since 2.6.10. some vendor kernels - * have it backported, though. Others don't. */ -//#define NEED_BACKPORT_OF_ATOMIC_ADD - -/* 2.6.something has deprecated kmem_cache_t - * some older still use it. - * some have it defined as struct kmem_cache_s, some as struct kmem_cache */ -//#define USE_KMEM_CACHE_S - -/* 2.6.something has sock_create_kern (SE-linux security context stuff) - * some older distribution kernels don't. */ -//#define DEFINE_SOCK_CREATE_KERN - -/* 2.6.24 and later have kernel_sock_shutdown. - * some older distribution kernels may also have a backport. */ -//#define DEFINE_KERNEL_SOCK_SHUTDOWN - -/* in older kernels (vanilla < 2.6.16) struct netlink_skb_parms has a - * member called dst_groups. Later it is called dst_group (without 's'). */ -//#define DRBD_NL_DST_GROUPS - -/* in older kernels (vanilla < 2.6.14) is no kzalloc() */ -//#define NEED_BACKPORT_OF_KZALLOC - -// some vendor kernels have it, some don't -//#define NEED_SG_SET_BUF -#define HAVE_LINUX_SCATTERLIST_H - -/* 2.6.29 and up no longer have swabb.h */ -//#define HAVE_LINUX_BYTEORDER_SWABB_H - -/* some vendor kernel have it backported. */ -#define HAVE_SET_CPUS_ALLOWED_PTR - -/* Some vendor kernels < 2.6.7 might define msleep in one or - * another way .. */ - -#define KERNEL_HAS_MSLEEP - -/* Some other kernels < 2.6.8 do not have struct kvec, - * others do.. */ - -#define KERNEL_HAS_KVEC - -/* Actually availabe since 2.6.25, but venders have backported... - */ -#define KERNEL_HAS_PROC_CREATE - -/* In 2.6.32 we finally fixed connector to pass netlink_skb_parms to the callback - */ -#define KERNEL_HAS_CN_SKB_PARMS +#ifdef __KERNEL__ +#include "compat.h" +#endif #endif diff -Nru drbd8-8.3.7/drbd/linux/drbd_genl.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_genl.h --- drbd8-8.3.7/drbd/linux/drbd_genl.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_genl.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,364 @@ +/* + * General overview: + * full generic netlink message: + * |nlmsghdr|genlmsghdr| + * + * payload: + * |optional fixed size family header| + * + * sequence of netlink attributes: + * I chose to have all "top level" attributes NLA_NESTED, + * corresponding to some real struct. + * So we have a sequence of |tla, len| + * + * nested nla sequence: + * may be empty, or contain a sequence of netlink attributes + * representing the struct fields. + * + * The tag number of any field (regardless of containing struct) + * will be available as T_ ## field_name, + * so you cannot have the same field name in two differnt structs. + * + * The tag numbers themselves are per struct, though, + * so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type, + * which we won't use here). + * The tag numbers are used as index in the respective nla_policy array. + * + * GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy + * genl_magic_struct.h + * generates the struct declaration, + * generates an entry in the tla enum, + * genl_magic_func.h + * generates an entry in the static tla policy + * with .type = NLA_NESTED + * generates the static _nl_policy definition, + * and static conversion functions + * + * genl_magic_func.h + * + * GENL_mc_group(group) + * genl_magic_struct.h + * does nothing + * genl_magic_func.h + * defines and registers the mcast group, + * and provides a send helper + * + * GENL_notification(op_name, op_num, mcast_group, tla list) + * These are notifications to userspace. + * + * genl_magic_struct.h + * generates an entry in the genl_ops enum, + * genl_magic_func.h + * does nothing + * + * mcast group: the name of the mcast group this notification should be + * expected on + * tla list: the list of expected top level attributes, + * for documentation and sanity checking. + * + * GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations" + * These are requests from userspace. + * + * _op and _notification share the same "number space", + * op_nr will be assigned to "genlmsghdr->cmd" + * + * genl_magic_struct.h + * generates an entry in the genl_ops enum, + * genl_magic_func.h + * generates an entry in the static genl_ops array, + * and static register/unregister functions to + * genl_register_family_with_ops(). + * + * flags and handler: + * GENL_op_init( .doit = x, .dumpit = y, .flags = something) + * GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM + * tla list: the list of expected top level attributes, + * for documentation and sanity checking. + */ + +/* + * STRUCTS + */ + +/* this is sent kernel -> userland on various error conditions, and contains + * informational textual info, which is supposedly human readable. + * The computer relevant return code is in the drbd_genlmsghdr. + */ +GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, + /* "arbitrary" size strings, nla_policy.len = 0 */ + __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0) +) + +/* Configuration requests typically need a context to operate on. + * Possible keys are device minor (fits in the drbd_genlmsghdr), + * the replication link (aka connection) name, + * and/or the replication group (aka resource) name, + * and the volume id within the resource. */ +GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context, + __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume) + __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128) + __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128) + __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128) +) + +GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, + __str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT, backing_dev, 128) + __str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev, 128) + __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx) + + /* use the resize command to try and change the disk_size */ + __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size) + /* we could change the max_bio_bvecs, + * but it won't propagate through the stack */ + __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs) + + __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF) + __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF) + + __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF) + __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF) + __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF) + __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) + __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF) + __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) + __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) + __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) + + __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) + __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) + __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) + __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) + __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) + __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) +) + +GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, + __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, 32) + __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF) +) + +GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, + __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE, + shared_secret, SHARED_SECRET_MAX) + __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX) + __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX) + __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX) + __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX) + __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF) + __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF) + __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF) + __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF) + __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF) + __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) + __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) + __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF) + __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF) + __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) + __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) + __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF) + __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF) + __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF) + __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF) + __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF) + __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF) + __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF) + __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) + __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data) + __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF) + __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF) + __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) + __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) +) + +GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, + __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate) +) + +GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, + __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) + __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) + __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) +) + +GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, + /* the reason of the broadcast, + * if this is an event triggered broadcast. */ + __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason) + __u32_field(2, DRBD_F_REQUIRED, current_state) + __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity) + __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid) + + /* These are for broadcast from after state change work. + * prev_state and new_state are from the moment the state change took + * place, new_state is not neccessarily the same as current_state, + * there may have been more state changes since. Which will be + * broadcasted soon, in their respective after state change work. */ + __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state) + __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state) + + /* if we have a local disk: */ + __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64))) + __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags) + __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total) + __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos) + /* and in case resync or online verify is active */ + __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total) + __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed) + + /* for pre and post notifications of helper execution */ + __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32) + __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code) +) + +GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms, + __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector) +) + +GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms, + __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm) +) + +GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, + __u32_field(1, DRBD_F_REQUIRED, timeout_type) +) + +GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms, + __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect) +) + +GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, + __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) +) + +/* + * Notifications and commands (genlmsghdr->cmd) + */ +GENL_mc_group(events) + + /* kernel -> userspace announcement of changes */ +GENL_notification( + DRBD_EVENT, 1, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY) +) + + /* query kernel for specific or all info */ +GENL_op( + DRBD_ADM_GET_STATUS, 2, + GENL_op_init( + .doit = drbd_adm_get_status, + .dumpit = drbd_adm_get_status_all, + /* anyone may ask for the status, + * it is broadcasted anyways */ + ), + /* To select the object .doit. + * Or a subset of objects in .dumpit. */ + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) +) + + /* add DRBD minor devices as volumes to resources */ +GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + + /* add or delete resources */ +GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + +GENL_op(DRBD_ADM_RESOURCE_OPTS, 9, + GENL_doit(drbd_adm_resource_opts), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY) +) + +GENL_op( + DRBD_ADM_CONNECT, 10, + GENL_doit(drbd_adm_connect), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) +) + +GENL_op( + DRBD_ADM_CHG_NET_OPTS, 29, + GENL_doit(drbd_adm_net_opts), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED) +) + +GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + +GENL_op(DRBD_ADM_ATTACH, 12, + GENL_doit(drbd_adm_attach), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED) +) + +GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28, + GENL_doit(drbd_adm_disk_opts), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED) +) + +GENL_op( + DRBD_ADM_RESIZE, 13, + GENL_doit(drbd_adm_resize), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY) +) + +GENL_op( + DRBD_ADM_PRIMARY, 14, + GENL_doit(drbd_adm_set_role), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) +) + +GENL_op( + DRBD_ADM_SECONDARY, 15, + GENL_doit(drbd_adm_set_role), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED) +) + +GENL_op( + DRBD_ADM_NEW_C_UUID, 16, + GENL_doit(drbd_adm_new_c_uuid), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY) +) + +GENL_op( + DRBD_ADM_START_OV, 17, + GENL_doit(drbd_adm_start_ov), + GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY) +) + +GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) diff -Nru drbd8-8.3.7/drbd/linux/drbd_genl_api.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_genl_api.h --- drbd8-8.3.7/drbd/linux/drbd_genl_api.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_genl_api.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,55 @@ +#ifndef DRBD_GENL_STRUCT_H +#define DRBD_GENL_STRUCT_H + +/** + * struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests + * @minor: + * For admin requests (user -> kernel): which minor device to operate on. + * For (unicast) replies or informational (broadcast) messages + * (kernel -> user): which minor device the information is about. + * If we do not operate on minors, but on connections or resources, + * the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT + * is used instead. + * @flags: possible operation modifiers (relevant only for user->kernel): + * DRBD_GENL_F_SET_DEFAULTS + * @volume: + * When creating a new minor (adding it to a resource), the resource needs + * to know which volume number within the resource this is supposed to be. + * The volume number corresponds to the same volume number on the remote side, + * whereas the minor number on the remote side may be different + * (union with flags). + * @ret_code: kernel->userland unicast cfg reply return code (union with flags); + */ +struct drbd_genlmsghdr { + __u32 minor; + union { + __u32 flags; + __s32 ret_code; + }; +}; + +/* To be used in drbd_genlmsghdr.flags */ +enum { + DRBD_GENL_F_SET_DEFAULTS = 1, +}; + +enum drbd_state_info_bcast_reason { + SIB_GET_STATUS_REPLY = 1, + SIB_STATE_CHANGE = 2, + SIB_HELPER_PRE = 3, + SIB_HELPER_POST = 4, + SIB_SYNC_PROGRESS = 5, +}; + +/* hack around predefined gcc/cpp "linux=1", + * we cannot possibly include <1/drbd_genl.h> */ +#undef linux + +#include +#define GENL_MAGIC_VERSION API_VERSION +#define GENL_MAGIC_FAMILY drbd +#define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr) +#define GENL_MAGIC_INCLUDE_FILE +#include + +#endif diff -Nru drbd8-8.3.7/drbd/linux/drbd_limits.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_limits.h --- drbd8-8.3.7/drbd/linux/drbd_limits.h 2009-09-29 07:51:14.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_limits.h 2012-02-02 14:09:14.000000000 +0000 @@ -17,121 +17,202 @@ #define DRBD_MINOR_COUNT_MIN 1 #define DRBD_MINOR_COUNT_MAX 255 +#define DRBD_MINOR_COUNT_DEF 32 +#define DRBD_MINOR_COUNT_SCALE '1' + +#define DRBD_VOLUME_MAX 65535 #define DRBD_DIALOG_REFRESH_MIN 0 #define DRBD_DIALOG_REFRESH_MAX 600 +#define DRBD_DIALOG_REFRESH_SCALE '1' /* valid port number */ #define DRBD_PORT_MIN 1 #define DRBD_PORT_MAX 0xffff +#define DRBD_PORT_SCALE '1' /* startup { */ /* if you want more than 3.4 days, disable */ #define DRBD_WFC_TIMEOUT_MIN 0 #define DRBD_WFC_TIMEOUT_MAX 300000 #define DRBD_WFC_TIMEOUT_DEF 0 +#define DRBD_WFC_TIMEOUT_SCALE '1' #define DRBD_DEGR_WFC_TIMEOUT_MIN 0 #define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 #define DRBD_DEGR_WFC_TIMEOUT_DEF 0 +#define DRBD_DEGR_WFC_TIMEOUT_SCALE '1' #define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 #define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 #define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 +#define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1' /* }*/ /* net { */ /* timeout, unit centi seconds - * more than one minute timeout is not usefull */ + * more than one minute timeout is not useful */ #define DRBD_TIMEOUT_MIN 1 #define DRBD_TIMEOUT_MAX 600 #define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ +#define DRBD_TIMEOUT_SCALE '1' + + /* If backing disk takes longer than disk_timeout, mark the disk as failed */ +#define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */ +#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */ +#define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */ +#define DRBD_DISK_TIMEOUT_SCALE '1' /* active connection retries when C_WF_CONNECTION */ #define DRBD_CONNECT_INT_MIN 1 #define DRBD_CONNECT_INT_MAX 120 #define DRBD_CONNECT_INT_DEF 10 /* seconds */ +#define DRBD_CONNECT_INT_SCALE '1' /* keep-alive probes when idle */ #define DRBD_PING_INT_MIN 1 #define DRBD_PING_INT_MAX 120 #define DRBD_PING_INT_DEF 10 +#define DRBD_PING_INT_SCALE '1' /* timeout for the ping packets.*/ #define DRBD_PING_TIMEO_MIN 1 -#define DRBD_PING_TIMEO_MAX 100 +#define DRBD_PING_TIMEO_MAX 300 #define DRBD_PING_TIMEO_DEF 5 +#define DRBD_PING_TIMEO_SCALE '1' /* max number of write requests between write barriers */ #define DRBD_MAX_EPOCH_SIZE_MIN 1 #define DRBD_MAX_EPOCH_SIZE_MAX 20000 #define DRBD_MAX_EPOCH_SIZE_DEF 2048 +#define DRBD_MAX_EPOCH_SIZE_SCALE '1' - /* I don't think that a tcp send buffer of more than 10M is usefull */ + /* I don't think that a tcp send buffer of more than 10M is useful */ #define DRBD_SNDBUF_SIZE_MIN 0 #define DRBD_SNDBUF_SIZE_MAX (10<<20) #define DRBD_SNDBUF_SIZE_DEF 0 +#define DRBD_SNDBUF_SIZE_SCALE '1' #define DRBD_RCVBUF_SIZE_MIN 0 #define DRBD_RCVBUF_SIZE_MAX (10<<20) #define DRBD_RCVBUF_SIZE_DEF 0 +#define DRBD_RCVBUF_SIZE_SCALE '1' /* @4k PageSize -> 128kB - 512MB */ #define DRBD_MAX_BUFFERS_MIN 32 #define DRBD_MAX_BUFFERS_MAX 131072 #define DRBD_MAX_BUFFERS_DEF 2048 +#define DRBD_MAX_BUFFERS_SCALE '1' /* @4k PageSize -> 4kB - 512MB */ #define DRBD_UNPLUG_WATERMARK_MIN 1 #define DRBD_UNPLUG_WATERMARK_MAX 131072 #define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) +#define DRBD_UNPLUG_WATERMARK_SCALE '1' /* 0 is disabled. * 200 should be more than enough even for very short timeouts */ #define DRBD_KO_COUNT_MIN 0 #define DRBD_KO_COUNT_MAX 200 -#define DRBD_KO_COUNT_DEF 0 +#define DRBD_KO_COUNT_DEF 7 +#define DRBD_KO_COUNT_SCALE '1' /* } */ /* syncer { */ /* FIXME allow rate to be zero? */ -#define DRBD_RATE_MIN 1 +#define DRBD_RESYNC_RATE_MIN 1 /* channel bonding 10 GbE, or other hardware */ -#define DRBD_RATE_MAX (4 << 20) -#define DRBD_RATE_DEF 250 /* kb/second */ - - /* less than 7 would hit performance unneccessarily. - * 3833 is the largest prime that still does fit - * into 64 sectors of activity log */ +#define DRBD_RESYNC_RATE_MAX (4 << 20) +#define DRBD_RESYNC_RATE_DEF 250 +#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ + + /* less than 7 would hit performance unnecessarily. + * 919 slots context information per transaction, + * 32k activity log, 4k transaction size, + * one transaction in flight: + * 919 * 7 = 6433 */ #define DRBD_AL_EXTENTS_MIN 7 -#define DRBD_AL_EXTENTS_MAX 3833 -#define DRBD_AL_EXTENTS_DEF 127 - -#define DRBD_AFTER_MIN -1 -#define DRBD_AFTER_MAX 255 -#define DRBD_AFTER_DEF -1 +#define DRBD_AL_EXTENTS_MAX 6433 +#define DRBD_AL_EXTENTS_DEF 1237 +#define DRBD_AL_EXTENTS_SCALE '1' + +#define DRBD_MINOR_NUMBER_MIN -1 +#define DRBD_MINOR_NUMBER_MAX ((1 << 20) - 1) +#define DRBD_MINOR_NUMBER_DEF -1 +#define DRBD_MINOR_NUMBER_SCALE '1' /* } */ /* drbdsetup XY resize -d Z * you are free to reduce the device size to nothing, if you want to. * the upper limit with 64bit kernel, enough ram and flexible meta data - * is 16 TB, currently. */ + * is 1 PiB, currently. */ /* DRBD_MAX_SECTORS */ -#define DRBD_DISK_SIZE_SECT_MIN 0 -#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30)) -#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ +#define DRBD_DISK_SIZE_MIN 0 +#define DRBD_DISK_SIZE_MAX (1 * (2LLU << 40)) +#define DRBD_DISK_SIZE_DEF 0 /* = disabled = no user size... */ +#define DRBD_DISK_SIZE_SCALE 's' /* sectors */ -#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON +#define DRBD_ON_IO_ERROR_DEF EP_DETACH #define DRBD_FENCING_DEF FP_DONT_CARE #define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT #define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT +#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR +#define DRBD_ON_CONGESTION_DEF OC_BLOCK +#define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL #define DRBD_MAX_BIO_BVECS_MIN 0 #define DRBD_MAX_BIO_BVECS_MAX 128 #define DRBD_MAX_BIO_BVECS_DEF 0 +#define DRBD_MAX_BIO_BVECS_SCALE '1' + +#define DRBD_C_PLAN_AHEAD_MIN 0 +#define DRBD_C_PLAN_AHEAD_MAX 300 +#define DRBD_C_PLAN_AHEAD_DEF 20 +#define DRBD_C_PLAN_AHEAD_SCALE '1' + +#define DRBD_C_DELAY_TARGET_MIN 1 +#define DRBD_C_DELAY_TARGET_MAX 100 +#define DRBD_C_DELAY_TARGET_DEF 10 +#define DRBD_C_DELAY_TARGET_SCALE '1' + +#define DRBD_C_FILL_TARGET_MIN 0 +#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */ +#define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */ +#define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */ + +#define DRBD_C_MAX_RATE_MIN 250 +#define DRBD_C_MAX_RATE_MAX (4 << 20) +#define DRBD_C_MAX_RATE_DEF 102400 +#define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */ + +#define DRBD_C_MIN_RATE_MIN 0 +#define DRBD_C_MIN_RATE_MAX (4 << 20) +#define DRBD_C_MIN_RATE_DEF 250 +#define DRBD_C_MIN_RATE_SCALE 'k' /* kilobytes */ + +#define DRBD_CONG_FILL_MIN 0 +#define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */ +#define DRBD_CONG_FILL_DEF 0 +#define DRBD_CONG_FILL_SCALE 's' /* sectors */ + +#define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN +#define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX +#define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF +#define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE + +#define DRBD_PROTOCOL_DEF DRBD_PROT_C + +#define DRBD_DISK_BARRIER_DEF 1 +#define DRBD_DISK_FLUSHES_DEF 1 +#define DRBD_DISK_DRAIN_DEF 1 +#define DRBD_MD_FLUSHES_DEF 1 +#define DRBD_TCP_CORK_DEF 1 + +#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 +#define DRBD_ALWAYS_ASBP_DEF 0 +#define DRBD_USE_RLE_DEF 1 -#undef RANGE #endif diff -Nru drbd8-8.3.7/drbd/linux/drbd_nl.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_nl.h --- drbd8-8.3.7/drbd/linux/drbd_nl.h 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_nl.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,138 +0,0 @@ -/* - PAKET( name, - TYPE ( pn, pr, member ) - ... - ) - - You may never reissue one of the pn arguments -*/ - -#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) -#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" -#endif - -NL_PACKET(primary, 1, - NL_BIT( 1, T_MAY_IGNORE, overwrite_peer) -) - -NL_PACKET(secondary, 2, ) - -NL_PACKET(disk_conf, 3, - NL_INT64( 2, T_MAY_IGNORE, disk_size) - NL_STRING( 3, T_MANDATORY, backing_dev, 128) - NL_STRING( 4, T_MANDATORY, meta_dev, 128) - NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) - NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) - NL_INTEGER( 7, T_MAY_IGNORE, fencing) - NL_BIT( 37, T_MAY_IGNORE, use_bmbv) - NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) - NL_BIT( 54, T_MAY_IGNORE, no_md_flush) - /* 55 max_bio_size was available in 8.2.6rc2 */ - NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) - NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) - NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) -) - -NL_PACKET(detach, 4, ) - -NL_PACKET(net_conf, 5, - NL_STRING( 8, T_MANDATORY, my_addr, 128) - NL_STRING( 9, T_MANDATORY, peer_addr, 128) - NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) - NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) - NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) - NL_INTEGER( 14, T_MAY_IGNORE, timeout) - NL_INTEGER( 15, T_MANDATORY, wire_protocol) - NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) - NL_INTEGER( 17, T_MAY_IGNORE, ping_int) - NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) - NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) - NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) - NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) - NL_INTEGER( 22, T_MAY_IGNORE, ko_count) - NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) - NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) - NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) - NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) - NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) - NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) - /* 59 addr_family was available in GIT, never released */ - NL_BIT( 60, T_MANDATORY, mind_af) - NL_BIT( 27, T_MAY_IGNORE, want_lose) - NL_BIT( 28, T_MAY_IGNORE, two_primaries) - NL_BIT( 41, T_MAY_IGNORE, always_asbp) - NL_BIT( 61, T_MAY_IGNORE, no_cork) - NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) -) - -NL_PACKET(disconnect, 6, ) - -NL_PACKET(resize, 7, - NL_INT64( 29, T_MAY_IGNORE, resize_size) - NL_BIT( 68, T_MAY_IGNORE, resize_force) -) - -NL_PACKET(syncer_conf, 8, - NL_INTEGER( 30, T_MAY_IGNORE, rate) - NL_INTEGER( 31, T_MAY_IGNORE, after) - NL_INTEGER( 32, T_MAY_IGNORE, al_extents) - NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) - NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) - NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) - NL_BIT( 65, T_MAY_IGNORE, use_rle) -) - -NL_PACKET(invalidate, 9, ) -NL_PACKET(invalidate_peer, 10, ) -NL_PACKET(pause_sync, 11, ) -NL_PACKET(resume_sync, 12, ) -NL_PACKET(suspend_io, 13, ) -NL_PACKET(resume_io, 14, ) -NL_PACKET(outdate, 15, ) -NL_PACKET(get_config, 16, ) -NL_PACKET(get_state, 17, - NL_INTEGER( 33, T_MAY_IGNORE, state_i) -) - -NL_PACKET(get_uuids, 18, - NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) - NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) -) - -NL_PACKET(get_timeout_flag, 19, - NL_BIT( 36, T_MAY_IGNORE, use_degraded) -) - -NL_PACKET(call_helper, 20, - NL_STRING( 38, T_MAY_IGNORE, helper, 32) -) - -/* Tag nr 42 already allocated in drbd-8.1 development. */ - -NL_PACKET(sync_progress, 23, - NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) -) - -NL_PACKET(dump_ee, 24, - NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) - NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) - NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) - NL_INT64( 48, T_MAY_IGNORE, ee_sector) - NL_INT64( 49, T_MAY_IGNORE, ee_block_id) - NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) -) - -NL_PACKET(start_ov, 25, - NL_INT64( 66, T_MAY_IGNORE, start_sector) -) - -NL_PACKET(new_c_uuid, 26, - NL_BIT( 63, T_MANDATORY, clear_bm) -) - -#undef NL_PACKET -#undef NL_INTEGER -#undef NL_INT64 -#undef NL_BIT -#undef NL_STRING - diff -Nru drbd8-8.3.7/drbd/linux/drbd_tag_magic.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_tag_magic.h --- drbd8-8.3.7/drbd/linux/drbd_tag_magic.h 2009-07-27 08:47:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/drbd_tag_magic.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,83 +0,0 @@ -#ifndef DRBD_TAG_MAGIC_H -#define DRBD_TAG_MAGIC_H - -#define TT_END 0 -#define TT_REMOVED 0xE000 - -/* declare packet_type enums */ -enum packet_types { -#define NL_PACKET(name, number, fields) P_ ## name = number, -#define NL_INTEGER(pn, pr, member) -#define NL_INT64(pn, pr, member) -#define NL_BIT(pn, pr, member) -#define NL_STRING(pn, pr, member, len) -#include "drbd_nl.h" - P_nl_after_last_packet, -}; - -/* These struct are used to deduce the size of the tag lists: */ -#define NL_PACKET(name, number, fields) \ - struct name ## _tag_len_struct { fields }; -#define NL_INTEGER(pn, pr, member) \ - int member; int tag_and_len ## member; -#define NL_INT64(pn, pr, member) \ - __u64 member; int tag_and_len ## member; -#define NL_BIT(pn, pr, member) \ - unsigned char member:1; int tag_and_len ## member; -#define NL_STRING(pn, pr, member, len) \ - unsigned char member[len]; int member ## _len; \ - int tag_and_len ## member; -#include "linux/drbd_nl.h" - -/* declate tag-list-sizes */ -static const int tag_list_sizes[] = { -#define NL_PACKET(name, number, fields) 2 fields , -#define NL_INTEGER(pn, pr, member) + 4 + 4 -#define NL_INT64(pn, pr, member) + 4 + 8 -#define NL_BIT(pn, pr, member) + 4 + 1 -#define NL_STRING(pn, pr, member, len) + 4 + (len) -#include "drbd_nl.h" -}; - -/* The two highest bits are used for the tag type */ -#define TT_MASK 0xC000 -#define TT_INTEGER 0x0000 -#define TT_INT64 0x4000 -#define TT_BIT 0x8000 -#define TT_STRING 0xC000 -/* The next bit indicates if processing of the tag is mandatory */ -#define T_MANDATORY 0x2000 -#define T_MAY_IGNORE 0x0000 -#define TN_MASK 0x1fff -/* The remaining 13 bits are used to enumerate the tags */ - -#define tag_type(T) ((T) & TT_MASK) -#define tag_number(T) ((T) & TN_MASK) - -/* declare tag enums */ -#define NL_PACKET(name, number, fields) fields -enum drbd_tags { -#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , -#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , -#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , -#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , -#include "drbd_nl.h" -}; - -struct tag { - const char *name; - int type_n_flags; - int max_len; -}; - -/* declare tag names */ -#define NL_PACKET(name, number, fields) fields -static const struct tag tag_descriptions[] = { -#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, -#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, -#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, -#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, -#include "drbd_nl.h" -}; - -#endif diff -Nru drbd8-8.3.7/drbd/linux/genl_magic_func.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/genl_magic_func.h --- drbd8-8.3.7/drbd/linux/genl_magic_func.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/genl_magic_func.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,422 @@ +#ifndef GENL_MAGIC_FUNC_H +#define GENL_MAGIC_FUNC_H + +#include + +/* + * Magic: declare tla policy {{{1 + * Magic: declare nested policies + * {{{2 + */ +#undef GENL_mc_group +#define GENL_mc_group(group) + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ + [tag_name] = { .type = NLA_NESTED }, + +static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] \ + __attribute__((unused)) = { +#include GENL_MAGIC_INCLUDE_FILE +}; + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static struct nla_policy s_name ## _nl_policy[] __read_mostly = \ +{ s_fields }; + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, _type, __get, \ + __put, __is_signed) \ + [attr_nr] = { .type = nla_type }, + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen, \ + __get, __put, __is_signed) \ + [attr_nr] = { .type = nla_type, \ + .len = maxlen - (nla_type == NLA_NUL_STRING) }, + +#include GENL_MAGIC_INCLUDE_FILE + +#ifndef __KERNEL__ +#ifndef pr_info +#define pr_info(args...) fprintf(stderr, args); +#endif +#endif + +#ifdef GENL_MAGIC_DEBUG +static void dprint_field(const char *dir, int nla_type, + const char *name, void *valp) +{ + __u64 val = valp ? *(__u32 *)valp : 1; + switch (nla_type) { + case NLA_U8: val = (__u8)val; + case NLA_U16: val = (__u16)val; + case NLA_U32: val = (__u32)val; + pr_info("%s attr %s: %d 0x%08x\n", dir, + name, (int)val, (unsigned)val); + break; + case NLA_U64: + val = *(__u64*)valp; + pr_info("%s attr %s: %lld 0x%08llx\n", dir, + name, (long long)val, (unsigned long long)val); + break; + case NLA_FLAG: + if (val) + pr_info("%s attr %s: set\n", dir, name); + break; + } +} + +static void dprint_array(const char *dir, int nla_type, + const char *name, const char *val, unsigned len) +{ + switch (nla_type) { + case NLA_NUL_STRING: + if (len && val[len-1] == '\0') + len--; + pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val); + break; + default: + /* we can always show 4 byte, + * thats what nlattr are aligned to. */ + pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n", + dir, name, len, val[0], val[1], val[2], val[3]); + } +} + +#define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b); + +/* Name is a member field name of the struct s. + * If s is NULL (only parsing, no copy requested in *_from_attrs()), + * nla is supposed to point to the attribute containing the information + * corresponding to that struct member. */ +#define DPRINT_FIELD(dir, nla_type, name, s, nla) \ + do { \ + if (s) \ + dprint_field(dir, nla_type, #name, &s->name); \ + else if (nla) \ + dprint_field(dir, nla_type, #name, \ + (nla_type == NLA_FLAG) ? NULL \ + : nla_data(nla)); \ + } while (0) + +#define DPRINT_ARRAY(dir, nla_type, name, s, nla) \ + do { \ + if (s) \ + dprint_array(dir, nla_type, #name, \ + s->name, s->name ## _len); \ + else if (nla) \ + dprint_array(dir, nla_type, #name, \ + nla_data(nla), nla_len(nla)); \ + } while (0) +#else +#define DPRINT_TLA(a, op, b) do {} while (0) +#define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0) +#define DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0) +#endif + +/* + * Magic: provide conversion functions {{{1 + * populate struct from attribute table: + * {{{2 + */ + +/* processing of generic netlink messages is serialized. + * use one static buffer for parsing of nested attributes */ +static struct nlattr *nested_attr_tb[128]; + +#ifndef BUILD_BUG_ON +/* Force a compilation error if condition is true */ +#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) +/* Force a compilation error if condition is true, but also produce a + result (of value 0 and type size_t), so the expression can be used + e.g. in a structure initializer (or where-ever else comma expressions + aren't permitted). */ +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) +#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) +#endif + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static int __ ## s_name ## _from_attrs(struct s_name *s, \ + struct genl_info *info, bool exclude_invariants) \ +{ \ + const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \ + struct nlattr *tla = info->attrs[tag_number]; \ + struct nlattr **ntb = nested_attr_tb; \ + struct nlattr *nla; \ + int err; \ + BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \ + if (!tla) \ + return -ENOMSG; \ + DPRINT_TLA(#s_name, "<=-", #tag_name); \ + err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \ + if (err) \ + return err; \ + \ + s_fields \ + return 0; \ +} __attribute__((unused)) \ +static int s_name ## _from_attrs(struct s_name *s, \ + struct genl_info *info) \ +{ \ + return __ ## s_name ## _from_attrs(s, info, false); \ +} __attribute__((unused)) \ +static int s_name ## _from_attrs_for_change(struct s_name *s, \ + struct genl_info *info) \ +{ \ + return __ ## s_name ## _from_attrs(s, info, true); \ +} __attribute__((unused)) \ + +#define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \ + nla = ntb[attr_nr]; \ + if (nla) { \ + if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ + pr_info("<< must not change invariant attr: %s\n", #name); \ + return -EEXIST; \ + } \ + assignment; \ + } else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \ + /* attribute missing from payload, */ \ + /* which was expected */ \ + } else if ((attr_flag) & DRBD_F_REQUIRED) { \ + pr_info("<< missing attr: %s\n", #name); \ + return -ENOMSG; \ + } + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) \ + __assign(attr_nr, attr_flag, name, nla_type, type, \ + if (s) \ + s->name = __get(nla); \ + DPRINT_FIELD("<<", nla_type, name, s, nla)) + +/* validate_nla() already checked nla_len <= maxlen appropriately. */ +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) \ + __assign(attr_nr, attr_flag, name, nla_type, type, \ + if (s) \ + s->name ## _len = \ + __get(s->name, nla, maxlen); \ + DPRINT_ARRAY("<<", nla_type, name, s, nla)) + +#include GENL_MAGIC_INCLUDE_FILE + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) + +/* + * Magic: define op number to op name mapping {{{1 + * {{{2 + */ +static const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) +__attribute__ ((unused)); +static const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) +{ + switch (cmd) { +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) \ + case op_num: return #op_name; +#include GENL_MAGIC_INCLUDE_FILE + default: + return "unknown"; + } +} + +#ifdef __KERNEL__ +#include +/* + * Magic: define genl_ops {{{1 + * {{{2 + */ + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) \ +{ \ + handler \ + .cmd = op_name, \ + .policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \ +}, + +#define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops) +static struct genl_ops ZZZ_genl_ops[] __read_mostly = { +#include GENL_MAGIC_INCLUDE_FILE +}; + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) + +/* + * Define the genl_family, multicast groups, {{{1 + * and provide register/unregister functions. + * {{{2 + */ +#define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family) +static struct genl_family ZZZ_genl_family __read_mostly = { + .id = GENL_ID_GENERATE, + .name = __stringify(GENL_MAGIC_FAMILY), + .version = GENL_MAGIC_VERSION, +#ifdef GENL_MAGIC_FAMILY_HDRSZ + .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ), +#endif + .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1, +}; + +/* + * Magic: define multicast groups + * Magic: define multicast group registration helper + */ +#undef GENL_mc_group +#define GENL_mc_group(group) \ +static struct genl_multicast_group \ +CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = { \ + .name = #group, \ +}; \ +static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \ + struct sk_buff *skb, gfp_t flags) \ +{ \ + unsigned int group_id = \ + CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id; \ + if (!group_id) \ + return -EINVAL; \ + return genlmsg_multicast(skb, 0, group_id, flags); \ +} + +#include GENL_MAGIC_INCLUDE_FILE + +int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void) +{ + int err = genl_register_family_with_ops(&ZZZ_genl_family, + ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops)); + if (err) + return err; +#undef GENL_mc_group +#define GENL_mc_group(group) \ + err = genl_register_mc_group(&ZZZ_genl_family, \ + &CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group)); \ + if (err) \ + goto fail; \ + else \ + pr_info("%s: mcg %s: %u\n", #group, \ + __stringify(GENL_MAGIC_FAMILY), \ + CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id); + +#include GENL_MAGIC_INCLUDE_FILE + +#undef GENL_mc_group +#define GENL_mc_group(group) + return 0; +fail: + genl_unregister_family(&ZZZ_genl_family); + return err; +} + +void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void) +{ + genl_unregister_family(&ZZZ_genl_family); +} + +/* + * Magic: provide conversion functions {{{1 + * populate skb from struct. + * {{{2 + */ + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s, \ + const bool exclude_sensitive) \ +{ \ + struct nlattr *tla = nla_nest_start(skb, tag_number); \ + if (!tla) \ + goto nla_put_failure; \ + DPRINT_TLA(#s_name, "-=>", #tag_name); \ + s_fields \ + nla_nest_end(skb, tla); \ + return 0; \ + \ +nla_put_failure: \ + if (tla) \ + nla_nest_cancel(skb, tla); \ + return -EMSGSIZE; \ +} \ +static inline int s_name ## _to_priv_skb(struct sk_buff *skb, \ + struct s_name *s) \ +{ \ + return s_name ## _to_skb(skb, s, 0); \ +} \ +static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb, \ + struct s_name *s) \ +{ \ + return s_name ## _to_skb(skb, s, 1); \ +} + + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) \ + if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ + DPRINT_FIELD(">>", nla_type, name, s, NULL); \ + __put(skb, attr_nr, s->name); \ + } + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) \ + if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \ + DPRINT_ARRAY(">>",nla_type, name, s, NULL); \ + __put(skb, attr_nr, min_t(int, maxlen, \ + s->name ## _len + (nla_type == NLA_NUL_STRING)),\ + s->name); \ + } + +#include GENL_MAGIC_INCLUDE_FILE + + +/* Functions for initializing structs to default values. */ + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) +#undef __u32_field_def +#define __u32_field_def(attr_nr, attr_flag, name, default) \ + x->name = default; +#undef __s32_field_def +#define __s32_field_def(attr_nr, attr_flag, name, default) \ + x->name = default; +#undef __flg_field_def +#define __flg_field_def(attr_nr, attr_flag, name, default) \ + x->name = default; +#undef __str_field_def +#define __str_field_def(attr_nr, attr_flag, name, maxlen) \ + memset(x->name, 0, sizeof(x->name)); \ + x->name ## _len = 0; +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \ +static void set_ ## s_name ## _defaults(struct s_name *x) { \ +s_fields \ +} + +#include GENL_MAGIC_INCLUDE_FILE + +#endif /* __KERNEL__ */ + +/* }}}1 */ +#endif /* GENL_MAGIC_FUNC_H */ +/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */ diff -Nru drbd8-8.3.7/drbd/linux/genl_magic_struct.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/genl_magic_struct.h --- drbd8-8.3.7/drbd/linux/genl_magic_struct.h 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/genl_magic_struct.h 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,278 @@ +#ifndef GENL_MAGIC_STRUCT_H +#define GENL_MAGIC_STRUCT_H + +#ifndef GENL_MAGIC_FAMILY +# error "you need to define GENL_MAGIC_FAMILY before inclusion" +#endif + +#ifndef GENL_MAGIC_VERSION +# error "you need to define GENL_MAGIC_VERSION before inclusion" +#endif + +#ifndef GENL_MAGIC_INCLUDE_FILE +# error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion" +#endif + +#include +#include +#include + +#define CONCAT__(a,b) a ## b +#define CONCAT_(a,b) CONCAT__(a,b) + +extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void); +extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void); + +/* + * Extension of genl attribute validation policies {{{2 + */ + +/* + * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not + * know about. This flag can be set in nlattr->nla_type to indicate that this + * attribute must not be ignored. + * + * We check and remove this flag in drbd_nla_check_mandatory() before + * validating the attribute types and lengths via nla_parse_nested(). + */ +#define DRBD_GENLA_F_MANDATORY (1 << 14) + +/* + * Flags specific to drbd and not visible at the netlink layer, used in + * _from_attrs and _to_skb: + * + * @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is + * invalid. + * + * @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be + * included in unpriviledged get requests or broadcasts. + * + * @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but + * cannot subsequently be changed. + */ +#define DRBD_F_REQUIRED (1 << 0) +#define DRBD_F_SENSITIVE (1 << 1) +#define DRBD_F_INVARIANT (1 << 2) + +#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY)) + +/* }}}1 + * MAGIC + * multi-include macro expansion magic starts here + */ + +/* MAGIC helpers {{{2 */ + +/* possible field types */ +#define __flg_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U8, char, \ + nla_get_u8, NLA_PUT_U8, false) +#define __u8_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \ + nla_get_u8, NLA_PUT_U8, false) +#define __u16_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U16, __u16, \ + nla_get_u16, NLA_PUT_U16, false) +#define __u32_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U32, __u32, \ + nla_get_u32, NLA_PUT_U32, false) +#define __s32_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U32, __s32, \ + nla_get_u32, NLA_PUT_U32, true) +#define __u64_field(attr_nr, attr_flag, name) \ + __field(attr_nr, attr_flag, name, NLA_U64, __u64, \ + nla_get_u64, NLA_PUT_U64, false) +#define __str_field(attr_nr, attr_flag, name, maxlen) \ + __array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \ + nla_strlcpy, NLA_PUT, false) +#define __bin_field(attr_nr, attr_flag, name, maxlen) \ + __array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \ + nla_memcpy, NLA_PUT, false) + +/* fields with default values */ +#define __flg_field_def(attr_nr, attr_flag, name, default) \ + __flg_field(attr_nr, attr_flag, name) +#define __u32_field_def(attr_nr, attr_flag, name, default) \ + __u32_field(attr_nr, attr_flag, name) +#define __s32_field_def(attr_nr, attr_flag, name, default) \ + __s32_field(attr_nr, attr_flag, name) +#define __str_field_def(attr_nr, attr_flag, name, maxlen) \ + __str_field(attr_nr, attr_flag, name, maxlen) + +#define GENL_op_init(args...) args +#define GENL_doit(handler) \ + .doit = handler, \ + .flags = GENL_ADMIN_PERM, +#define GENL_dumpit(handler) \ + .dumpit = handler, \ + .flags = GENL_ADMIN_PERM, + +/* }}}1 + * Magic: define the enum symbols for genl_ops + * Magic: define the enum symbols for top level attributes + * Magic: define the enum symbols for nested attributes + * {{{2 + */ + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) + +#undef GENL_mc_group +#define GENL_mc_group(group) + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) \ + op_name = op_num, + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, tla_list) \ + op_name = op_num, + +enum { +#include GENL_MAGIC_INCLUDE_FILE +}; + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, attr_list) + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ + tag_name = tag_number, + +enum { +#include GENL_MAGIC_INCLUDE_FILE +}; + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +enum { \ + s_fields \ +}; + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, \ + __get, __put, __is_signed) \ + T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, \ + maxlen, __get, __put, __is_signed) \ + T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + +#include GENL_MAGIC_INCLUDE_FILE + +/* }}}1 + * Magic: compile time assert unique numbers for operations + * Magic: -"- unique numbers for top level attributes + * Magic: -"- unique numbers for nested attributes + * {{{2 + */ + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, attr_list) \ + case op_name: + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) \ + case op_name: + +static inline void ct_assert_unique_operations(void) +{ + switch (0) { +#include GENL_MAGIC_INCLUDE_FILE + ; + } +} + +#undef GENL_op +#define GENL_op(op_name, op_num, handler, attr_list) + +#undef GENL_notification +#define GENL_notification(op_name, op_num, mcast_group, tla_list) + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ + case tag_number: + +static inline void ct_assert_unique_top_level_attributes(void) +{ + switch (0) { +#include GENL_MAGIC_INCLUDE_FILE + ; + } +} + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +static inline void ct_assert_unique_ ## s_name ## _attributes(void) \ +{ \ + switch (0) { \ + s_fields \ + ; \ + } \ +} + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) \ + case attr_nr: + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) \ + case attr_nr: + +#include GENL_MAGIC_INCLUDE_FILE + +/* }}}1 + * Magic: declare structs + * struct { + * fields + * }; + * {{{2 + */ + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +struct s_name { s_fields }; + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + __is_signed) \ + type name; + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, __is_signed) \ + type name[maxlen]; \ + __u32 name ## _len; + +#include GENL_MAGIC_INCLUDE_FILE + +#undef GENL_struct +#define GENL_struct(tag_name, tag_number, s_name, s_fields) \ +enum { \ + s_fields \ +}; + +#undef __field +#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \ + is_signed) \ + F_ ## name ## _IS_SIGNED = is_signed, + +#undef __array +#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \ + __get, __put, is_signed) \ + F_ ## name ## _IS_SIGNED = is_signed, + +#include GENL_MAGIC_INCLUDE_FILE + +/* }}}1 */ +#endif /* GENL_MAGIC_STRUCT_H */ +/* vim: set foldmethod=marker nofoldenable : */ diff -Nru drbd8-8.3.7/drbd/linux/hardirq.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/hardirq.h --- drbd8-8.3.7/drbd/linux/hardirq.h 2009-07-27 08:47:42.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/hardirq.h 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -/* Just an empty file. */ diff -Nru drbd8-8.3.7/drbd/linux/lru_cache.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/lru_cache.h --- drbd8-8.3.7/drbd/linux/lru_cache.h 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/lru_cache.h 2012-02-02 14:09:14.000000000 +0000 @@ -32,29 +32,28 @@ #include /* for memset */ #include -/* { compatibility crap */ - -/* needs to be included here, - * because of various old kernel compatibility wrappers */ -#include -#ifdef USE_KMEM_CACHE_S -#define kmem_cache kmem_cache_s -#endif - -#ifdef NEED_BACKPORT_OF_KZALLOC -static inline void *kzalloc(size_t size, int flags) +/* Compatibility code */ +#include "compat.h" +#ifndef COMPAT_HAVE_CLEAR_BIT_UNLOCK +static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) { - void *rv = kmalloc(size, flags); - if (rv) - memset(rv, 0, size); - - return rv; +#if defined(__x86_64__) || defined(__i386__) || defined(__arch_um__) + barrier(); +#else + smp_mb(); /* Be on the save side for alpha, and others */ +#endif + clear_bit(nr, addr); } -#undef NEED_BACKPORT_OF_KZALLOC #endif - -/* } compatibility crap */ - +#ifndef COMPAT_HAVE_BOOL_TYPE +typedef _Bool bool; +enum { + false = 0, + true = 1 +}; +#define COMPAT_HAVE_BOOL_TYPE +#endif +/* End of Compatibility code */ /* This header file (and its .c file; kernel-doc of functions see there) @@ -88,7 +87,7 @@ usually the condition is softened to regions that _may_ have been target of in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent bitmap, trading frequency of meta data transactions against amount of - (possibly unneccessary) resync traffic. + (possibly unnecessary) resync traffic. If we set a hard limit on the area that may be "hot" at any given time, we limit the amount of resync traffic needed for crash recovery. @@ -163,16 +162,16 @@ * .list is on one of three lists: * in_use: currently in use (refcnt > 0, lc_number != LC_FREE) * lru: unused but ready to be reused or recycled - * (ts_refcnt == 0, lc_number != LC_FREE), + * (lc_refcnt == 0, lc_number != LC_FREE), * free: unused but ready to be recycled - * (ts_refcnt == 0, lc_number == LC_FREE), + * (lc_refcnt == 0, lc_number == LC_FREE), * * an element is said to be "in the active set", * if either on "in_use" or "lru", i.e. lc_number != LC_FREE. * * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache * (total memory usage 2 pages), and up to 3833 elements on the act_log - * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages. + * lru_cache, totalling ~215 kB for 64bit architecture, ~53 pages. * * We usually do not actually free these objects again, but only "recycle" * them, as the change "index: -old_label, +LC_FREE" would need a transaction @@ -184,15 +183,17 @@ struct hlist_node colision; struct list_head list; /* LRU list or free list */ unsigned refcnt; - /* back "pointer" into ts_cache->element[index], - * for paranoia, and for "ts_element_to_index" */ + /* back "pointer" into lc_cache->element[index], + * for paranoia, and for "lc_element_to_index" */ unsigned lc_index; /* if we want to track a larger set of objects, * it needs to become arch independend u64 */ unsigned lc_number; - /* special label when on free list */ #define LC_FREE (~0U) + + /* for pending changes */ + unsigned lc_new_number; }; struct lru_cache { @@ -200,6 +201,7 @@ struct list_head lru; struct list_head free; struct list_head in_use; + struct list_head to_be_changed; /* the pre-created kmem cache to allocate the objects from */ struct kmem_cache *lc_cache; @@ -210,26 +212,27 @@ size_t element_off; /* number of elements (indices) */ - unsigned int nr_elements; + unsigned int nr_elements; /* Arbitrary limit on maximum tracked objects. Practical limit is much * lower due to allocation failures, probably. For typical use cases, * nr_elements should be a few thousand at most. - * This also limits the maximum value of ts_element.ts_index, allowing the - * 8 high bits of .ts_index to be overloaded with flags in the future. */ + * This also limits the maximum value of lc_element.lc_index, allowing the + * 8 high bits of .lc_index to be overloaded with flags in the future. */ #define LC_MAX_ACTIVE (1<<24) + /* allow to accumulate a few (index:label) changes, + * but no more than max_pending_changes */ + unsigned int max_pending_changes; + /* number of elements currently on to_be_changed list */ + unsigned int pending_changes; + /* statistics */ - unsigned used; /* number of lelements currently on in_use list */ - unsigned long hits, misses, starving, dirty, changed; + unsigned used; /* number of elements currently on in_use list */ + unsigned long hits, misses, starving, locked, changed; /* see below: flag-bits for lru_cache */ unsigned long flags; - /* when changing the label of an index element */ - unsigned int new_number; - - /* for paranoia when changing the label of an index element */ - struct lc_element *changing_element; void *lc_private; const char *name; @@ -245,10 +248,15 @@ /* debugging aid, to catch concurrent access early. * user needs to guarantee exclusive access by proper locking! */ __LC_PARANOIA, - /* if we need to change the set, but currently there is a changing - * transaction pending, we are "dirty", and must deferr further - * changing requests */ + + /* annotate that the set is "dirty", possibly accumulating further + * changes, until a transaction is finally triggered */ __LC_DIRTY, + + /* Locked, no further changes allowed. + * Also used to serialize changing transactions. */ + __LC_LOCKED, + /* if we need to change the set, but currently there is no free nor * unused element available, we are "starving", and must not give out * further references, to guarantee that eventually some refcnt will @@ -260,9 +268,11 @@ }; #define LC_PARANOIA (1<<__LC_PARANOIA) #define LC_DIRTY (1<<__LC_DIRTY) +#define LC_LOCKED (1<<__LC_LOCKED) #define LC_STARVING (1<<__LC_STARVING) extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, + unsigned max_pending_changes, unsigned e_count, size_t e_size, size_t e_off); extern void lc_reset(struct lru_cache *lc); extern void lc_destroy(struct lru_cache *lc); @@ -273,7 +283,7 @@ extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); -extern void lc_changed(struct lru_cache *lc, struct lc_element *e); +extern void lc_committed(struct lru_cache *lc); struct seq_file; extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); @@ -282,32 +292,40 @@ void (*detail) (struct seq_file *, struct lc_element *)); /** - * lc_try_lock - can be used to stop lc_get() from changing the tracked set + * lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set * @lc: the lru cache to operate on * - * Note that the reference counts and order on the active and lru lists may - * still change. Returns true if we aquired the lock. + * Allows (expects) the set to be "dirty". Note that the reference counts and + * order on the active and lru lists may still change. Used to serialize + * changing transactions. Returns true if we aquired the lock. */ -static inline int lc_try_lock(struct lru_cache *lc) +static inline int lc_try_lock_for_transaction(struct lru_cache *lc) { - return !test_and_set_bit(__LC_DIRTY, &lc->flags); + return !test_and_set_bit(__LC_LOCKED, &lc->flags); } /** + * lc_try_lock - variant to stop lc_get() from changing the tracked set + * @lc: the lru cache to operate on + * + * Note that the reference counts and order on the active and lru lists may + * still change. Only works on a "clean" set. Returns true if we aquired the + * lock, which means there are no pending changes, and any further attempt to + * change the set will not succeed until the next lc_unlock(). + */ +extern int lc_try_lock(struct lru_cache *lc); + +/** * lc_unlock - unlock @lc, allow lc_get() to change the set again * @lc: the lru cache to operate on */ static inline void lc_unlock(struct lru_cache *lc) { clear_bit(__LC_DIRTY, &lc->flags); - smp_mb__after_clear_bit(); + clear_bit_unlock(__LC_LOCKED, &lc->flags); } -static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) -{ - struct lc_element *e = lc_find(lc, enr); - return e && e->refcnt; -} +extern bool lc_is_used(struct lru_cache *lc, unsigned int enr); #define lc_entry(ptr, type, member) \ container_of(ptr, type, member) diff -Nru drbd8-8.3.7/drbd/linux/memcontrol.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/memcontrol.h --- drbd8-8.3.7/drbd/linux/memcontrol.h 2009-07-27 08:47:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/memcontrol.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ -/* just an empty file - * memcontrol.h did not exist prior to 2.6.25. - * but it needs more recent kernels for mm_inline.h to work. */ diff -Nru drbd8-8.3.7/drbd/linux/mutex.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/mutex.h --- drbd8-8.3.7/drbd/linux/mutex.h 2009-07-27 08:47:42.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/mutex.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,37 +0,0 @@ -/* "Backport" of the mutex to older Linux-2.6.x kernels. - */ -#ifndef __LINUX_MUTEX_H -#define __LINUX_MUTEX_H - -#include - -struct mutex { - struct semaphore sem; -}; - -static inline void mutex_init(struct mutex *m) -{ - sema_init(&m->sem, 1); -} - -static inline void mutex_lock(struct mutex *m) -{ - down(&m->sem); -} - -static inline int mutex_lock_interruptible(struct mutex *m) -{ - return down_interruptible(&m->sem); -} - -static inline void mutex_unlock(struct mutex *m) -{ - up(&m->sem); -} - -static inline int mutex_is_locked(struct mutex *lock) -{ - return atomic_read(&lock->sem.count) != 1; -} - -#endif diff -Nru drbd8-8.3.7/drbd/linux/tracepoint.h drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/tracepoint.h --- drbd8-8.3.7/drbd/linux/tracepoint.h 2009-07-27 08:47:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/linux/tracepoint.h 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -struct tracepoint; diff -Nru drbd8-8.3.7/drbd/lru_cache.c drbd8-8.4.1+git55a81dc~cmd1/drbd/lru_cache.c --- drbd8-8.3.7/drbd/lru_cache.c 2009-11-25 09:06:43.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd/lru_cache.c 2012-02-02 14:09:14.000000000 +0000 @@ -39,8 +39,8 @@ } while (0) #define RETURN(x...) do { \ - clear_bit(__LC_PARANOIA, &lc->flags); \ - smp_mb__after_clear_bit(); return x ; } while (0) + clear_bit_unlock(__LC_PARANOIA, &lc->flags); \ + return x ; } while (0) /* BUG() if e is not one of the elements tracked by lc */ #define PARANOIA_LC_ELEMENT(lc, e) do { \ @@ -50,9 +50,40 @@ BUG_ON(i >= lc_->nr_elements); \ BUG_ON(lc_->lc_element[i] != e_); } while (0) + +/* We need to atomically + * - try to grab the lock (set LC_LOCKED) + * - only if there is no pending transaction + * (neither LC_DIRTY nor LC_STARVING is set) + * Because of PARANOIA_ENTRY() above abusing lc->flags as well, + * it is not sufficient to just say + * return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED); + */ +int lc_try_lock(struct lru_cache *lc) +{ + unsigned long val; + do { + val = cmpxchg(&lc->flags, 0, LC_LOCKED); + } while (unlikely (val == LC_PARANOIA)); + /* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */ + return 0 == val; +#if 0 + /* Alternative approach, spin in case someone enters or leaves a + * PARANOIA_ENTRY()/RETURN() section. */ + unsigned long old, new, val; + do { + old = lc->flags & LC_PARANOIA; + new = old | LC_LOCKED; + val = cmpxchg(&lc->flags, old, new); + } while (unlikely (val == (old ^ LC_PARANOIA))); + return old == val; +#endif +} + /** * lc_create - prepares to track objects in an active set * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details + * @max_pending_changes: maximum changes to accumulate until a transaction is required * @e_count: number of elements allowed to be active simultaneously * @e_size: size of the tracked objects * @e_off: offset to the &struct lc_element member in a tracked object @@ -61,6 +92,7 @@ * or NULL on (allocation) failure. */ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, + unsigned max_pending_changes, unsigned e_count, size_t e_size, size_t e_off) { struct hlist_head *slot = NULL; @@ -79,7 +111,7 @@ if (e_count > LC_MAX_ACTIVE) return NULL; - slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL); + slot = kcalloc(e_count, sizeof(struct hlist_head), GFP_KERNEL); if (!slot) goto out_fail; element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL); @@ -93,12 +125,13 @@ INIT_LIST_HEAD(&lc->in_use); INIT_LIST_HEAD(&lc->lru); INIT_LIST_HEAD(&lc->free); + INIT_LIST_HEAD(&lc->to_be_changed); lc->name = name; lc->element_size = e_size; lc->element_off = e_off; lc->nr_elements = e_count; - lc->new_number = LC_FREE; + lc->max_pending_changes = max_pending_changes; lc->lc_cache = cache; lc->lc_element = element; lc->lc_slot = slot; @@ -112,6 +145,7 @@ e = p + e_off; e->lc_index = i; e->lc_number = LC_FREE; + e->lc_new_number = LC_FREE; list_add(&e->list, &lc->free); element[i] = e; } @@ -170,15 +204,15 @@ INIT_LIST_HEAD(&lc->in_use); INIT_LIST_HEAD(&lc->lru); INIT_LIST_HEAD(&lc->free); + INIT_LIST_HEAD(&lc->to_be_changed); lc->used = 0; lc->hits = 0; lc->misses = 0; lc->starving = 0; - lc->dirty = 0; + lc->locked = 0; lc->changed = 0; + lc->pending_changes = 0; lc->flags = 0; - lc->changing_element = NULL; - lc->new_number = LC_FREE; memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); for (i = 0; i < lc->nr_elements; i++) { @@ -189,6 +223,7 @@ /* re-init it */ e->lc_index = i; e->lc_number = LC_FREE; + e->lc_new_number = LC_FREE; list_add(&e->list, &lc->free); } } @@ -203,14 +238,14 @@ /* NOTE: * total calls to lc_get are * (starving + hits + misses) - * misses include "dirty" count (update from an other thread in + * misses include "locked" count (update from an other thread in * progress) and "changed", when this in fact lead to an successful * update of the cache. */ return seq_printf(seq, "\t%s: used:%u/%u " - "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", + "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", lc->name, lc->used, lc->nr_elements, - lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); + lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); } static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) @@ -219,16 +254,8 @@ } -/** - * lc_find - find element by label, if present in the hash table - * @lc: The lru_cache object - * @enr: element number - * - * Returns the pointer to an element, if the element with the requested - * "label" or element number is present in the hash table, - * or NULL if not found. Does not change the refcnt. - */ -struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) +static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr, + bool include_changing) { struct hlist_node *n; struct lc_element *e; @@ -236,29 +263,48 @@ BUG_ON(!lc); BUG_ON(!lc->nr_elements); hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { - if (e->lc_number == enr) + /* "about to be changed" elements, pending transaction commit, + * are hashed by their "new number". "Normal" elements have + * lc_number == lc_new_number. */ + if (e->lc_new_number != enr) + continue; + if (e->lc_new_number == e->lc_number || include_changing) return e; + break; } return NULL; } -/* returned element will be "recycled" immediately */ -static struct lc_element *lc_evict(struct lru_cache *lc) +/** + * lc_find - find element by label, if present in the hash table + * @lc: The lru_cache object + * @enr: element number + * + * Returns the pointer to an element, if the element with the requested + * "label" or element number is present in the hash table, + * or NULL if not found. Does not change the refcnt. + * Ignores elements that are "about to be used", i.e. not yet in the active + * set, but still pending transaction commit. + */ +struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) { - struct list_head *n; - struct lc_element *e; - - if (list_empty(&lc->lru)) - return NULL; - - n = lc->lru.prev; - e = list_entry(n, struct lc_element, list); - - PARANOIA_LC_ELEMENT(lc, e); + return __lc_find(lc, enr, 0); +} - list_del(&e->list); - hlist_del(&e->colision); - return e; +/** + * lc_is_used - find element by label + * @lc: The lru_cache object + * @enr: element number + * + * Returns true, if the element with the requested "label" or element number is + * present in the hash table, and is used (refcnt > 0). + * Also finds elements that are not _currently_ used but only "about to be + * used", i.e. on the "to_be_changed" list, pending transaction commit. + */ +bool lc_is_used(struct lru_cache *lc, unsigned int enr) +{ + struct lc_element *e = __lc_find(lc, enr, 1); + return e && e->refcnt; } /** @@ -275,22 +321,34 @@ PARANOIA_LC_ELEMENT(lc, e); BUG_ON(e->refcnt); - e->lc_number = LC_FREE; + e->lc_number = e->lc_new_number = LC_FREE; hlist_del_init(&e->colision); list_move(&e->list, &lc->free); RETURN(); } -static struct lc_element *lc_get_unused_element(struct lru_cache *lc) +static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number) { struct list_head *n; + struct lc_element *e; - if (list_empty(&lc->free)) - return lc_evict(lc); + if (!list_empty(&lc->free)) + n = lc->free.next; + else if (!list_empty(&lc->lru)) + n = lc->lru.prev; + else + return NULL; + + e = list_entry(n, struct lc_element, list); + PARANOIA_LC_ELEMENT(lc, e); + + e->lc_new_number = new_number; + if (!hlist_unhashed(&e->colision)) + __hlist_del(&e->colision); + hlist_add_head(&e->colision, lc_hash_slot(lc, new_number)); + list_move(&e->list, &lc->to_be_changed); - n = lc->free.next; - list_del(n); - return list_entry(n, struct lc_element, list); + return e; } static int lc_unused_element_available(struct lru_cache *lc) @@ -303,45 +361,7 @@ return 0; } - -/** - * lc_get - get element by label, maybe change the active set - * @lc: the lru cache to operate on - * @enr: the label to look up - * - * Finds an element in the cache, increases its usage count, - * "touches" and returns it. - * - * In case the requested number is not present, it needs to be added to the - * cache. Therefore it is possible that an other element becomes evicted from - * the cache. In either case, the user is notified so he is able to e.g. keep - * a persistent log of the cache changes, and therefore the objects in use. - * - * Return values: - * NULL - * The cache was marked %LC_STARVING, - * or the requested label was not in the active set - * and a changing transaction is still pending (@lc was marked %LC_DIRTY). - * Or no unused or free element could be recycled (@lc will be marked as - * %LC_STARVING, blocking further lc_get() operations). - * - * pointer to the element with the REQUESTED element number. - * In this case, it can be used right away - * - * pointer to an UNUSED element with some different element number, - * where that different number may also be %LC_FREE. - * - * In this case, the cache is marked %LC_DIRTY (blocking further changes), - * and the returned element pointer is removed from the lru list and - * hash collision chains. The user now should do whatever housekeeping - * is necessary. - * Then he must call lc_changed(lc,element_pointer), to finish - * the change. - * - * NOTE: The user needs to check the lc_number on EACH use, so he recognizes - * any cache set change. - */ -struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) +static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) { struct lc_element *e; @@ -351,8 +371,12 @@ RETURN(NULL); } - e = lc_find(lc, enr); - if (e) { + e = __lc_find(lc, enr, 1); + /* if lc_new_number != lc_number, + * this enr is currently being pulled in already, + * and will be available once the pending transaction + * has been committed. */ + if (e && e->lc_new_number == e->lc_number) { ++lc->hits; if (e->refcnt++ == 0) lc->used++; @@ -361,6 +385,26 @@ } ++lc->misses; + if (!may_change) + RETURN(NULL); + + /* It has been found above, but on the "to_be_changed" list, not yet + * committed. Don't pull it in twice, wait for the transaction, then + * try again */ + if (e) + RETURN(NULL); + + /* To avoid races with lc_try_lock(), first, mark us dirty + * (using test_and_set_bit, as it implies memory barriers), ... */ + test_and_set_bit(__LC_DIRTY, &lc->flags); + + /* ... only then check if it is locked anyways. If lc_unlock clears + * the dirty bit again, that's not a problem, we will come here again. + */ + if (test_bit(__LC_LOCKED, &lc->flags)) { + ++lc->locked; + RETURN(NULL); + } /* In case there is nothing available and we can not kick out * the LRU element, we have to wait ... @@ -370,71 +414,109 @@ RETURN(NULL); } - /* it was not present in the active set. - * we are going to recycle an unused (or even "free") element. - * user may need to commit a transaction to record that change. - * we serialize on flags & TF_DIRTY */ - if (test_and_set_bit(__LC_DIRTY, &lc->flags)) { - ++lc->dirty; + /* It was not present in the active set. We are going to recycle an + * unused (or even "free") element, but we won't accumulate more than + * max_pending_changes changes. */ + if (lc->pending_changes >= lc->max_pending_changes) RETURN(NULL); - } - e = lc_get_unused_element(lc); + e = lc_prepare_for_change(lc, enr); BUG_ON(!e); clear_bit(__LC_STARVING, &lc->flags); BUG_ON(++e->refcnt != 1); lc->used++; - - lc->changing_element = e; - lc->new_number = enr; + lc->pending_changes++; RETURN(e); } -/* similar to lc_get, - * but only gets a new reference on an existing element. - * you either get the requested element, or NULL. - * will be consolidated into one function. +/** + * lc_get - get element by label, maybe change the active set + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * + * In case the requested number is not present, it needs to be added to the + * cache. Therefore it is possible that an other element becomes evicted from + * the cache. In either case, the user is notified so he is able to e.g. keep + * a persistent log of the cache changes, and therefore the objects in use. + * + * Return values: + * NULL + * The cache was marked %LC_STARVING, + * or the requested label was not in the active set + * and a changing transaction is still pending (@lc was marked %LC_DIRTY). + * Or no unused or free element could be recycled (@lc will be marked as + * %LC_STARVING, blocking further lc_get() operations). + * + * pointer to the element with the REQUESTED element number. + * In this case, it can be used right away + * + * pointer to an UNUSED element with some different element number, + * where that different number may also be %LC_FREE. + * + * In this case, the cache is marked %LC_DIRTY, + * so lc_try_lock() will no longer succeed. + * The returned element pointer is moved to the "to_be_changed" list, + * and registered with the new element number on the hash collision chains, + * so it is possible to pick it up from lc_is_used(). + * Up to "max_pending_changes" (see lc_create()) can be accumulated. + * The user now should do whatever housekeeping is necessary, + * typically serialize on lc_try_lock_for_transaction(), then call + * lc_committed(lc) and lc_unlock(), to finish the change. + * + * NOTE: The user needs to check the lc_number on EACH use, so he recognizes + * any cache set change. */ -struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) +struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) { - struct lc_element *e; - - PARANOIA_ENTRY(); - if (lc->flags & LC_STARVING) { - ++lc->starving; - RETURN(NULL); - } + return __lc_get(lc, enr, 1); +} - e = lc_find(lc, enr); - if (e) { - ++lc->hits; - if (e->refcnt++ == 0) - lc->used++; - list_move(&e->list, &lc->in_use); /* Not evictable... */ - } - RETURN(e); +/** + * lc_try_get - get element by label, if present; do not change the active set + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * + * Return values: + * NULL + * The cache was marked %LC_STARVING, + * or the requested label was not in the active set + * + * pointer to the element with the REQUESTED element number. + * In this case, it can be used right away + */ +struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) +{ + return __lc_get(lc, enr, 0); } /** - * lc_changed - tell @lc that the change has been recorded + * lc_committed - tell @lc that pending changes have been recorded * @lc: the lru cache to operate on - * @e: the element pending label change + * + * User is expected to serialize on explicit lc_try_lock_for_transaction() + * before the transaction is started, and later needs to lc_unlock() explicitly + * as well. */ -void lc_changed(struct lru_cache *lc, struct lc_element *e) +void lc_committed(struct lru_cache *lc) { + struct lc_element *e, *tmp; + PARANOIA_ENTRY(); - BUG_ON(e != lc->changing_element); - PARANOIA_LC_ELEMENT(lc, e); - ++lc->changed; - e->lc_number = lc->new_number; - list_add(&e->list, &lc->in_use); - hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number)); - lc->changing_element = NULL; - lc->new_number = LC_FREE; - clear_bit(__LC_DIRTY, &lc->flags); - smp_mb__after_clear_bit(); + list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) { + /* count number of changes, not number of transactions */ + ++lc->changed; + e->lc_number = e->lc_new_number; + list_move(&e->list, &lc->in_use); + } + lc->pending_changes = 0; RETURN(); } @@ -453,13 +535,12 @@ PARANOIA_ENTRY(); PARANOIA_LC_ELEMENT(lc, e); BUG_ON(e->refcnt == 0); - BUG_ON(e == lc->changing_element); + BUG_ON(e->lc_number != e->lc_new_number); if (--e->refcnt == 0) { /* move it to the front of LRU. */ list_move(&e->list, &lc->lru); lc->used--; - clear_bit(__LC_STARVING, &lc->flags); - smp_mb__after_clear_bit(); + clear_bit_unlock(__LC_STARVING, &lc->flags); } RETURN(e->refcnt); } @@ -499,16 +580,24 @@ void lc_set(struct lru_cache *lc, unsigned int enr, int index) { struct lc_element *e; + struct list_head *lh; if (index < 0 || index >= lc->nr_elements) return; e = lc_element_by_index(lc, index); - e->lc_number = enr; + BUG_ON(e->lc_number != e->lc_new_number); + BUG_ON(e->refcnt != 0); + e->lc_number = e->lc_new_number = enr; hlist_del_init(&e->colision); - hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); - list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); + if (enr == LC_FREE) + lh = &lc->free; + else { + hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); + lh = &lc->lru; + } + list_move(&e->list, lh); } /** @@ -538,18 +627,3 @@ } } } - -EXPORT_SYMBOL(lc_create); -EXPORT_SYMBOL(lc_reset); -EXPORT_SYMBOL(lc_destroy); -EXPORT_SYMBOL(lc_set); -EXPORT_SYMBOL(lc_del); -EXPORT_SYMBOL(lc_try_get); -EXPORT_SYMBOL(lc_find); -EXPORT_SYMBOL(lc_get); -EXPORT_SYMBOL(lc_put); -EXPORT_SYMBOL(lc_changed); -EXPORT_SYMBOL(lc_element_by_index); -EXPORT_SYMBOL(lc_index_of); -EXPORT_SYMBOL(lc_seq_printf_stats); -EXPORT_SYMBOL(lc_seq_dump_details); diff -Nru drbd8-8.3.7/drbd-kernel.spec.in drbd8-8.4.1+git55a81dc~cmd1/drbd-kernel.spec.in --- drbd8-8.3.7/drbd-kernel.spec.in 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd-kernel.spec.in 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,110 @@ +Name: drbd-kernel +Summary: Kernel driver for DRBD +Version: @PACKAGE_VERSION@ +Release: 1%{?dist} +Source: http://oss.linbit.com/%{name}/8.3/drbd-%{version}.tar.gz +License: GPLv2+ +Group: System Environment/Kernel +URL: http://www.drbd.org/ +BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) +%if ! %{defined suse_version} +BuildRequires: redhat-rpm-config +%endif +%if %{defined kernel_module_package_buildreqs} +BuildRequires: %kernel_module_package_buildreqs +%endif + +%description +This module is the kernel-dependent driver for DRBD. This is split out so +that multiple kernel driver versions can be installed, one for each +installed kernel. + +%prep +%setup -q -n drbd-%{version} + +%if %{defined suse_kernel_module_package} +# Support also sles10, where kernel_module_package was not yet defined. +# In sles11, suse_k_m_p became a wrapper around k_m_p. + +%if 0%{?suse_version} < 1110 +# We need to exclude some flavours on sles10 etc, +# or we hit an rpm internal buffer limit. +%suse_kernel_module_package -n drbd -p preamble -f filelist-suse kdump kdumppae vmi vmipae um +%else +%suse_kernel_module_package -n drbd -p preamble -f filelist-suse +%endif +%else +# Concept stolen from sles kernel-module-subpackage: +# include the kernel version in the package version, +# so we can have more than one kmod-drbd. +# Needed, because even though kABI is still "compatible" in RHEL 6.0 to 6.1, +# the actual functionality differs very much: 6.1 does no longer do BARRIERS, +# but wants FLUSH/FUA instead. +# For convenience, we want both 6.0 and 6.1 in the same repository, +# and have yum/rpm figure out via dependencies, which kmod version should be installed. +# This is a dirty hack, non generic, and should probably be enclosed in some "if-on-rhel6". +%define _this_kmp_version %{version}_%(echo %kernel_version | sed -r 'y/-/_/; s/\.el.\.(x86_64|i.86)$//;') +%kernel_module_package -v %_this_kmp_version -n drbd -p preamble -f filelist-redhat +%endif + +%build +rm -rf obj +mkdir obj +ln -s ../scripts obj/ + +for flavor in %flavors_to_build; do + cp -r drbd obj/$flavor + #make -C %{kernel_source $flavor} M=$PWD/obj/$flavor + make -C obj/$flavor %{_smp_mflags} all KDIR=%{kernel_source $flavor} +done + +%install +export INSTALL_MOD_PATH=$RPM_BUILD_ROOT + +%if %{defined kernel_module_package_moddir} +export INSTALL_MOD_DIR=%{kernel_module_package_moddir drbd} +%else +%if %{defined suse_kernel_module_package} +export INSTALL_MOD_DIR=updates +%else +export INSTALL_MOD_DIR=extra/drbd +%endif +%endif + +# Very likely kernel_module_package_moddir did ignore the parameter, +# so we just append it here. The weak-modules magic expects that location. +[ $INSTALL_MOD_DIR = extra ] && INSTALL_MOD_DIR=extra/drbd + +for flavor in %flavors_to_build ; do + make -C %{kernel_source $flavor} modules_install \ + M=$PWD/obj/$flavor + kernelrelease=$(make -s -C %{kernel_source $flavor} kernelrelease) + mv obj/$flavor/.kernel.config.gz obj/k-config-$kernelrelease.gz +done + +%if %{defined suse_kernel_module_package} +# On SUSE, putting the modules into the default path determined by +# %kernel_module_package_moddir is enough to give them priority over +# shipped modules. +rm -f drbd.conf +%else +mkdir -p $RPM_BUILD_ROOT/etc/depmod.d +echo "override drbd * weak-updates" \ + > $RPM_BUILD_ROOT/etc/depmod.d/drbd.conf +%endif + +%clean +rm -rf %{buildroot} + +%changelog +* Tue Dec 20 2011 Philipp Reisner - 8.4.1-1 +- New upstream release. + +* Mon Jul 18 2011 Philipp Reisner - 8.4.0-1 +- New upstream release. + +* Fri Jan 28 2011 Philipp Reisner - 8.3.10-1 +- New upstream release. + +* Wed Nov 25 2010 Andreas Gruenbacher - 8.3.9-1 +- Convert to a Kernel Module Package. diff -Nru drbd8-8.3.7/drbd-km.spec.in drbd8-8.4.1+git55a81dc~cmd1/drbd-km.spec.in --- drbd8-8.3.7/drbd-km.spec.in 2010-01-07 09:09:33.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd-km.spec.in 2012-02-02 14:09:14.000000000 +0000 @@ -10,7 +10,7 @@ Name: @PACKAGE_TARNAME@-km Summary: DRBD driver for Linux Version: @PACKAGE_VERSION@ -Release: 12@RPM_DIST_TAG@ +Release: 1@RPM_DIST_TAG@ Source: http://oss.linbit.com/%{name}/8.3/drbd-%{version}.tar.gz License: GPLv2+ ExclusiveOS: linux @@ -100,6 +100,30 @@ %changelog +* Tue Dec 20 2011 Philipp Reisner - 8.4.1-1 +- New upstream release. + +* Mon Jul 18 2011 Philipp Reisner - 8.4.0-1 +- New upstream release. + +* Fri Jan 28 2011 Philipp Reisner - 8.3.10-1 +- New upstream release. + +* Fri Oct 22 2010 Philipp Reisner - 8.3.9-1 +- New upstream release. + +* Wed Jun 2 2010 Philipp Reisner - 8.3.8-1 +- New upstream release. + +* Thu Jan 13 2010 Philipp Reisner - 8.3.7-1 +- New upstream release. + +* Thu Nov 8 2009 Philipp Reisner - 8.3.6-1 +- New upstream release. + +* Thu Oct 27 2009 Philipp Reisner - 8.3.5-1 +- New upstream release. + * Wed Oct 21 2009 Florian Haas - 8.3.4-12 - Packaging makeover. diff -Nru drbd8-8.3.7/drbd.spec.in drbd8-8.4.1+git55a81dc~cmd1/drbd.spec.in --- drbd8-8.3.7/drbd.spec.in 2010-01-13 16:16:02.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd.spec.in 2012-02-02 14:09:14.000000000 +0000 @@ -21,6 +21,7 @@ %bcond_without bashcompletion # --with xen is ignored on any non-x86 architecture %bcond_without xen +%bcond_without legacy_utils %ifnarch %{ix86} x86_64 %global _without_xen --without-xen %endif @@ -49,9 +50,12 @@ %if %{with pacemaker} Requires: %{name}-pacemaker = %{version} %endif -%if %{with rgmanager} -Requires: %{name}-rgmanager = %{version} -%endif +## %if %{with rgmanager} +## ## No. +## ## We don't want to annoy the majority of our userbase on pacemaker +## ## by pulling in the full rgmanager stack via drbd-rgmanager as well. +## Requires: %{name}-rgmanager = %{version} +## %endif %if %{with heartbeat} Requires: %{name}-heartbeat = %{version} %endif @@ -77,6 +81,12 @@ %package utils Summary: Management utilities for DRBD Group: System Environment/Kernel +# We used to have one monolithic userland package. +# Since all other packages require drbd-utils, +# it should be sufficient to add the conflict here. +Conflicts: drbd < 8.3.6 +# These exist in centos extras: +Conflicts: drbd82 drbd83 @RPM_REQ_CHKCONFIG_POST@ @RPM_REQ_CHKCONFIG_PREUN@ @@ -92,6 +102,11 @@ /sbin/drbdsetup /sbin/drbdadm /sbin/drbdmeta +%if %{with legacy_utils} +%dir /lib/drbd/ +/lib/drbd/drbdsetup-83 +/lib/drbd/drbdadm-83 +%endif %{_initddir}/%{name} %{_sbindir}/drbd-overview %dir %{_prefix}/lib/%{name} @@ -179,12 +194,32 @@ %{_prefix}/lib/ocf/resource.d/linbit/drbd %endif # with pacemaker +# Dependencies for drbd-rgmanager are particularly awful. On RHEL 5 +# and prior (and corresponding Fedora releases), %{_datadir}/cluster +# was owned by rgmanager version 2, so we have to depend on that. +# +# With Red Hat Cluster 3.0.1 (around Fedora 12), the DRBD resource +# agent was merged in, and it became part of the resource-agents 3 +# package (which of course is different from resource-agents on all +# other platforms -- go figure). So for resource-agents >= 3, we must +# generally conflict. +# +# Then for RHEL 6, Red Hat in all their glory decided to keep the +# packaging scheme, but kicked DRBD out of the resource-agents +# package. Thus, for RHEL 6 specifically, we must not conflict with +# resource-agents >=3, but instead require it. +# +# The saga continues: +# In RHEL 6.1 they have listed the drbd resource agent as valid agent, +# but do not include it in their resource-agents package. -> So we +# drop any dependency regarding rgmanager's version. +# +# All of this for exactly two (2) files. %if %{with rgmanager} %package rgmanager Summary: Red Hat Cluster Suite agent for DRBD Group: System Environment/Base -Requires: %{name}-utils = %{version}-%{release}, rgmanager < 3 -Conflicts: resource-agents >= 3 +Requires: %{name}-utils = %{version}-%{release} @RPM_SUBPACKAGE_NOARCH@ %description rgmanager @@ -255,6 +290,7 @@ %{?_without_heartbeat} \ %{?_with_rgmanager} \ %{?_without_bashcompletion} \ + %{?_without_legacy_utils} \ --with-initdir=%{_initddir} make %{?_smp_mflags} @@ -281,6 +317,21 @@ %changelog +* Tue Dec 20 2011 Philipp Reisner - 8.4.1-1 +- New upstream release. + +* Wed Jul 15 2011 Philipp Reisner - 8.4.0-1 +- New upstream release. + +* Fri Jan 28 2011 Philipp Reisner - 8.3.10-1 +- New upstream release. + +* Fri Oct 22 2010 Philipp Reisner - 8.3.9-1 +- New upstream release. + +* Wed Jun 2 2010 Philipp Reisner - 8.3.8-1 +- New upstream release. + * Thu Jan 13 2010 Philipp Reisner - 8.3.7-1 - New upstream release. diff -Nru drbd8-8.3.7/drbd_config.h drbd8-8.4.1+git55a81dc~cmd1/drbd_config.h --- drbd8-8.3.7/drbd_config.h 2010-01-13 16:14:27.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/drbd_config.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,104 +0,0 @@ -/* - drbd_config.h - DRBD's compile time configuration. - - drbd is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - drbd is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with drbd; see the file COPYING. If not, write to - the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef DRBD_CONFIG_H -#define DRBD_CONFIG_H - -extern const char *drbd_buildtag(void); - -#define REL_VERSION "8.3.7" -#define API_VERSION 88 -#define PRO_VERSION_MIN 86 -#define PRO_VERSION_MAX 91 - -#ifndef __CHECKER__ /* for a sparse run, we need all STATICs */ -#define DBG_ALL_SYMBOLS /* no static functs, improves quality of OOPS traces */ -#endif - -/* drbd_assert_breakpoint() function -#define DBG_ASSERTS - */ - -/* Dump all cstate changes */ -#define DUMP_MD 2 - -/* some extra checks -#define PARANOIA - */ - -/* Enable fault insertion code */ -#define DRBD_ENABLE_FAULTS - -/* RedHat's 2.6.9 kernels have the gfp_t type. Mainline has this feature - * since 2.6.16. If you build for RedHat enable the line below. */ -#define KERNEL_HAS_GFP_T - -/* kernel.org has atomic_add_return since 2.6.10. some vendor kernels - * have it backported, though. Others don't. */ -//#define NEED_BACKPORT_OF_ATOMIC_ADD - -/* 2.6.something has deprecated kmem_cache_t - * some older still use it. - * some have it defined as struct kmem_cache_s, some as struct kmem_cache */ -//#define USE_KMEM_CACHE_S - -/* 2.6.something has sock_create_kern (SE-linux security context stuff) - * some older distribution kernels don't. */ -//#define DEFINE_SOCK_CREATE_KERN - -/* 2.6.24 and later have kernel_sock_shutdown. - * some older distribution kernels may also have a backport. */ -//#define DEFINE_KERNEL_SOCK_SHUTDOWN - -/* in older kernels (vanilla < 2.6.16) struct netlink_skb_parms has a - * member called dst_groups. Later it is called dst_group (without 's'). */ -//#define DRBD_NL_DST_GROUPS - -/* in older kernels (vanilla < 2.6.14) is no kzalloc() */ -//#define NEED_BACKPORT_OF_KZALLOC - -// some vendor kernels have it, some don't -//#define NEED_SG_SET_BUF -#define HAVE_LINUX_SCATTERLIST_H - -/* 2.6.29 and up no longer have swabb.h */ -//#define HAVE_LINUX_BYTEORDER_SWABB_H - -/* some vendor kernel have it backported. */ -#define HAVE_SET_CPUS_ALLOWED_PTR - -/* Some vendor kernels < 2.6.7 might define msleep in one or - * another way .. */ - -#define KERNEL_HAS_MSLEEP - -/* Some other kernels < 2.6.8 do not have struct kvec, - * others do.. */ - -#define KERNEL_HAS_KVEC - -/* Actually availabe since 2.6.25, but venders have backported... - */ -#define KERNEL_HAS_PROC_CREATE - -/* In 2.6.32 we finally fixed connector to pass netlink_skb_parms to the callback - */ -#define KERNEL_HAS_CN_SKB_PARMS - -#endif diff -Nru drbd8-8.3.7/filelist-redhat drbd8-8.4.1+git55a81dc~cmd1/filelist-redhat --- drbd8-8.3.7/filelist-redhat 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/filelist-redhat 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,11 @@ +%defattr(644,root,root,755) +%doc COPYING +%doc ChangeLog +%if 0%(grep -q "release 5" /etc/redhat-release && echo 1) +/lib/modules/%verrel%variant +%doc obj/k-config-%verrel%variant.gz +%else +/lib/modules/%verrel%dotvariant +%doc obj/k-config-%verrel%dotvariant.gz +%endif +%config /etc/depmod.d/drbd.conf diff -Nru drbd8-8.3.7/filelist-suse drbd8-8.4.1+git55a81dc~cmd1/filelist-suse --- drbd8-8.3.7/filelist-suse 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/filelist-suse 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,11 @@ +%defattr(-,root,root) +%doc COPYING +%doc ChangeLog +%if %{defined 3} +# on sles10, _suse_kernel_module_subpackage takes 3 arguments still +/lib/modules/%3-%1 +%doc obj/k-config-%3-%1.gz +%else +/lib/modules/%2-%1 +%doc obj/k-config-%2-%1.gz +%endif diff -Nru drbd8-8.3.7/preamble drbd8-8.4.1+git55a81dc~cmd1/preamble --- drbd8-8.3.7/preamble 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/preamble 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,22 @@ +# always require a suitable userland +Requires: drbd-utils = %{version} + +%if %{defined suse_kernel_module_package} +%if 0%{?sles_version} == 10 +%{expand:%(cat %_sourcedir/drbd/preamble-sles10)} + +%else +%if 0%{?sles_version} == 11 +%{expand:%(cat %_sourcedir/drbd/preamble-sles11)} + +%endif +%endif +%else +%if 0%((test -e /etc/redhat-release && grep -q "release 5" /etc/redhat-release) && echo 1) +%{expand:%(cat %_sourcedir/drbd/preamble-rhel5)} + +# CentOS: +Conflicts: kmod-drbd82 kmod-drbd83 + +%endif +%endif diff -Nru drbd8-8.3.7/preamble-rhel5 drbd8-8.4.1+git55a81dc~cmd1/preamble-rhel5 --- drbd8-8.3.7/preamble-rhel5 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/preamble-rhel5 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,100 @@ +Provides: drbd-km-2.6.18_238.1.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_238.1.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_238.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_238.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.32.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.32.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.26.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.26.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.17.4.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.17.4.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.17.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.17.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.11.4.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.11.4.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.11.3.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.11.3.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.11.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.11.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.8.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.8.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.3.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.3.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_194.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_194.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_164.15.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_164.15.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_164.11.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_164.11.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_164.10.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_164.10.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_164.9.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_164.9.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_164.6.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_164.6.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_164.2.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_164.2.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_164.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_164.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.7.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.7.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.4.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.4.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.2.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.2.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.1.16.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.1.16.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.1.14.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.1.14.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.1.10.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.1.10.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.1.6.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.1.6.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.1.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.1.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_128.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_128.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_92.1.22.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_92.1.22.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_92.1.18.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_92.1.18.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_92.1.13.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_92.1.13.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_92.1.10.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_92.1.10.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_92.1.6.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_92.1.6.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_92.1.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_92.1.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_92.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_92.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_53.1.21.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_53.1.21.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_53.1.19.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_53.1.19.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_53.1.14.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_53.1.14.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_53.1.13.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_53.1.13.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_53.1.6.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_53.1.6.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_53.1.4.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_53.1.4.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_53.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_53.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_8.1.15.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_8.1.15.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_8.1.14.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_8.1.14.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_8.1.8.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_8.1.8.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_8.1.6.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_8.1.6.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_8.1.4.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_8.1.4.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_8.1.3.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_8.1.3.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_8.1.1.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_8.1.1.el5%variant < 8.3.10 +Provides: drbd-km-2.6.18_8.el5%variant = 8.3.10 +Obsoletes: drbd-km-2.6.18_8.el5%variant < 8.3.10 diff -Nru drbd8-8.3.7/preamble-sles10 drbd8-8.4.1+git55a81dc~cmd1/preamble-sles10 --- drbd8-8.3.7/preamble-sles10 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/preamble-sles10 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,52 @@ +Provides: drbd-km-2.6.16.60_0.60.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.60.1_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.59.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.59.1_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.58.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.58.1_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.54.5_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.54.5_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.42.7_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.42.7_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.42.5_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.42.5_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.42.4_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.42.4_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.39.3_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.39.3_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.37_f594963d_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.37_f594963d_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.34_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.34_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.33_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.33_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.31_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.31_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.30_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.30_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.29_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.29_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.27_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.27_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.25_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.25_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.23_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.23_%1 < 8.3.10 +Provides: drbd-km-2.6.16.60_0.21_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.60_0.21_%1 < 8.3.10 +Provides: drbd-km-2.6.16.54_0.2.5_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.54_0.2.5_%1 < 8.3.10 +Provides: drbd-km-2.6.16.54_0.2.3_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.54_0.2.3_%1 < 8.3.10 +Provides: drbd-km-2.6.16.53_0.16_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.53_0.16_%1 < 8.3.10 +Provides: drbd-km-2.6.16_53_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16_53_%1 < 8.3.10 +Provides: drbd-km-2.6.16.46_0.14_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.46_0.14_%1 < 8.3.10 +Provides: drbd-km-2.6.16.46_0.12_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.46_0.12_%1 < 8.3.10 +Provides: drbd-km-2.6.16.21_0.15_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.21_0.15_%1 < 8.3.10 +Provides: drbd-km-2.6.16.21_0.8_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.16.21_0.8_%1 < 8.3.10 diff -Nru drbd8-8.3.7/preamble-sles11 drbd8-8.4.1+git55a81dc~cmd1/preamble-sles11 --- drbd8-8.3.7/preamble-sles11 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/preamble-sles11 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,37 @@ +# SLES 11 SP1 +Provides: drbd-km-2.6.32.27_0.2_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.32.27_0.2_%1 < 8.3.10 +Provides: drbd-km-2.6.32.24_0.2_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.32.24_0.2_%1 < 8.3.10 +Provides: drbd-km-2.6.32.23_0.3_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.32.23_0.3_%1 < 8.3.10 +Provides: drbd-km-2.6.32.19_0.3_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.32.19_0.3_%1 < 8.3.10 +Provides: drbd-km-2.6.32.19_0.2_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.32.19_0.2_%1 < 8.3.10 +Provides: drbd-km-2.6.32.13_0.5_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.32.13_0.5_%1 < 8.3.10 +Provides: drbd-km-2.6.32.13_0.4_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.32.13_0.4_%1 < 8.3.10 +Provides: drbd-km-2.6.32.12_0.7_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.32.12_0.7_%1 < 8.3.10 + +# SLES 11 +Provides: drbd-km-2.6.27.45_0.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.45_0.1_%1 < 8.3.10 +Provides: drbd-km-2.6.27.42_0.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.42_0.1_%1 < 8.3.10 +Provides: drbd-km-2.6.27.39_0.3_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.39_0.3_%1 < 8.3.10 +Provides: drbd-km-2.6.27.37_0.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.37_0.1_%1 < 8.3.10 +Provides: drbd-km-2.6.27.29_0.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.29_0.1_%1 < 8.3.10 +Provides: drbd-km-2.6.27.25_0.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.25_0.1_%1 < 8.3.10 +Provides: drbd-km-2.6.27.23_0.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.23_0.1_%1 < 8.3.10 +Provides: drbd-km-2.6.27.21_0.1_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.21_0.1_%1 < 8.3.10 +Provides: drbd-km-2.6.27.19_5_%1 = 8.3.10 +Obsoletes: drbd-km-2.6.27.19_5_%1 < 8.3.10 diff -Nru drbd8-8.3.7/rpm-macro-fixes/README drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/README --- drbd8-8.3.7/rpm-macro-fixes/README 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/README 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,20 @@ +macros.kernel-source.sles11-sp1.diff: + + Patch needed on SUSE products in order to allow building kernel module + packages for a specific kernel version. + See the patch for more detailed documentation. + +macros.kernel-source.sles11.diff: + + Same thing for sles11 (no sp1) + +suse_macros.sles10.diff: + + Similar thing for sles10 + +kmodtool.rhel5.diff + + Add filelist tag substitution capabilities to rhel5 kmodtool, + and drop the dependency on a ...-kmod-common package, + similar to what rhel6 does. + diff -Nru drbd8-8.3.7/rpm-macro-fixes/kmodtool.rhel5.diff drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/kmodtool.rhel5.diff --- drbd8-8.3.7/rpm-macro-fixes/kmodtool.rhel5.diff 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/kmodtool.rhel5.diff 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,30 @@ +--- /usr/lib/rpm/redhat/kmodtool ++++ /usr/lib/rpm/redhat/kmodtool +@@ -65,12 +65,19 @@ + { + local variant="${1}" + local dashvariant="${variant:+-${variant}}" ++ local dotvariant="${variant:+.${variant}}" ++ + case "$verrel" in + *.el*) kdep="kernel${dashvariant}-%{_target_cpu} = ${verrel}" ;; + *.EL*) kdep="kernel${dashvariant}-%{_target_cpu} = ${verrel}" ;; + *) kdep="kernel-%{_target_cpu} = ${verrel}${variant}" ;; + esac + ++ echo "%global verrel $verrel" ++ echo "%global variant ${variant:-%nil}" ++ echo "%global dashvariant ${dashvariant:-%nil}" ++ echo "%global dotvariant ${dotvariant:-%nil}" ++ + echo "%package -n kmod-${kmod_name}${dashvariant}" + + if [ -z "$kmp_provides_summary" ]; then +@@ -100,7 +107,6 @@ + fi + + cat <= %{?epoch:%{epoch}:}%{version} + Requires(post): /sbin/depmod + Requires(postun): /sbin/depmod + EOF diff -Nru drbd8-8.3.7/rpm-macro-fixes/macros.kernel-source.sles11-sp1.diff drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/macros.kernel-source.sles11-sp1.diff --- drbd8-8.3.7/rpm-macro-fixes/macros.kernel-source.sles11-sp1.diff 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/macros.kernel-source.sles11-sp1.diff 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,48 @@ +By default, the %kernel_module_package will build packages for all kernel +flavors it finds in /usr/src/linux-obj: this directory contains symlinks to the +latest kernel-$flavor-devel packages installed. + +This default can be overridden by defining the %kernel_version macro on the +rpmbuild command line. For example, you can build against version +2.6.32.19-0.2 with: + + rpmbuild --define 'kernel_version 2.6.32.19-0.2' + +When doing that, rpmbuild will iterate over the kernels defined in +/usr/src/linux-%kernel_version-obj, instead. + +It is not possible to iterate over all installed kernel-$flavor-devel packages +in one rpmbuild command: rpm only allows to build a single sub-package with a +given name (for example, drbd-kmp-default), and cannot build two separate +drbd-kmp-default sub-packages with different versions. + + Andreas Gruenbacher + +--- /etc/rpm/macros.kernel-source.orig ++++ /etc/rpm/macros.kernel-source +@@ -9,14 +9,14 @@ + echo "%%define _suse_kernel_module_subpackage(n:v:r:f:p:) %%{expand:%%(cd %_sourcedir; cat $subpkg; echo %%%%nil)}" \ + flavors_to_build= \ + flavors="%*" \ +- for flavor in $(ls /usr/src/linux-obj/%_target_cpu 2>/dev/null); do \ ++ for flavor in $(ls /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu 2>/dev/null); do \ + case " $flavors " in \ + (*" $flavor "*) \ + [ -n "%{-X}" ] && continue ;; \ + (*) \ + [ -z "%{-X}" -a -n "$flavors" ] && continue ;; \ + esac \ +- krel=$(make -s -C /usr/src/linux-obj/%_target_cpu/$flavor kernelrelease) \ ++ krel=$(make -s -C /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu/$flavor kernelrelease) \ + kver=${krel%%-*} \ + [ -e /boot/symsets-$kver-$flavor.tar.gz ] || continue \ + flavors_to_build="$flavors_to_build $flavor" \ +@@ -24,7 +24,7 @@ + done \ + echo "%%global flavors_to_build${flavors_to_build:-%%nil}" \ + echo "%%{expand:%%(test -z '%flavors_to_build' && echo %%%%internal_kmp_error)}" \ +- echo "%%global kernel_source() /usr/src/linux-obj/%_target_cpu/%%%%{1}" \ ++ echo "%%global kernel_source() /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu/%%%%{1}" \ + echo "%%global kernel_module_package_moddir() updates" \ + \ + echo "%package -n %{-n*}%{!-n:%name}-kmp-_dummy_" \ diff -Nru drbd8-8.3.7/rpm-macro-fixes/macros.kernel-source.sles11.diff drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/macros.kernel-source.sles11.diff --- drbd8-8.3.7/rpm-macro-fixes/macros.kernel-source.sles11.diff 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/macros.kernel-source.sles11.diff 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,30 @@ +See comment in macros.kernel-source.sles11-sp1.diff + +--- /etc/rpm/macros.kernel-source.orig ++++ /etc/rpm/macros.kernel-source +@@ -9,14 +9,14 @@ + echo "%%define _suse_kernel_module_subpackage(n:v:r:f:p:) %%{expand:%%(cd %_sourcedir; cat $subpkg; echo %%%%nil)}" \ + flavors_to_build= \ + flavors="%*" \ +- for flavor in $(ls /usr/src/linux-obj/%_target_cpu 2>/dev/null); do \ ++ for flavor in $(ls /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu 2>/dev/null); do \ + case " $flavors " in \ + (*" $flavor "*) \ + [ -n "%{-X}" ] && continue ;; \ + (*) \ + [ -z "%{-X}" -a -n "$flavors" ] && continue ;; \ + esac \ +- krel=$(make -s -C /usr/src/linux-obj/%_target_cpu/$flavor kernelrelease) \ ++ krel=$(make -s -C /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu/$flavor kernelrelease) \ + kver=${krel%%-*} \ + [ -e /boot/symsets-$kver-$flavor.tar.gz ] || continue \ + flavors_to_build="$flavors_to_build $flavor" \ +@@ -24,7 +24,7 @@ + done \ + echo "%%global flavors_to_build${flavors_to_build:-%%nil}" \ + echo "%%{expand:%%(test -z '%flavors_to_build' && echo %%%%internal_kmp_error)}" \ +- echo "%%global kernel_source() /usr/src/linux-obj/%_target_cpu/%%%%{1}" \ ++ echo "%%global kernel_source() /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu/%%%%{1}" \ + \ + echo "%package -n %{-n*}%{!-n:%name}-kmp-_dummy_" \ + echo "Version: %version" \ diff -Nru drbd8-8.3.7/rpm-macro-fixes/macros.rhel5.diff drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/macros.rhel5.diff --- drbd8-8.3.7/rpm-macro-fixes/macros.rhel5.diff 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/macros.rhel5.diff 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,13 @@ +--- /usr/lib/rpm/redhat/macros.orig ++++ /usr/lib/rpm/redhat/macros +@@ -170,8 +170,8 @@ + + %kernel_module_package(n:v:r:s:f:xp:) %{expand:%( \ + %define kmodtool %{-s*}%{!-s:/usr/lib/rpm/redhat/kmodtool} \ +- %define kmp_version %{-v*}%{!-v:%{version}} \ +- %define kmp_release %{-r*}%{!-r:%{release}} \ ++ %global kmp_version %{-v*}%{!-v:%{version}} \ ++ %global kmp_release %{-r*}%{!-r:%{release}} \ + %define latest_kernel %(rpm -q --qf '%{VERSION}-%{RELEASE}\\\\n' `rpm -q kernel-devel | /usr/lib/rpm/redhat/rpmsort -r | head -n 1` | head -n 1) \ + %{!?kernel_version:%{expand:%%global kernel_version %{latest_kernel}}} \ + %global kverrel %(%{kmodtool} verrel %{?kernel_version} 2>/dev/null) \ diff -Nru drbd8-8.3.7/rpm-macro-fixes/suse_macros.sles10.diff drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/suse_macros.sles10.diff --- drbd8-8.3.7/rpm-macro-fixes/suse_macros.sles10.diff 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/suse_macros.sles10.diff 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,46 @@ +See comment in macros.kernel-source.sles11-sp1.diff + +--- /usr/lib/rpm/suse_macros.orig ++++ /usr/lib/rpm/suse_macros +@@ -473,12 +473,12 @@ + + # Defines %flavors_to_build as a side effect. + %suse_kernel_module_package(n:v:r:s:f:xp:) \ +-%{expand:%( \ ++%{expand:%{expand:%( \ ++ ( \ + subpkg=%{-s*}%{!-s:/usr/lib/rpm/rpm-suse-kernel-module-subpackage} \ + echo "%%define _suse_kernel_module_subpackage(n:v:r:f:p:) %%{expand:%%(cd %_sourcedir; cat $subpkg; echo %%%%nil)}" \ +- flavors="%{-x:%*}%{!-x:$(ls /usr/src/linux-obj/%_target_cpu 2>/dev/null)}" \ ++ flavors="%{-x:%*}%{!-x:$(ls /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu 2>/dev/null)}" \ + flavors_to_build= \ +- kver=$(rpm -q --qf '%{VERSION}-%{RELEASE}' kernel-source) \ + for flavor in $flavors; do \ + if [ -z "%{-x}" ]; then \ + case " %* " in \ +@@ -486,19 +486,23 @@ + continue ;; \ + esac \ + fi \ +- krel=$(make -s -C /usr/src/linux-obj/%_target_cpu/$flavor kernelrelease) \ ++ krel=$(make -s -C /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu/$flavor kernelrelease) \ ++ kver=${krel%%-*} \ + [ -e /boot/symsets-$krel.tar.gz ] || continue \ + flavors_to_build="$flavors_to_build $flavor" \ + echo "%%_suse_kernel_module_subpackage -n %{-n*}%{!-n:%name}-kmp -v %{-v*}%{!-v:%version} -r %{-r*}%{!-r:%release} %{-p} $flavor $krel $kver" \ + done \ + echo "%%global flavors_to_build${flavors_to_build:-%%nil}" \ ++ echo "%%global kernel_source() /usr/src/linux-%{?kernel_version:%kernel_version-}obj/%_target_cpu/%%%%{1}" \ ++ echo "%%global kernel_module_package_moddir() updates" \ + \ + echo "%package -n %{-n*}%{!-n:%name}-kmp-_dummy_" \ + echo "Version: %version" \ + echo "Summary: %summary" \ + echo "Group: %group" \ + echo "%description -n %{-n*}%{!-n:%name}-kmp-_dummy_" \ +- )} ++ ) | sed -e 's/%%/%%%%/g' \ ++ )}} + + %suse_version 1010 + %sles_version 10 diff -Nru drbd8-8.3.7/rpm-macro-fixes/symset-table.diff drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/symset-table.diff --- drbd8-8.3.7/rpm-macro-fixes/symset-table.diff 1970-01-01 00:00:00.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/rpm-macro-fixes/symset-table.diff 2012-02-02 14:09:14.000000000 +0000 @@ -0,0 +1,54 @@ +symsets-xyz-tar.gz contain only the current symsets, +and potentially compatible symsets. + +To be compatible by definition means to be a subset of the current symset. + +If we scan through the symsets in ascending order of their size in bytes, +the first symset to match a particular symbol will be the "oldest", +"most compatible". + +This way, even if the most recent kernel version provides some new +symset containing new symbols, a kernel module package built +against it will still only require the weakest symset(s) necessary, +so will stay compatible on the rpm dependency level with all older +kernels that provide the actually used symbols. + +Without the sorting and filtering, the resulting kmp would require +all symsets the respective symbols are defined in, including the +latest symset, even if only a subset of the contained symbols is +actually used. Thus the kmp may become "incompatible" on the rpm +level with older kernel versions, even though it works just fine +with "weak-modules" on the actual symbol version level. + +--- /usr/lib/rpm/symset-table ++++ /usr/lib/rpm/symset-table +@@ -21,15 +21,26 @@ + + for symsets in *; do + krel=${symsets#symsets-} +- for symset in $symsets/*; do ++ for symset in $(ls -Sr $symsets/* ); do + class=${symset##*/} ; class=${class%.*} + hash=${symset##*.} + awk ' + BEGIN { FS = "\t" ; OFS = "\t" } + { sub(/0x0*/, "", $1) +- print krel "/" $1 "/" $2, class, hash } ++ print krel "/" $1, $2, class, hash } + ' krel="$krel" class="$class" hash="$hash" $symset +- done ++ done \ ++ | awk ' ++ # Filter out duplicate symbols. Since we went through the symset ++ # files in increasing size order, each symbol will remain in the ++ # table with the oldest symset it is defined in. ++ BEGIN { FS = "\t" ; OFS = "\t" } ++ { if ($2 in seen) ++ next ++ seen[$2]=1 ++ print $1 "/" $2, $3, $4 } ++ ' \ ++ | sort -t $'\t' -k 1,1 + done + + # vim:shiftwidth=4 softtabstop=4 diff -Nru drbd8-8.3.7/scripts/Makefile drbd8-8.4.1+git55a81dc~cmd1/scripts/Makefile --- drbd8-8.3.7/scripts/Makefile 2012-09-03 23:12:22.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/scripts/Makefile 2012-09-03 22:37:14.000000000 +0000 @@ -45,7 +45,7 @@ WITH_XEN = yes WITH_PACEMAKER = yes WITH_HEARTBEAT = yes -WITH_RGMANAGER = yes +WITH_RGMANAGER = no WITH_BASHCOMPLETION = yes # variables meant to be overridden from the make command line diff -Nru drbd8-8.3.7/scripts/adjust_drbd_config_h.sh drbd8-8.4.1+git55a81dc~cmd1/scripts/adjust_drbd_config_h.sh --- drbd8-8.3.7/scripts/adjust_drbd_config_h.sh 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/scripts/adjust_drbd_config_h.sh 1970-01-01 00:00:00.000000000 +0000 @@ -1,212 +0,0 @@ -#!/bin/bash -# drbd_config.h auto edit magic for 2.4 kernels ... - -# expects KDIR in the environment to be set correctly! - -set -e -sorry() { - cat <<___ - Sorry, automagic adjustment of drbd_config.h failed. - For well known 2.6. kernels, no adjustment to the shipped drbd_config is necessary. - You need to verify it yourself. -___ -} -trap "sorry" 0 -grep_q() { grep "$@" /dev/null &>/dev/null ; } - -# PARANOIA: -test -e ./linux/drbd_config.h || { - echo >&2 "oops, invoked in unexpected directory..." - exit 1 -} - -if [[ -z $KDIR ]] ; then - echo >&2 "You did not tell me which kernel I should check" - echo >&2 "So I'm taking a guess..." - O= - KDIR_BEST_GUESS=/lib/modules/`uname -r`/source - O_BEST_GUESS=/lib/modules/`uname -r`/build - test -d $KDIR_BEST_GUESS && KDIR=$KDIR_BEST_GUESS - test -d $O_BEST_GUESS && O=$O_BEST_GUESS -fi -test -n "$KDIR" - -# ok, now we have a KDIR; cd into it, in case we detect relative pathes -pushd $KDIR - -KDIR=${KDIR%/} -if test -z "$O"; then - ## just in case... - ## detect if $KDIR points to something which is actually $O ... - X=$( make no-such-makefile-target 2>/dev/null | - sed -ne '/ -C .* O=.* no-such-makefile-target$/p' | tr -s ' ' ) - if [[ -n $X ]]; then - KDIR=${X##* -C }; KDIR=${KDIR%% *}; KDIR=$(cd $KDIR && pwd) - O=${X##* O=}; O=${O%% *}; O=$(cd $KDIR && cd $O && pwd) - else - O=$KDIR; - fi -else - O=${O%/} -fi - -# some paranoia: check that all files are where we expect them -ls > /dev/null \ -$KDIR/{Makefile,include/linux/{gfp,types,slab,net}.h} -ls > /dev/null \ -$O/{.config,Makefile,include/linux/version.h} -test -e $O/include/asm/atomic.h || -test -e $O/include/asm/arch/atomic.h || -test -e $O/include2/asm/atomic.h || -test -e $KDIR/include/asm-generic/atomic.h || -exit 1 - -if grep_q "^PATCHLEVEL *= *6" $KDIR/Makefile ; then - # do we have gfp_t? - if grep_q "typedef.*gfp_t" $KDIR/include/linux/gfp.h $KDIR/include/linux/types.h; then - have_gfp_t=1 - else - have_gfp_t=0 - fi - # stupid vendor kernels grrr... - have_atomic_add=0 - # btw, don't ask why I don't use grep -qs $a $b $c - # it simply does not work always... - for f in $O/include/asm/atomic.h \ - $O/include/asm/arch/atomic.h \ - $O/include2/asm/atomic.h \ - $O/include/asm/atomic_32.h \ - $O/include2/asm/atomic_32.h \ - $O/include/asm/arch/atomic_32.h \ - $KDIR/include/asm-generic/atomic.h - do - if grep_q "atomic_add_return" $f; then - have_atomic_add=1 - break - fi - done - if grep_q "typedef.*kmem_cache_s" $KDIR/include/linux/slab.h ; then - have_kmem_cache_s=1 - else - have_kmem_cache_s=0 - fi - if grep_q "sock_create_kern" $KDIR/include/linux/net.h ; then - have_sock_create_kern=1 - else - have_sock_create_kern=0 - fi - if grep_q "kernel_sock_shutdown" $KDIR/include/linux/net.h ; then - have_kernel_sock_shutdown=1 - else - have_kernel_sock_shutdown=0 - fi - if grep_q "dst_groups" $KDIR/include/linux/netlink.h ; then - have_nl_dst_groups=1 - else - have_nl_dst_groups=0 - fi - if grep_q "kzalloc" $KDIR/include/linux/slab.h ; then - need_backport_of_kzalloc=0 - else - need_backport_of_kzalloc=1 - fi - if test -e $KDIR/include/linux/scatterlist.h ; then - have_linux_scatterlist_h=1 - if grep_q "sg_set_buf" $KDIR/include/linux/scatterlist.h ; then - need_sg_set_buf=0 - else - need_sg_set_buf=1 - fi - else - have_linux_scatterlist_h=0 - need_sg_set_buf=1 - fi - if grep_q "msleep" $KDIR/include/linux/delay.h ; then - have_msleep=1 - else - have_msleep=0 - fi - if grep_q "kvec" $KDIR/include/linux/uio.h ; then - have_kvec=1 - else - have_kvec=0 - fi - if test -e $KDIR/include/linux/byteorder/swabb.h ; then - have_linux_byteorder_swabb_h=1 - else - have_linux_byteorder_swabb_h=0 - fi - if grep_q "proc_create(" $KDIR/include/linux/proc_fs.h ; then - have_proc_create=1 - else - have_proc_create=0 - fi - if grep_q "set_cpus_allowed_ptr(" $KDIR/include/linux/sched.h ; then - have_set_cpus_allowed_ptr=1 - else - have_set_cpus_allowed_ptr=0 - fi - if grep_q "netlink_skb_parms" $KDIR/include/linux/connector.h ; then - have_netlink_skb_parms=1 - else - have_netlink_skb_parms=0 - fi -else - # not a 2.6. kernel. just leave it alone... - exit 0 -fi - -# and back do drbd source -popd - -test -e ./linux/drbd_config.h.orig || cp ./linux/drbd_config.h{,.orig} - -perl -pe " - s{.*(#define KERNEL_HAS_GFP_T.*)} - { ( $have_gfp_t ? '' : '//' ) . \$1}e; - s{.*(#define NEED_BACKPORT_OF_ATOMIC_ADD.*)} - { ( $have_atomic_add ? '//' : '' ) . \$1}e; - s{.*(#define USE_KMEM_CACHE_S.*)} - { ( $have_kmem_cache_s ? '' : '//' ) . \$1}e; - s{.*(#define DEFINE_SOCK_CREATE_KERN.*)} - { ( $have_sock_create_kern ? '//' : '' ) . \$1}e; - s{.*(#define DEFINE_KERNEL_SOCK_SHUTDOWN.*)} - { ( $have_kernel_sock_shutdown ? '//' : '' ) . \$1}e; - s{.*(#define DRBD_NL_DST_GROUPS.*)} - { ( $have_nl_dst_groups ? '' : '//' ) . \$1}e; - s{.*(#define NEED_BACKPORT_OF_KZALLOC.*)} - { ( $need_backport_of_kzalloc ? '' : '//' ) . \$1}e; - s{.*(#define NEED_SG_SET_BUF.*)} - { ( $need_sg_set_buf ? '' : '//' ) . \$1}e; - s{.*(#define HAVE_LINUX_SCATTERLIST_H.*)} - { ( $have_linux_scatterlist_h ? '' : '//' ) . \$1}e; - s{.*(#define KERNEL_HAS_MSLEEP.*)} - { ( $have_msleep ? '' : '//' ) . \$1}e; - s{.*(#define KERNEL_HAS_KVEC.*)} - { ( $have_kvec ? '' : '//' ) . \$1}e; - s{.*(#define HAVE_LINUX_BYTEORDER_SWABB_H.*)} - { ( $have_linux_byteorder_swabb_h ? '' : '//' ) . \$1}e; - s{.*(#define KERNEL_HAS_PROC_CREATE.*)} - { ( $have_proc_create ? '' : '//' ) . \$1}e; - s{.*(#define HAVE_SET_CPUS_ALLOWED_PTR.*)} - { ( $have_set_cpus_allowed_ptr ? '' : '//' ) . \$1}e; - s{.*(#define KERNEL_HAS_CN_SKB_PARMS.*)} - { ( $have_netlink_skb_parms ? '' : '//' ) . \$1}e; - " \ - < ./linux/drbd_config.h \ - > ./linux/drbd_config.h.new - -if ! DIFF=$(diff -s -U0 ./linux/drbd_config.h{,.new}) ; then - mv ./linux/drbd_config.h{.new,} - sed -e 's/^/ /' <<___ - -Adjusted drbd_config.h: -$DIFF - -___ -else - rm ./linux/drbd_config.h.new - echo -e "\n Using unmodified drbd_config.h\n" -fi -trap - 0 -exit 0 diff -Nru drbd8-8.3.7/scripts/crm-fence-peer.sh drbd8-8.4.1+git55a81dc~cmd1/scripts/crm-fence-peer.sh --- drbd8-8.3.7/scripts/crm-fence-peer.sh 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/scripts/crm-fence-peer.sh 2012-02-02 14:09:14.000000000 +0000 @@ -12,7 +12,7 @@ s/ *\bid="[^"]*"// # remove id tag # print each attribute on its own line, by : attr - h # rememver the current rest line + h # remember the current (tail of the) line # remove all but the first attribute, and print, s/^\([^[:space:]]*[[:space:]][^= ]*="[^"]*"\).*$/\1/p g # then restore the remembered line, @@ -155,26 +155,59 @@ local peer_state check_peer_node_reachable set_states_from_proc_drbd - case $peer_state/$DRBD_disk in - reachable/*) - cibadmin -C -o constraints -X "$new_constraint" && - drbd_fence_peer_exit_code=4 rc=0 + : == DEBUG == DRBD_peer=${DRBD_peer[*]} === + case "${DRBD_peer[*]}" in + *Secondary*|*Primary*) + # WTF? We are supposed to fence the peer, + # but the replication link is just fine? + echo WARNING "peer is not Unknown, did not place the constraint!" + rc=0 + return ;; - */UpToDate) + esac + : == DEBUG == CTS_mode=$CTS_mode == + : == DEBUG == DRBD_disk_all_consistent=$DRBD_disk_all_consistent == + : == DEBUG == DRBD_disk_all_uptodate=$DRBD_disk_all_uptodate == + : == DEBUG == $peer_state/${DRBD_disk[*]}/$unreachable_peer_is == + if [[ ${#DRBD_disk[*]} = 0 ]]; then + # Someone called this script, without the corresponding drbd + # resource being configured. That's not very useful. + echo WARNING "could not determine my disk state: did not place the constraint!" + rc=0 + # keep drbd_fence_peer_exit_code at "generic error", + # which will cause a "script is broken" message in case it was + # indeed called as handler from within drbd + elif [[ $peer_state = reachable ]] && $DRBD_disk_all_consistent; then + cibadmin -C -o constraints -X "$new_constraint" && + drbd_fence_peer_exit_code=4 rc=0 && + echo INFO "peer is $peer_state, my disk is ${DRBD_disk[*]}: placed constraint '$id_prefix-$master_id'" + elif $DRBD_disk_all_uptodate ; then # We could differentiate between unreachable, # and DC-unreachable. In the latter case, placing the # constraint will fail anyways, and drbd_fence_peer_exit_code # will stay at "generic error". cibadmin -C -o constraints -X "$new_constraint" && - drbd_fence_peer_exit_code=5 rc=0 - ;; - *) - echo WARNING "did not place the constraint!" + drbd_fence_peer_exit_code=5 rc=0 && + echo INFO "peer is not reachable, my disk is UpToDate: placed constraint '$id_prefix-$master_id'" + elif [[ $peer_state = unreachable ]] && [[ $unreachable_peer_is = outdated ]] && $DRBD_disk_all_consistent; then + # If the peer is not reachable, but we are only Consistent, we + # may need some way to still allow promotion. + # Easy way out: --force primary with drbdsetup. + # But that would not place the constraint, nor outdate the + # peer. With this --unreachable-peer-is-outdated, we still try + # to set the constraint. Next promotion attempt will find the + # "correct" constraint, consider the peer as successfully + # fenced, and continue. + cibadmin -C -o constraints -X "$new_constraint" && + drbd_fence_peer_exit_code=5 rc=0 && + echo WARNING "peer is unreachable, my disk is only Consistent: --unreachable-peer-is-outdated FORCED constraint '$id_prefix-$master_id'" && + echo WARNING "This MAY RISK DATA INTEGRITY" + else + echo WARNING "peer is $peer_state, my disk is ${DRBD_disk[*]}: did not place the constraint!" drbd_fence_peer_exit_code=5 rc=0 # I'd like to return 6 here, otherwise pacemaker will retry # forever to promote, even though 6 is not strictly correct. - ;; - esac + fi } # drbd_peer_fencing fence|unfence @@ -220,7 +253,7 @@ # and try to go online with stale data. # Exactly what this "fence" hanler should prevent. # But setting contraints in a cluster partition with - # "no-quorum-policy=ignore" will usually succeed. + # "no-quorum-policy=ignore" will usually succeed. # # So we need to differentiate between node reachable or # not, and DRBD "Consistent" or "UpToDate". @@ -228,7 +261,7 @@ try_place_constraint elif [[ "$have_constraint" = "$(set +x; echo "$new_constraint" | sed_rsc_location_suitable_for_string_compare "$id_prefix-$master_id")" ]]; then - : "identical constraint already placed" + echo INFO "suitable constraint already placed: '$id_prefix-$master_id'" drbd_fence_peer_exit_code=4 rc=0 else @@ -245,7 +278,7 @@ # better data than us, and wants us outdated. fi - if [ $rc != 0 ]; then + if [[ $rc != 0 ]]; then # at least we tried. # maybe it was already in place? echo WARNING "DATA INTEGRITY at RISK: could not place the fencing constraint!" @@ -311,6 +344,20 @@ let "cibtimeout = cibtimeout * 5 / 4" done state_lines=$(echo "$cib_xml" | grep '&2 "invalid logfacility: $lf" + return + ;; + esac + exec > >(2>&- ; logger -t "$PROG[$$]" -p $lf.info) 2>&1 +} if [[ $- != *x* ]]; then - exec > >(2>&- ; logger -t "$PROG[$$]" -p local5.info) 2>&1 + # you may override with --logfacility below + redirect_to_logger local5 fi # clean environment just in case. -unset fencing_attribute id_prefix timeout dc_timeout +unset fencing_attribute id_prefix timeout dc_timeout unreachable_peer_is +CTS_mode=false suicide_on_failure_if_primary=false # poor mans command line argument parsing, # allow for command line overrides while [[ $# != 0 ]]; do case $1 in + --logfacility=*) + redirect_to_logger ${1#*=} + ;; + --logfacility) + redirect_to_logger $2 + shift + ;; --resource=*) DRBD_RESOURCE=${1#*=} ;; @@ -435,6 +543,16 @@ dc_timeout=$2 shift ;; + --CTS-mode) + CTS_mode=true + ;; + --unreachable-peer-is-outdated) + # This is NOT to be scripted. + # Or people will put this into the handler definition in + # drbd.conf, and all this nice work was useless. + test -t 0 && + unreachable_peer_is=outdated + ;; # --suicide-on-failure-if-primary) # suicide_on_failure_if_primary=true # ;; @@ -449,14 +567,16 @@ done # DRBD_RESOURCE: from environment # master_id: parsed from cib -# apply defaults: -: ${fencing_attribute:="#uname"} -: ${id_prefix:="drbd-fence-by-handler"} -: ${role:="Master"} + +: "== unreachable_peer_is == ${unreachable_peer_is:=unknown}" +# apply defaults: +: "== fencing_attribute == ${fencing_attribute:="#uname"}" +: "== id_prefix == ${id_prefix:="drbd-fence-by-handler"}" +: "== role == ${role:="Master"}" # defaults suitable for single-primary no-stonith. -: ${timeout:=1} -: ${dc_timeout:=$[20+timeout]} +: "== timeout == ${timeout:=1}" +: "== dc_timeout == ${dc_timeout:=$[20+timeout]}" # check envars normally passed in by drbdadm # TODO DRBD_CONF is also passed in. we may need to use it in the @@ -471,6 +591,17 @@ fi done +# Fixup id-prefix to include the resource name +# There may be multiple drbd instances part of the same M/S Group, pointing to +# the same master-id. Still they need to all have their own constraint, to be +# able to unfence independently when they finish their resync independently. +# Be nice to people who already explicitly configure an id prefix containing +# the resource name. +if [[ $id_prefix != *"-$DRBD_RESOURCE" ]] ; then + id_prefix="$id_prefix-$DRBD_RESOURCE" + : "== id_prefix == ${id_prefix}" +fi + # make sure it contains what we expect HOSTNAME=$(uname -n) diff -Nru drbd8-8.3.7/scripts/drbd drbd8-8.4.1+git55a81dc~cmd1/scripts/drbd --- drbd8-8.3.7/scripts/drbd 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/scripts/drbd 2012-02-02 14:09:14.000000000 +0000 @@ -3,7 +3,8 @@ # chkconfig: - 70 08 # description: Loads and unloads the drbd module # -# Copright 2001-2008 LINBIT Information Technologies +# Copyright 2001-2010 LINBIT +# # Philipp Reisner, Lars Ellenberg # ### BEGIN INIT INFO @@ -12,8 +13,10 @@ # Required-Stop: $local_fs $network $syslog # Should-Start: sshd multipathd # Should-Stop: sshd multipathd -# Default-Start: -# Default-Stop: +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# X-Start-Before: heartbeat corosync +# X-Stop-After: heartbeat corosync # Short-Description: Control drbd resources. ### END INIT INFO @@ -44,41 +47,13 @@ { [ -e "$PROC_DRBD" ] && return - $MODPROBE -s drbd `$DRBDADM sh-mod-parms` $ADD_MOD_PARAM || { + $MODPROBE -s drbd $ADD_MOD_PARAM || { echo "Can not load the drbd module."$'\n'; exit 20 } # tell klogd to reload module symbol information ... [ -e /var/run/klogd.pid ] && [ -x /sbin/klogd ] && /sbin/klogd -i } -function adjust_with_progress -{ - IFS_O=$IFS - NEWLINE=' -' - IFS=$NEWLINE - local res - - COMMANDS=`$DRBDADM -d -n res adjust all` || exit 20 - echo -n "[ " - - for CMD in $COMMANDS; do - case "$CMD" in - res=*) eval "$CMD";; - *\ disk\ *) echo -n "d($res) " ;; - *\ syncer\ *) echo -n "s($res) " ;; - *\ net\ *) echo -n "n($res) " ;; - *) echo ".. " ;; - esac - if ! eval "$CMD"; then - echo -e "\n[$res] cmd $CMD failed - continuing!\n " - fi - done - echo -n "]" - - IFS=$IFS_O -} - drbd_pretty_status() { local proc_drbd=$1 @@ -137,6 +112,21 @@ ) | column -t } +# Try to settle regardless of udev version or presence, +# so "/etc/init.d/drbd stop" is able to rmmod, without interfering +# temporary module references caused by udev scanning the devices. +# But don't wait too long. +_udev_settle() +{ + if udevadm version ; then + # ok, we have udevadm, use it. + udevadm settle --timeout=5 + else + # if udevsettle is not there, + # no matter. + udevsettle --timeout=5 + fi +} case "$1" in start) @@ -144,19 +134,27 @@ # file, or we need to ask the user about registering this installation # at http://usage.drbd.org, we call drbdadm here without any IO # redirection. - $DRBDADM sh-nop + # If "no op" has a non-zero exit code, the config is unusable, + # and every other command will fail. log_daemon_msg "Starting DRBD resources" + if ! out=$($DRBDADM sh-nop 2>&1) ; then + printf "\n%s\n" "$out" >&2 + log_end_msg 1 + exit 1 + fi assure_module_is_loaded - adjust_with_progress + + $DRBDADM adjust-with-progress all + [[ $? -gt 1 ]] && exit 20 # make sure udev has time to create the device files - for RESOURCE in `$DRBDADM sh-resources`; do - for DEVICE in `$DRBDADM sh-dev $RESOURCE`; do - UDEV_TIMEOUT_LOCAL=$UDEV_TIMEOUT - while [ ! -e $DEVICE ] && [ $UDEV_TIMEOUT_LOCAL -gt 0 ] ; do - sleep 1 + # FIXME this probably should, on platforms that have it, + # use udevadm settle --timeout=X --exit-if-exists=$DEVICE + for DEVICE in `$DRBDADM sh-dev all`; do + UDEV_TIMEOUT_LOCAL=$UDEV_TIMEOUT + while [ ! -e $DEVICE ] && [ $UDEV_TIMEOUT_LOCAL -gt 0 ] ; do + sleep 1 UDEV_TIMEOUT_LOCAL=$(( $UDEV_TIMEOUT_LOCAL-1 )) - done done done @@ -169,7 +167,9 @@ stop) $DRBDADM sh-nop log_daemon_msg "Stopping all DRBD resources" - if [ -e $PROC_DRBD ] ; then + for try in 1 2; do + if [ -e $PROC_DRBD ] ; then + [[ $try = 2 ]] && echo "Retrying once..." # bypass drbdadm and drbd config file and everything, # to avoid leaving devices around that are not referenced by # the current config file, or in case the current config file @@ -182,10 +182,14 @@ *" not mounted") :;; *) echo "$M" >&2 ;; esac - $DRBDSETUP "$d" down done - $RMMOD drbd - fi + for res in $(drbdsetup all show | sed -ne 's/^resource \(.*\) {$/\1/p'); do + drbdsetup "$res" down + done + _udev_settle &> /dev/null + $RMMOD drbd && break + fi + done [ -f /var/lock/subsys/drbd ] && rm /var/lock/subsys/drbd log_end_msg 0 ;; @@ -208,13 +212,8 @@ log_end_msg 0 ;; restart|force-reload) - $DRBDADM sh-nop - log_daemon_msg "Restarting all DRBD resources" - $DRBDADM down all - $RMMOD drbd - assure_module_is_loaded - $DRBDADM up all - log_end_msg 0 + ( . $0 stop ) + ( . $0 start ) ;; *) echo "Usage: /etc/init.d/drbd {start|stop|status|reload|restart|force-reload}" diff -Nru drbd8-8.3.7/scripts/drbd-overview.pl drbd8-8.4.1+git55a81dc~cmd1/scripts/drbd-overview.pl --- drbd8-8.3.7/scripts/drbd-overview.pl 2009-06-09 11:33:03.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/scripts/drbd-overview.pl 2012-02-02 14:09:14.000000000 +0000 @@ -24,21 +24,40 @@ # sets $drbd{minor}->{name} (and possibly ->{ll_dev}) sub map_minor_to_resource_names() { - my $drbdadm_sh_status = `drbdadm sh-status`; + my @drbdadm_sh_status = `drbdadm sh-status`; + my ($ll_res, $ll_dev, $ll_minor, $conf_res, $conf_vnr, $minor, $name, $vnr); - while ($drbdadm_sh_status =~ m{ - \n - _stacked_on=(.*?)\n - (?:_stacked_on_device=(.*)\n - _stacked_on_minor=(\d*)\n)? - _minor=(.*?)\n - _res_name=(.*?)\n - }xg) - { - $drbd{$4}{name} = $5; - $minor_of_name{$5} = $4; - $drbd{$4}{ll_dev} = defined($2) ? $3 : $1 - if $1; + for (@drbdadm_sh_status) { + # volumes only present in >= 8.4 + # some things generated by drbdadm + + /^_conf_res_name=(.*)\n/ and $conf_res = $1, $name = $conf_res; + /^_conf_volume=(\d+)\n/ and $conf_vnr = $1; + + /^_stacked_on=(.*?)\n/ and $ll_res = $1; + # not always present: + /^_stacked_on_device=(.*)\n/ and $ll_dev = $1; + /^_stacked_on_minor=(\d+)\n/ and $ll_minor = $1; + + # rest generated by drbdsetup + /^_minor=(.*?)\n/ and $minor = $1; + /^_res_name=(.+?)\n/ and $name = $1; + /^_volume=(\d+)\n/ and $vnr = $1; + + /^_sh_status_process/ or next; + + $drbd{$minor}{name} = $name; + if (defined $conf_vnr) { + # >= 8.4, append /volume to resource name. + # If both are present, they should be the same. But + # just in case, prefer the kernel volume number, if it + # is present and positive. Else, use the volume number + # from the config. + $drbd{$minor}{name} .= defined $vnr ? "/$vnr" : "/$conf_vnr"; + } + $minor_of_name{$name} = $minor; + $drbd{$minor}{ll_dev} = defined($ll_dev) ? $ll_minor : $ll_res + if $ll_res; } # fix up hack for git versions 8.3.1 > x > 8.3.0: @@ -103,6 +122,7 @@ }; } close PD; + for (values %drbd) { $_->{state} ||= "Unconfigured . . . ."; } } # sets $drbd{minor}->{pv_info} @@ -196,16 +216,20 @@ # parent $_ = ; close(V) or warn "virsh dumpxml exit code: $?\n"; - while (m{]*>\s* - \s* - ]*>.*}gs) { + m{} or next; + my $dev = $1; + if ($dev !~ /^\d+$/) { + my @stat = stat("/dev/drbd$dev") or next; + $dev = $stat[6] & 0xff; + } + m{ $info{$dom}->{state} eq 'running' ? "\*$dom" : "_$dom", - vdev => $2, - bus => $3, + vdev => $1, + bus => $2, }; } } @@ -256,7 +280,7 @@ $out[$line] = [ sprintf("%3u:%s", $m, $t->{name} || "??not-found??"), - $t->{ll_dev} ? "^^$t->{ll_dev}" : "", + defined($t->{ll_dev}) ? "^^$t->{ll_dev}" : "", split(/\s+/, $t->{state}), @used_by ]; diff -Nru drbd8-8.3.7/scripts/drbd.conf.example drbd8-8.4.1+git55a81dc~cmd1/scripts/drbd.conf.example --- drbd8-8.3.7/scripts/drbd.conf.example 2010-01-07 09:09:34.000000000 +0000 +++ drbd8-8.4.1+git55a81dc~cmd1/scripts/drbd.conf.example 2012-02-02 14:09:14.000000000 +0000 @@ -1,624 +1,170 @@ -# -# drbd.conf example -# -# parameters you _need_ to change are the hostname, device, disk, -# meta-disk, address and port in the "on {}" sections. -# -# you ought to know about the protocol, and the various timeouts. -# -# you probably want to set the rate in the syncer sections - -# -# NOTE common pitfall: -# rate is given in units of _byte_ not bit -# - -# -# increase timeout and maybe ping-int in net{}, if you see -# problems with "connection lost/connection established" -# (or change your setup to reduce network latency; make sure full -# duplex behaves as such; check average roundtrip times while -# network is saturated; and so on ...) -# - -skip { - As you can see, you can also comment chunks of text - with a 'skip[optional nonsense]{ skipped text }' section. - This comes in handy, if you just want to comment out - some 'resource {...}' section: - just precede it with 'skip'. - - The basic format of option assignment is -