rsciw |
03-16-2012 06:21 PM |
Trouble with pacemaker, drbd and nfs
Ahoi,
trying for a while to get this stuff up and running but somehow I appear to be missing stuff...
pacemaker will for its life not start drbd, and therefore NFS will fail too...
perhaps I just need another set of eyes having a look at this, as mine are falling asleep (11:15 pm and still in office ;) )
this is what I get from
# crm verify -L -V
Code:
crm_verify[11776]: 2012/03/16_23:17:12 WARN: unpack_rsc_op: Processing failed op fs_drbd_start_0 on wkse13p1xynfs01: unknown error (1)
crm_verify[11776]: 2012/03/16_23:17:12 WARN: unpack_rsc_op: Processing failed op nfs_share_start_0 on wkse13p1xynfs01: unknown error (1)
crm_verify[11776]: 2012/03/16_23:17:12 WARN: common_apply_stickiness: Forcing fs_drbd away from wkse13p1xynfs02 after 1000000 failures (max=1000000)
crm_verify[11776]: 2012/03/16_23:17:12 WARN: common_apply_stickiness: Forcing fs_drbd away from wkse13p1xynfs01 after 1000000 failures (max=1000000)
crm_verify[11776]: 2012/03/16_23:17:12 WARN: common_apply_stickiness: Forcing nfs_share away from wkse13p1xynfs01 after 1000000 failures (max=1000000)
crm_verify[11776]: 2012/03/16_23:17:12 ERROR: clone_rsc_order_lh_non_clone: Unknown task: fs_drbd_promote_0
crm_verify[11776]: 2012/03/16_23:17:12 ERROR: clone_rsc_order_rh_non_clone: Unknown action: fs_drbd_demote_0
the pacemaker conf:
Code:
node $id="b10a7cb5-2d73-4bf6-a8ce-301bf0a61d62" wkse13p1xynfs01
node $id="c594c1ea-d70d-464a-921e-b0aba4f455a6" wkse13p1xynfs02
primitive clusterIP ocf:heartbeat:IPaddr2 \
params ip="10.26.29.237" nic="eth0:2" \
op monitor interval="5s"
primitive clusterIParp ocf:heartbeat:SendArp \
params ip="10.26.29.237" nic="eth0:2"
primitive drbd_disk ocf:linbit:drbd \
params drbd_resource="nfs" \
op monitor interval="15s" \
op start interval="0" timeout="240" \
op stop interval="0" timeout="240"
primitive fs_drbd ocf:heartbeat:Filesystem \
params device="/dev/drbd0" directory="/drbd" fstype="ext3" \
op start interval="0" timeout="240" \
op stop interval="0" timeout="240" \
meta target-role="Started"
primitive nfs_share ocf:heartbeat:nfsserver \
params nfs_ip="10.26.29.237" nfs_init_script="/etc/init.d/nfs" nfs_shared_infodir="/var/lib/nfs" nfs_notify_cmd="/sbin/rpc.statd" \
op start interval="0" timeout="240" \
op stop interval="0" timeout="240"
group IP clusterIP clusterIParp \
meta target-role="Started"
group Misc nfs_share \
meta target-role="Started"
ms ms_drbd drbd_disk \
meta master-max="1" master-node-max="1" clone-max="1" clone-node-max="1" notify="true"
colocation drbd-with-IP inf: ms_drbd:Master IP
colocation mnt_on_master inf: fs_drbd ms_drbd:Master
colocation nfs_on_master inf: Misc ms_drbd:Master
order ip-before-arp inf: clusterIP:start clusterIParp:start
order ip-before-drbd inf: clusterIP:start ms_drbd:promote
order mount-after-drbd inf: ms_drbd:start fs_drbd:promote
property $id="cib-bootstrap-options" \
dc-version="1.0.12-unknown" \
cluster-infrastructure="Heartbeat" \
expected-quorum-votes="1" \
stonith-enabled="false" \
no-quorum-policy="ignore"
drbd conf
Code:
/etc/drbd.conf
#
# please have a a look at the example configuration file in
# /usr/share/doc/drbd83/drbd.conf
#
#global {
# usage-count no;
#}
#common {
# protocol C;
#}
include "/etc/drbd.d/global_common.conf";
include "/etc/drbd.d/*.res";
/etc/drbd.d/global_common.conf
global {
usage-count no;
# minor-count dialog-refresh disable-ip-verification
}
common {
protocol C;
handlers {
pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
local-io-error "/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f";
# fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
# split-brain "/usr/lib/drbd/notify-split-brain.sh root";
# out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
# before-resync-target "/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
# after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
}
startup { degr-wfc-timeout 120; }
disk {
# on-io-error fencing use-bmbv no-disk-barrier no-disk-flushes
# no-disk-drain no-md-flushes max-bio-bvecs
}
net {
# sndbuf-size rcvbuf-size timeout connect-int ping-int ping-timeout max-buffers
# max-epoch-size ko-count allow-two-primaries cram-hmac-alg shared-secret
# after-sb-0pri after-sb-1pri after-sb-2pri data-integrity-alg no-tcp-cork
}
syncer {
# rate after al-extents use-rle cpu-mask verify-alg csums-alg
}
}
/etc/drbd.d/nfs.res
resource nfs {
handlers {
split-brain "/usr/lib/drbd/notify-split-brain.sh root";
}
device /dev/drbd0;
disk /dev/sdb;
meta-disk internal;
syncer {
rate 100M;
}
on wkse13p1xynfs01 {
address 10.26.29.238:7790;
}
on wkse13p1xynfs02 {
address 10.26.29.239:7790;
}
net {
cram-hmac-alg sha1;
shared-secret "thisis4lulz53cr37b3ingv!s!bl34nd411";
after-sb-0pri discard-least-changes;
after-sb-1pri consensus;
after-sb-2pri disconnect;
}
disk {
on-io-error detach;
}
package versions:
Code:
resource-agents-1.0.4-1.1.el5
heartbeat-libs-3.0.3-2.el5
heartbeat-3.0.3-2.el5
drbd83-8.3.12-2.el5.centos
cluster-glue-libs-1.0.6-1.6.el5
cluster-glue-1.0.6-1.6.el5
pacemaker-libs-1.0.12-1.el5.centos
pacemaker-1.0.12-1.el5.centos
kmod-drbd83-8.3.12-1.el5.centos
so yeah, if anyone sees anything which I'm too blind too see right now, would appreciate it if you'd post it here :)
Cheers
|