Hi!
By random I realized that every node of my 5-node test-cluster had at least one corosync-coredump. Unfortunately they even seem to have different signatures. I can provide a rough backtrace to get you warmed up for the fix ;-)
-rw------- 1 root root 124416000 Aug 10 13:42 /var/lib/corosync/core
-rw------- 1 root root 178900992 Aug 16 10:16 /var/lib/corosync/core
-rw------- 1 root root 124203008 Aug 13 09:28 /var/lib/corosync/core
-rw------- 1 root root 124063744 Aug 13 09:28 /var/lib/corosync/core
-rw------- 1 root root 166428672 Aug 22 10:31 /var/lib/corosync/core
So you see the core dumps did not happen at the same time; it seems they had different reasons.
(gdb) bt
#0 0x00007f57174acb55 in raise () from /lib64/libc.so.6
#1 0x00007f57174ae131 in abort () from /lib64/libc.so.6
#2 0x00007f57174a5a10 in __assert_fail () from /lib64/libc.so.6
#3 0x00007f571823afef in ?? () from /usr/lib64/libtotem_pg.so.4
#4 0x00007f5718240fee in ?? () from /usr/lib64/libtotem_pg.so.4
#5 0x00007f571824167c in ?? () from /usr/lib64/libtotem_pg.so.4
#6 0x00007f571823801c in ?? () from /usr/lib64/libtotem_pg.so.4
#7 0x00007f57182389a3 in rrp_deliver_fn () from /usr/lib64/libtotem_pg.so.4
#8 0x00007f5718232120 in ?? () from /usr/lib64/libtotem_pg.so.4
#9 0x00007f571822e898 in poll_run () from /usr/lib64/libtotem_pg.so.4
#10 0x0000000000407d2f in main ()
(gdb) bt
#0 0x00007f44218e5b55 in raise () from /lib64/libc.so.6
#1 0x00007f44218e7131 in abort () from /lib64/libc.so.6
#2 0x00007f44218dea10 in __assert_fail () from /lib64/libc.so.6
#3 0x00007f442267581b in ?? () from /usr/lib64/libtotem_pg.so.4
#4 0x00007f442266794a in poll_run () from /usr/lib64/libtotem_pg.so.4
#5 0x0000000000407d2f in main ()
(gdb) bt
#0 0x00007f70e4e7db55 in raise () from /lib64/libc.so.6
#1 0x00007f70e4e7f131 in abort () from /lib64/libc.so.6
#2 0x00007f70e4e76a10 in __assert_fail () from /lib64/libc.so.6
#3 0x00007f70e5c0bfef in ?? () from /usr/lib64/libtotem_pg.so.4
#4 0x00007f70e5c11fee in ?? () from /usr/lib64/libtotem_pg.so.4
#5 0x00007f70e5c1267c in ?? () from /usr/lib64/libtotem_pg.so.4
#6 0x00007f70e5c0901c in ?? () from /usr/lib64/libtotem_pg.so.4
#7 0x00007f70e5c099a3 in rrp_deliver_fn () from /usr/lib64/libtotem_pg.so.4
#8 0x00007f70e5c03120 in ?? () from /usr/lib64/libtotem_pg.so.4
#9 0x00007f70e5bff898 in poll_run () from /usr/lib64/libtotem_pg.so.4
#10 0x0000000000407d2f in main ()
(gdb) bt
#0 0x00007f1100e83b55 in raise () from /lib64/libc.so.6
#1 0x00007f1100e85131 in abort () from /lib64/libc.so.6
#2 0x00007f1100e7ca10 in __assert_fail () from /lib64/libc.so.6
#3 0x00007f1101c11fef in ?? () from /usr/lib64/libtotem_pg.so.4
#4 0x00007f1101c17fee in ?? () from /usr/lib64/libtotem_pg.so.4
#5 0x00007f1101c1867c in ?? () from /usr/lib64/libtotem_pg.so.4
#6 0x00007f1101c0f01c in ?? () from /usr/lib64/libtotem_pg.so.4
#7 0x00007f1101c0f9a3 in rrp_deliver_fn () from /usr/lib64/libtotem_pg.so.4
#8 0x00007f1101c09120 in ?? () from /usr/lib64/libtotem_pg.so.4
#9 0x00007f1101c05898 in poll_run () from /usr/lib64/libtotem_pg.so.4
#10 0x0000000000407d2f in main ()
(gdb) bt
#0 0x00007fc7f18a85e9 in memcpy () from /lib64/libc.so.6
#1 0x00007fc7f21c5620 in coroipcs_response_send () from /usr/lib64/libcoroipcs.so.4
#2 0x00007fc7ea1a58e5 in ?? () from /usr/lib64/lcrso/service_ckpt.lcrso
#3 0x00000000004068a0 in ?? ()
#4 0x00007fc7f25ee6f7 in ?? () from /usr/lib64/libtotem_pg.so.4
#5 0x00007fc7f25e6b2d in ?? () from /usr/lib64/libtotem_pg.so.4
#6 0x00007fc7f25eb7f0 in ?? () from /usr/lib64/libtotem_pg.so.4
#7 0x00007fc7f25e301c in ?? () from /usr/lib64/libtotem_pg.so.4
#8 0x00007fc7f25e39a3 in rrp_deliver_fn () from /usr/lib64/libtotem_pg.so.4
#9 0x00007fc7f25dd120 in ?? () from /usr/lib64/libtotem_pg.so.4
#10 0x00007fc7f25d9898 in poll_run () from /usr/lib64/libtotem_pg.so.4
#11 0x0000000000407d2f in main ()
Regards,
Ulrich
_______________________________________________________
Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/
By random I realized that every node of my 5-node test-cluster had at least one corosync-coredump. Unfortunately they even seem to have different signatures. I can provide a rough backtrace to get you warmed up for the fix ;-)
-rw------- 1 root root 124416000 Aug 10 13:42 /var/lib/corosync/core
-rw------- 1 root root 178900992 Aug 16 10:16 /var/lib/corosync/core
-rw------- 1 root root 124203008 Aug 13 09:28 /var/lib/corosync/core
-rw------- 1 root root 124063744 Aug 13 09:28 /var/lib/corosync/core
-rw------- 1 root root 166428672 Aug 22 10:31 /var/lib/corosync/core
So you see the core dumps did not happen at the same time; it seems they had different reasons.
(gdb) bt
#0 0x00007f57174acb55 in raise () from /lib64/libc.so.6
#1 0x00007f57174ae131 in abort () from /lib64/libc.so.6
#2 0x00007f57174a5a10 in __assert_fail () from /lib64/libc.so.6
#3 0x00007f571823afef in ?? () from /usr/lib64/libtotem_pg.so.4
#4 0x00007f5718240fee in ?? () from /usr/lib64/libtotem_pg.so.4
#5 0x00007f571824167c in ?? () from /usr/lib64/libtotem_pg.so.4
#6 0x00007f571823801c in ?? () from /usr/lib64/libtotem_pg.so.4
#7 0x00007f57182389a3 in rrp_deliver_fn () from /usr/lib64/libtotem_pg.so.4
#8 0x00007f5718232120 in ?? () from /usr/lib64/libtotem_pg.so.4
#9 0x00007f571822e898 in poll_run () from /usr/lib64/libtotem_pg.so.4
#10 0x0000000000407d2f in main ()
(gdb) bt
#0 0x00007f44218e5b55 in raise () from /lib64/libc.so.6
#1 0x00007f44218e7131 in abort () from /lib64/libc.so.6
#2 0x00007f44218dea10 in __assert_fail () from /lib64/libc.so.6
#3 0x00007f442267581b in ?? () from /usr/lib64/libtotem_pg.so.4
#4 0x00007f442266794a in poll_run () from /usr/lib64/libtotem_pg.so.4
#5 0x0000000000407d2f in main ()
(gdb) bt
#0 0x00007f70e4e7db55 in raise () from /lib64/libc.so.6
#1 0x00007f70e4e7f131 in abort () from /lib64/libc.so.6
#2 0x00007f70e4e76a10 in __assert_fail () from /lib64/libc.so.6
#3 0x00007f70e5c0bfef in ?? () from /usr/lib64/libtotem_pg.so.4
#4 0x00007f70e5c11fee in ?? () from /usr/lib64/libtotem_pg.so.4
#5 0x00007f70e5c1267c in ?? () from /usr/lib64/libtotem_pg.so.4
#6 0x00007f70e5c0901c in ?? () from /usr/lib64/libtotem_pg.so.4
#7 0x00007f70e5c099a3 in rrp_deliver_fn () from /usr/lib64/libtotem_pg.so.4
#8 0x00007f70e5c03120 in ?? () from /usr/lib64/libtotem_pg.so.4
#9 0x00007f70e5bff898 in poll_run () from /usr/lib64/libtotem_pg.so.4
#10 0x0000000000407d2f in main ()
(gdb) bt
#0 0x00007f1100e83b55 in raise () from /lib64/libc.so.6
#1 0x00007f1100e85131 in abort () from /lib64/libc.so.6
#2 0x00007f1100e7ca10 in __assert_fail () from /lib64/libc.so.6
#3 0x00007f1101c11fef in ?? () from /usr/lib64/libtotem_pg.so.4
#4 0x00007f1101c17fee in ?? () from /usr/lib64/libtotem_pg.so.4
#5 0x00007f1101c1867c in ?? () from /usr/lib64/libtotem_pg.so.4
#6 0x00007f1101c0f01c in ?? () from /usr/lib64/libtotem_pg.so.4
#7 0x00007f1101c0f9a3 in rrp_deliver_fn () from /usr/lib64/libtotem_pg.so.4
#8 0x00007f1101c09120 in ?? () from /usr/lib64/libtotem_pg.so.4
#9 0x00007f1101c05898 in poll_run () from /usr/lib64/libtotem_pg.so.4
#10 0x0000000000407d2f in main ()
(gdb) bt
#0 0x00007fc7f18a85e9 in memcpy () from /lib64/libc.so.6
#1 0x00007fc7f21c5620 in coroipcs_response_send () from /usr/lib64/libcoroipcs.so.4
#2 0x00007fc7ea1a58e5 in ?? () from /usr/lib64/lcrso/service_ckpt.lcrso
#3 0x00000000004068a0 in ?? ()
#4 0x00007fc7f25ee6f7 in ?? () from /usr/lib64/libtotem_pg.so.4
#5 0x00007fc7f25e6b2d in ?? () from /usr/lib64/libtotem_pg.so.4
#6 0x00007fc7f25eb7f0 in ?? () from /usr/lib64/libtotem_pg.so.4
#7 0x00007fc7f25e301c in ?? () from /usr/lib64/libtotem_pg.so.4
#8 0x00007fc7f25e39a3 in rrp_deliver_fn () from /usr/lib64/libtotem_pg.so.4
#9 0x00007fc7f25dd120 in ?? () from /usr/lib64/libtotem_pg.so.4
#10 0x00007fc7f25d9898 in poll_run () from /usr/lib64/libtotem_pg.so.4
#11 0x0000000000407d2f in main ()
Regards,
Ulrich
_______________________________________________________
Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/