Mailing List Archive

Core Dump When Sending to Other Node That's Resetting
Hi,
I wrote a wrapper using hbclient api for an application that manages the redundancy of our system. The application uses the wrapper to send/receive messages (string) between the primary and secondary.
In our testing of reset and switch over, once in a while, there is core dump in the send with double free in libc, that I do not know if caused by my wrapper of hbclient api.


/lib/libc.so.6[0xf7d71629]
/lib/libc.so.6(cfree+0x59)[0xf7d719e9]
/usr/lib/libplumb.so.2[0xf7e88dcf]
/usr/lib/libplumb.so.2[0xf7e9a03e]
/usr/lib/libplumb.so.2[0xf7e9a1a4]
/usr/lib/libplumb.so.2[0xf7e9922f]
/usr/lib/libplumb.so.2(msg2ipcchan+0xb8)[0xf7e891ea]
/usr/lib/libhbclient.so.1[0xf7e6a736]
/usr/lib/libha_lib.so(hb_send+0x204)[0xf7e61e15] ---> my wrapper

I use send_ordered_nodemsg() to send and readmsg() to read (based on api_test.c). However in sample codes of ipfail or drbd, I saw the setting up of IPChannel and usage of msg2ipcchan(). Which is more appropriate?


I'd also like to know if I should add more codes to handle node status change because the crashes always occur when the other node go reset.


Snippet of my codes:

1. Initialization:
if (mhm_hb->llc_ops->signon(mhm_hb, "ping")!= HA_OK) {    // I pasted the common "ping",
                                                                                                                    //  plan to change to different name
    cl_log(LOG_ERR, "Cannot sign on with heartbeat");
...

2. Send:
int hb_send(ll_cluster_t *hb, char *dest, void *buf, size_t sz)
{
  HA_Message *msg;
  if (hb==NULL) return HA_FAIL;
  msg = ha_msg_new(0);
  if (ha_msg_add(msg, F_TYPE, T_MHM_MSG) != HA_OK) {
    cl_log(LOG_ERR, "hb_send: cannot add field TYPE\n");
    ZAPMSG(msg);
    return HA_FAIL;
  }
  if (ha_msg_add(msg, F_ORIG, node_name) != HA_OK) {
    cl_log(LOG_ERR, "hb_send: cannot add field ORIG\n");
    ZAPMSG(msg);
    return HA_FAIL;
  }
  char *payload = malloc(sz+1);
  if (payload==NULL) {
    ZAPMSG(msg);
    return HA_FAIL;
  }
  memset(payload, 0, sz+1);    // Add a Null byte at the end
  memcpy(payload, buf, sz);
  if (ha_msg_add(msg, F_MHM_PAYLOAD, payload) != HA_OK) {
    cl_log(LOG_ERR, "hb_send: cannot add field PAYLOAD\n");
    ZAPMSG(msg);
    return HA_FAIL;
  }
  if (hb->llc_ops->send_ordered_nodemsg(hb, msg, peer_name) != HA_OK) {
    ZAPMSG(msg);
    return HA_FAIL;
  }
  else {
    ZAPMSG(msg);
    return sz;
  }
}

3. Receive:
int hb_recv(ll_cluster_t *hb, void *buf, size_t sz)
{
    int msgcount=0;
    HA_Message *reply;

    if (hb==NULL) return HA_FAIL;
    memset(buf, 0, sz);
    for(; (reply=hb->llc_ops->readmsg(hb, 1)) != NULL;) { ----> Blocking receiving
        const char *    type;
        const char *    orig;
        const char *payload;
        ++msgcount;
        if ((type = ha_msg_value(reply, F_TYPE)) == NULL) {
            type = "?";
        }
        if ((orig = ha_msg_value(reply, F_ORIG)) == NULL) {
            orig = "?";
        }
        cl_log(LOG_DEBUG, "Got message %d of type [%s] from [%s]"
        ,    msgcount, type, orig);
        if (strcmp(type, T_MHM_MSG) == 0) {
          payload = ha_msg_value(reply, F_MHM_PAYLOAD);
         
          int p_sz = strlen(payload);
          cl_log(LOG_DEBUG, "payload %s sz %d p_sz %d\n", payload, sz, p_sz);
                   
          if (p_sz <= sz) {
            char *tmp = (char*) buf;
            strncpy(tmp, payload, p_sz);
            cl_log(LOG_DEBUG, "return buf %s sz %d ret_val %d", buf, strlen(buf), p_sz);
            ZAPMSG(reply);
            return(p_sz);
          } else {
            cl_log(LOG_ERR, "Receive buffer %d too small for payload %d", sz, p_sz);
            ZAPMSG(reply);
            return HA_FAIL;
          }
        }
        ZAPMSG(reply);    //// ---> Could we delete message that's not meant to our module, or should we let it go?
    }
    if (reply==NULL) {
      cl_log(LOG_ERR, "read_hb_msg returned NULL");
      cl_log(LOG_ERR, "REASON: %s", hb->llc_ops->errmsg(hb));
    }
    return 0;
}


Thanks,
Phong
Re: Core Dump When Sending to Other Node That's Resetting [ In reply to ]
Have you tried running this under valgrind?

On 04/13/2012 05:22 PM, Nguyen Dinh Phong wrote:
> Hi,
> I wrote a wrapper using hbclient api for an application that manages
> the redundancy of our system. The application uses the wrapper to
> send/receive messages (string) between the primary and secondary.
> In our testing of reset and switch over, once in a while, there is
> core dump in the send with double free in libc, that I do not know if
> caused by my wrapper of hbclient api.
>
> /lib/libc.so.6[0xf7d71629]
> /lib/libc.so.6(cfree+0x59)[0xf7d719e9]
> /usr/lib/libplumb.so.2[0xf7e88dcf]
> /usr/lib/libplumb.so.2[0xf7e9a03e]
> /usr/lib/libplumb.so.2[0xf7e9a1a4]
> /usr/lib/libplumb.so.2[0xf7e9922f]
> /usr/lib/libplumb.so.2(msg2ipcchan+0xb8)[0xf7e891ea]
> /usr/lib/libhbclient.so.1[0xf7e6a736]
> /usr/lib/libha_lib.so(hb_send+0x204)[0xf7e61e15] ---> my wrapper
>
> I use send_ordered_nodemsg() to send and readmsg() to read (based on
> api_test.c). However in sample codes of ipfail or drbd, I saw the
> setting up of IPChannel and usage of msg2ipcchan(). Which is more
> appropriate?
>
> I'd also like to know if I should add more codes to handle node status
> change because the crashes always occur when the other node go reset.
>
> Snippet of my codes:
>
> 1. Initialization:
> if (mhm_hb->llc_ops->signon(mhm_hb, "ping")!= HA_OK) { // I pasted the
> common "ping",
> // plan to change to different name
> cl_log(LOG_ERR, "Cannot sign on with heartbeat");
> ...
>
> 2. Send:
> int hb_send(ll_cluster_t *hb, char *dest, void *buf, size_t sz)
> {
> HA_Message *msg;
> if (hb==NULL) return HA_FAIL;
> msg = ha_msg_new(0);
> if (ha_msg_add(msg, F_TYPE, T_MHM_MSG) != HA_OK) {
> cl_log(LOG_ERR, "hb_send: cannot add field TYPE\n");
> ZAPMSG(msg);
> return HA_FAIL;
> }
> if (ha_msg_add(msg, F_ORIG, node_name) != HA_OK) {
> cl_log(LOG_ERR, "hb_send: cannot add field ORIG\n");
> ZAPMSG(msg);
> return HA_FAIL;
> }
> char *payload = malloc(sz+1);
> if (payload==NULL) {
> ZAPMSG(msg);
> return HA_FAIL;
> }
> memset(payload, 0, sz+1);// Add a Null byte at the end
> memcpy(payload, buf, sz);
> if (ha_msg_add(msg, F_MHM_PAYLOAD, payload) != HA_OK) {
> cl_log(LOG_ERR, "hb_send: cannot add field PAYLOAD\n");
> ZAPMSG(msg);
> return HA_FAIL;
> }
> if (hb->llc_ops->send_ordered_nodemsg(hb, msg, peer_name) != HA_OK) {
> ZAPMSG(msg);
> return HA_FAIL;
> }
> else {
> ZAPMSG(msg);
> return sz;
> }
> }
>
> 3. Receive:
> int hb_recv(ll_cluster_t *hb, void *buf, size_t sz)
> {
> int msgcount=0;
> HA_Message *reply;
>
> if (hb==NULL) return HA_FAIL;
> memset(buf, 0, sz);
> for(; (reply=hb->llc_ops->readmsg(hb, 1)) != NULL;) { ---->
> Blocking receiving
> const char * type;
> const char * orig;
> const char *payload;
> ++msgcount;
> if ((type = ha_msg_value(reply, F_TYPE)) == NULL) {
> type = "?";
> }
> if ((orig = ha_msg_value(reply, F_ORIG)) == NULL) {
> orig = "?";
> }
> cl_log(LOG_DEBUG, "Got message %d of type [%s] from [%s]"
> , msgcount, type, orig);
> if (strcmp(type, T_MHM_MSG) == 0) {
> payload = ha_msg_value(reply, F_MHM_PAYLOAD);
>
> int p_sz = strlen(payload);
> cl_log(LOG_DEBUG, "payload %s sz %d p_sz %d\n", payload, sz,
> p_sz);
>
> if (p_sz <= sz) {
> char *tmp = (char*) buf;
> strncpy(tmp, payload, p_sz);
> cl_log(LOG_DEBUG, "return buf %s sz %d ret_val %d", buf,
> strlen(buf), p_sz);
> ZAPMSG(reply);
> return(p_sz);
> } else {
> cl_log(LOG_ERR, "Receive buffer %d too small for payload
> %d", sz, p_sz);
> ZAPMSG(reply);
> return HA_FAIL;
> }
> }
> ZAPMSG(reply); //// ---> Could we delete message that's not
> meant to our module, or should we let it go?
> }
> if (reply==NULL) {
> cl_log(LOG_ERR, "read_hb_msg returned NULL");
> cl_log(LOG_ERR, "REASON: %s", hb->llc_ops->errmsg(hb));
> }
> return 0;
> }
>
> Thanks,
> Phong
>
>
>
> _______________________________________________________
> Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org
> http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
> Home Page: http://linux-ha.org/


--
Alan Robertson<alanr@unix.sh> - @OSSAlanR

"Openness is the foundation and preservative of friendship... Let me claim from you at all times your undisguised opinions." - William Wilberforce
Re: Core Dump When Sending to Other Node That's Resetting [ In reply to ]
Hi Alan,
I haven't run this with valgrind yet.
On the other hand, I saw some codes from send_ha_message() in crm that checking if the channel is connected before sending. I'm testing to see this would help.
 
Thanks,
Phong


________________________________
From: Alan Robertson <alanr@unix.sh>
To: Nguyen Dinh Phong <ndphong@yahoo.com>; High-Availability Linux Development List <linux-ha-dev@lists.linux-ha.org>
Sent: Wednesday, April 18, 2012 10:13 PM
Subject: Re: [Linux-ha-dev] Core Dump When Sending to Other Node That's Resetting


Have you tried running this under valgrind?

On 04/13/2012 05:22 PM, Nguyen Dinh Phong wrote:
Hi,
>I wrote a wrapper using hbclient api for an application that manages the redundancy of our system. The application uses the wrapper to send/receive messages (string) between the primary and secondary.
>In our testing of reset and switch over, once in a while, there is core dump in the send with double free in libc, that I do not know if caused by my wrapper of hbclient api.
>
>
>
>/lib/libc.so.6[0xf7d71629]
>/lib/libc.so.6(cfree+0x59)[0xf7d719e9]
>/usr/lib/libplumb.so.2[0xf7e88dcf]
>/usr/lib/libplumb.so.2[0xf7e9a03e]
>/usr/lib/libplumb.so.2[0xf7e9a1a4]
>/usr/lib/libplumb.so.2[0xf7e9922f]
>/usr/lib/libplumb.so.2(msg2ipcchan+0xb8)[0xf7e891ea]
>/usr/lib/libhbclient.so.1[0xf7e6a736]
>/usr/lib/libha_lib.so(hb_send+0x204)[0xf7e61e15] ---> my wrapper
>
>
>I use send_ordered_nodemsg() to send and readmsg() to read (based on api_test.c). However in sample codes of ipfail or drbd, I saw the setting up of IPChannel and usage of msg2ipcchan(). Which is more appropriate?
>
>
>
>I'd also like to know if I should add more codes to handle node status change because the crashes always occur when the other node go reset.
>
>
>Snippet of my codes:
>
>1. Initialization:
>if (mhm_hb->llc_ops->signon(mhm_hb, "ping")!= HA_OK) {    // I pasted the common "ping",
>                                                                                                                    //  plan to change to different name
>    cl_log(LOG_ERR, "Cannot sign on with heartbeat");
>...
>
>2. Send:
>int hb_send(ll_cluster_t *hb, char *dest, void *buf, size_t
sz)
>{
>  HA_Message *msg;
>  if (hb==NULL) return HA_FAIL;
>  msg = ha_msg_new(0);
>  if (ha_msg_add(msg, F_TYPE, T_MHM_MSG) != HA_OK) {
>    cl_log(LOG_ERR, "hb_send: cannot add field TYPE\n");
>    ZAPMSG(msg);
>    return HA_FAIL;
>  }
>  if (ha_msg_add(msg, F_ORIG, node_name) != HA_OK) {
>    cl_log(LOG_ERR, "hb_send: cannot add field ORIG\n");
>    ZAPMSG(msg);
>    return HA_FAIL;
>  }
>  char *payload = malloc(sz+1);
>  if (payload==NULL) {
>    ZAPMSG(msg);
>    return HA_FAIL;
>  }
>  memset(payload, 0, sz+1);    // Add a Null byte at the end
>  memcpy(payload, buf, sz);
>  if (ha_msg_add(msg, F_MHM_PAYLOAD, payload) != HA_OK) {
>    cl_log(LOG_ERR, "hb_send: cannot add field PAYLOAD\n");
>    ZAPMSG(msg);
>    return HA_FAIL;
>  }
>  if (hb->llc_ops->send_ordered_nodemsg(hb, msg,
peer_name) != HA_OK) {
>    ZAPMSG(msg);
>    return HA_FAIL;
>  }
>  else {
>    ZAPMSG(msg);
>    return sz;
>  }
>}
>
>3. Receive:
>int hb_recv(ll_cluster_t *hb, void *buf, size_t sz)
>{
>    int msgcount=0;
>    HA_Message *reply;
>
>    if (hb==NULL) return HA_FAIL;
>    memset(buf, 0, sz);
>    for(; (reply=hb->llc_ops->readmsg(hb, 1)) !=
NULL;) { ----> Blocking receiving
>        const char *    type;
>        const char *    orig;
>        const char *payload;
>        ++msgcount;
>        if ((type = ha_msg_value(reply, F_TYPE)) == NULL) {
>            type = "?";
>        }
>        if ((orig = ha_msg_value(reply, F_ORIG)) == NULL) {
>            orig = "?";
>        }
>        cl_log(LOG_DEBUG, "Got message %d of type [%s] from
[%s]"
>        ,    msgcount, type, orig);
>        if (strcmp(type, T_MHM_MSG) == 0) {
>          payload = ha_msg_value(reply, F_MHM_PAYLOAD);
>         
>          int p_sz = strlen(payload);
>          cl_log(LOG_DEBUG, "payload %s sz %d p_sz %d\n",
payload, sz, p_sz);
>                   
>          if (p_sz <= sz) {
>            char *tmp = (char*) buf;
>            strncpy(tmp, payload, p_sz);
>            cl_log(LOG_DEBUG, "return buf %s sz %d ret_val
%d", buf, strlen(buf), p_sz);
>            ZAPMSG(reply);
>            return(p_sz);
>          } else {
>            cl_log(LOG_ERR, "Receive buffer %d too small for
payload %d", sz, p_sz);
>            ZAPMSG(reply);
>            return HA_FAIL;
>          }
>        }
>        ZAPMSG(reply);    //// ---> Could we delete message that's not meant to our module, or should we let it go?
>    }
>    if (reply==NULL) {
>      cl_log(LOG_ERR, "read_hb_msg returned NULL");
>      cl_log(LOG_ERR, "REASON: %s",
hb->llc_ops->errmsg(hb));
>    }
>    return 0;
>}
>
>
>Thanks,
>Phong
>
>
>
>
>_______________________________________________________
Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/


-- Alan Robertson <alanr@unix.sh> - @OSSAlanR "Openness is the foundation and preservative of friendship... Let me claim from you at all times your undisguised opinions." - William Wilberforce