diff -r 76f8bfbd73ed src/io/bmi/bmi_ib/ib.c
--- a/src/io/bmi/bmi_ib/ib.c	Tue Mar 11 21:38:09 2008 -0500
+++ b/src/io/bmi/bmi_ib/ib.c	Wed Mar 12 20:59:13 2008 -0500
@@ -85,8 +85,6 @@ static int ib_tcp_client_connect(ib_meth
 static int ib_tcp_client_connect(ib_method_addr_t *ibmap,
                                  struct bmi_method_addr *remote_map);
 #endif
-static int ib_tcp_server_check_new_connections(void);
-static int ib_block_for_activity(int timeout_ms);
 
 /*
  * Return string form of work completion opcode field.
@@ -109,7 +107,7 @@ static const char *wc_opcode_string(int 
  * walk the incomingq looking for things to do to them.  Returns
  * number of new things that arrived.
  */
-static int ib_check_cq(void)
+int ib_check_cq(void)
 {
     int ret = 0;
 
@@ -1444,7 +1442,7 @@ BMI_ib_close_context(bmi_context_id cont
  * Can't just call test since we don't want to reap the operation,
  * just make sure it's done or not.
  */
-static int
+int
 BMI_ib_cancel(bmi_op_id_t id, bmi_context_id context_id __unused)
 {
     struct method_op *mop;
@@ -1852,7 +1850,7 @@ out_unlock:
  * testunexpected will pick up new connections.  Returns ==1 if IB device is
  * ready, other >0 for some activity, else 0.
  */
-static int ib_block_for_activity(int timeout_ms)
+int ib_block_for_activity(int timeout_ms)
 {
     struct pollfd pfd[3];  /* cq fd, async fd, accept socket */
     int numfd;
diff -r 76f8bfbd73ed src/io/bmi/bmi_ib/ib.h
--- a/src/io/bmi/bmi_ib/ib.h	Tue Mar 11 21:38:09 2008 -0500
+++ b/src/io/bmi/bmi_ib/ib.h	Wed Mar 12 20:59:13 2008 -0500
@@ -25,6 +25,9 @@
 /* 20 8kB buffers allocated to each connection for unexpected messages */
 #define DEFAULT_EAGER_BUF_NUM  (20)
 #define DEFAULT_EAGER_BUF_SIZE (8 << 10)
+
+int ib_check_cq(void);
+int ib_block_for_activity(int timeout_ms);
 
 struct buf_head;
 
@@ -55,6 +58,7 @@ typedef struct {
 
     int send_credit;    /* free slots on receiver */
     int return_credit;  /* receive buffers he filled but that we've emptied */
+    int wr_credit;	/* make sure we don't overflow available wr in hca */
 
     void *priv;
 
@@ -454,7 +458,7 @@ void memcache_cache_flush(void *md);
  * Debugging macros.
  */
 #if 1
-#define DEBUG_LEVEL 2
+#define DEBUG_LEVEL 1
 #define debug(lvl,fmt,args...) \
     do { \
 	if (lvl <= DEBUG_LEVEL) \
diff -r 76f8bfbd73ed src/io/bmi/bmi_ib/openib.c
--- a/src/io/bmi/bmi_ib/openib.c	Tue Mar 11 21:38:09 2008 -0500
+++ b/src/io/bmi/bmi_ib/openib.c	Wed Mar 12 20:59:13 2008 -0500
@@ -38,6 +38,7 @@ struct openib_device_priv {
     int nic_max_sge;
     int nic_max_wr;
 
+    int nic_wr_credit;	/* credit used to prevent wq overflows */
     /*
      * Temp array for filling scatter/gather lists to pass to IB functions,
      * allocated once at start to max size defined as reported by the qp.
@@ -157,6 +158,10 @@ static int openib_new_connection(ib_conn
 	error_errno("%s: create QP", __func__);
     VALGRIND_MAKE_MEM_DEFINED(&att, sizeof(att));
     VALGRIND_MAKE_MEM_DEFINED(&oc->qp->qp_num, sizeof(oc->qp->qp_num));
+
+    /* set the default max num of wr_credit to the num_wr to prevent
+     * overflow of the wr's under heavy load on slower hardware */
+    od->nic_wr_credit = num_wr;
 
     /* compare the caps that came back against what we already have */
     if (od->sg_max_len == 0) {
@@ -407,7 +412,7 @@ static void openib_post_sr(const struct 
     };
     struct ibv_send_wr *bad_wr;
 
-    debug(4, "%s: %s bh %d len %u wr %d/%d", __func__, c->peername, bh->num,
+    debug(1, "%s: %s bh %d len %u wr %d/%d", __func__, c->peername, bh->num,
           len, od->num_unsignaled_sends, od->max_unsignaled_sends);
 
     if (od->num_unsignaled_sends + 10 == od->max_unsignaled_sends)
@@ -505,6 +510,8 @@ static void openib_post_sr_rdmaw(struct 
     while (!done) {
         int ret;
         struct ibv_send_wr *bad_wr;
+	int retry_count = 0;
+	int max_retries = 100;
 
         if (recv_bytes_needed == 0) {
             /* new one, fresh numbers */
@@ -574,9 +581,34 @@ static void openib_post_sr_rdmaw(struct 
             sr.wr_id = 0;
             sr.send_flags = 0;
         }
+        
+	debug(1, "%s: ibv_post_send wr_id %llx to %s remote addr %llx rkey %x nic_wr_credit %d",
+              __func__, llu(sr.wr_id), c->peername, llu(sr.wr.rdma.remote_addr),
+              sr.wr.rdma.rkey, od->nic_wr_credit);
 
+retry_post:
         ret = ibv_post_send(oc->qp, &sr, &bad_wr);
+	++od->nic_wr_credit;
         if (ret < 0){
+		int activity;
+		int blocktime = 10;
+		warning("%s: post_send failed, ret: %d, wr_credit: %d, block for %d ms",
+			__func__, ret, od->nic_wr_credit, blocktime);
+retry_block:
+		activity = ib_block_for_activity(blocktime);
+		++retry_count;
+		if (activity){
+			ib_check_cq(); /* hopefully pop some work */
+			--od->nic_wr_credit; /* undo bogus increment */
+			debug(1, "%s: RETRY ibv_post_send wr_id %llx to %s remote addr %llx rkey %x nic_wr_credit %d",
+              			__func__, llu(sr.wr_id), c->peername, 
+				llu(sr.wr.rdma.remote_addr),
+              			sr.wr.rdma.rkey, od->nic_wr_credit);
+			goto retry_post;
+		} else if (retry_count < max_retries){
+			goto retry_block;
+		}	
+
 		gossip_err("%s: ibv_post_send failed ret: %d errno: %d\n",
 			__func__, ret, errno);
 		gossip_err(" wr_id: 0x%lx next: %p sg_list %p num_sge %d\n",
@@ -585,6 +617,8 @@ static void openib_post_sr_rdmaw(struct 
 			sr.opcode, sr.send_flags, sr.imm_data);
 		gossip_err(" sr.wr.rdma.remote_addr: 0x%llx rkey 0x%x\n",
 			llu(sr.wr.rdma.remote_addr), sr.wr.rdma.rkey);
+		gossip_err(" od->nic_wr_credit %d od->nic_max_wr %d\n",
+			od->nic_wr_credit, od->nic_max_wr);
 
 		struct ibv_qp_attr attr;
 		struct ibv_qp_init_attr init_attr;
@@ -624,9 +658,12 @@ static int openib_check_cq(struct bmi_ib
 	wc->opcode = BMI_IB_OP_SEND;
     else if (desc.opcode == (IBV_WC_SEND | IBV_WC_RECV))
 	wc->opcode = BMI_IB_OP_RECV;
-    else if (desc.opcode == IBV_WC_RDMA_WRITE)
+    else if (desc.opcode == IBV_WC_RDMA_WRITE){
 	wc->opcode = BMI_IB_OP_RDMA_WRITE;
-    else {
+	--od->nic_wr_credit;
+	debug(1, "%s: completed rdma wr_id %llx, nic_wr_credit %d",
+		__func__, llu(desc.wr_id), od->nic_wr_credit);
+    } else {
 	debug(0, "%s: unknown opcode, id %llx status %d opcode %d",
 	      __func__, llu(desc.wr_id), desc.status, desc.opcode);
 	debug(0, "%s: vendor_err %d byte_len %d imm_data %d qp_num %d",

