Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

From: Gleb Natapov (glebn_at_[hidden])
Date: 2007-09-05 11:41:31


Hi,

  opal_atomic_lifo implementation suffers from ABA problem.
Here is the code for opal_atomic_lifo_pop:

1 do {
2 item = lifo->opal_lifo_head;
3 if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
4 item,
5 (void*)item->opal_list_next ) )
6 break;
7 /* Do some kind of pause to release the bus */
8 } while( 1 );
9 if( item == &(lifo->opal_lifo_ghost) ) return NULL;
10 item->opal_list_next = NULL;
11 return item;

If the following happens:

   Thread1: Thread2:
1 executes 2
2 executes 1-11 and acquire item
3 enters 3 but preempted before cmpxchg
  NOTE: the third parameter passed to cmpset is
  NULL because item is in use by thread2
4 executes lifo_push(item)
5 successfully executes cmpxchg since the old
  value is equal to current value (ABA problem)
  but places NULL into lifo->opal_lifo_head!

Included patch seems to be fixing this problem, but I am not really sure
if this is the right whay to solve this kind of problems.

diff --git a/opal/class/opal_atomic_lifo.h b/opal/class/opal_atomic_lifo.h
index caf35b1..4e8148c 100644
--- a/opal/class/opal_atomic_lifo.h
+++ b/opal/class/opal_atomic_lifo.h
@@ -71,8 +71,10 @@ static inline opal_list_item_t* opal_atomic_lifo_push( opal_atomic_lifo_t* lifo,
         item->opal_list_next = lifo->opal_lifo_head;
         if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
                                     (void*)item->opal_list_next,
- item ) )
+ item ) ) {
+ opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 1, 0);
             return (opal_list_item_t*)item->opal_list_next;
+ }
         /* DO some kind of pause to release the bus */
     } while( 1 );
 #else
@@ -89,14 +91,17 @@ static inline opal_list_item_t* opal_atomic_lifo_pop( opal_atomic_lifo_t* lifo )
 {
     opal_list_item_t* item;
 #if OMPI_HAVE_THREAD_SUPPORT
- do {
- item = lifo->opal_lifo_head;
+ while((item = lifo->opal_lifo_head) != &(lifo->opal_lifo_ghost))
+ {
+ if(!opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 0, 1))
+ continue;
         if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
                                     item,
                                     (void*)item->opal_list_next ) )
             break;
+ opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 1, 0);
         /* Do some kind of pause to release the bus */
- } while( 1 );
+ }
 #else
     item = lifo->opal_lifo_head;
     lifo->opal_lifo_head = (opal_list_item_t*)item->opal_list_next;
diff --git a/opal/class/opal_list.c b/opal/class/opal_list.c
index c8a5568..715715e 100644
--- a/opal/class/opal_list.c
+++ b/opal/class/opal_list.c
@@ -55,6 +55,7 @@ OBJ_CLASS_INSTANCE(
 static void opal_list_item_construct(opal_list_item_t *item)
 {
     item->opal_list_next = item->opal_list_prev = NULL;
+ item->item_free = 1;
 #if OMPI_ENABLE_DEBUG
     item->opal_list_item_refcount = 0;
     item->opal_list_item_belong_to = NULL;
diff --git a/opal/class/opal_list.h b/opal/class/opal_list.h
index 83fa57b..3a45f4e 100644
--- a/opal/class/opal_list.h
+++ b/opal/class/opal_list.h
@@ -102,6 +102,7 @@ struct opal_list_item_t
     /**< Pointer to next list item */
     volatile struct opal_list_item_t *opal_list_prev;
     /**< Pointer to previous list item */
+ int32_t item_free;
 
 #if OMPI_ENABLE_DEBUG
     /** Atomic reference count for debugging */

--
			Gleb.