Open MPI logo

Hardware Locality Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Hardware Locality Development mailing list

Subject: [hwloc-devel] Problem with hwloc_linux_foreach_proc_tid()
From: Bert Wesarg (bert.wesarg_at_[hidden])
Date: 2010-03-17 16:41:39


Hi all,

I have a problem with this retry algorithm of
hwloc_linux_foreach_proc_tid(). For example with the
hwloc_linux_get_pid_cpubind() function. hwloc_linux_get_pid_cpubind()
should collect all affinity mask from the threads. But if the retry is
triggered and the new tid list does not have a tid anymore which had
an affinity mask which is not a subset of the collected one in the
first run. The end result is inaccurate.

A small example:

tid A has affinity 0x1
tid B has affinity 0x2

After the first round the affinity mask is 0x3. Now tid A exits and
the retry check will be trigged. The end affinity mask will still be
0x3. Because the mask will not be reset to 0.

Here is a proposal to fix this. It passes a new argument to the
callback function, which is only true for the first tid. So this can
be used to zero the cpuset.

Regards,
Bert

---
diff --git i/src/topology-linux.c w/src/topology-linux.c
index 06d1739..9366fa5 100644
--- i/src/topology-linux.c
+++ w/src/topology-linux.c
@@ -340,23 +340,26 @@ hwloc_linux_get_proc_tids(DIR *taskdir, unsigned
*nr_tidsp, pid_t ** tidsp)
 }
 /* Callbacks for binding each process sub-tid */
-typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t
topology, pid_t tid, void *data, int policy);
+typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t
topology, pid_t tid, void *data, int policy, int first);
 static int
-hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t
topology, pid_t tid, void *data, int policy __hwloc_attribute_unused)
+hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t
topology, pid_t tid, void *data, int policy __hwloc_attribute_unused,
int first __hwloc_attribute_unused)
 {
   hwloc_cpuset_t cpuset = (hwloc_cpuset_t) data;
   return hwloc_linux_set_tid_cpubind(topology, tid, cpuset);
 }
 static int
-hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t
topology, pid_t tid, void *data, int policy)
+hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t
topology, pid_t tid, void *data, int policy, int first)
 {
   hwloc_cpuset_t cpuset = (hwloc_cpuset_t) data;
   hwloc_cpuset_t tidset = hwloc_linux_get_tid_cpubind(topology, tid);
   if (!tidset)
     return -1;
+  if (first)
+    hwloc_cpuset_zero(cpuset);
+
   if (policy & HWLOC_CPUBIND_STRICT) {
     /* if STRICT, we want all threads to have the same binding */
     if (hwloc_cpuset_iszero(cpuset)) {
@@ -386,6 +389,7 @@ hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
   DIR *taskdir;
   pid_t *tids, *newtids;
   unsigned i, nr, newnr;
+  int first;
   int err;
   if (pid)
@@ -406,9 +410,11 @@ hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
     goto out_with_dir;
  retry:
+  first = 1;
   /* apply the callback to all threads */
   for(i=0; i<nr; i++) {
-    err = cb(topology, tids[i], data, policy);
+    err = cb(topology, tids[i], data, policy, first);
+    first = 0;
     if (err < 0)
       goto out_with_tids;
   }