Open MPI logo

Hardware Locality Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Hardware Locality Development mailing list

Subject: Re: [hwloc-devel] hwloc powerpc rhel5 and power7 patch
From: Samuel Thibault (samuel.thibault_at_[hidden])
Date: 2010-09-20 12:01:20


Closer look on the patch:

Alexey Kardashevskiy, le Fri 17 Sep 2010 20:01:46 +1000, a écrit :
> Index: src/topology-linux.c
> ===================================================================
> --- src/topology-linux.c (revision 2443)
> +++ src/topology-linux.c (working copy)
> @@ -1391,6 +1391,238 @@
> *found = nbnodes;
> }
>
> +static int
> +hwloc_read_str(const char *p, const char *p1, char *buf, size_t nbuf, int root_fd)

Same as read_uint32: callers will have the tendency to do things like

char device_type[64];

which will break sooner or later. Rather allocate data according to
the file size (extending the allocation if the stat-provided size is
actually bogus) and free it in callers. It's less simple to write
hwloc_read_str, but it makes life way easier for later.

> +{
> + char fname[256];
> + int ret = -1;
> + sprintf(fname, "%s/%s", p, p1);

Again, never use arbitrary numbers :) And never use sprintf but
snprintf.

  char fname[strlen(p) + 1 + strlen(p1) + 1];
  snprintf(fname, sizeof(fname), "%s/%s", p, p1);

really puts us on the safe side.

> + FILE *file = hwloc_fopen(fname, "r", root_fd);
> + if (NULL == file)
> + return ret;
> + if (NULL != fgets(buf, nbuf, file))
> + ret = 0;
> + fclose(file);
> + buf[nbuf-1] = 0;

Then you can pass nbuf-1 to fgets. Right, that's really nitpicking :)

> +static size_t
> +hwloc_read_raw(const char *p, const char *p1, void *buf, size_t nbuf,
> + int root_fd)

Same for this one, actually.

All these could indeed go to src/misc.c.

> +typedef struct {
> + unsigned int n, allocated;
> + struct {
> + hwloc_cpuset_t cpuset;
> + uint32_t ibm_phandle;
> + uint32_t l2_cache;
> + char name[64];

Again, this could just be a pointer to a strdup of the dirent->d_name,
to avoid any surprise in the future.

> + } *p;
> +} device_tree_cpus_t;
> +
> +static void
> +add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_cpuset_t cpuset,
> + uint32_t l2_cache, uint32_t ibm_phandle, const char *name)
> +{
> + if (cpus->n == cpus->allocated) {
> + cpus->allocated += 64;
> + cpus->p = realloc(cpus->p, cpus->allocated * sizeof(cpus->p[0]));
> + }
> + cpus->p[cpus->n].ibm_phandle = ibm_phandle;
> + cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_cpuset_dup(cpuset);
> + cpus->p[cpus->n].l2_cache = l2_cache;
> + strncpy(cpus->p[cpus->n].name, name, sizeof(cpus->p[0].name)-1);

strdup being here of course.

> +static int
> +look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
> + uint32_t ibm_phandle, unsigned int *level, hwloc_cpuset_t cpuset)
> +{
> + int ret = -1;
> + if ((NULL == level) || (NULL == cpuset))
> + return ret;
> + for (unsigned int i = 0; i < cpus->n; ++i) {
> + if (ibm_phandle != cpus->p[i].l2_cache)

Oh, it's odd that it's still called "l2_cache" for L3 caches above L2,
too :)

> + continue;
> + if (NULL != cpus->p[i].cpuset) {
> + hwloc_cpuset_or(cpuset, cpuset, cpus->p[i].cpuset);
> + ret = 0;
> + } else {
> + ++(*level);
> + if (0 == look_powerpc_device_tree_discover_cache(cpus,
> + cpus->p[i].ibm_phandle, level, cpuset))
> + ret = 0;
> + }
> + }
> + return ret;
> +}
> +
> +static void
> +look_powerpc_device_tree(struct hwloc_topology *topology)
> +{
> + device_tree_cpus_t cpus = { 0 };

We assume that the compiler can understand

  device_tree_cpus_t cpus = { .n = 0 };

which is much clearer, please use that :)

> + const char ofroot[] = "/proc/device-tree/cpus";
> +
> + int root_fd = topology->backend_params.sysfs.root_fd;
> + DIR *dt = hwloc_opendir(ofroot, root_fd);
> + if (NULL == dt)
> + return;
> +
> + struct dirent *dirent;
> + while (NULL != (dirent = readdir(dt))) {
> +
> + char cpu[256];
> + sprintf(cpu, "%s/%s", ofroot, dirent->d_name);
> +
> + char device_type[64];
> + if (0 > hwloc_read_str(cpu, "device_type", device_type, sizeof(device_type), root_fd))
> + continue;
> +
> + uint32_t reg = -1, l2_cache = -1, ibm_phandle = -1;
> + hwloc_read_uint32(cpu, "reg", &reg, root_fd);
> + hwloc_read_uint32(cpu, "l2-cache", &l2_cache, root_fd);
> + hwloc_read_uint32(cpu, "ibm,phandle", &ibm_phandle, root_fd);
> +
> + if (0 == strcmp(device_type, "cache")) {
> + add_device_tree_cpus_node(&cpus, NULL, l2_cache, ibm_phandle, dirent->d_name);
> + }
> + else if (0 == strcmp(device_type, "cpu")) {
> + /* Found CPU */
> + hwloc_cpuset_t cpuset = NULL;
> + uint32_t threads[128], nthreads = 0;
> + nthreads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s",
> + threads, sizeof(threads), root_fd) / sizeof(threads[0]);
> + if (0 != nthreads) {
> + cpuset = hwloc_cpuset_alloc();
> + for (unsigned int i = 0; i < nthreads; ++i) {
> + hwloc_cpuset_set(cpuset, threads[i]);
> + }
> + } else if ((unsigned int)-1 != reg) {
> + cpuset = hwloc_cpuset_alloc();
> + hwloc_cpuset_set(cpuset, reg);
> + }
> +
> + if (NULL == cpuset) {
> + hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
> + continue;
> + }
> + add_device_tree_cpus_node(&cpus, cpuset, l2_cache, ibm_phandle, dirent->d_name);
> +
> + /* Add core */
> + struct hwloc_obj *core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
> + core->cpuset = hwloc_cpuset_dup(cpuset);
> + hwloc_insert_object_by_cpuset(topology, core);
> +
> + /* Add socket */
> + /* -1 - to discuss */
> + struct hwloc_obj *socket = hwloc_alloc_setup_object(HWLOC_OBJ_SOCKET, -1);
> + socket->cpuset = hwloc_cpuset_dup(cpuset);
> + hwloc_insert_object_by_cpuset(topology, socket);

Mmm, are we really sure that this describes sockets? How would a multicore
socket look like here? We should not insert sockets if we are not sure
which cpuset they really have (the principle of hwloc is "never lie, at
worse don't say anything").

> + /* Add L1 cache */
> + /* Ignore Instruction caches */
> +
> + /* d-cache-block-size - ignore */
> + /* d-cache-line-size - to read, in bytes */
> + /* d-cache-sets - ignore */
> + /* d-cache-size - to read, in bytes */
> + /* d-tlb-sets - ignore */
> + /* d-tlb-size - ignore, always 0 on power6 */
> + /* i-cache-* and i-tlb-* represent instruction cache, ignore */
> + uint32_t d_cache_line_size = 0, d_cache_size = 0;
> + if ( (0 != hwloc_read_uint32(cpu, "d-cache-line-size", &d_cache_line_size, root_fd)) &&
> + (0 != hwloc_read_uint32(cpu, "d-cache-size", &d_cache_size, root_fd)) ) {
> + struct hwloc_obj *cache =
> + hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
> + cache->attr->cache.size = d_cache_size;
> + cache->attr->cache.depth = 1;
> + cache->attr->cache.linesize = d_cache_line_size;

I would rather create an L1 cache object as soon as any of the cache
properties is there, and then fill the properties with what is actually
available.

> + cache->cpuset = hwloc_cpuset_dup(cpuset);
> + hwloc_debug_cpuset("cache depth 1 has cpuset %s\n", cache->cpuset);
> + hwloc_insert_object_by_cpuset(topology, cache);
> + }
> + hwloc_cpuset_free(cpuset);
> + }
> + }
> + closedir(dt);
> +
> + /* No cores and L2 cache were found, exiting */
> + if (0 == cpus.n) {
> + hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
> + return;
> + }
> +
> +#ifdef HWLOC_DEBUG
> + for (unsigned int i = 0; i < cpus.n; ++i) {
> + hwloc_debug("%i: %s ibm,phandle=%08X l2_cache=%08X ",
> + i, cpus.p[i].name, cpus.p[i].ibm_phandle, cpus.p[i].l2_cache);
> + if (NULL == cpus.p[i].cpuset) {
> + hwloc_debug("%s\n", "no cpuset");
> + } else {
> + hwloc_debug_cpuset("cpuset %s\n", cpus.p[i].cpuset);
> + }
> + }
> +#endif
> +
> + /* Scan L2/L3/... caches */
> + for (unsigned int i = 0; i < cpus.n; ++i) {
> + /* Skip real CPUs */
> + if (NULL != cpus.p[i].cpuset)
> + continue;
> +
> + /* Calculate cache level and CPU mask */
> + unsigned int level = 2;
> + hwloc_cpuset_t cpuset = hwloc_cpuset_alloc();
> + if (0 == look_powerpc_device_tree_discover_cache(&cpus,
> + cpus.p[i].ibm_phandle, &level, cpuset)) {
> + /* Check for "cache-unified" - if it is present, CPU has unified L1 cache */
> + /* d-cache-line-size - to read, in bytes */
> + /* d-cache-sets - ignore */
> + /* d-cache-size - to read, in bytes */
> + /* i-cache-* represent instruction cache, ignore */
> + uint32_t d_cache_line_size = 0, d_cache_size = 0;
> + char cpu[256];
> + sprintf(cpu, "%s/%s", ofroot, cpus.p[i].name);
> +
> + if ( (0 != hwloc_read_uint32(cpu, "d-cache-line-size", &d_cache_line_size, root_fd)) &&
> + (0 != hwloc_read_uint32(cpu, "d-cache-size", &d_cache_size, root_fd)) ) {

Same here.

Else the principle seems fine to me and the code as it is now should not
be too hard to make more generalized for other backends.

Samuel