Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: [OMPI users] TCP instead of openIB doesn't work
From: Vittorio Giovara (vitto.giova_at_[hidden])
Date: 2009-02-27 10:00:32


Hello, i'm posting here another problem of my installation
I wanted to benchmark the differences between tcp and openib transport

if i run a simple non mpi application i get
randori ~ # mpirun --mca btl tcp,self -np 2 -host randori -host tatami
hostname
randori
tatami

but as soon as i switch to my benchmark program i have
mpirun --mca btl tcp,self -np 2 -host randori -host tatami graph
Master thread reporting
matrix size 33554432 kB, time is in [us]

and instead of starting the send/receive functions it just hangs there; i
also checked the transmitted packets with wireshark but after the handshake
no more packets are exchanged

I read in the archives that there were some problems in this area and so i
tried what was suggested in previous emails

mpirun --mca btl ^openib -np 2 -host randori -host tatami graph
mpirun --mca pml ob1 --mca btl tcp,self -np 2 -host randori -host tatami
graph

gives exactly the same output as before (no mpisend/receive)
while the next commands gives something more interesting

mpirun --mca pml cm --mca btl tcp,self -np 2 -host randori -host tatami
graph
--------------------------------------------------------------------------
No available pml components were found!

This means that there are no components of this type installed on your
system or all the components reported that they could not be used.

This is a fatal error; your MPI process is likely to abort. Check the
output of the "ompi_info" command and ensure that components of this
type are available on your system. You may also wish to check the
value of the "component_path" MCA parameter and ensure that it has at
least one directory that contains valid MCA components.

--------------------------------------------------------------------------
[tatami:06619] PML cm cannot be selected
mpirun noticed that job rank 0 with PID 6710 on node randori exited on
signal 15 (Terminated).

which is not possible as if i do ompi_info --param all there is the CM pml
component

                 MCA pml: cm (MCA v1.0, API v1.0, Component v1.2.8)
                 MCA pml: ob1 (MCA v1.0, API v1.0, Component v1.2.8)

my test program is quite simple, just a couple of MPI_Send and MPI_Recv
(just after the signature)
do you have any ideas that might help me?
thanks a lot
Vittorio

========================
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#define M_COL 4096
#define M_ROW 524288
#define NUM_MSG 25

unsigned long int gigamatrix[M_ROW][M_COL];

int main (int argc, char *argv[]) {
    int numtasks, rank, dest, source, rc, tmp, count, tag=1;
    unsigned long int exp, exchanged;
    unsigned long int i, j, e;
    unsigned long matsize;
    MPI_Status Stat;
    struct timeval timing_start, timing_end;
    double inittime = 0;
    long int totaltime = 0;

    MPI_Init (&argc, &argv);
    MPI_Comm_size (MPI_COMM_WORLD, &numtasks);
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);

    if (rank == 0) {
        fprintf (stderr, "Master thread reporting\n", numtasks - 1);
        matsize = (long) M_COL * M_ROW / 64;
        fprintf (stderr, "matrix size %d kB, time is in [us]\n", matsize);

        source = 1;
        dest = 1;

        /*warm up phase*/
        rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
        rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD,
&Stat);
        rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
        rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
        rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD,
&Stat);
        rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);

        for (e = 0; e < NUM_MSG; e++) {
            exp = pow (2, e);
            exchanged = 64 * exp;

            /*timing of ops*/
            gettimeofday (&timing_start, NULL);
            rc = MPI_Send (&gigamatrix[0], exchanged, MPI_UNSIGNED_LONG,
dest, tag, MPI_COMM_WORLD);
            rc = MPI_Recv (&gigamatrix[0], exchanged, MPI_UNSIGNED_LONG,
source, tag, MPI_COMM_WORLD, &Stat);
            gettimeofday (&timing_end, NULL);

            totaltime = (timing_end.tv_sec - timing_start.tv_sec) * 1000000
+ (timing_end.tv_usec - timing_start.tv_usec);
            memset (&timing_start, 0, sizeof(struct timeval));
            memset (&timing_end, 0, sizeof(struct timeval));
            fprintf (stdout, "%d kB\t%d\n", exp, totaltime);
        }

        fprintf(stderr, "task complete\n");

    } else {
        if (rank >= 1) {
            dest = 0;
            source = 0;

            rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD,
&Stat);
            rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
            rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD,
&Stat);
            rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD,
&Stat);
            rc = MPI_Send (&tmp, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
            rc = MPI_Recv (&tmp, 1, MPI_INT, source, tag, MPI_COMM_WORLD,
&Stat);

            for (e = 0; e < NUM_MSG; e++) {
                exp = pow (2, e);
                exchanged = 64 * exp;

                rc = MPI_Recv (&gigamatrix[0], (unsigned) exchanged,
MPI_UNSIGNED_LONG, source, tag, MPI_COMM_WORLD, &Stat);
                rc = MPI_Send (&gigamatrix[0], (unsigned) exchanged,
MPI_UNSIGNED_LONG, dest, tag, MPI_COMM_WORLD);

            }
        }
    }

    MPI_Finalize ();

    return 0;
}