Open MPI logo

MTT Devel Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all MTT Devel mailing list

Subject: Re: [MTT devel] MTT server error (user: cisco)
From: Ethan Mallove (ethan.mallove_at_[hidden])
Date: 2008-01-02 15:55:39


I fixed the below issue with Analyze/Performance/IMB.pm in
r1117. What it will now do is read an uninterrupted data
table broken up by either an EOF or something that does not
look like a row in the data table (e.g., an error or warning
message). I'm surprised that the below "floating point
exception" could result in a pass. At least now the entire
test run is not scrapped because of one bad apple.

-Ethan

> Sat, Dec/29/2007 05:38:32PM, jjhursey_at_[hidden] wrote:
>
> SQL QUERY: INSERT INTO latency_bandwidth
> (latency_bandwidth_id, message_size, latency_min, latency_avg, latency_max, bandwidth_min, bandwidth_avg, bandwidth_max) VALUES
> ('314123', '{0,1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,524288,1048576,2097152,4194304}', '{0.15,191.51,166.73,169.26,166.44,167.43,168.30,168.44,165.46,166.42,167.48,162.31,136.42,222.19,446.03,716.19,1254.33,2458.68,5584.21,12544.87,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,27091.03,43622.23,72144.60,130192.28}', '{0.22,191.53,166.78,169.27,166.45,167.45,168.30,168.60,165.47,166.46,167.64,162.32,136.45,222.21,446.09,716.21,1254.39,2458.76,5584.51,12545.59,,,,,,,,,,,,,,,,,27094.99,43640.46,72183.70,130302.65}', '{0.28,191.55,166.89,169.28,166.47,167.49,168.32,168.69,165.48,166.49,167.70,162.34,136.48,222.24,446.13,716.24,1254.44,2458.84,5584.96,12546.58,,,,,,,,,,,,,,,,,27099.48,43659.70,72207.25,130419.42}', DEFAULT, DEFAULT, DEFAULT)
> SQL ERROR: ERROR: malformed array literal: "{0.22,191.53,166.78,169.27,166.45,167.45,168.30,168.60,165.47,166.46,167.64,162.32,136.45,222.21,446.09,716.21,1254.39,2458.76,5584.51,12545.59,,,,,,,,,,,,,,,,,27094.99,43640.46,72183.70,130302.65}"
> SQL ERROR:
>
> [SNIP]
>
> 'exit_value_81' => 0,
> 'mpi_install_section_name_81' => 'ompi/gnu-standard',
> 'latency_max_81' => '{0.28,191.55,166.89,169.28,166.47,167.49,168.32,168.69,165.48,166.49,167.70,162.34,136.48,222.24,446.13,716.24,1254.44,2458.84,5584.96,12546.58,,,,,,,,,,,,,,,,,27099.48,43659.70,72207.25,130419.42}',
> 'latency_avg_81' => '{0.22,191.53,166.78,169.27,166.45,167.45,168.30,168.60,165.47,166.46,167.64,162.32,136.45,222.21,446.09,716.21,1254.39,2458.76,5584.51,12545.59,,,,,,,,,,,,,,,,,27094.99,43640.46,72183.70,130302.65}',
> 'np_81' => '8',
> 'network_81' => 'loopback,verbs',
> 'test_result_81' => 1,
> 'latency_min_81' => '{0.15,191.51,166.73,169.26,166.44,167.43,168.30,168.44,165.46,166.42,167.48,162.31,136.42,222.19,446.03,716.19,1254.33,2458.68,5584.21,12544.87,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,27091.03,43622.23,72144.60,130192.28}',
> 'test_build_section_name_81' => 'imb',
> 'description_81' => 'Cisco MPI development cluster',
> 'result_stderr_81' => '',
> 'environment_81' => '',
> 'exit_signal_81' => -1,
> 'test_name_81' => 'Allgatherv',
> 'parameters_81' => '--mca btl_openib_use_eager_rdma 0 --mca btl_tcp_if_include ib0 --mca oob_tcp_if_include ib0',
> 'start_timestamp_81' => 'Sat Dec 29 22:32:56 2007',
> 'command_81' => 'mpirun -np 8 --mca btl_openib_use_eager_rdma 0 --mca btl openib,self --mca btl_tcp_if_include ib0 --mca oob_tcp_if_include ib0 src/IMB-MPI1 -npmin 8 Allgatherv',
> 'duration_81' => '20 seconds',
> 'message_size_81' => '{0,1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,524288,1048576,2097152,4194304}',
> 'resource_manager_81' => 'slurm',
> 'result_stdout_81' => '#---------------------------------------------------
> # Intel (R) MPI Benchmark Suite V2.3, MPI-1 part
> #---------------------------------------------------
> # Date : Sat Dec 29 14:32:57 2007
> # Machine : x86_64# System : Linux
> # Release : 2.6.9-42.ELsmp
> # Version : #1 SMP Wed Jul 12 23:32:02 EDT 2006
>
> #
> # Minimum message length in bytes: 0
> # Maximum message length in bytes: 4194304
> #
> # MPI_Datatype : MPI_BYTE
> # MPI_Datatype for reductions : MPI_FLOAT
> # MPI_Op : MPI_SUM
> #
> #
>
> # List of Benchmarks to run:
>
> # Allgatherv
>
> #----------------------------------------------------------------
> # Benchmarking Allgatherv
> # #processes = 8
> #----------------------------------------------------------------
> #bytes #repetitions t_min[usec] t_max[usec] t_avg[usec]
> 0 1000 0.15 0.28 0.22
> 1 1000 191.51 191.55 191.53
> 2 1000 166.73 166.89 166.78
> 4 1000 169.26 169.28 169.27
> 8 1000 166.44 166.47 166.45
> 16 1000 167.43 167.49 167.45
> 32 1000 168.30 168.32 168.30
> 64 1000 168.44 168.69 168.60
> 128 1000 165.46 165.48 165.47
> 256 1000 166.42 166.49 166.46
> 512 1000 167.48 167.70 167.64
> 1024 1000 162.31 162.34 162.32
> 2048 1000 136.42 136.48 136.45
> 4096 1000 222.19 222.24 222.21
> 8192 1000 446.03 446.13 446.09
> 16384 1000 716.19 716.24 716.21
> 32768 1000 1254.33 1254.44 1254.39
> 65536 640 2458.68 2458.84 2458.76
> 131072 320 5584.21 5584.96 5584.51
> 262144 160 12544.87 12546.58 12545.59
> [svbu-mpi031:12247] *** Process received signal ***
> [svbu-mpi031:12247] Signal: Floating point exception (8)
> [svbu-mpi031:12247] Signal code: (0)
> [svbu-mpi031:12247] Failing at address: 0x25900002fed
> [svbu-mpi031:12247] [ 0] /lib64/tls/libpthread.so.0 [0x2a95e57430]
> [svbu-mpi031:12247] [ 1] /lib64/tls/libc.so.6(__poll+0x2f) [0x2a9601e96f]
> [svbu-mpi031:12247] [ 2] /home/mpiteam/scratches/2007-12-28/Ppuo/installs/5BvS/install/lib/libopen-pal.so.0(opal_poll_dispatch+0x13c) [0x2a9568e1ba]
> [svbu-mpi031:12247] [ 3] /home/mpiteam/scratches/2007-12-28/Ppuo/installs/5BvS/install/lib/libopen-pal.so.0(opal_event_base_loop+0x419) [0x2a9568a238]
> [svbu-mpi031:12247] [ 4] /home/mpiteam/scratches/2007-12-28/Ppuo/installs/5BvS/install/lib/libopen-pal.so.0(opal_event_loop+0x1d) [0x2a95689e1d]
> [svbu-mpi031:12247] [ 5] /home/mpiteam/scratches/2007-12-28/Ppuo/installs/5BvS/install/lib/libopen-pal.so.0(opal_progress+0x6a) [0x2a95680fbe]
> [svbu-mpi031:12247] [ 6] mpirun [0x403fe4]
> [svbu-mpi031:12247] [ 7] mpirun(orterun+0x9bb) [0x403823]
> [svbu-mpi031:12247] [ 8] mpirun(main+0x1b) [0x402e63]
> [svbu-mpi031:12247] [ 9] /lib64/tls/libc.so.6(__libc_start_main+0xdb) [0x2a95f7d3fb]
> [svbu-mpi031:12247] [10] mpirun(orte_daemon_recv+0x1e2) [0x402dba]
> [svbu-mpi031:12247] *** End of error message ***
> 524288 80 27091.03 27099.48 27094.99
> 1048576 40 43622.23 43659.70 43640.46
> 2097152 20 72144.60 72207.25 72183.70
> 4194304 10 130192.28 130419.42 130302.65
> ',
> 'variant_81' => 81,
> 'result_message_81' => 'Passed',
> 'test_type_81' => 'latency_bandwidth',
> 'launcher_81' => 'mpirun',
>