Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: Re: [OMPI users] Try to submit OMPI job to SGE gives ERRORS (orte_plm_base_select failed & orte_ess_set_name failed) (Reuti)
From: Derrick LIN (klin938_at_[hidden])
Date: 2011-04-16 17:09:22


>
> So you route the SGE startup mechanism to use `ssh`, nevertherless is
> should work of course. Small difference to a conventional `ssh` is, that SGE
> will start a private daemon for each job on the nodes listening on a random
> port.
>
> When you use only one host, then forks will be created but no `ssh` call.
> Your test uses more than one node?
>

I have tested with more than one node but the error still happened.

You copied you SGE aware version to all nodes at the same location? Are you
> getting the correct `mpiexec` and shared libraries in your jobscript? Shows
> the output of:
>

I installed it from the ubuntu apt-get on each node, so the OpenMPI is in
standard location. In fact ubuntu handles all dependencies very well without
worrying about PATH or LD_LIBRARY_PATH.

> #!/bin/sh
> which mpiexec
> echo $LD_LIBRARY_PATH
> ldd ompi_job
>
> the expected ones (ompi_job is the binary and ompi_job.sh the script) when
> submitted with a PE request?
>

/usr/bin/mpiexec
/usr/lib/openmpi/lib:/usr/lib/openmpi/lib/openmpi
        linux-vdso.so.1 => (0x00007fff9b1ff000)
        libmpi.so.0 => /usr/lib/libmpi.so.0 (0x00002af0868aa000)
        libopen-rte.so.0 => /usr/lib/libopen-rte.so.0 (0x00002af086b58000)
        libopen-pal.so.0 => /usr/lib/libopen-pal.so.0 (0x00002af086da4000)
        libdl.so.2 => /lib/libdl.so.2 (0x00002af087017000)
        libnsl.so.1 => /lib/libnsl.so.1 (0x00002af08721b000)
        libutil.so.1 => /lib/libutil.so.1 (0x00002af087436000)
        libm.so.6 => /lib/libm.so.6 (0x00002af087639000)
        libpthread.so.0 => /lib/libpthread.so.0 (0x00002af0878bc000)
        libc.so.6 => /lib/libc.so.6 (0x00002af087ada000)
        /lib64/ld-linux-x86-64.so.2 (0x00002af086687000)

Below are some runtime data inside a job spooling directory on the execution
host

pwbcad_at_sgeqexec01:128.1$ ls
addgrpid config environment error exit_status job_pid pe_hostfile pid
 trace usage
*pwbcad_at_sgeqexec01:128.1$ cat config*
add_grp_id=65416
fs_stdin_host=""
fs_stdin_path=
fs_stdin_tmp_path=/tmp/128.1.dev.q/
fs_stdin_file_staging=0
fs_stdout_host=""
fs_stdout_path=
fs_stdout_tmp_path=/tmp/128.1.dev.q/
fs_stdout_file_staging=0
fs_stderr_host=""
fs_stderr_path=
fs_stderr_tmp_path=/tmp/128.1.dev.q/
fs_stderr_file_staging=0
stdout_path=/mnt/FacilityBioinformatics/pwbcad
stderr_path=/mnt/FacilityBioinformatics/pwbcad
stdin_path=/dev/null
merge_stderr=1
tmpdir=/tmp/128.1.dev.q
handle_as_binary=0
no_shell=0
ckpt_job=0
h_vmem=INFINITY
h_vmem_is_consumable_job=0
s_vmem=INFINITY
s_vmem_is_consumable_job=0
h_cpu=INFINITY
h_cpu_is_consumable_job=0
s_cpu=INFINITY
s_cpu_is_consumable_job=0
h_stack=INFINITY
h_stack_is_consumable_job=0
s_stack=INFINITY
s_stack_is_consumable_job=0
h_data=INFINITY
h_data_is_consumable_job=0
s_data=INFINITY
s_data_is_consumable_job=0
h_core=INFINITY
s_core=INFINITY
h_rss=INFINITY
s_rss=INFINITY
h_fsize=INFINITY
s_fsize=INFINITY
s_descriptors=UNDEFINED
h_descriptors=UNDEFINED
s_maxproc=UNDEFINED
h_maxproc=UNDEFINED
s_memorylocked=UNDEFINED
h_memorylocked=UNDEFINED
s_locks=UNDEFINED
h_locks=UNDEFINED
priority=0
shell_path=/bin/bash
script_file=/var/spool/gridengine/execd/sgeqexec01/job_scripts/128
job_owner=pwbcad
min_gid=0
min_uid=0
cwd=/mnt/FacilityBioinformatics/pwbcad
prolog=none
epilog=none
starter_method=NONE
suspend_method=NONE
resume_method=NONE
terminate_method=NONE
script_timeout=120
pe=orte
pe_slots=16
host_slots=8
pe_hostfile=/var/spool/gridengine/execd/sgeqexec01/active_jobs/128.1/pe_hostfile
pe_start=/bin/true
pe_stop=/bin/true
pe_stdout_path=/mnt/FacilityBioinformatics/pwbcad
pe_stderr_path=/mnt/FacilityBioinformatics/pwbcad
shell_start_mode=posix_compliant
use_login_shell=1
mail_list=pwbcad_at_[hidden]
mail_options=0
forbid_reschedule=0
forbid_apperror=0
queue=dev.q
host=sgeqexec01.garvan.unsw.edu.au
processors=UNDEFINED
binding=NULL
job_name=run_cal_pi_auto
job_id=128
ja_task_id=0
account=sge
submission_time=1302987873
notify=0
acct_project=none
njob_args=0
queue_tmpdir=/tmp
use_afs=0
admin_user=sgeadmin
notify_kill_type=1
notify_kill=default
notify_susp_type=1
notify_susp=default
qsub_gid=no
pty=0
write_osjob_id=1
inherit_env=1
enable_windomacc=0
enable_addgrp_kill=0
csp=0
ignore_fqdn=0
default_domain=none
*pwbcad_at_sgeqexec01:128.1$ cat environment*
USER=pwbcad
SSH_CLIENT=149.171.200.64 63056 22
MAIL=/var/mail/pwbcad
SHLVL=1
OLDPWD=/home/pwbcad
HOME=/home/pwbcad
SSH_TTY=/dev/pts/4
PAGER=less
PS1=\[\e[32;1m\]\u\[\e[0m\]@\[\e[35;1m\]\h\[\e[0m\]:\[\e[34;1m\]\W\[\e[0m\]\$
LOGNAME=pwbcad
_=/usr/bin/qsub
TERM=xterm
SGE_ROOT=/var/lib/gridengine
PATH=/tmp/128.1.dev.q:.:/home/pwbcad/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/meme/bin:/usr/local/eigenstrat:/usr/local/tophat/bin:/usr/local/cufflinks/bin:/usr/local/defuse/bin:/usr/local/bowtie/bin:/usr/local/cnvseq/bin:/usr/local/fastx_toolkit/bin:/usr/local/breakway/bin
SGE_CELL=default
LANG=en_AU.UTF-8
SHELL=/bin/bash
PWD=/mnt/FacilityBioinformatics/pwbcad
SSH_CONNECTION=149.171.200.64 63056 129.94.136.216 22
EDITOR=nano
REQNAME=run_cal_pi_auto
JOB_NAME=run_cal_pi_auto
JOB_SCRIPT=/var/spool/gridengine/execd/sgeqexec01/job_scripts/128
SGE_BINARY_PATH=/usr/sbin/lx26-amd64
REQUEST=run_cal_pi_auto
HOSTNAME=sgeqexec01.garvan.unsw.edu.au
QUEUE=dev.q
JOB_ID=128
ENVIRONMENT=BATCH
ARC=lx26-amd64
NQUEUES=2
NSLOTS=16
NHOSTS=2
RESTARTED=0
TMPDIR=/tmp/128.1.dev.q
TMP=/tmp/128.1.dev.q
PE=orte
PE_HOSTFILE=/var/spool/gridengine/execd/sgeqexec01/active_jobs/128.1/pe_hostfile
SGE_RSH_COMMAND=/usr/bin/ssh
SGE_O_HOME=/home/pwbcad
SGE_O_LOGNAME=pwbcad
SGE_O_PATH=.:/home/pwbcad/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/meme/bin:/usr/local/eigenstrat:/usr/local/tophat/bin:/usr/local/cufflinks/bin:/usr/local/defuse/bin:/usr/local/bowtie/bin:/usr/local/cnvseq/bin:/usr/local/fastx_toolkit/bin:/usr/local/breakway/bin
SGE_O_SHELL=/bin/bash
SGE_O_MAIL=/var/mail/pwbcad
SGE_O_HOST=enzo
SGE_O_WORKDIR=/mnt/FacilityBioinformatics/pwbcad
SGE_TASK_ID=undefined
SGE_TASK_FIRST=undefined
SGE_TASK_LAST=undefined
SGE_TASK_STEPSIZE=undefined
SGE_ARCH=lx26-amd64
SGE_ACCOUNT=sge
SGE_JOB_SPOOL_DIR=/var/spool/gridengine/execd/sgeqexec01/active_jobs/128.1
*pwbcad_at_sgeqexec01:128.1$ cat pe_hostfile*
sgeqexec01.garvan.unsw.edu.au 8 dev.q_at_sgeqexec01.garvan.unsw.edu.auUNDEFINED
sgeqexec02.garvan.unsw.edu.au 8 dev.q_at_sgeqexec02.garvan.unsw.edu.auUNDEFINED

Hope these provide you more info.

Regards