- what is your SGE configuration `qconf -sconf`?
 
#global:
execd_spool_dir              /var/spool/gridengine/execd
mailer                       /usr/bin/mail
xterm                        /usr/bin/xterm
load_sensor                  none
prolog                       none
epilog                       none
shell_start_mode             posix_compliant
login_shells                 bash,sh,ksh,csh,tcsh
min_uid                      0
min_gid                      0
user_lists                   none
xuser_lists                  none
projects                     none
xprojects                    none
enforce_project              false
enforce_user                 auto
load_report_time             00:00:40
max_unheard                  00:05:00
reschedule_unknown           00:00:00
loglevel                     log_warning
administrator_mail           root
set_token_cmd                none
pag_cmd                      none
token_extend_time            none
shepherd_cmd                 none
qmaster_params               none
execd_params                 none
reporting_params             accounting=true reporting=false \
                             flush_time=00:00:15 joblog=false sharelog=00:00:00
finished_jobs                100
gid_range                    65400-65500
max_aj_instances             2000
max_aj_tasks                 75000
max_u_jobs                   0
max_jobs                     0
auto_user_oticket            0
auto_user_fshare             0
auto_user_default_project    none
auto_user_delete_time        86400
delegated_file_staging       false
reprioritize                 false
rlogin_daemon                /usr/sbin/sshd -i
rlogin_command               /usr/bin/ssh
qlogin_daemon                /usr/sbin/sshd -i
qlogin_command               /usr/share/gridengine/qlogin-wrapper
rsh_daemon                   /usr/sbin/sshd -i
rsh_command                  /usr/bin/ssh
jsv_url                      none
jsv_allowed_mod              ac,h,i,e,o,j,M,N,p,w

# my queue setting is:

qname                 dev.q
hostlist              sgeqexec01.domain.com.au
seq_no                0
load_thresholds       np_load_avg=1.75
suspend_thresholds    NONE
nsuspend              1
suspend_interval      00:05:00
priority              0
min_cpu_interval      00:05:00
processors            UNDEFINED
qtype                 BATCH INTERACTIVE
ckpt_list             NONE
pe_list               make orte
rerun                 FALSE
slots                 8
tmpdir                /tmp
shell                 /bin/bash
prolog                NONE
epilog                NONE
shell_start_mode      posix_compliant
starter_method        NONE
suspend_method        NONE
resume_method         NONE
terminate_method      NONE
notify                00:00:60
owner_list            NONE
user_lists            NONE
xuser_lists           NONE
subordinate_list      NONE
complex_values        NONE
projects              NONE
xprojects             NONE
calendar              NONE
initial_state         default
s_rt                  INFINITY
h_rt                  INFINITY
s_cpu                 INFINITY
h_cpu                 INFINITY
s_fsize               INFINITY
h_fsize               INFINITY
s_data                INFINITY
h_data                INFINITY
s_stack               INFINITY
h_stack               INFINITY
s_core                INFINITY
h_core                INFINITY
s_rss                 INFINITY
h_rss                 INFINITY
s_vmem                INFINITY
h_vmem                INFINITY

# my PE setting is:

pe_name            orte
slots              4
user_lists         NONE
xuser_lists        NONE
start_proc_args    /bin/true
stop_proc_args     /bin/true
allocation_rule    $round_robin
control_slaves     TRUE
job_is_first_task  FALSE
urgency_slots      min
accounting_summary FALSE
 
a) you are testing from master to a node, but jobs are running between nodes.

b) unless you need X11 forwarding, using SGE’s -builtin- communication works fine, this way you can have a cluster without `rsh` or `ssh` (or limited to admin staff) and can still run parallel jobs.

Sorry for the misleading snip. All the hosts (both master and execution host) in the cluster can powerwordless each other without an issue. As my 2) states, I could run a generic openmpi job without the SGE successfully. So I do not think is the communication issue?
 
Then you are bypassing SGE’s slot allocation and will have wrong accounting and no job control of the slave tasks.
 
I know it's not a proper submission as a PE job. I simply ran out of idea what to do next. Even it's not a proper way, but that openmpi error didn't happen and the job completed. I am wondering why.


The correct version of my OpenMPI is 1.4.1, not 1.3 in my first post.

I have installed OpenMPI on the submission host and the master later, but it didn't help. So I guess OpenMPI is needed in execution hosts only.