Index: ompi/mca/btl/mosix/configure.m4 =================================================================== --- ompi/mca/btl/mosix/configure.m4 (revision 0) +++ ompi/mca/btl/mosix/configure.m4 (revision 0) @@ -0,0 +1,28 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_btl_mosix_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_btl_mosix_CONFIG],[ + # check for mosix presence + AC_CHECK_FILE([/proc/mosix/mosip], + [$1], + [$2]) +])dnl Index: ompi/mca/btl/mosix/btl_mosix_frag.c =================================================================== --- ompi/mca/btl/mosix/btl_mosix_frag.c (revision 0) +++ ompi/mca/btl/mosix/btl_mosix_frag.c (revision 0) @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_mosix.h" + +static void mca_btl_mosix_frag_common_constructor(mca_btl_mosix_frag_t* frag) +{ + frag->base.des_src = frag->segments; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + + frag->segments[0].seg_addr.pval = (void*)(frag + 1); + frag->segments[0].seg_len = frag->size - sizeof(mca_btl_mosix_frag_t); + frag->segments[1].seg_addr.pval = NULL; + frag->segments[1].seg_len = 0; +} + +static void mca_btl_mosix_frag_eager_constructor(mca_btl_mosix_frag_t* frag) +{ + frag->size = mca_btl_mosix_module.super.btl_eager_limit; + frag->my_list = &mca_btl_mosix_component.mosix_frag_eager; + mca_btl_mosix_frag_common_constructor(frag); +} + +static void mca_btl_mosix_frag_max_constructor(mca_btl_mosix_frag_t* frag) +{ + frag->size = mca_btl_mosix_module.super.btl_max_send_size; + frag->my_list = &mca_btl_mosix_component.mosix_frag_max; + mca_btl_mosix_frag_common_constructor(frag); +} + +static void mca_btl_mosix_frag_user_constructor(mca_btl_mosix_frag_t* frag) +{ + frag->size = 0; + frag->my_list = &mca_btl_mosix_component.mosix_frag_user; + mca_btl_mosix_frag_common_constructor(frag); +} + + +OBJ_CLASS_INSTANCE( mca_btl_mosix_frag_t, + mca_btl_base_descriptor_t, + NULL, + NULL ); + +OBJ_CLASS_INSTANCE( mca_btl_mosix_frag_eager_t, + mca_btl_base_descriptor_t, + mca_btl_mosix_frag_eager_constructor, + NULL ); + +OBJ_CLASS_INSTANCE( mca_btl_mosix_frag_max_t, + mca_btl_base_descriptor_t, + mca_btl_mosix_frag_max_constructor, + NULL ); + +OBJ_CLASS_INSTANCE( mca_btl_mosix_frag_user_t, + mca_btl_base_descriptor_t, + mca_btl_mosix_frag_user_constructor, + NULL ); Index: ompi/mca/btl/mosix/btl_mosix_endpoint.c =================================================================== --- ompi/mca/btl/mosix/btl_mosix_endpoint.c (revision 0) +++ ompi/mca/btl/mosix/btl_mosix_endpoint.c (revision 0) @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "ompi_config.h" + +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_FCNTL_H +#include +#endif + +#include "opal/opal_socket_errno.h" + +#include "ompi/types.h" +#include "ompi/mca/btl/base/btl_base_error.h" + +#include "btl_mosix.h" +#include "btl_mosix_endpoint.h" + +#define CLOSE_FD(x) { close(x); x = -1; } + +/* + * Initialize state of the endpoint instance. + * + */ + +static void mca_btl_mosix_endpoint_construct(mca_btl_base_endpoint_t* endpoint) +{ + OBJ_CONSTRUCT( &endpoint->endpoint_establishment_lock, opal_mutex_t ); +} + +/* + * Destroy a endpoint + * + */ + +static void mca_btl_mosix_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) +{ + OBJ_DESTRUCT( &endpoint->endpoint_establishment_lock ); + CLOSE_FD(endpoint->endpoint_tcp_fd); + CLOSE_FD(endpoint->endpoint_udp_fd); +} + + +OBJ_CLASS_INSTANCE(mca_btl_mosix_endpoint_t, + opal_list_item_t, + mca_btl_mosix_endpoint_construct, + mca_btl_mosix_endpoint_destruct); + +static int mca_btl_mosix_endpoint_open_tcp(mca_btl_mosix_endpoint_t* mosix_endpoint) +{ + char* remote_mbox_path = NULL; + OPAL_THREAD_LOCK(&mosix_endpoint->endpoint_establishment_lock); + if( 0 > mosix_endpoint->endpoint_tcp_fd ) { + /* Construct the path for the remote mailbox */ + asprintf(&remote_mbox_path, MCA_BTL_MOSIX_MBOX_PATH_STRING_FORMAT, + mosix_endpoint->endpoint_address->addr_ipv4_str, + mosix_endpoint->endpoint_address->addr_pid); + opal_output_verbose(20, mca_btl_base_output, + "btl: mosix: Establishind TCP link to address %s and PID #%i", + mosix_endpoint->endpoint_address->addr_ipv4_str, + mosix_endpoint->endpoint_address->addr_pid); + + /* Try to open path to remote mailbox*/ + mosix_endpoint->endpoint_tcp_fd = open(remote_mbox_path, 1); + free(remote_mbox_path); + if( -1 == mosix_endpoint->endpoint_tcp_fd ) { + return OMPI_ERR_UNREACH; + } + } + OPAL_THREAD_UNLOCK(&mosix_endpoint->endpoint_establishment_lock); + return OMPI_SUCCESS; +} + +static int mca_btl_mosix_endpoint_open_udp(mca_btl_mosix_endpoint_t* mosix_endpoint) +{ + char* remote_mbox_path = NULL; + OPAL_THREAD_LOCK(&mosix_endpoint->endpoint_establishment_lock); + if( 0 > mosix_endpoint->endpoint_tcp_fd ) { + /* Construct the path for the remote mailbox */ + asprintf(&remote_mbox_path, MCA_BTL_MOSIX_UBOX_PATH_STRING_FORMAT, + mosix_endpoint->endpoint_address->addr_ipv4_str, + mosix_endpoint->endpoint_address->addr_pid); + opal_output_verbose(20, mca_btl_base_output, + "btl: mosix: Establishind UDP link to address %s and PID #%i", + mosix_endpoint->endpoint_address->addr_ipv4_str, + mosix_endpoint->endpoint_address->addr_pid); + + /* Try to open path to remote mailbox*/ + mosix_endpoint->endpoint_udp_fd = open(remote_mbox_path, 1); + free(remote_mbox_path); + if( -1 == mosix_endpoint->endpoint_udp_fd ) { + return OMPI_ERR_UNREACH; + } + } + OPAL_THREAD_UNLOCK(&mosix_endpoint->endpoint_establishment_lock); + return OMPI_SUCCESS; +} + +/* + * Attempt to send a descriptor using a given endpoint. + * If the channel has not been in use so far - open it. + */ +int mca_btl_mosix_endpoint_send(mca_btl_mosix_endpoint_t* mosix_endpoint, + mca_btl_mosix_frag_t* frag) +{ + size_t cnt; + int chosen_fd = -1; + mca_btl_mosix_header_t addr = {frag->tag}; + struct iovec writer[] = { + {&addr, sizeof(addr)}, + {frag->segments[0].seg_addr.pval, frag->segments[0].seg_len}, + {frag->segments[1].seg_addr.pval, frag->segments[1].seg_len}, + }; + + /* telect the channel to send over */ + cnt = writer[0].iov_len + writer[1].iov_len + writer[2].iov_len; + if (cnt > mca_btl_mosix_component.mosix_udp_max_size) { + if( 0 > mosix_endpoint->endpoint_tcp_fd ) { + mca_btl_mosix_endpoint_open_tcp(mosix_endpoint); + } + chosen_fd = mosix_endpoint->endpoint_tcp_fd; + } else { + if( 0 > mosix_endpoint->endpoint_udp_fd ) { + mca_btl_mosix_endpoint_open_udp(mosix_endpoint); + } + chosen_fd = mosix_endpoint->endpoint_udp_fd; + } + if( 0 > chosen_fd ) { + return OMPI_ERROR; /* Return something more specific... */ + } + + /* non-blocking write, but continue if interrupted */ + cnt = writev(chosen_fd, writer, frag->base.des_src_cnt + 1); + if( 0 == cnt ) { + CLOSE_FD(mosix_endpoint->endpoint_tcp_fd); + return OMPI_ERR_UNREACH; + } + if( 0 > cnt ) { + BTL_ERROR(("mca_btl_mosix_endpoint_send: writev failed: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + CLOSE_FD(mosix_endpoint->endpoint_tcp_fd); + return OMPI_ERR_IN_ERRNO; + } + + /* Deal with callbacks and deallocation of the fragment */ + if( NULL != frag->base.des_cbfunc) { + frag->base.des_cbfunc(&(mca_btl_mosix_module.super), mosix_endpoint, &(frag->base), OMPI_SUCCESS); + } + if( frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) { + MCA_BTL_MOSIX_FRAG_RETURN(frag); + } + return OMPI_SUCCESS; +} + + +/* + * An incoming message was detected for the given endpoint - + * Read it into a fresh descriptor and notify the upper layers of the arrival. + */ +void mca_btl_mosix_endpoint_recv_handler(mca_btl_mosix_endpoint_t* mosix_endpoint) +{ + ssize_t rc, cnt = -1; + mca_btl_mosix_frag_t* frag; + mca_btl_mosix_header_t addr; + + /* Prepare descriptor for reading an incoming message */ + struct iovec reader[] = {{&addr, sizeof(mca_btl_mosix_header_t)}, {NULL, 0}}; + + /* Prepare a fragment for incoming data */ + MCA_BTL_MOSIX_FRAG_ALLOC_MAX(frag, rc); + if( OPAL_UNLIKELY(NULL == frag) ) { + return; + } + frag->base.des_dst = frag->segments; + frag->base.des_dst_cnt = 1; + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + reader[1].iov_base = frag+1; + reader[1].iov_len = frag->size - sizeof(mca_btl_mosix_frag_t); + + /* Read the incoming message */ + cnt = readv(mca_btl_mosix_module.module_mailbox_fd, reader, 2); + if( cnt > 0 ) { + /* Invoke the call-back function to notify of the incoming package */ + mca_btl_base_tag_t tag = addr.tag; + mca_btl_active_message_callback_t* reg = mca_btl_base_active_message_trigger; + cnt -= sizeof(mca_btl_mosix_header_t); + if( 0 > cnt ) { + BTL_ERROR(("mca_btl_mosix_endpoint_recv_handler: readv failed: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + return; + } + frag->segments[0].seg_len = cnt; + if( mosix_endpoint->endpoint_nbo ) { + tag = ntohl(tag); + } + reg += tag; + reg->cbfunc(&(mca_btl_mosix_module.super), tag, &frag->base, reg->cbdata); + } + if( cnt < 0 ) { + BTL_ERROR(("mca_btl_mosix_endpoint_recv_handler: readv failed: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + } + + /* Return the frag to it's original state */ + frag->base.des_src = frag->segments; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + MCA_BTL_MOSIX_FRAG_RETURN(frag); +} Index: ompi/mca/btl/mosix/btl_mosix.c =================================================================== --- ompi/mca/btl/mosix/btl_mosix.c (revision 0) +++ ompi/mca/btl/mosix/btl_mosix.c (revision 0) @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include + +#include "opal/util/arch.h" +#include "opal/class/opal_bitmap.h" +#include "opal/datatype/opal_convertor.h" + +#include "ompi/mca/btl/btl.h" +#include "ompi/proc/proc.h" +#include "ompi/runtime/ompi_module_exchange.h" + +#include "btl_mosix.h" +#include "btl_mosix_frag.h" +#include "btl_mosix_endpoint.h" + +mca_btl_mosix_module_t mca_btl_mosix_module = { + { + &mca_btl_mosix_component.super, + 0, /* max size of first fragment */ + 0, /* min send fragment size */ + 0, /* max send fragment size */ + 0, /* btl_rdma_pipeline_send_length */ + 0, /* btl_rdma_pipeline_frag_size */ + 0, /* btl_min_rdma_pipeline_size */ + 0, /* exclusivity */ + 0, /* latency */ + 0, /* bandwidth */ + 0, /* flags */ + mca_btl_mosix_add_procs, + mca_btl_mosix_del_procs, + NULL, /* register */ + mca_btl_mosix_finalize, + mca_btl_mosix_alloc, + mca_btl_mosix_free, + mca_btl_mosix_prepare_src, + NULL, /* mca_btl_mosix_prepare_dst */ + mca_btl_mosix_send, + NULL, /* send immediate */ + NULL, /* mca_btl_mosix_put */ + NULL, /* mca_btl_mosix_get */ + mca_btl_base_dump, + NULL, /* mem-pool */ + NULL, /* register error */ + NULL /* Fault-tolerance handler */ + } +}; + +/* + * Create the endpoints to connect the current module with each of the + * given processes. The reachable bitmap will be filled with 1's since + * we assume all the processes are in the MOSIX cluster and reachable via + * the MOSIX direct communication mechanism. The endpoint is created with + * the target processes' "coordinates", namely it's PID and home-node + * address (in a dotted-qued string format, see mca_btl_mosix_addr_t), so + * when the first packet is sent the connection FD will be established. + */ +int mca_btl_mosix_add_procs(struct mca_btl_base_module_t* btl_base, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers, + opal_bitmap_t* reachable) +{ + int rc; + size_t index, addr_len; + mca_btl_mosix_endpoint_t* peer; + mca_btl_mosix_module_t* btl_mosix = (mca_btl_mosix_module_t*)btl_base; + assert(&mca_btl_mosix_module == (mca_btl_mosix_module_t*) btl_base); + + /* Enforce the per-process connection-maximum limit */ + if( MCA_BTL_MOSIX_MAX_CONNECTIONS <= nprocs ) { + opal_bitmap_clear_all_bits(reachable); + return OMPI_SUCCESS; + } + + /* Allocate filter space */ + btl_mosix->mosix_incoming.item_count = nprocs; + btl_mosix->mosix_incoming.filters = calloc(nprocs, sizeof(mca_btl_mosix_filter_t)); + if (NULL == btl_mosix->mosix_incoming.filters) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + btl_mosix->mosix_incoming.endpoints = calloc(nprocs, sizeof(mca_btl_mosix_endpoint_t*)); + if (NULL == btl_mosix->mosix_incoming.endpoints) { + free(btl_mosix->mosix_incoming.filters); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* loop through all procs, setting our reachable flag */ + for (index= 0; index < nprocs ; ++index) { + /* Allocate the new peer */ + peer = OBJ_NEW( mca_btl_mosix_endpoint_t ); + if( NULL == peer ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* Fill in the fields for each new endpoint */ + peers[index] = peer; + peer->endpoint_nbo = false; +#ifndef WORDS_BIGENDIAN + /* if we are little endian and our peer is not so lucky, then we + need to put all information sent to him in big endian (aka + Network Byte Order) and expect all information received to + be in NBO. Since big endian machines always send and receive + in NBO, we don't care so much about that case. */ + if( procs[0]->proc_arch & OPAL_ARCH_ISBIGENDIAN ) { + peer->endpoint_nbo = true; + } +#endif + + /* Read the address of each process so we can later contact it on-demand */ + rc = ompi_modex_recv(&mca_btl_mosix_component.super.btl_version, + procs[index], (void**)&(peer->endpoint_address), &addr_len); + if( OMPI_SUCCESS != rc ) { + return rc; + } + + /* Fill the filter with content about this process */ + btl_mosix->mosix_incoming.endpoints[index] = peer; + btl_mosix->mosix_incoming.filters[index].filter_pid = peer->endpoint_address->addr_pid; + btl_mosix->mosix_incoming.filters[index].filter_conditions = + MCA_BTL_MOSIX_INTERESTED_IN_HOME | + MCA_BTL_MOSIX_INTERESTED_IN_PID; + + rc = opal_bitmap_set_bit(reachable, index); + if( OMPI_SUCCESS != rc ) { + return rc; + } + } + return OMPI_SUCCESS; +} + +/* + * Delete all the connections from this module to other instances. + */ +int mca_btl_mosix_del_procs(struct mca_btl_base_module_t *btl_base, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t **peers) +{ + size_t index; + for( index = 0 ; index < nprocs ; index++ ) { + OBJ_RELEASE( peers[index] ); + } + return OMPI_SUCCESS; +} + +/* + * Last function, called at module tear-down time. + */ +int mca_btl_mosix_finalize(struct mca_btl_base_module_t* btl) +{ + mca_btl_mosix_module_t* btl_mosix = (mca_btl_mosix_module_t*) btl; + OBJ_RELEASE(btl_mosix->mosix_incoming_mask); + return OMPI_SUCCESS; +} + + +/** + * Allocate a descriptor with a segment of the requested size. + * Note that the BTL layer may choose to return a smaller size + * if it cannot support the request. + * + * @param btl (IN) BTL module + * @param size (IN) Request segment size. + */ +mca_btl_base_descriptor_t* +mca_btl_mosix_alloc(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + uint8_t order, + size_t size, + uint32_t flags ) +{ + mca_btl_mosix_frag_t* frag = NULL; + int rc; + + if( size <= btl->btl_eager_limit ) { + MCA_BTL_MOSIX_FRAG_ALLOC_EAGER(frag, rc); + } else if( size <= btl->btl_max_send_size ) { + MCA_BTL_MOSIX_FRAG_ALLOC_MAX(frag, rc); + } + if( OPAL_UNLIKELY(NULL == frag) ) { + return NULL; + } + + /* Fill in the missing fields */ + frag->segments[0].seg_len = size; + frag->base.des_flags = flags; + frag->base.order = MCA_BTL_NO_ORDER; + return (mca_btl_base_descriptor_t*)frag; +} + + +/** + * Return a segment allocated by this BTL. + * + * @param btl (IN) BTL module + * @param descriptor (IN) Allocated descriptor. + */ +int mca_btl_mosix_free( struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des ) +{ + mca_btl_mosix_frag_t* frag = (mca_btl_mosix_frag_t*)des; + MCA_BTL_MOSIX_FRAG_RETURN(frag); + return OMPI_SUCCESS; +} + +/** + * Prepare a descriptor for send/rdma using the supplied + * convertor. If the convertor references data that is contigous, + * the descriptor may simply point to the user buffer. Otherwise, + * this routine is responsible for allocating buffer space and + * packing if required. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL peer addressing + * @param convertor (IN) Data type convertor + * @param reserve (IN) Additional bytes requested by upper layer to precede user data + * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) + */ +mca_btl_base_descriptor_t* +mca_btl_mosix_prepare_src(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_mpool_base_registration_t* registration, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags) +{ + mca_btl_mosix_frag_t* frag; + struct iovec iov; + uint32_t iov_count = 1; + size_t result; + int rc; + + /* Enforce upper message length limit */ + if( OPAL_UNLIKELY((reserve + *size) > btl->btl_max_send_size) ) { + *size = btl->btl_max_send_size - reserve; + } + + /* Fetch a fragment to work on */ + if( *size + reserve <= btl->btl_eager_limit ) { + MCA_BTL_MOSIX_FRAG_ALLOC_EAGER(frag, rc); + } else { + MCA_BTL_MOSIX_FRAG_ALLOC_MAX(frag, rc); + } + if( OPAL_UNLIKELY(NULL == frag) ) { + return NULL; + } + frag->segments[0].seg_len = reserve; + + /* Fill it with outgoing data */ + iov.iov_len = frag->size - reserve; + //if( opal_convertor_need_buffers(convertor) ) { + if( (0 != reserve) || (opal_convertor_need_buffers(convertor)) ) { + /* Use existing buffer at the end of the fragment */ + iov.iov_base = (unsigned char*)frag->segments[0].seg_addr.pval + reserve; + rc = opal_convertor_pack( convertor, &iov, &iov_count, &result ); + if( 0 > rc ) { + MCA_BTL_MOSIX_FRAG_RETURN(frag); + return NULL; + } + frag->segments[0].seg_len += result; + frag->base.des_src_cnt = 1; + } else { + iov.iov_base = NULL; + /* Read the iovec for the buffer to be transfered */ + rc = opal_convertor_pack( convertor, &iov, &iov_count, &result ); + if( rc < 0 ) { + MCA_BTL_MOSIX_FRAG_RETURN(frag); + return NULL; + } + frag->segments[1].seg_addr.pval = iov.iov_base; + frag->segments[1].seg_len = result; + frag->base.des_src_cnt = 2; + } + frag->base.des_src = frag->segments; + frag->base.order = MCA_BTL_NO_ORDER; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = flags; + return &frag->base; +} + +/** + * Prepare a descriptor for send/rdma using the supplied + * convertor. If the convertor references data that is contigous, + * the descriptor may simply point to the user buffer. Otherwise, + * this routine is responsible for allocating buffer space and + * packing if required. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL peer addressing + * @param convertor (IN) Data type convertor + * @param reserve (IN) Additional bytes requested by upper layer to precede user data + * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) + */ + +mca_btl_base_descriptor_t* +mca_btl_mosix_prepare_dst(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_mpool_base_registration_t* registration, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags ) +{ + mca_btl_mosix_frag_t* frag; + size_t origin, position = *size; + int rc; + + if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) { + *size = (size_t)UINT32_MAX; + } + MCA_BTL_MOSIX_FRAG_ALLOC_USER(frag, rc); + if( OPAL_UNLIKELY(NULL == frag) ) { + return NULL; + } + + opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments[0].seg_addr.pval) ); + origin = convertor->bConverted; + position += origin; + opal_convertor_set_position( convertor, &position ); + *size = position - origin; + + frag->segments[0].seg_len = *size; + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + frag->base.des_flags = flags; + frag->base.des_dst = frag->segments; + frag->base.des_dst_cnt = 1; + frag->base.order = MCA_BTL_NO_ORDER; + return &frag->base; +} + +/** + * Initiate an asynchronous send. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transfered + * @param tag (IN) The tag value used to notify the peer. + */ +int mca_btl_mosix_send( struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag ) + +{ + mca_btl_mosix_endpoint_t* mosix_endpoint = (mca_btl_mosix_endpoint_t*) endpoint; + mca_btl_mosix_frag_t* frag = (mca_btl_mosix_frag_t*)descriptor; + + frag->tag = tag; + /* frag->type = MCA_BTL_MOSIX_HEADER_TYPE_SEND; */ + return mca_btl_mosix_endpoint_send(mosix_endpoint, frag); +} + + +/** + * Initiate an asynchronous put. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + * +int mca_btl_mosix_put( mca_btl_base_module_t* btl, + mca_btl_base_endpoint_t* endpoint, + mca_btl_base_descriptor_t* des ) +{ + mca_btl_mosix_module_t* mosix_btl = (mca_btl_mosix_module_t*) btl; + mca_btl_mosix_frag_t* frag = (mca_btl_mosix_frag_t*) des; + int peer = endpoint->mosix_vp; + mca_btl_base_segment_t* src = des->des_src; + mca_btl_base_segment_t* dst = des->des_dst; + unsigned char* src_addr = (unsigned char*)src->seg_addr.pval; + size_t src_len = src->seg_len; + unsigned char* dst_addr = (unsigned char*)ompi_ptr_ltop(dst->seg_addr.lval); + + frag->endpoint = endpoint; + frag->btl = mosix_btl; + frag->type = MCA_BTL_MOSIX_HDR_TYPE_PUT; + frag->mosix_event = mosix_put(mosix_btl->base->state, src_addr, dst_addr, src_len, peer); + + OPAL_THREAD_LOCK( &mosix_btl->mosix_lock ); + opal_list_append( &(mosix_btl->rdma_list), (opal_list_item_t*)frag ); + OPAL_THREAD_UNLOCK( &mosix_btl->mosix_lock ); + return OMPI_SUCCESS; +} + + +** + * Initiate an asynchronous get. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + * + * + +int mca_btl_mosix_get( mca_btl_base_module_t* btl, + mca_btl_base_endpoint_t* endpoint, + mca_btl_base_descriptor_t* des ) +{ + mca_btl_mosix_module_t* mosix_btl = (mca_btl_mosix_module_t*) btl; + mca_btl_mosix_frag_t* frag = (mca_btl_mosix_frag_t*) des; + int peer = endpoint->mosix_vp; + mca_btl_base_segment_t* src = des->des_src; + mca_btl_base_segment_t* dst = des->des_dst; + unsigned char* src_addr = (unsigned char*)src->seg_addr.pval; + size_t src_len = src->seg_len; + unsigned char* dst_addr = (unsigned char*)dst->seg_addr.lval; + + frag->endpoint = endpoint; + frag->btl = mosix_btl; + frag->type = MCA_BTL_MOSIX_HDR_TYPE_GET; + opal_output( 0, "mosix_get( remote %p, local %p, length %d, peer %d )\n", + (void*)src_addr, (void*)dst_addr, (int)src_len, peer ); + frag->mosix_event = mosix_get(mosix_btl->base->state, src_addr, dst_addr, src_len, peer); + + OPAL_THREAD_LOCK( &mosix_btl->mosix_lock ); + opal_list_append( &(mosix_btl->rdma_list), (opal_list_item_t*)frag ); + OPAL_THREAD_UNLOCK( &mosix_btl->mosix_lock ); + return OMPI_SUCCESS; +} + +* + * Send a descriptor created by mca_btl_mosix_alloc() and passed through + * mca_btl_mosix_prepare_src() in a blocking manner. See endpoint_t docs for + * more about blocking/non-blocking options. + * TODO: export an API for the non-blocking alternative... + * +int mca_btl_mosix_send(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag) +{ + int rc; + mca_btl_base_header_t* mosix_header = descriptor->des_src[0].seg_addr.pval; + mosix_header->tag = tag; + if( ((mca_btl_mosix_endpoint_t*)endpoint)->endpoint_nbo ) { + mosix_header->tag = htonl(mosix_header->tag); + } + rc = mca_btl_mosix_endpoint_send(endpoint, descriptor); + if( descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) { + descriptor->des_cbfunc(btl, endpoint, descriptor, rc); + } + if( descriptor->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) { + mca_btl_mosix_free(btl, descriptor); + } + return rc; +} +*/ Index: ompi/mca/btl/mosix/btl_mosix_component.c =================================================================== --- ompi/mca/btl/mosix/btl_mosix_component.c (revision 0) +++ ompi/mca/btl/mosix/btl_mosix_component.c (revision 0) @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2012 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2009 Oak Ridge National Laboratory + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "ompi_config.h" + +#include "opal/opal_socket_errno.h" +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#include +#ifdef HAVE_SYS_IOCTL_H +#include +#endif +#include +#include + +#include "orte/types.h" +#include "orte/util/show_help.h" + +#include "ompi/constants.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/runtime/ompi_module_exchange.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/btl/base/btl_base_error.h" + +#include "btl_mosix.h" +#include "btl_mosix_frag.h" +#include "btl_mosix_endpoint.h" + +static inline int mca_btl_mosix_param_register_int( + const char* param_name, + const char* help_string, + int default_value) +{ + int value; + mca_base_param_reg_int(&mca_btl_mosix_component.super.btl_version, + param_name, help_string, false, false, + default_value, &value); + return value; +} + +/* + * Local functions + */ +static int mca_btl_mosix_component_register(void); +static int mca_btl_mosix_component_open(void); +static int mca_btl_mosix_component_close(void); +static int mca_btl_mosix_component_progress(void); + +mca_btl_mosix_component_t mca_btl_mosix_component = { + { + /* First, the mca_base_component_t struct containing meta information + about the component itself */ + + { + MCA_BTL_BASE_VERSION_2_0_0, + + "mosix", /* MCA component name */ + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + mca_btl_mosix_component_open, /* component open */ + mca_btl_mosix_component_close, /* component close */ + NULL, /* component query */ + mca_btl_mosix_component_register, /* component register */ + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + mca_btl_mosix_component_init, + mca_btl_mosix_component_progress + }, + &mca_btl_mosix_module +}; + +static int mca_btl_mosix_component_register(void) +{ + mca_btl_mosix_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW + 100; + mca_btl_mosix_module.super.btl_eager_limit = 64*1024; + mca_btl_mosix_module.super.btl_rndv_eager_limit = 64*1024; + mca_btl_mosix_module.super.btl_max_send_size = 128*1024; + mca_btl_mosix_module.super.btl_rdma_pipeline_send_length = 128*1024; + mca_btl_mosix_module.super.btl_rdma_pipeline_frag_size = INT_MAX; + mca_btl_mosix_module.super.btl_min_rdma_pipeline_size = 0; + mca_btl_mosix_module.super.btl_flags = MCA_BTL_FLAGS_SEND | + MCA_BTL_FLAGS_SEND_INPLACE | + MCA_BTL_FLAGS_NEED_CSUM | + MCA_BTL_FLAGS_NEED_ACK | + MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; + mca_btl_mosix_module.super.btl_bandwidth = 100; + mca_btl_mosix_module.super.btl_latency = 100; + + mca_btl_base_param_register(&mca_btl_mosix_component.super.btl_version, + &mca_btl_mosix_module.super); + mca_btl_mosix_component.mosix_free_list_num = + mca_btl_mosix_param_register_int( "free_list_num", NULL, 8 ); + mca_btl_mosix_component.mosix_free_list_max = + mca_btl_mosix_param_register_int( "free_list_max", NULL, -1 ); + mca_btl_mosix_component.mosix_free_list_inc = + mca_btl_mosix_param_register_int( "free_list_inc", NULL, 32 ); + mca_btl_mosix_component.mosix_udp_max_size = + mca_btl_mosix_param_register_int( "udp_max_size", "Upper limit on message length for UDP", 512 ); + return OMPI_SUCCESS; +} + +/* + * Gather the information composing our local address. + */ +static int mca_btl_mosix_retrieve_local_address(mca_btl_mosix_addr_t* local_address) +{ + /* Determine local address from local file */ + int rc, local_addr_fd = open("/proc/mosix/mosip", O_RDONLY); + if (-1 == local_addr_fd) { + return OMPI_ERR_FILE_OPEN_FAILURE; + } + + /* Read file contents (contains only the address, in the dotted-quad format) */ + rc = read(local_addr_fd, &local_address->addr_ipv4_str, MCA_BTL_MOSIX_ADDR_IPV4_LENGTH); + if ( 0 >= rc ) { + return OMPI_ERR_FILE_READ_FAILURE; + } + local_address->addr_ipv4_str[rc - 1] = 0; /* remove newline */ + + /* Store process ID */ + local_address->addr_pid = getpid(); + return OMPI_SUCCESS; +} + +static int mca_btl_mosix_component_open(void) +{ + /* Determine the local address thus making sure MOSIX is up and running */ + return mca_btl_mosix_retrieve_local_address(&mca_btl_mosix_component.mosix_module->module_mailbox_address); +} + +static int mca_btl_mosix_component_close(void) +{ + return OMPI_SUCCESS; +} + +/* + * MOSIX module initialization: since there's always a single module, + * just distribute it's address (including our PID and IP(v4) address) and + * open the local mailbox for incoming messages (thus making sure MOSIX is up). + */ +mca_btl_base_module_t** mca_btl_mosix_component_init(int *num_btl_modules, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + int ret = OMPI_SUCCESS; + mca_btl_base_module_t** btls = NULL; + mca_btl_mosix_module_t* mosix_module = mca_btl_mosix_component.mosix_module; + + /* Notify other procs of our address (used in mca_btl_mosix_add_procs()) */ + ret = ompi_modex_send(&mca_btl_mosix_component.super.btl_version, + &mosix_module->module_mailbox_address, + sizeof(mca_btl_mosix_addr_t)); + if (OMPI_SUCCESS != ret) { + return NULL; + } + + /* Open the handle to the incoming mailbox */ + mosix_module->module_mailbox_fd = + open(MCA_BTL_MOSIX_LOCAL_PATH_STRING_FORMAT, O_CREAT, + MCA_BTL_MOSIX_LOCAL_MBOX_FLAGS); + if (-1 == mosix_module->module_mailbox_fd) { + return NULL; + } + + mosix_module->mosix_incoming_mask = OBJ_NEW(opal_bitmap_t); + opal_bitmap_init(mosix_module->mosix_incoming_mask, MCA_BTL_MOSIX_MAX_CONNECTIONS); + OBJ_CONSTRUCT (&mca_btl_mosix_component.mosix_lock, opal_mutex_t); + OBJ_CONSTRUCT (&mca_btl_mosix_component.mosix_frag_eager, ompi_free_list_t); + OBJ_CONSTRUCT (&mca_btl_mosix_component.mosix_frag_max, ompi_free_list_t); + OBJ_CONSTRUCT (&mca_btl_mosix_component.mosix_frag_user, ompi_free_list_t); + ompi_free_list_init_new( &mca_btl_mosix_component.mosix_frag_eager, + sizeof(mca_btl_mosix_frag_t) + mca_btl_mosix_module.super.btl_eager_limit, + opal_cache_line_size, + OBJ_CLASS(mca_btl_mosix_frag_eager_t), + 0, opal_cache_line_size, + mca_btl_mosix_component.mosix_free_list_num, + mca_btl_mosix_component.mosix_free_list_max, + mca_btl_mosix_component.mosix_free_list_inc, + NULL ); /* use default allocator */ + + ompi_free_list_init_new( &mca_btl_mosix_component.mosix_frag_user, + sizeof(mca_btl_mosix_frag_t), + opal_cache_line_size, + OBJ_CLASS(mca_btl_mosix_frag_user_t), + 0, opal_cache_line_size, + mca_btl_mosix_component.mosix_free_list_num, + mca_btl_mosix_component.mosix_free_list_max, + mca_btl_mosix_component.mosix_free_list_inc, + NULL ); /* use default allocator */ + + ompi_free_list_init_new( &mca_btl_mosix_component.mosix_frag_max, + sizeof(mca_btl_mosix_frag_t)+mca_btl_mosix_module.super.btl_max_send_size, + opal_cache_line_size, + OBJ_CLASS(mca_btl_mosix_frag_max_t), + 0, opal_cache_line_size, + mca_btl_mosix_component.mosix_free_list_num, + mca_btl_mosix_component.mosix_free_list_max, + mca_btl_mosix_component.mosix_free_list_inc, + NULL ); /* use default allocator */ + + /* Set the only BTL as output and return */ + btls = malloc(sizeof(mca_btl_base_module_t*)); + btls[0] = (mca_btl_base_module_t*)mosix_module; + *num_btl_modules = 1; + return btls; +} + +/* + * Poll for incoming messages on any of the connections. In fact - this is not + * effected by open/closed endpoints, but rather by anything found in the local mailbox. + * Messages in the mailbox are filtered by all the available filters (each endpoint has + * a corresponding filter on the mailbox, including sender PID and location). + * The ioctl() calls set the filters and create a bitmap where each bit represents + * the presence of an incoming message(s) fitting the corresponding filter. + * For each such "flagged" filter - the endpoint recieving callback is invoked. + */ +static int mca_btl_mosix_component_progress(void) +{ + mca_btl_mosix_module_t* mosix_module = mca_btl_mosix_component.mosix_module; + opal_bitmap_t* incoming_mask = mosix_module->mosix_incoming_mask; + int count = 0, index = mosix_module->mosix_incoming.item_count; + + /* Set parameters for MOSIX polling using ioctl */ + ioctl(mosix_module->module_mailbox_fd, SIOCKSTOREINTERESTS, &(mosix_module->mosix_incoming)); + + /* Perform MOSIX direct communication polling */ + ioctl(mosix_module->module_mailbox_fd, SIOCWHICH, incoming_mask->bitmap); + + /* Iterate over fds to treat only DiCOM sockets */ + while (index-- > 0) { + if (opal_bitmap_is_set_bit(incoming_mask, index)) { + mca_btl_mosix_endpoint_recv_handler(mosix_module->mosix_incoming.endpoints[index]); + count++; + } + } + return count; +} Index: ompi/mca/btl/mosix/configure.params =================================================================== --- ompi/mca/btl/mosix/configure.params (revision 0) +++ ompi/mca/btl/mosix/configure.params (revision 0) @@ -0,0 +1,24 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_CONFIG_FILES="Makefile" Index: ompi/mca/btl/mosix/btl_mosix_frag.h =================================================================== --- ompi/mca/btl/mosix/btl_mosix_frag.h (revision 0) +++ ompi/mca/btl/mosix/btl_mosix_frag.h (revision 0) @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_MOSIX_FRAG_H +#define MCA_BTL_MOSIX_FRAG_H + +#include "ompi_config.h" +#include "btl_mosix.h" + +BEGIN_C_DECLS + +#define MCA_BTL_MOSIX_HEADER_TYPE_SEND 1 +#define MCA_BTL_MOSIX_HEADER_TYPE_PUT 2 +#define MCA_BTL_MOSIX_HEADER_TYPE_GET 3 +#define MCA_BTL_MOSIX_HEADER_TYPE_RECV 4 + +/** + * Elan send fragment derived type. + */ +struct mca_btl_mosix_frag_t { + mca_btl_base_descriptor_t base; + mca_btl_base_segment_t segments[2]; + ompi_free_list_t* my_list; + mca_btl_base_tag_t tag; + size_t size; + int type; +}; +typedef struct mca_btl_mosix_frag_t mca_btl_mosix_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_mosix_frag_t); + +typedef struct mca_btl_mosix_frag_t mca_btl_mosix_frag_eager_t; + +OBJ_CLASS_DECLARATION(mca_btl_mosix_frag_eager_t); + +typedef struct mca_btl_mosix_frag_t mca_btl_mosix_frag_max_t; + +OBJ_CLASS_DECLARATION(mca_btl_mosix_frag_max_t); + +typedef struct mca_btl_mosix_frag_t mca_btl_mosix_frag_user_t; + +OBJ_CLASS_DECLARATION(mca_btl_mosix_frag_user_t); + +/* + * Macros to allocate/return descriptors from module specific + * free list(s). + */ +#define MCA_BTL_MOSIX_FRAG_ALLOC_LIST( list, frag, rc ) \ +{ \ + ompi_free_list_item_t *item; \ + OMPI_FREE_LIST_GET(&(list), item, rc); \ + frag = (mca_btl_mosix_frag_t*) item; \ +} + +#define MCA_BTL_MOSIX_FRAG_ALLOC_EAGER(frag, rc) \ + MCA_BTL_MOSIX_FRAG_ALLOC_LIST(mca_btl_mosix_component.mosix_frag_eager, frag, rc) + +#define MCA_BTL_MOSIX_FRAG_ALLOC_MAX(frag, rc) \ + MCA_BTL_MOSIX_FRAG_ALLOC_LIST(mca_btl_mosix_component.mosix_frag_max, frag, rc) + +#define MCA_BTL_MOSIX_FRAG_ALLOC_USER(frag, rc) \ + MCA_BTL_MOSIX_FRAG_ALLOC_LIST(mca_btl_mosix_component.mosix_frag_user, frag, rc) + +#define MCA_BTL_MOSIX_FRAG_RETURN(frag) \ + { \ + OMPI_FREE_LIST_RETURN(frag->my_list, \ + (ompi_free_list_item_t*)(frag)); \ + } + + +END_C_DECLS + +#endif /* MCA_BTL_MOSIX_FRAG_H */ Index: ompi/mca/btl/mosix/help-mpi-btl-mosix.txt =================================================================== --- ompi/mca/btl/mosix/help-mpi-btl-mosix.txt (revision 0) +++ ompi/mca/btl/mosix/help-mpi-btl-mosix.txt (revision 0) @@ -0,0 +1,12 @@ +# -*- text -*- +# +# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open MPI's TCP support +# (the openib BTL). +# Index: ompi/mca/btl/mosix/btl_mosix_endpoint.h =================================================================== --- ompi/mca/btl/mosix/btl_mosix_endpoint.h (revision 0) +++ ompi/mca/btl/mosix/btl_mosix_endpoint.h (revision 0) @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_MOSIX_ENDPOINT_H +#define MCA_BTL_MOSIX_ENDPOINT_H + +#include "btl_mosix.h" + +#define MCA_BTL_MOSIX_MBOX_PATH_STRING_FORMAT "/proc/mosix/mbox/%s/%i" +#define MCA_BTL_MOSIX_UBOX_PATH_STRING_FORMAT "/proc/mosix/ubox/%s/%i" +#define MCA_BTL_MOSIX_LOCAL_PATH_STRING_FORMAT "/proc/mosix/mybox" +#define MCA_BTL_MOSIX_MBOX_PATH_BUFFER_LENGTH (35) +#define MCA_BTL_MOSIX_MAX_CONNECTIONS (128) /* 128 DiCOMs possible */ +#define MCA_BTL_MOSIX_LOCAL_MBOX_FLAGS (71) /* See flags in `man direct_communication` */ +#define MCA_BTL_MOSIX_ADDR_IPV4_LENGTH (16) /* Space for xxx.xxx.xxx.xxx + null truncation */ + +/* Define conditions for filter bit-field */ +#define MCA_BTL_MOSIX_INTERESTED_IN_PID (1) +#define MCA_BTL_MOSIX_INTERESTED_IN_HOME (2) +#define MCA_BTL_MOSIX_INTERESTED_IN_MINLEN (4) +#define MCA_BTL_MOSIX_INTERESTED_IN_MAXLEN (8) +#define MCA_BTL_MOSIX_INTERESTED_IN_PATTERN (16) +#define MCA_BTL_MOSIX_INTERESTED_IN_MESSAGENO (32) +#define MCA_BTL_MOSIX_INTERESTED_IN_OFFSET (64) +#define MCA_BTL_MOSIX_PREVENT_REMOVAL (128) + +/* Define special ioctl codes */ +#define SIOCINTERESTED (0x8985) +#define SIOCKSTOREINTERESTS (0x8986) +#define SIOCWHICH (0x8987) + +BEGIN_C_DECLS + +/** + * Structure used to publish MOSIX connection information to peers. + */ + +struct mca_btl_mosix_addr_t { + char addr_ipv4_str[MCA_BTL_MOSIX_ADDR_IPV4_LENGTH]; /**< ipv4 address */ + pid_t addr_pid; /**< process ID */ +}; +typedef struct mca_btl_mosix_addr_t mca_btl_mosix_addr_t; + +/** + * Endpoint is a representation of a connection to a remote process. + * While the address is filled when the endpoint structure is created, both + * FDs remain closed until they are needed. Since there are currently two methods + * to send data over, via TCP or UDP, both channels can be open at the same time. + * UDP should be favored for async. sending, but is ofter out-performed by TCP, and + * requires tending to packet order and acknoledgements (lower layer only provides delivery). + */ + +struct mca_btl_base_endpoint_t { + int endpoint_tcp_fd; /**< file descriptor for sync. sending */ + int endpoint_udp_fd; /**< file descriptor for async. sending */ + mca_btl_mosix_addr_t* endpoint_address; /**< address of the remote mailbox */ + opal_mutex_t endpoint_establishment_lock; /**< lock to protect endpoint establishment */ + bool endpoint_nbo; /**< convert headers to network byte order */ +}; + +typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; +typedef mca_btl_base_endpoint_t mca_btl_mosix_endpoint_t; +OBJ_CLASS_DECLARATION(mca_btl_mosix_endpoint_t); + +int mca_btl_mosix_endpoint_send(mca_btl_mosix_endpoint_t* mosix_endpoint, mca_btl_mosix_frag_t* frag); +void mca_btl_mosix_endpoint_recv_handler(mca_btl_base_endpoint_t* btl_endpoint); + +/* + * The filter is used to specify the criteria for incoming mailbox messages + * for an endpoint. For example, if this process is connected to the remote + * process #1234, the filter will include the flag MCA_BTL_MOSIX_INTERESTED_IN_PID + * in the conditions bitmap, and the pid will contain 1234. + */ +struct mca_btl_mosix_filter_t { + unsigned char filter_conditions; /**< bitmap of fields effecting the filter (below) */ + unsigned char filter_testlen; /**< length of test-pattern (1-8 bytes) */ + int filter_pid; /**< Process-ID of sender */ + unsigned int filter_home; /**< home-node of sender (0 = same home) */ + int filter_minlen; /**< minimum message length */ + int filter_maxlen; /**< maximum message length */ + int filter_testoffset; /**< offset of test-pattern within message */ + unsigned char filter_testdata[8]; /**< expected test-pattern */ + int filter_msgno; /**< pick a specific message (starting from 1) */ + int filter_msgoffset; /**< start reading from given offset */ +}; +typedef struct mca_btl_mosix_filter_t mca_btl_mosix_filter_t; + +/* + * This is an array of filters and their corresponding endpoints (arrays + * should both be of the same length, stored in item_count). + * The progress function polls for incoming messages to all existing + * endpoints by feeding the criteria for each one (using an ioctl() call). + * When a message fitting a criteria is found - the corresponding endpoint + * is dispatched to the incoming message handler to retrieve it. + */ +struct mca_btl_mosix_incoming_array_t { + long item_count; /**< number of filters */ + mca_btl_mosix_filter_t* filters; /**< filters to store criteria */ + mca_btl_mosix_endpoint_t** endpoints; /**< endpoints for incoming data */ +}; +typedef struct mca_btl_mosix_incoming_array_t mca_btl_mosix_incoming_array_t; + +END_C_DECLS +#endif Index: ompi/mca/btl/mosix/btl_mosix.h =================================================================== --- ompi/mca/btl/mosix/btl_mosix.h (revision 0) +++ ompi/mca/btl/mosix/btl_mosix.h (revision 0) @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2012 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_BTL_MOSIX_H +#define MCA_BTL_MOSIX_H + +/* Open MPI includes */ +#include "ompi_config.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/btl/btl.h" + +#include "btl_mosix_frag.h" +#include "btl_mosix_endpoint.h" +BEGIN_C_DECLS + +/** + * MOSIX BTL Module Interface (one per process). + * Note: MOSIX issues a "dummy" FD for the mailbox, not recognized by the system. + * This should be an oddly high number, such as 207618048, so it won't collide with + * native linux FDs. + */ +struct mca_btl_mosix_module_t { + mca_btl_base_module_t super; /**< base BTL interface */ + + int module_mailbox_fd; + /**< Local mailbox file descriptor for polling/reading incoming messages */ + + mca_btl_mosix_addr_t module_mailbox_address; + /**< Local mailbox address - published to other processes */ + + mca_btl_mosix_incoming_array_t mosix_incoming; + /**< Array of filters for MOSIX direct communication polling */ + + opal_bitmap_t* mosix_incoming_mask; + /**< Bit-field corresponding to incoming data in array */ +}; +typedef struct mca_btl_mosix_module_t mca_btl_mosix_module_t; +extern mca_btl_mosix_module_t mca_btl_mosix_module; + +/* Placeholder for additional header information in the future */ +typedef struct mca_btl_base_header_t mca_btl_mosix_header_t; + +/** + * MOSIX BTL component. + */ +struct mca_btl_mosix_component_t { + mca_btl_base_component_2_0_0_t super; + /**< base BTL component */ + + mca_btl_mosix_module_t* mosix_module; + /**< local module - since there is only one it can be static */ + + size_t mosix_udp_max_size; + /**< upper limit for using udp - tcp used above this length */ + + int mosix_free_list_num; + /**< initial size of free lists */ + + int mosix_free_list_max; + /**< maximum size of free lists */ + + int mosix_free_list_inc; + /**< number of elements to alloc when growing free lists */ + + int mosix_max_posted_recv; + /**< number of pre-posted receives */ + + /* free list of fragment descriptors */ + ompi_free_list_t mosix_frag_eager; + ompi_free_list_t mosix_frag_max; + ompi_free_list_t mosix_frag_user; + + opal_list_t mosix_procs; + /**< list of mosix proc structures */ + + opal_mutex_t mosix_lock; + /**< lock for accessing module state */ +}; +typedef struct mca_btl_mosix_component_t mca_btl_mosix_component_t; +OMPI_MODULE_DECLSPEC extern mca_btl_mosix_component_t mca_btl_mosix_component; + +/** + * MOSIX component initialization. + * + * @param num_btl_modules (OUT) Number of BTLs returned in BTL array. + * @param allow_multi_user_threads (OUT) Flag indicating wether BTL supports user threads (TRUE) + * @param have_hidden_threads (OUT) Flag indicating wether BTL uses threads (TRUE) + */ +extern mca_btl_base_module_t** mca_btl_mosix_component_init( + int *num_btl_modules, + bool allow_multi_user_threads, + bool have_hidden_threads +); + + +/** + * Cleanup any resources held by the BTL. + * + * @param btl BTL instance. + * @return OMPI_SUCCESS or error status on failure. + */ + +extern int mca_btl_mosix_finalize( + struct mca_btl_base_module_t* btl +); + + +/** + * PML->BTL notification of change in the process list. + * + * @param btl (IN) + * @param nprocs (IN) Number of processes + * @param procs (IN) Set of processes + * @param peers (OUT) Set of (optional) peer addressing info. + * @param peers (IN/OUT) Set of processes that are reachable via this BTL. + * @return OMPI_SUCCESS or error status on failure. + * + */ + +extern int mca_btl_mosix_add_procs( + struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers, + opal_bitmap_t* reachable +); + +/** + * PML->BTL notification of change in the process list. + * + * @param btl (IN) BTL instance + * @param nproc (IN) Number of processes. + * @param procs (IN) Set of processes. + * @param peers (IN) Set of peer data structures. + * @return Status indicating if cleanup was successful + * + */ + +extern int mca_btl_mosix_del_procs( + struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers +); + + +/** + * Initiate an asynchronous send. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transfered + * @param tag (IN) The tag value used to notify the peer. + */ + +extern int mca_btl_mosix_send( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag +); + + +/** + * Initiate an asynchronous put. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ + +extern int mca_btl_mosix_put( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* decriptor +); + + +/** + * Initiate an asynchronous get. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + */ + +extern int mca_btl_mosix_get( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* decriptor +); + +/** + * Allocate a descriptor with a segment of the requested size. + * Note that the BTL layer may choose to return a smaller size + * if it cannot support the request. + * + * @param btl (IN) BTL module + * @param size (IN) Request segment size. + */ + +extern mca_btl_base_descriptor_t* mca_btl_mosix_alloc( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + uint8_t order, + size_t size, + uint32_t flags); + + +/** + * Return a segment allocated by this BTL. + * + * @param btl (IN) BTL module + * @param descriptor (IN) Allocated descriptor. + */ + +extern int mca_btl_mosix_free( + struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des); + + +/** + * Prepare a descriptor for send/rdma using the supplied + * convertor. If the convertor references data that is contigous, + * the descriptor may simply point to the user buffer. Otherwise, + * this routine is responsible for allocating buffer space and + * packing if required. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL peer addressing + * @param convertor (IN) Data type convertor + * @param reserve (IN) Additional bytes requested by upper layer to precede user data + * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) +*/ + +mca_btl_base_descriptor_t* mca_btl_mosix_prepare_src( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + struct mca_mpool_base_registration_t*, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags); + +mca_btl_base_descriptor_t* mca_btl_mosix_prepare_dst( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + struct mca_mpool_base_registration_t*, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags); + +END_C_DECLS +#endif Index: ompi/mca/btl/mosix/Makefile.am =================================================================== --- ompi/mca/btl/mosix/Makefile.am (revision 0) +++ ompi/mca/btl/mosix/Makefile.am (revision 0) @@ -0,0 +1,54 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = help-mpi-btl-mosix.txt + +sources = \ + btl_mosix.c \ + btl_mosix.h \ + btl_mosix_frag.c \ + btl_mosix_frag.h \ + btl_mosix_component.c \ + btl_mosix_endpoint.c \ + btl_mosix_endpoint.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_btl_mosix_DSO +lib = +lib_sources = +component = mca_btl_mosix.la +component_sources = $(sources) +else +lib = libmca_btl_mosix.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component) +mca_btl_mosix_la_SOURCES = $(component_sources) +mca_btl_mosix_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(lib) +libmca_btl_mosix_la_SOURCES = $(lib_sources) +libmca_btl_mosix_la_LDFLAGS = -module -avoid-version Index: orte/mca/ras/mosix/configure.m4 =================================================================== --- orte/mca/ras/mosix/configure.m4 (revision 0) +++ orte/mca/ras/mosix/configure.m4 (revision 0) @@ -0,0 +1,28 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ras_mosix_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_ras_mosix_CONFIG],[ + # check for mosix presence + AC_CHECK_FILE([/proc/mosix/mosip], + [$1], + [$2]) +])dnl Index: orte/mca/ras/mosix/configure.params =================================================================== --- orte/mca/ras/mosix/configure.params (revision 0) +++ orte/mca/ras/mosix/configure.params (revision 0) @@ -0,0 +1,22 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +PARAM_CONFIG_FILES="Makefile" Index: orte/mca/ras/mosix/ras_mosix_component.c =================================================================== --- orte/mca/ras/mosix/ras_mosix_component.c (revision 0) +++ orte/mca/ras/mosix/ras_mosix_component.c (revision 0) @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "orte/runtime/orte_globals.h" +#include "orte/util/name_fns.h" + +#include "orte/mca/ras/base/ras_private.h" +#include "ras_mosix.h" + +extern int g_orte_mca_ras_mosix_prevent_oversubscription; + +/* + * Local variables + */ +static int param_priority; + + +/* + * Local functions + */ +static int orte_ras_mosix_open(void); +static int orte_ras_mosix_component_query(mca_base_module_t **module, int *priority); + + +orte_ras_base_component_t mca_ras_mosix_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + { + ORTE_RAS_BASE_VERSION_2_0_0, + + /* Component name and version */ + "mosix", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_ras_mosix_open, + NULL, + orte_ras_mosix_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + + +static int orte_ras_mosix_open(void) +{ + /* for now we set the priority lower then the priority of the POE RAS + * so that it is used whenever the LOADL_PROCESSOR_LIST is actually set */ + param_priority = + mca_base_param_reg_int(&mca_ras_mosix_component.base_version, + "priority", + "Priority of the loadleveler ras component", + false, false, 90, NULL); + + /* Descide if allocation must wait for the resources to be available (set to non-zero) + * or go for the best effort immediately. + * TODO: Make it the time to wait before resorting to best effort. + */ + printf("BEFORE:%i\n", g_orte_mca_ras_mosix_prevent_oversubscription); + g_orte_mca_ras_mosix_prevent_oversubscription = + mca_base_param_reg_int(&mca_ras_mosix_component.base_version, + "prevent_oversubscription", + "Wait for enough free resources to prevent oversubscription", + false, false, 0, NULL); + g_orte_mca_ras_mosix_prevent_oversubscription = 0; + printf("AFTER:%i\n", g_orte_mca_ras_mosix_prevent_oversubscription); + return ORTE_SUCCESS; +} + +static int orte_ras_mosix_component_query(mca_base_module_t **module, int *priority) +{ + mca_base_param_lookup_int(param_priority, priority); + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix: available for selection", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + *module = (mca_base_module_t *) &orte_ras_mosix_module; + return ORTE_SUCCESS; +} + Index: orte/mca/ras/mosix/Makefile.am =================================================================== --- orte/mca/ras/mosix/Makefile.am (revision 0) +++ orte/mca/ras/mosix/Makefile.am (revision 0) @@ -0,0 +1,52 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(ras_mosix_CPPFLAGS) + +sources = \ + ras_mosix.h \ + ras_mosix_component.c \ + ras_mosix_module.c + + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_ras_mosix_DSO +lib = +lib_sources = +component = mca_ras_mosix.la +component_sources = $(sources) +else +lib = libmca_ras_mosix.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component) +mca_ras_mosix_la_SOURCES = $(component_sources) +mca_ras_mosix_la_LDFLAGS = -module -avoid-version $(ras_mosix_LDFLAGS) +mca_ras_mosix_la_LIBADD = $(ras_mosix_LIBS) + +noinst_LTLIBRARIES = $(lib) +libmca_ras_mosix_la_SOURCES = $(lib_sources) +libmca_ras_mosix_la_LDFLAGS = -module -avoid-version $(ras_mosix_LDFLAGS) +libmca_ras_mosix_la_LIBADD = $(ras_mosix_LIBS) Index: orte/mca/ras/mosix/ras_mosix.h =================================================================== --- orte/mca/ras/mosix/ras_mosix.h (revision 0) +++ orte/mca/ras/mosix/ras_mosix.h (revision 0) @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Resource Allocation (Loadleveler) + */ +#ifndef ORTE_RAS_MOSIX_H +#define ORTE_RAS_MOSIX_H + +#include "orte_config.h" +#include "orte/mca/ras/ras.h" +#include "orte/mca/ras/base/base.h" + +BEGIN_C_DECLS + + ORTE_DECLSPEC extern orte_ras_base_component_t mca_ras_mosix_component; + ORTE_DECLSPEC extern orte_ras_base_module_t orte_ras_mosix_module; + +END_C_DECLS + +#endif Index: orte/mca/ras/mosix/ras_mosix_module.c =================================================================== --- orte/mca/ras/mosix/ras_mosix_module.c (revision 0) +++ orte/mca/ras/mosix/ras_mosix_module.c (revision 0) @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2011 IBM Corporation. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#include +#include + +#include "opal/util/argv.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/orte_globals.h" +#include "orte/constants.h" + +#include "orte/mca/ras/base/ras_private.h" +#include "ras_mosix.h" + + +/* + * Local functions + */ +static int orte_ras_mosix_allocate(opal_list_t *nodes); +static int orte_ras_mosix_finalize(void); + +static int orte_ras_mosix_discover(opal_list_t *nodelist, int count); +static int get_next_bestnode(char **node_name); +static int queue_for_allocation(int num_procs); + +#define MAX_IPV4_ADDRESS_LENGHT 512 + +#define BEST_NODE_COMMAND "mosbestnode" +#define PARALLEL_QUEUE_COMMAND "mosrun -q -P%i echo" + +/* + * Global variable + */ +orte_ras_base_module_t orte_ras_mosix_module = { + orte_ras_mosix_allocate, + orte_ras_mosix_finalize +}; + +int g_orte_mca_ras_mosix_prevent_oversubscription = 0; + +/* + * Discover available (pre-allocated) nodes. Allocate the + * requested number of nodes/process slots to the job. + */ +static int orte_ras_mosix_allocate(opal_list_t *nodes) +{ + int ret; + int num_procs = orte_ras_base.total_slots_alloc; + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "ras:mosix:allocate: Allocating %i nodes.", num_procs)); + + /* Prevent over-subscription if it is disallowed */ + if (g_orte_mca_ras_mosix_prevent_oversubscription) { + if (ORTE_SUCCESS != (ret = queue_for_allocation(num_procs))) { + return ret; + } + } + + /* Allocate the required number of nodes */ + if (ORTE_SUCCESS != (ret = orte_ras_mosix_discover(nodes, num_procs))) { + ORTE_ERROR_LOG(ret); + } + return ret; +} + +/* + * There's really nothing to do here + */ +static int orte_ras_mosix_finalize(void) +{ + return ORTE_SUCCESS; +} + +/** + * Discover the available resources using MOSIX. + * Ignore hostfile or any other user-specified parameters. + */ +static int orte_ras_mosix_discover(opal_list_t* nodelist, int count) +{ + int index = 0; + orte_node_t *node; + opal_list_item_t* item; + char *hostname; + + /* Iterate through all the nodes and make an entry for each */ + while ((index++ < count) && + (ORTE_SUCCESS == get_next_bestnode(&hostname))) { + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:discover: got hostname %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname)); + + /* Remember that LoadLeveler may list the same node more than once. + So we have to check for duplicates. */ + for (item = opal_list_get_first(nodelist); + opal_list_get_end(nodelist) != item; + item = opal_list_get_next(item)) { + node = (orte_node_t*) item; + if (0 == strcmp(node->name, hostname)) { + ++node->slots; + + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:discover: found -- bumped slots to %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots)); + break; + } + } + + /* Did we find it? */ + if (opal_list_get_end(nodelist) == item) { + /* Nope -- didn't find it, so add a new item to the list */ + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:discover: not found -- added to list", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + node = OBJ_NEW(orte_node_t); + node->name = hostname; + node->state = ORTE_NODE_STATE_UP; + node->slots_inuse = 0; + node->slots_max = 0; + node->slots = 1; + opal_list_append(nodelist, &node->super); + } else { + /* Yes, so we need to free the hostname that came back */ + free(hostname); + } + } + return ORTE_SUCCESS; +} + +static int queue_for_allocation(int num_procs) +{ + char* command; + /* Make sure a shell is available (sanity check) */ + int rc = system(NULL); + if (0 > rc) { + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:queue_for_allocation: Shell not available", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return ORTE_ERROR; + } + + /* Construct command for queuing N parallel "echo" (dummy) processes */ + rc = asprintf(&command, PARALLEL_QUEUE_COMMAND, num_procs); + if (0 > rc) { + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:queue_for_allocation: Command construction failed", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return ORTE_ERROR; + } + + /* Run the command and wait for it to finish (ignore output) */ + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:queue_for_allocation: running \"%s\"", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), command)); + rc = system(command); + free(command); + if (0 != rc) { + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:queue_for_allocation: Failed to queue for allocation", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return ORTE_ERROR; + } + return ORTE_SUCCESS; +} + +static int get_next_bestnode(char** node_name) +{ + int rc; + char name_buffer[MAX_IPV4_ADDRESS_LENGHT] = {0}; + + /* Run "mosbestnode" external application */ + FILE* reader = popen(BEST_NODE_COMMAND, "r"); + if (NULL == reader) { + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:get_next_bestnode: failed to run %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), BEST_NODE_COMMAND)); + return ORTE_ERROR; + } + + /* Read the IP(v4) address it returns */ + rc = fread(name_buffer, sizeof(char), MAX_IPV4_ADDRESS_LENGHT-1, reader); + pclose(reader); + if (rc < 0) { + OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, + "%s ras:mosix:allocate:get_next_bestnode: reading fstream error #%i", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ferror(reader))); + return ORTE_ERROR; + } + /* Ignore last new-line and duplicate the allocated host IP address */ + name_buffer[rc-1] = 0; + *node_name = strdup(name_buffer); + return ORTE_SUCCESS; +} Index: orte/mca/ras/base/base.h =================================================================== --- orte/mca/ras/base/base.h (revision 26266) +++ orte/mca/ras/base/base.h (working copy) @@ -45,6 +45,7 @@ bool allocation_read; bool display_alloc; orte_ras_base_module_t *active_module; + int total_slots_alloc; } orte_ras_base_t; ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base; Index: orte/mca/ras/base/ras_base_allocate.c =================================================================== --- orte/mca/ras/base/ras_base_allocate.c (revision 26266) +++ orte/mca/ras/base/ras_base_allocate.c (working copy) @@ -128,6 +128,15 @@ */ orte_ras_base.allocation_read = true; + /* Count the amount of slots to allocate */ + orte_ras_base.total_slots_alloc = 0; + for (i=0; i < jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + continue; + } + orte_ras_base.total_slots_alloc += app->num_procs; + } + /* construct a list to hold the results */ OBJ_CONSTRUCT(&nodes, opal_list_t); Index: orte/mca/odls/mosix/configure.m4 =================================================================== --- orte/mca/odls/mosix/configure.m4 (revision 0) +++ orte/mca/odls/mosix/configure.m4 (revision 0) @@ -0,0 +1,28 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_odls_mosix_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_odls_mosix_CONFIG],[ + # check for mosix presence + AC_CHECK_FILE([/proc/mosix/mosip], + [$1], + [$2]) +])dnl Index: orte/mca/odls/mosix/help-odls-mosix.txt =================================================================== --- orte/mca/odls/mosix/help-odls-mosix.txt (revision 0) +++ orte/mca/odls/mosix/help-odls-mosix.txt (revision 0) @@ -0,0 +1,121 @@ +# -*- text -*- +# +# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is a US/English help file. +# +[execve error] +Open MPI tried to fork a new process via the "execve" system call but +failed. This is an unusual error because Open MPI checks many things +before attempting to launch a child process. This error may be +indicative of another problem on the target host. Your job will now +abort. + + Local host: %s + Application name: %s +# +[binding not supported] +Open MPI tried to bind a new process, but process binding is not +supported on the host where it was launched. The process was killed +without launching the target application. Your job will now abort. + + Local host: %s + Application name: %s +# +[binding generic error] +Open MPI tried to bind a new process, but something went wrong. The +process was killed without launching the target application. Your job +will now abort. + + Local host: %s + Application name: %s + Error message: %s + Location: %s:%d +# +[bound to everything] +Open MPI tried to bind a new process to a specific set of processors, +but ended up binding it to *all* processors. This means that the new +process is effectively unbound. + +This is only a warning -- your job will continue. You can suppress +this warning in the future by setting the odls_warn_if_not_bound MCA +parameter to 0. + + Local host: %s + Application name: %s + Location: %s:%d +# +[slot list and paffinity_alone] +Open MPI detected that both a slot list was specified and the MCA +parameter "paffinity_alone" was set to true. Only one of these can be +used at a time. Your job will now abort. + + Local host: %s + Application name: %s +# +[iof setup failed] +Open MPI tried to launch a child process but the "IOF child setup" +failed. This should not happen. Your job will now abort. + + Local host: %s + Application name: %s +# +[not bound] +WARNING: Open MPI tried to bind a process but failed. This is a +warning only; your job will continue. + + Local host: %s + Application name: %s + Error message: %s + Location: %s:%d +# +[syscall fail] +A system call failed that should not have. In this particular case, +a warning or error message was not displayed that should have been. +Your job may behave unpredictably after this, or abort. + + Local host: %s + Application name: %s + Function: %s + Location: %s:%d +# +[memory not bound] +WARNING: Open MPI tried to bind a process but failed. This is a +warning only; your job will continue, though performance may +be degraded. + + Local host: %s + Application name: %s + Error message: %s + Location: %s:%d + +# +[memory binding error] +Open MPI tried to bind memory for a new process but something went +wrong. The process was killed without launching the target +application. Your job will now abort. + + Local host: %s + Application name: %s + Error message: %s + Location: %s:%d +# +[hostname and address collision] +Both host name and an IP address were specified for as a remote node. +Your job will start with the host name ignored. \ No newline at end of file Index: orte/mca/odls/mosix/odls_mosix_component.c =================================================================== --- orte/mca/odls/mosix/odls_mosix_component.c (revision 0) +++ orte/mca/odls/mosix/odls_mosix_component.c (revision 0) @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include + +#include "orte/util/show_help.h" +#include "orte/mca/odls/odls.h" +#include "orte/mca/odls/base/odls_private.h" +#include "orte/mca/odls/mosix/odls_mosix.h" + +/* instantiate a module-global variable */ +bool orte_odls_mosix_report_bindings; + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +orte_odls_mosix_component_t mca_odls_mosix_component = { + { + /* First, the mca_component_t struct containing meta information + about the component itself */ + { + ORTE_ODLS_BASE_VERSION_2_0_0, + /* Component name and version */ + "mosix", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_odls_mosix_component_open, + orte_odls_mosix_component_close, + orte_odls_mosix_component_query, + orte_odls_mosix_component_register + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } + } +}; + +/* + * utility routines for parameter registration + */ + +static inline char* mca_odls_mosix_param_register_string( + const char* param_name, + const char* help_string, + const char* default_value) +{ + char *value; + mca_base_param_reg_string(&mca_odls_mosix_component.super.version, + param_name, help_string, false, false, + default_value, &value); + return value; +} + +static inline int mca_odls_mosix_param_register_int( + const char* param_name, + const char* help_string, + int default_value) +{ + int value; + mca_base_param_reg_int(&mca_odls_mosix_component.super.version, + param_name, help_string, false, false, + default_value, &value); + return value; +} + +int orte_odls_mosix_component_open(void) +{ + return ORTE_SUCCESS; +} + +int orte_odls_mosix_component_query(mca_base_module_t **module, int *priority) +{ + /* the base open/select logic protects us against operation when + * we are NOT in a daemon, so we don't have to check that here + */ + + /* we have built some logic into the configure.m4 file that checks + * to see if we have "fork" support and only builds this component + * if we do. Hence, we only get here if we CAN build - in which + * case, we definitely should be considered for selection + */ + *priority = 2; /* default priority + 1 */ + *module = (mca_base_module_t *) &orte_odls_mosix_module; + return ORTE_SUCCESS; +} + +int orte_odls_mosix_component_register(void) +{ + mca_odls_mosix_component.mosix_remote_host_name = + mca_odls_mosix_param_register_string("remote_host_name", "Host name to serve as a remote node for the process.", NULL); + mca_odls_mosix_component.mosix_remote_host_address = + mca_odls_mosix_param_register_string("remote_host_address", "Host address to serve as a remote node for the process.", NULL); + if ((NULL != mca_odls_mosix_component.mosix_remote_host_name) && + (NULL != mca_odls_mosix_component.mosix_remote_host_address)) + { + orte_show_help("help-odls-mosix.txt", "hostname and address collision", true); + mca_odls_mosix_component.mosix_remote_host_name = NULL; + } + + mca_odls_mosix_component.mosix_grid_class = + mca_odls_mosix_param_register_int("grid_class", "Class permitted for migration in a MOSIX multi-cluster.", 0); + mca_odls_mosix_component.mosix_queue_priority = + mca_odls_mosix_param_register_int("queue_priority", "Priority in the MOSIX queue (0 for not queuing, default).", 0); + mca_odls_mosix_component.mosix_migration_lock = + mca_odls_mosix_param_register_int("migration_lock", "Whether to lock the process in place for migration (If strated remotely - stays there).", 0); + mca_odls_mosix_component.mosix_unsupported_syscalls = + mca_odls_mosix_param_register_int("unsupported_syscalls", "How to handle unsupported system calls (0: Exit, 1: Ignore (default), 2: Report).", 1); + mca_odls_mosix_component.mosix_job_id = + mca_odls_mosix_param_register_int("job_id", "MOSIX Job ID for all the processes launched in this MPI job.", 0); + return ORTE_SUCCESS; +} + +int orte_odls_mosix_component_close(void) +{ + return ORTE_SUCCESS; +} Index: orte/mca/odls/mosix/configure.params =================================================================== --- orte/mca/odls/mosix/configure.params (revision 0) +++ orte/mca/odls/mosix/configure.params (revision 0) @@ -0,0 +1,22 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2007 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +PARAM_CONFIG_FILES="Makefile" Index: orte/mca/odls/mosix/odls_mosix.h =================================================================== --- orte/mca/odls/mosix/odls_mosix.h (revision 0) +++ orte/mca/odls/mosix/odls_mosix.h (revision 0) @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file: + */ + +#ifndef ORTE_ODLS_MOSIX_H +#define ORTE_ODLS_MOSIX_H + +#include "orte_config.h" + +#include "opal/mca/mca.h" + +#include "orte/mca/odls/odls.h" + +BEGIN_C_DECLS + +/* + * Module open / close + */ +int orte_odls_mosix_component_open(void); +int orte_odls_mosix_component_close(void); +int orte_odls_mosix_component_query(mca_base_module_t **module, int *priority); +int orte_odls_mosix_component_register(void); + +/* + * ODLS Default module + */ +struct orte_odls_mosix_component_t { + orte_odls_base_component_2_0_0_t super; + char* mosix_remote_host_name; + char* mosix_remote_host_address; + uint8_t mosix_grid_class; + uint8_t mosix_queue_priority; + uint8_t mosix_migration_lock; + uint8_t mosix_unsupported_syscalls; + uint32_t mosix_job_id; +}; + +typedef struct orte_odls_mosix_component_t orte_odls_mosix_component_t; +extern orte_odls_base_module_t orte_odls_mosix_module; +ORTE_MODULE_DECLSPEC extern orte_odls_mosix_component_t mca_odls_mosix_component; + +/* dedicated debug output flag */ +ORTE_MODULE_DECLSPEC extern bool orte_odls_mosix_report_bindings; + +END_C_DECLS + +#endif /* ORTE_ODLS_MOSIX_H */ Index: orte/mca/odls/mosix/odls_mosix_module.c =================================================================== --- orte/mca/odls/mosix/odls_mosix_module.c (revision 0) +++ orte/mca/odls/mosix/odls_mosix_module.c (revision 0) @@ -0,0 +1,1121 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007 Evergrid, Inc. All rights reserved. + * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_WAIT_H +#include +#endif +#include +#ifdef HAVE_FCNTL_H +#include +#endif +#ifdef HAVE_SYS_TIME_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif /* HAVE_SYS_STAT_H */ + +#if defined(HAVE_SCHED_YIELD) +/* Only if we have sched_yield() */ +#ifdef HAVE_SCHED_H +#include +#endif +#else +/* Only do these if we don't have */ +#ifdef HAVE_SYS_SELECT_H +#include +#endif +#endif /* HAVE_SCHED_YIELD */ + +#include "opal/mca/maffinity/base/base.h" +#include "opal/mca/paffinity/base/base.h" +#include "opal/mca/hwloc/base/base.h" +#include "opal/class/opal_pointer_array.h" +#include "opal/util/opal_environ.h" + +#include "orte/util/show_help.h" +#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/iof/base/iof_base_setup.h" +#include "orte/mca/plm/plm.h" +#include "orte/util/name_fns.h" + +#include "orte/mca/odls/base/odls_private.h" +#include "orte/mca/odls/mosix/odls_mosix.h" + +/* + * MOSIX requires all processes to be launched with "/bin/mosrun -w procname arg1 arg2..." + */ +#define ORTE_ODLS_MOSIX_ARGV_COUNT (8) +#define ORTE_ODLS_MOSIX_MOSRUN_LOCATION "/bin/mosenv" + +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_MIGRATION_LOCKED "-L" +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_MIGRATION_UNLOCKED "-l" +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_FAIL_ON_UNSUPPORTED "-u" +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_IGNORE_UNSUPPORTED "-e" +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_REPORT_UNSUPPORTED "-w" + +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_JOB_ID "-J%i" +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_GRID_CLASS "-G%i" +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_QUEUE_PRIORITY "-Q%i" +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_REMOTE_HOST_NAME "-r%s" +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_REMOTE_HOST_ADDRESS "-%s" + +/* + * External Interface + */ +static int orte_odls_mosix_launch_local_procs(opal_buffer_t *data); +static int orte_odls_mosix_kill_local_procs(opal_pointer_array_t *procs, bool set_state); +static int orte_odls_mosix_signal_local_procs(const orte_process_name_t *proc, int32_t signal); + +static void set_handler_default(int sig); + +orte_odls_base_module_t orte_odls_mosix_module = { + orte_odls_base_default_get_add_procs_data, + orte_odls_mosix_launch_local_procs, + orte_odls_mosix_kill_local_procs, + orte_odls_mosix_signal_local_procs, + orte_odls_base_default_deliver_message, + orte_odls_base_default_require_sync +}; + +/* convenience macro for erroring out */ +#define ORTE_ODLS_ERROR_OUT(errval) \ + do { \ + rc = (errval); \ + write(p[1], &rc, sizeof(int)); \ + exit(1); \ + } while(0); + +/* convenience macro for checking binding requirements */ +#define ORTE_ODLS_IF_BIND_NOT_REQD(n) \ + do { \ + if (ORTE_BINDING_NOT_REQUIRED(jobdat->policy)) { \ + if (orte_report_bindings) { \ + orte_show_help("help-odls-mosix.txt", \ + "odls-mosix:binding-not-avail", \ + true, orte_process_info.nodename, \ + (n), context->app); \ + } \ + goto LAUNCH_PROCS; \ + } \ + } while(0); + +static bool odls_mosix_child_died(pid_t pid, unsigned int timeout, int *exit_status) +{ + time_t end; + pid_t ret; + + /* Because of rounding in time (which returns whole seconds) we + * have to add 1 to our wait number: this means that we wait + * somewhere between (target) and (target)+1 seconds. Otherwise, + * the default 1s actually means 'somwhere between 0 and 1s'. */ + end = time(NULL) + timeout + 1; + do { + ret = waitpid(pid, exit_status, WNOHANG); + if (pid == ret) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:WAITPID INDICATES PROC %d IS DEAD", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid)); + /* It died -- return success */ + return true; + } else if (0 == ret) { + /* with NOHANG specified, if a process has already exited + * while waitpid was registered, then waitpid returns 0 + * as there is no error - this is a race condition problem + * that occasionally causes us to incorrectly report a proc + * as refusing to die. Unfortunately, errno may not be reset + * by waitpid in this case, so we cannot check it. + * + * (note the previous fix to this, to return 'process dead' + * here, fixes the race condition at the cost of reporting + * all live processes have immediately died! Better to + * occasionally report a dead process as still living - + * which will occasionally trip the timeout for cases that + * are right on the edge.) + */ + + /* Do nothing, process still alive */ + } else if (-1 == ret && ECHILD == errno) { + /* The pid no longer exists, so we'll call this "good + enough for government work" */ + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)pid)); + return true; + } + + /* Bogus delay for 1 msec - let's actually give the CPU some time + * to quit the other process (sched_yield() -- even if we have it + * -- changed behavior in 2.6.3x Linux flavors to be undesirable) + * Don't use select on a bogus file descriptor here as it has proven + * unreliable and sometimes immediately returns - we really, really + * -do- want to wait a bit! + */ + usleep(1000); + } while (time(NULL) < end); + + /* The child didn't die, so return false */ + return false; +} + +static int odls_mosix_kill_local(pid_t pid, int signum) +{ + if (orte_forward_job_control) { + pid = -pid; + } + if (0 != kill(pid, signum)) { + if (ESRCH != errno) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:SENT KILL %d TO PID %d GOT ERRNO %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno)); + return errno; + } + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:SENT KILL %d TO PID %d SUCCESS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid)); + return 0; +} + +int orte_odls_mosix_kill_local_procs(opal_pointer_array_t *procs, bool set_state) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs, set_state, + odls_mosix_kill_local, odls_mosix_child_died))) { + ORTE_ERROR_LOG(rc); + return rc; + } + return ORTE_SUCCESS; +} + +/* + * Manipulate the argv array to start this process under MOSIX. + */ +static void prepend_mosrun_options(orte_app_context_t* context) +{ + size_t argc = 0; + uint32_t index = 0; + char** new_argv = malloc((ORTE_ODLS_MOSIX_ARGV_COUNT + 1) * sizeof(char*)); + + /* Fill in the mosrun parametes according to what was passed to mpirun */ + new_argv[index++] = ORTE_ODLS_MOSIX_MOSRUN_LOCATION; + asprintf(&new_argv[index++], ORTE_ODLS_MOSIX_MOSRUN_PARAM_GRID_CLASS, mca_odls_mosix_component.mosix_grid_class); + if (0 != mca_odls_mosix_component.mosix_migration_lock) { + new_argv[index++] = ORTE_ODLS_MOSIX_MOSRUN_PARAM_MIGRATION_LOCKED; + } else { + new_argv[index++] = ORTE_ODLS_MOSIX_MOSRUN_PARAM_MIGRATION_UNLOCKED; + } + switch (mca_odls_mosix_component.mosix_unsupported_syscalls) { + case 0: + new_argv[index++] = ORTE_ODLS_MOSIX_MOSRUN_PARAM_FAIL_ON_UNSUPPORTED; + break; + case 2: + new_argv[index++] = ORTE_ODLS_MOSIX_MOSRUN_PARAM_REPORT_UNSUPPORTED; + break; + case 1: + default: + new_argv[index++] = ORTE_ODLS_MOSIX_MOSRUN_PARAM_IGNORE_UNSUPPORTED; + } + + if (0 != mca_odls_mosix_component.mosix_job_id) { + asprintf(&new_argv[index++], ORTE_ODLS_MOSIX_MOSRUN_PARAM_JOB_ID, mca_odls_mosix_component.mosix_job_id); + } + if (0 != mca_odls_mosix_component.mosix_queue_priority) { + asprintf(&new_argv[index++], ORTE_ODLS_MOSIX_MOSRUN_PARAM_QUEUE_PRIORITY, mca_odls_mosix_component.mosix_queue_priority); + } + if (NULL != mca_odls_mosix_component.mosix_remote_host_name) { + asprintf(&new_argv[index++], ORTE_ODLS_MOSIX_MOSRUN_PARAM_REMOTE_HOST_NAME, mca_odls_mosix_component.mosix_remote_host_name); + } + if (NULL != mca_odls_mosix_component.mosix_remote_host_address) { + asprintf(&new_argv[index++], ORTE_ODLS_MOSIX_MOSRUN_PARAM_REMOTE_HOST_ADDRESS, mca_odls_mosix_component.mosix_remote_host_address); + } + + /* Append the original argv if not NULL, and truncate */ + new_argv[index++] = context->app; + if (NULL == context->argv) { + new_argv[index++] = NULL; + } else { + while (NULL != context->argv[argc++]); /* Count the given parameters */ + if (argc > ORTE_ODLS_MOSIX_ARGV_COUNT - index) { + new_argv = realloc(new_argv, (index + argc) * sizeof(char*)); + } + /* Assume current parameters will at least include the first arg... */ + memcpy(new_argv + index, context->argv + 1, argc * sizeof(char*)); + } + context->app = new_argv[0]; + context->argv = new_argv; +} + + +/** + * Fork/exec the specified processes + */ + +static int odls_mosix_fork_local_proc(orte_app_context_t* context, + orte_odls_child_t *child, + char **environ_copy, + orte_odls_job_t *jobdat) +{ + orte_iof_base_io_conf_t opts; + int rc; + sigset_t sigs; + int i, p[2]; + pid_t pid; + bool paffinity_enabled = false; + opal_paffinity_base_cpu_set_t mask; + orte_node_rank_t nrank; + int16_t n; + orte_local_rank_t lrank; + int target_socket, npersocket, logical_skt; + int logical_cpu, phys_core, phys_cpu, ncpu; + bool bound = false; + char *param, *tmp; + + if (NULL != child) { + /* should pull this information from MPIRUN instead of going with + default */ + opts.usepty = OPAL_ENABLE_PTY_SUPPORT; + + /* do we want to setup stdin? */ + if (NULL != child && + (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target)) { + opts.connect_stdin = true; + } else { + opts.connect_stdin = false; + } + + if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) { + ORTE_ERROR_LOG(rc); + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = rc; + } + return rc; + } + } + + /* A pipe is used to communicate between the parent and child to + indicate whether the exec ultimately succeeded or failed. The + child sets the pipe to be close-on-exec; the child only ever + writes anything to the pipe if there is an error (e.g., + executable not found, exec() fails, etc.). The parent does a + blocking read on the pipe; if the pipe closed with no data, + then the exec() succeeded. If the parent reads something from + the pipe, then the child was letting us know that it failed. */ + if (pipe(p) < 0) { + ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES); + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES; + } + return ORTE_ERR_SYS_LIMITS_PIPES; + } + + /* Fork off the child */ + pid = fork(); + if (NULL != child) { + child->pid = pid; + } + + if(pid < 0) { + ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN; + } + return ORTE_ERR_SYS_LIMITS_CHILDREN; + } + + if (pid == 0) { + long fd, fdmax = sysconf(_SC_OPEN_MAX); + + if (orte_forward_job_control) { + /* Set a new process group for this child, so that a + SIGSTOP can be sent to it without being sent to the + orted. */ + setpgid(0, 0); + } + + /* Setup the pipe to be close-on-exec */ + close(p[0]); + fcntl(p[1], F_SETFD, FD_CLOEXEC); + + if (NULL != child) { + /* setup stdout/stderr so that any error messages that we may + print out will get displayed back at orterun. + + NOTE: Definitely do this AFTER we check contexts so that any + error message from those two functions doesn't come out to the + user. IF we didn't do it in this order, THEN a user who gives + us a bad executable name or working directory would get N + error messages, where N=num_procs. This would be very annoying + for large jobs, so instead we set things up so that orterun + always outputs a nice, single message indicating what happened + */ + if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts, &environ_copy))) { + ORTE_ODLS_ERROR_OUT(i); + } + + /* Setup process affinity. First check to see if a slot list was + * specified. If so, use it. If no slot list was specified, + * that's not an error -- just fall through and try the next + * paffinity scheme. + */ + if (NULL != child->slot_list) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork got slot_list %s for child %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + child->slot_list, ORTE_NAME_PRINT(child->name))); + if (opal_paffinity_alone) { + /* It's an error if multiple paffinity schemes were specified */ + orte_show_help("help-odls-mosix.txt", + "odls-mosix:multiple-paffinity-schemes", true, child->slot_list); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + if (orte_report_bindings) { + opal_output(0, "%s odls:default:fork binding child %s to slot_list %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), child->slot_list); + } + if (ORTE_SUCCESS != (rc = opal_paffinity_base_slot_list_set((long)child->name->vpid, child->slot_list))) { + if (ORTE_ERR_NOT_SUPPORTED == rc) { + /* OS doesn't support providing topology information */ + orte_show_help("help-odls-mosix.txt", + "odls-mosix:topo-not-supported", + true, orte_process_info.nodename, "rankfile containing a slot_list of ", + child->slot_list, context->app); + ORTE_ODLS_ERROR_OUT(rc); + } + + orte_show_help("help-odls-mosix.txt", + "odls-mosix:slot-list-failed", true, child->slot_list, ORTE_ERROR_NAME(rc)); + ORTE_ODLS_ERROR_OUT(rc); + } + } else if (ORTE_BIND_TO_CORE & jobdat->policy) { + /* we want to bind this proc to a specific core, or multiple cores + * if the cpus_per_rank is > 0 + */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:default:fork binding child %s to core(s) cpus/rank %d stride %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), + (int)jobdat->cpus_per_rank, (int)jobdat->stride)); + /* get the node rank */ + if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(child->name))) { + orte_show_help("help-odls-mosix.txt", + "odls-mosix:invalid-node-rank", true); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + /* get the local rank */ + if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) { + orte_show_help("help-odls-mosix.txt", + "odls-mosix:invalid-local-rank", true); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + /* init the mask */ + OPAL_PAFFINITY_CPU_ZERO(mask); + if (ORTE_MAPPING_NPERXXX & jobdat->policy) { + /* we need to balance the children from this job across the available sockets */ + npersocket = jobdat->npersocket; + /* determine the socket to use based on those available */ + if (npersocket < 2) { + /* if we only have 1/sock, or we have less procs than sockets, + * then just put it on the lrank socket + */ + logical_skt = lrank; + } else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) { + logical_skt = lrank % npersocket; + } else { + logical_skt = lrank / npersocket; + } + if (orte_odls_globals.bound) { + /* if we are bound, use this as an index into our available sockets */ + for (n=target_socket=0; target_socket < opal_bitmap_size(&orte_odls_globals.sockets) && n < logical_skt; target_socket++) { + if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets, target_socket)) { + n++; + } + } + /* if we don't have enough sockets, that is an error */ + if (n < logical_skt) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:not-enough-resources", true, + "sockets", orte_process_info.nodename, + "bind-to-core", context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } else { + target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt); + if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + /* OS doesn't support providing topology information */ + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:topo-not-supported", + true, orte_process_info.nodename, "bind-to-core", "", + context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork child %s local rank %d npersocket %d logical socket %d target socket %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), lrank, + npersocket, logical_skt, target_socket)); + /* set the starting point */ + logical_cpu = (lrank % npersocket) * jobdat->cpus_per_rank; + /* bind to this socket */ + goto bind_socket; + } else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) { + /* this corresponds to a mapping policy where + * local rank 0 goes on socket 0, and local + * rank 1 goes on socket 1, etc. - round robin + * until all ranks are mapped + * + * NOTE: we already know our number of sockets + * from when we initialized + */ + target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets); + if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + /* OS does not support providing topology information */ + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:topo-not-supported", + true, orte_process_info.nodename, "bind-to-core", "", + context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank, + (int)orte_odls_globals.num_sockets, + (int)(lrank % orte_odls_globals.num_sockets), + target_socket)); + /* my starting core within this socket has to be offset by cpus_per_rank */ + logical_cpu = (lrank / orte_odls_globals.num_sockets) * jobdat->cpus_per_rank; + + bind_socket: + /* cycle across the cpus_per_rank */ + for (n=0; n < jobdat->cpus_per_rank; n++) { + /* get the physical core within this target socket */ + phys_core = opal_paffinity_base_get_physical_core_id(target_socket, logical_cpu); + if (0 > phys_core) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:invalid-phys-cpu", true); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + /* map this to a physical cpu on this node */ + if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:not-enough-resources", true, + "processors", orte_process_info.nodename, + "bind-to-core", context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + /* are we bound? */ + if (orte_odls_globals.bound) { + /* see if this physical cpu is available to us */ + if (!OPAL_PAFFINITY_CPU_ISSET(phys_cpu, orte_odls_globals.my_cores)) { + /* no it isn't - skip it */ + continue; + } + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + target_socket, phys_core, phys_cpu)); + OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); + /* increment logical cpu */ + logical_cpu += jobdat->stride; + } + if (orte_report_bindings) { + opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %04lx", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), target_socket, mask.bitmask[0]); + } + } else { + /* my starting core has to be offset by cpus_per_rank */ + logical_cpu = nrank * jobdat->cpus_per_rank; + for (n=0; n < jobdat->cpus_per_rank; n++) { + /* are we bound? */ + if (orte_odls_globals.bound) { + /* if we are bound, then use the logical_cpu as an index + * against our available cores + */ + ncpu = 0; + for (i=0; i < OPAL_PAFFINITY_BITMASK_CPU_MAX && ncpu <= logical_cpu; i++) { + if (OPAL_PAFFINITY_CPU_ISSET(i, orte_odls_globals.my_cores)) { + ncpu++; + phys_cpu = i; + } + } + /* if we don't have enough processors, that is an error */ + if (ncpu <= logical_cpu) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:not-enough-resources", true, + "processors", orte_process_info.nodename, + "bind-to-core", context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } else { + /* if we are not bound, then all processors are available + * to us, so index into the node's array to get the + * physical cpu + */ + phys_cpu = opal_paffinity_base_get_physical_processor_id(logical_cpu); + if (OPAL_ERROR == phys_cpu){ + /* No processor to bind to so error out */ + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:not-enough-resources", true, + "processors", orte_process_info.nodename, + "bind-to-core", context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } else if (0 > phys_cpu) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:invalid-phys-cpu", true); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } + OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); + /* increment logical cpu */ + logical_cpu += jobdat->stride; + } + if (orte_report_bindings) { + opal_output(0, "%s odls:default:fork binding child %s to cpus %04lx", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), mask.bitmask[0]); + } + } + if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-core"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:failed-set-paff", true); + ORTE_ODLS_ERROR_OUT(rc); + } + paffinity_enabled = true; + } else if (ORTE_BIND_TO_SOCKET & jobdat->policy) { + /* bind this proc to a socket */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:default:fork binding child %s to socket", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name))); + /* layout this process across the sockets based on + * the provided mapping policy + */ + if (ORTE_LOCAL_RANK_INVALID == (lrank = orte_ess.get_local_rank(child->name))) { + orte_show_help("help-odls-mosix.txt", + "odls-mosix:invalid-local-rank", true); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + if (ORTE_MAPPING_NPERXXX & jobdat->policy) { + /* we need to balance the children from this job across the available sockets */ + npersocket = jobdat->npersocket; + /* determine the socket to use based on those available */ + if (npersocket < 2) { + /* if we only have 1/sock, or we have less procs than sockets, + * then just put it on the lrank socket + */ + logical_skt = lrank; + } else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) { + logical_skt = lrank % npersocket; + } else { + logical_skt = lrank / npersocket; + } + if (orte_odls_globals.bound) { + /* if we are bound, use this as an index into our available sockets */ + for (target_socket=0, n = 0; target_socket < opal_bitmap_size(&orte_odls_globals.sockets) && n < logical_skt; target_socket++) { + if (opal_bitmap_is_set_bit(&orte_odls_globals.sockets, target_socket)) { + n++; + } + } + /* if we don't have enough sockets, that is an error */ + if (n < logical_skt) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:not-enough-resources", true, + "sockets", orte_process_info.nodename, + "bind-to-socket", context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } else { + target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt); + if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + /* OS doesn't support providing topology information */ + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:topo-not-supported", + true, orte_process_info.nodename, "bind-to-socket", "", + context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork child %s local rank %d npersocket %d logical socket %d target socket %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), lrank, + npersocket, logical_skt, target_socket)); + } else if (ORTE_MAPPING_BYSOCKET & jobdat->policy) { + /* this corresponds to a mapping policy where + * local rank 0 goes on socket 0, and local + * rank 1 goes on socket 1, etc. - round robin + * until all ranks are mapped + * + * NOTE: we already know our number of sockets + * from when we initialized + */ + target_socket = opal_paffinity_base_get_physical_socket_id(lrank % orte_odls_globals.num_sockets); + if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + /* OS does not support providing topology information */ + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:topo-not-supported", + true, orte_process_info.nodename, "bind-to-socket", "", + context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "bysocket lrank %d numsocks %d logical socket %d target socket %d", (int)lrank, + (int)orte_odls_globals.num_sockets, + (int)(lrank % orte_odls_globals.num_sockets), + target_socket)); + } else { + /* use a byslot-like policy where local rank 0 goes on + * socket 0, and local rank 1 goes on socket 0, etc. + * following round-robin until all ranks mapped + */ + if (orte_odls_globals.bound) { + /* if we are bound, then we compute the logical socket id + * based on the number of available cores in each socket so + * that each rank gets its own core, adjusting for the cpus_per_task + */ + /* Find the lrank available core, accounting for cpus_per_task */ + logical_cpu = lrank * jobdat->cpus_per_rank; + /* use the logical_cpu as an index against our available cores */ + ncpu = 0; + for (i=0; i < orte_odls_globals.num_processors && ncpu <= logical_cpu; i++) { + if (OPAL_PAFFINITY_CPU_ISSET(i, orte_odls_globals.my_cores)) { + ncpu++; + phys_cpu = i; + } + } + /* if we don't have enough processors, that is an error */ + if (ncpu < logical_cpu) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:not-enough-resources", true, + "processors", orte_process_info.nodename, + "bind-to-socket", context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + /* get the physical socket of that cpu */ + if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_socket_core(phys_cpu, &target_socket, &phys_core)) { + if (ORTE_BINDING_NOT_REQUIRED(jobdat->policy)) { + goto LAUNCH_PROCS; + } + orte_show_help("help-odls-mosix.txt", + "odls-mosix:topo-not-supported", + true, orte_process_info.nodename, "bind-to-socket", "", + context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } else { + /* if we are not bound, then just use all sockets */ + if (1 == orte_odls_globals.num_sockets) { + /* if we only have one socket, then just put it there */ + target_socket = opal_paffinity_base_get_physical_socket_id(0); + if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + /* OS doesn't support providing topology information */ + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:topo-not-supported", + true, orte_process_info.nodename, "bind-to-socket", "", + context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } else { + /* compute the logical socket, compensating for the number of cpus_per_rank */ + logical_skt = lrank / (orte_odls_globals.num_cores_per_socket / jobdat->cpus_per_rank); + /* wrap that around the number of sockets so we round-robin */ + logical_skt = logical_skt % orte_odls_globals.num_sockets; + /* now get the target physical socket */ + target_socket = opal_paffinity_base_get_physical_socket_id(logical_skt); + if (ORTE_ERR_NOT_SUPPORTED == target_socket) { + /* OS doesn't support providing topology information */ + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:topo-not-supported", + true, orte_process_info.nodename, "bind-to-socket", "", + context->app); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "byslot lrank %d socket %d", (int)lrank, target_socket)); + } + } + + OPAL_PAFFINITY_CPU_ZERO(mask); + + for (n=0; n < orte_odls_globals.num_cores_per_socket; n++) { + /* get the physical core within this target socket */ + phys_core = opal_paffinity_base_get_physical_core_id(target_socket, n); + if (0 > phys_core) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:invalid-phys-cpu", true); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + /* map this to a physical cpu on this node */ + if (ORTE_SUCCESS != opal_paffinity_base_get_map_to_processor_id(target_socket, phys_core, &phys_cpu)) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:invalid-phys-cpu", true); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + /* are we bound? */ + if (orte_odls_globals.bound) { + /* see if this physical cpu is available to us */ + if (!OPAL_PAFFINITY_CPU_ISSET(phys_cpu, orte_odls_globals.my_cores)) { + /* no it isn't - skip it */ + continue; + } + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork mapping phys socket %d core %d to phys_cpu %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + target_socket, phys_core, phys_cpu)); + OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); + } + /* if we did not bind it anywhere, then that is an error */ + OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &bound); + if (!bound) { + orte_show_help("help-odls-mosix.txt", + "odls-mosix:could-not-bind-to-socket", true, + target_socket, orte_process_info.nodename); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } + if (orte_report_bindings) { + opal_output(0, "%s odls:default:fork binding child %s to socket %d cpus %04lx", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), target_socket, mask.bitmask[0]); + } + if (ORTE_SUCCESS != (rc = opal_paffinity_base_set(mask))) { + ORTE_ODLS_IF_BIND_NOT_REQD("bind-to-socket"); + orte_show_help("help-odls-mosix.txt", + "odls-mosix:failed-set-paff", true); + ORTE_ODLS_ERROR_OUT(rc); + } + paffinity_enabled = true; + } + + /* If we were able to set processor affinity, try setting up + * memory affinity + */ + if (paffinity_enabled) { + if (OPAL_SUCCESS == opal_maffinity_base_open() && + OPAL_SUCCESS == opal_maffinity_base_select()) { + opal_maffinity_setup = true; + } + } + + } else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { + /* tie stdin/out/err/internal to /dev/null */ + int fdnull; + for (i=0; i < 3; i++) { + fdnull = open("/dev/null", O_RDONLY, 0); + if(fdnull > i) { + dup2(fdnull, i); + } + close(fdnull); + } + fdnull = open("/dev/null", O_RDONLY, 0); + if(fdnull > opts.p_internal[1]) { + dup2(fdnull, opts.p_internal[1]); + } + close(fdnull); + } + +LAUNCH_PROCS: + /* if we are bound, report it */ + if (opal_paffinity_base_bound) { + param = mca_base_param_environ_variable("paffinity","base","bound"); + opal_setenv(param, "1", true, &environ_copy); + free (param); + /* and provide a char representation of what we did */ + tmp = opal_paffinity_base_print_binding(mask); + if (NULL != tmp) { + param = mca_base_param_environ_variable("paffinity","base","applied_binding"); + opal_setenv(param, tmp, true, &environ_copy); + free(tmp); + } + + /* Also set the memory affininty policy */ + opal_hwloc_base_set_process_membind_policy(); + } + + /* close all file descriptors w/ exception of + * stdin/stdout/stderr and the pipe used for the IOF INTERNAL + * messages + */ + for(fd=3; fdapp, context->argv, environ_copy); + ORTE_ODLS_ERROR_OUT(ORTE_ERR_FATAL); + } else { + + if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { + /* connect endpoints IOF */ + rc = orte_iof_base_setup_parent(child->name, &opts); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + /* Wait to read something from the pipe or close */ + close(p[1]); + while (1) { + rc = read(p[0], &i, sizeof(int)); + if (rc < 0) { + /* Signal interrupts are ok */ + if (errno == EINTR) { + continue; + } + + /* Other errno's are bad */ + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = ORTE_ERR_PIPE_READ_FAILURE; + } + + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork got code %d back from child", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i)); + close(p[0]); + return ORTE_ERR_PIPE_READ_FAILURE; + } else if (0 == rc) { + /* Child was successful in exec'ing! */ + break; + } else { + /* Doh -- child failed. + Let the calling function + know about the failure. The actual exit status of child proc + cannot be found here - all we can do is report the ORTE error + code that was reported back to us. The calling func needs to report the + failure to launch this process through the SMR or else + everyone else will hang. + */ + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = i; + } + + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:fork got code %d back from child", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i)); + close(p[0]); + return ORTE_ERR_FAILED_TO_START; + } + } + + if (NULL != child) { + /* set the proc state to LAUNCHED */ + child->state = ORTE_PROC_STATE_LAUNCHED; + child->alive = true; + } + close(p[0]); + } + + return ORTE_SUCCESS; +} + + +/** + * Launch all processes allocated to the current node. + */ + +int orte_odls_mosix_launch_local_procs(opal_buffer_t *data) +{ + int rc; + orte_jobid_t job; + orte_job_t *jdata; + + /* construct the list of children we are to launch */ + if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:launch:local failed to construct child list on error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); + goto CLEANUP; + } + + /* launch the local procs */ + if (ORTE_SUCCESS != (rc = orte_odls_base_default_launch_local(job, odls_mosix_fork_local_proc))) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:launch:local failed to launch on error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); + goto CLEANUP; + } + + /* look up job data object */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + if (jdata->state & ORTE_JOB_STATE_SUSPENDED) { + if (ORTE_PROC_IS_HNP) { + /* Have the plm send the signal to all the nodes. + If the signal arrived before the orteds started, + then they won't know to suspend their procs. + The plm also arranges for any local procs to + be signaled. + */ + orte_plm.signal_job(jdata->jobid, SIGTSTP); + } else { + orte_odls_mosix_signal_local_procs(NULL, SIGTSTP); + } + } + } + +CLEANUP: + + return rc; +} + + +static void set_handler_default(int sig) +{ + struct sigaction act; + + act.sa_handler = SIG_DFL; + act.sa_flags = 0; + sigemptyset(&act.sa_mask); + + sigaction(sig, &act, (struct sigaction *)0); +} + +/** + * Send a sigal to a pid. Note that if we get an error, we set the + * return value and let the upper layer print out the message. + */ +static int send_signal(pid_t pid, int signal) +{ + int rc = ORTE_SUCCESS; + + OPAL_OUTPUT_VERBOSE((1, orte_odls_globals.output, + "%s sending signal %d to pid %ld", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + signal, (long)pid)); + + if (orte_forward_job_control) { + /* Send the signal to the process group rather than the + process. The child is the leader of its process group. */ + pid = -pid; + } + if (kill(pid, signal) != 0) { + switch(errno) { + case EINVAL: + rc = ORTE_ERR_BAD_PARAM; + break; + case ESRCH: + /* This case can occur when we deliver a signal to a + process that is no longer there. This can happen if + we deliver a signal while the job is shutting down. + This does not indicate a real problem, so just + ignore the error. */ + break; + case EPERM: + rc = ORTE_ERR_PERM; + break; + default: + rc = ORTE_ERROR; + } + } + + return rc; +} + +static int orte_odls_mosix_signal_local_procs(const orte_process_name_t *proc, int32_t signal) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_odls_base_default_signal_local_procs(proc, signal, send_signal))) { + ORTE_ERROR_LOG(rc); + return rc; + } + return ORTE_SUCCESS; +} Index: orte/mca/odls/mosix/Makefile.am =================================================================== --- orte/mca/odls/mosix/Makefile.am (revision 0) +++ orte/mca/odls/mosix/Makefile.am (revision 0) @@ -0,0 +1,46 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = help-odls-mosix.txt + +sources = \ + odls_mosix.h \ + odls_mosix_component.c \ + odls_mosix_module.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_odls_mosix_DSO +component_noinst = +component_install = mca_odls_mosix.la +else +component_noinst = libmca_odls_mosix.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_odls_mosix_la_SOURCES = $(sources) +mca_odls_mosix_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_odls_mosix_la_SOURCES =$(sources) +libmca_odls_mosix_la_LDFLAGS = -module -avoid-version