/**
 * @file  odvba.cxx
 * @brief Command line tool which performs ODVBA on a set of brain images.
 *
 * Copyright (c) 2010-2012 University of Pennsylvania. All rights reserved.
 * See http://www.rad.upenn.edu/sbia/software/license.html or COPYING file.
 *
 * Contact: SBIA Group <sbia-software at uphs.upenn.edu>
 */

#include <string>
#include <iostream>
#include <time.h>
#include <nifti1_io.h> // nifti_validfilename()

#include <odvba/basis.h>
#include <odvba/odvba.h>
#include <odvba/utilities.h>

#if !defined(NDEBUG) && UNIX
#  include <unistd.h> // getpid
#endif


/**
 * @def   ODVBA_USE_MPI
 * @brief Whether parallelized MPI implementation is used.
 */
#if !defined (ODVBA_USE_MPI)
#  define ODVBA_USE_MPI 0
#endif

#if ODVBA_USE_MPI
#  include <mpi.h>
#endif

/**
 * @def   ODVBA_SYNC_STARTUP
 * @brief Whether to sync the startup, i.e., all processes wait for all
 *        other processes to be initialized before they get their hands dirty.
 */
#if !defined (ODVBA_SYNC_STARTUP)
#  define ODVBA_SYNC_STARTUP 1
#endif


// acceptable .cxx file
using namespace std;
using namespace basis;
using namespace odvba;


// ===========================================================================
// command-line argument constraints
// ===========================================================================

/**
 * @brief Constrain file names to valid NIfTI-1 file names.
 */
class NiftiFileNameConstraint : public TCLAP::Constraint<std::string>
{
public:
    NiftiFileNameConstraint() {}
    virtual ~NiftiFileNameConstraint() {}
    virtual std::string description() const { return "Value must be a valid NIfTI-1 file name."; }
    virtual std::string shortID() const { return "<nifti>"; }
    virtual bool check(const std::string& value) const
    {
        return nifti_validfilename(value.c_str());
    }
};

// ===========================================================================
// main
// ===========================================================================

// ---------------------------------------------------------------------------
int main(int argc, char* argv[])
{
    bool ok = true;

    // -----------------------------------------------------------------------
    // initialize process
    int nProc = 1;
    int rank  = 0;

    #if ODVBA_USE_MPI
        char procName[MPI_MAX_PROCESSOR_NAME];
        int  procNameLen = 0;

        MPI_Init(&argc, &argv);
        MPI_Comm_size(MPI_COMM_WORLD, &nProc);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        MPI_Get_processor_name(procName, &procNameLen);
    #endif // ODVBA_USE_MPI

    // process ID used in messages
    char procId[20];

    if (nProc > 1) sprintf(procId, "(Process %d) ", rank + 1);
    else           procId[0] = '\0';

    // -----------------------------------------------------------------------
    // command-line argument constraints
    PositiveValueConstraint<double>    positive_double_constraint("<double>");
    PositiveValueConstraint<int>       positive_int_constraint("<int>");
    ZeroOrPositiveValueConstraint<int> zero_or_positive_int_constraint("<int>");
    ExistingFileConstraint             subjects_list_constraint("<subjects_list>");
    ExistingFileConstraint             existing_file_constraint;
    NiftiFileNameConstraint            nifti_file_name_constraint;

    // -----------------------------------------------------------------------
    // command-line arguments
    Options dflt; // default options

    PositionalArg sublist_file("subjects_list",
            "Text file which lists the NIfTI-1 image data files of the input subjects.",
            true, "", &subjects_list_constraint);

    StringArg index_file("", "index",
            "Name of the pre-computed index file.",
            false, "", &existing_file_constraint);

    StringArg index_out_file("", "index-out",
            "Name of the file to write the computed index information to.",
            false, "", "<file>");

    StringArg ni_file("", "ni",
            "Name of the neighborhood indices file. If none of the other neighborhood"
            " related options (--ni-*) are given, the generated neighborhood indices"
            " are written to the specified file. Otherwise, these are read from the"
            " given file which must have been generated before using odvba-ni.",
            false, "", "<file>");

    DoubleArg ni_size("s", "ni-size",
            "Size of neighborhood in mm. Required and used only if neighborhood"
            " indices file not provided.",
            false, 0, &positive_double_constraint);

    IntArg ni_num("n", "ni-num",
            "Number of neighborhoods. At most, the neighborhood around each"
            " non-zero voxel can be considered, i.e., n <= m, where m is the"
            " number of non-zero voxels.",
            false, 0, &positive_int_constraint);

    IntArg vox_num("e", "ni-voxels",
            "Number of voxel in each neighborhood.",
            false, 0, &positive_int_constraint);

    IntArg perms_num("p", "perms-num",
        "The number of permutations to test.",
        false, dflt.nPerm, &zero_or_positive_int_constraint);

    DoubleArg phi("", "phi",
        "Exponent phi of discrimination degree (cf. Eq.(11) of MICCAI paper).",
        false, dflt.phi, &positive_double_constraint);

    StringArg maps_file("", "maps",
        "Name of file to which the group analysis results are written."
        " If this option is not specified, only the final image of"
        " p-values is output.",
        false, "", "<file>");

    StringArg perms_file("", "perms",
        "If the number of permutation tests specified by the option --perms-num"
        " is zero, the pre-computed permutations stored in the given"
        " file are used. Otherwise, the generated random permutations used for"
        " the permutation tests are written to the specified file."
        " If this option is present but not --perms-num, --perms-num is assumed to"
        " be zero, hence, the permutations will be read from the specified"
        " file. This option is used for regression testing, in particular.",
        false, "", "<file>");

    StringArg p_image_file("o", "p-image",
            "Filename prefix of the output volume of p-values which"
            " is output as NIfTI-1 image with voxel type DT_FLOAT.",
            false, "p.nii.gz", &nifti_file_name_constraint);

    SwitchArg srand_time("", "srand-time",
            "Use the execution time to initialize the pseudo-random number"
            " generator. If this option is not given, a fixed seed is used to"
            " initialize it which ensures identical results whenever this"
            " program is run on the same machine with the same input images"
            " and parameter settings. Otherwise, the results may differ between"
            " executions.",
            false);

    #ifndef NDEBUG
        SwitchArg debug("", "debug",
                "Whether to wait in an endless loop until a debugger is attached and"
                " the loop variable is incremented manually.",
                false);
    #endif

    MultiSwitchArg verbose("v", "verbose",
        "Increase verbosity of output messages.");

    // -----------------------------------------------------------------------
    // parse command-line
    try {
        vector<string> examples;

        examples.push_back("EXENAME --perms-num 100 --phi 1 --ni-num 10000"
                " --ni-voxels 400 --ni-size 15 subjects.txt --maps p-values.hdr"
                "\n"
                "Runs 100 permutation tests after generating 10,000 random neighborhoods with"
                " 400 randomly selected voxels within windows of size 15mm. The resulting p-values"
                " are written to the image file p-values.hdr in the current working directory.");

        examples.push_back("EXENAME --perms-num 100 --ni-num 10000 --ni-voxels 400 --ni-size 15 --ni ni.txt subjects.txt"
                "\n"
                "Generates 10,000 random neighborhoods with 400 randomly selected voxels in windows of size 15mm."
                " Then 100 permutation tests are performed using these neighborhoods. The used neighborhoods are"
                " saved in the file ni.txt together with the p-value map p.nii.gz in the current working directory.");

        examples.push_back("EXENAME --perms-num 100 --index index.txt --ni ni.txt subjects.txt"
                "\n"
                "Runs 100 permutation tests using the pre-computed index saved in the file index.txt"
                " and neighborhoods which are read from the file ni.txt. These files have to be generated"
                " using the programs odvba-index and odvba-ni, respectively.");

        CmdLine cmd(// program identification
                    "odvba", PROJECT,
                    // program description
                    "This program implements a group analysis method named"
                    " Optimally-Discriminative Voxel-Based Analysis (ODVBA)."
                    " For more information on this method, please refer to the"
                    " corresponding publication:"
                    " T. Zhang and C. Davatzikos, ODVBA: Optimally-Discriminative"
                    " Voxel-Based Analysis",
                    // example usage
                    examples,
                    // version and copyright information
                    RELEASE, COPYRIGHT,
                    // license information
                    "See http://www.rad.upenn.edu/sbia/software/license.html or COPYING file.",
                    // contact
                    "SBIA Group <sbia-software at uphs.upenn.edu>",
                    // standard arguments only handled by main process
                    rank == 0);

        // The constructor of the CmdLine class has already added the standard
        // arguments --help, --helpshort, --helpxml, and --version.

        cmd.add(index_file);
        cmd.add(index_out_file);
        cmd.add(ni_file);
        cmd.add(ni_size);
        cmd.add(ni_num);
        cmd.add(vox_num);
        cmd.add(perms_num);
        cmd.add(phi);
        cmd.add(maps_file);
        cmd.add(perms_file);
        cmd.add(p_image_file);
        cmd.add(srand_time);
        #ifndef NDEBUG
            cmd.add(debug);
        #endif
        cmd.add(verbose);
        cmd.add(sublist_file);

        cmd.setExceptionHandling(false);

        cmd.parse(argc, argv);
    } catch (CmdLineException& e) {
        // invalid command-line specification
        if (rank == 0) cerr << e.error() << endl;
        #if ODVBA_USE_MPI
            MPI_Finalize();
        #endif
        exit(EXIT_FAILURE);
    } catch (ArgException& e) {
        if (rank == 0) {
            if (!e.argId().empty() && e.argId() != " ") cerr << e.argId() << ", ";
            cerr << e.error() << endl;
            cerr << "See --help for a list of available and required arguments." << endl;
        }
        #if ODVBA_USE_MPI
            MPI_Finalize();
        #endif
        exit(EXIT_FAILURE);
	} catch (ExitException& e) {
        #if ODVBA_USE_MPI
            MPI_Finalize();
        #endif
        exit(e.getExitStatus());
	}

    // options of ODVBA algorithm
    Options opt;
    opt.nPerm = perms_num.getValue();
    opt.phi   = phi.getValue();

    if (srand_time.getValue()) {
        opt.seed = static_cast<unsigned int>(time(NULL));
    }

    // read permutations from file if number of permutations was not
    // specified but a permutations file
    if (!perms_num.isSet() && perms_file.isSet()) opt.nPerm = 0;

    // -----------------------------------------------------------------------
    // check command-line arguments
    bool generate_ni = (ni_num.getValue() > 0 && vox_num.getValue() > 0 && ni_size.getValue() > 0);
    if (generate_ni && (ni_num.getValue() == 0 || vox_num.getValue() == 0 || ni_size.getValue() <= 0)) {
        if (rank == 0) {
            cerr << "Not all arguments required for the generation of the neighborhoods specified!" << endl;
            cerr << "Please provide all of the following options: --ni-num, --ni-voxels, --ni-size." << endl;
            cerr << "See the --help for a description of these arguments." << endl;
        }
        #if ODVBA_USE_MPI
            MPI_Finalize ();
        #endif
        exit(EXIT_FAILURE);
    } else if (!generate_ni && !ni_file.isSet()) {
        if (rank == 0) {
            cerr << "Missing neighborhood index option(s)! Either specify an input file with the pre-computed" << endl;
            cerr << "neighborhood indices generated using odvba-ni or the arguments required to generate such" << endl;
            cerr << "indices before the permutation tests using the options: --ni-num, --ni-voxels, --ni-size." << endl;
            cerr << "See the --help for a description of these arguments." << endl;
        }
    }

    if (opt.nPerm < 0 && !perms_file.isSet()) {
        if (rank == 0) {
            cerr << "Invalid number of permutations or no file with pre-computed permutations specified!" << endl;
            cerr << "See --help for a list of required and available arguments." << endl;
        }
        #if ODVBA_USE_MPI
            MPI_Finalize ();
        #endif
        exit (EXIT_FAILURE);
    }

    // -----------------------------------------------------------------------
    // print process information / synchronize processes
    #if ODVBA_USE_MPI
    #  if ODVBA_SYNC_STARTUP
        MPI_Status status;
        int        token = 0;

        time_t startTimeSync = clock(); // time when synchronization was initiated

        if (rank == 0) {
            // print information about process
            cout << procId << "Running on " << procName << endl;
            cout.flush();

            for (int i = 1; i < nProc; i++) {
                // send off slave
                MPI_Send(&token, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
                // wait until slave process print process information
                MPI_Recv(&token, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status);
            }
        } else {
            // wait for master process to send go signal
            MPI_Recv(&token, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
            // print information about process
            cout << procId << "Running on " << procName << endl;
            cout.flush();
            // let master send off the next slave
            MPI_Send(&token, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
        }

        MPI_Barrier(MPI_COMM_WORLD);

        if (rank == 0 && nProc > 1) {
            double duration = static_cast<double>(clock() - startTimeSync) / CLOCKS_PER_SEC;
            cout << procId << "Processes synchronized in "
                    << fixed << setprecision(2) << duration << " sec" << endl;
            cout.flush();
        }
    #  else // ODVBA_SYNC_STARTUP
        cout << procId << "Running on " << procName << endl;
        cout.flush();
    #  endif // ODVBA_SYNC_STARTUP
    #endif // ODVBA_USE_MPI

    // -----------------------------------------------------------------------
    // run...
    #ifndef NDEBUG
    if (debug.getValue()) {
        cout << procId << "Process";
        #if UNIX
        cout << " PID " << getpid();
        #endif
        #if ODVBA_USE_MPI
            cout << " on " << procName;
        #endif
        cout << " ready for attach of debugger" << endl;
        int i = 0;
        while (0 == i) {
            sleep(5);
        }
    }
    #endif

    // start time
    time_t startTime = clock();

    // permutation test mode (regression testing)
    const int PERMMODE_NONE  = 0; // do not read/write permutations from/to file
    const int PERMMODE_WRITE = 1; // generate random permutations and write them to file
    const int PERMMODE_READ  = 2; // read permutations from file

    const int permMode = perms_file.isSet()
                         ? ((opt.nPerm > 0) ? PERMMODE_WRITE : PERMMODE_READ)
                         : PERMMODE_NONE;

    // -----------------------------------------------------------------------
    // input
    Database       db;  // all data used by ODVBA, read by master from disk
                        // and send to slave processes
    nifti_1_header hdr; // image header of first input image, only valid for master

    // only master reads in the data and prepares it before sending it to the slaves
    if (rank == 0) {
        CvMat* data = NULL;
        // read image data
        if (ok) {
            cout << procId << "Parsing subject list and reading image data" << endl;
            cout.flush();
            ok = ((data = read_data(sublist_file.getValue().c_str(), &db.n1, &hdr)) != NULL);
            if (!ok) cerr << procId << "Failed to read image data" << endl;
        }
        // read/create index
        if (ok) {
            if (index_file.isSet()) {
                cout << procId << "Reading index" << endl;
                cout.flush();
                ok = ((db.index = read_matrix(index_file.getValue().c_str())) != NULL);
                if (!ok) cerr << procId << "Failed to read index" << endl;
            } else if (rank == 0) {
                cout << procId << "Generating index" << endl;
                cout.flush();
                ok = ((db.index = create_index(data)) != NULL);
                if (!ok) cerr << procId << "Failed to generate index" << endl;
            }
        }
        // read/create neighborhood
        if (ok) {
            if (!generate_ni) {
                cout << procId << "Reading neighborhoods" << endl;
                cout.flush();
                ok = ((db.NI = read_matrix(ni_file.getValue().c_str())) != NULL);
                if (!ok) cerr << procId << "Failed to read neighborhoods" << endl;
            } else {
                cout << procId << "Generating neighborhoods" << endl;
                cout.flush();
                // Note: One half voxel is subtracted because create_ni() accounts for it.
                int ni_size_x = static_cast<int>(round(ni_size.getValue() / hdr.pixdim[1] - 0.5));
                int ni_size_y = static_cast<int>(round(ni_size.getValue() / hdr.pixdim[2] - 0.5));
                int ni_size_z = static_cast<int>(round(ni_size.getValue() / hdr.pixdim[3] - 0.5));
                // unlikely different from mm, but this check will not hurt
                int xyz_units = XYZT_TO_SPACE(hdr.xyzt_units);
                if (xyz_units == NIFTI_UNITS_MICRON) {
                    ni_size_x /= 1000;
                    ni_size_y /= 1000;
                    ni_size_z /= 1000;
                } else if (xyz_units == NIFTI_UNITS_METER) {
                    ni_size_x *= 1000;
                    ni_size_y *= 1000;
                    ni_size_z *= 1000;
                }
                // enforce minimum of one voxel neighborhood in each direction
                if (ni_size_x <= 0) ni_size_x = 1;
                if (ni_size_y <= 0) ni_size_y = 1;
                if (ni_size_z <= 0) ni_size_z = 1;
                // create neighborhood
                ok = ((db.NI = create_ni(db.index,
                                         hdr.dim[1], hdr.dim[2], hdr.dim[3],
                                         ni_size_x, ni_size_y, ni_size_z,
                                         ni_num.getValue(), vox_num.getValue())) != NULL);
                if (!ok) cerr << procId << "Failed to generate neighborhoods" << endl;
            }
        }
        // extract image data of non-zero voxels
        if (ok) {
            cout << procId << "Extracting image data" << endl;
            cout.flush();
            ok = ((db.X = create_x(data, db.index)) != NULL);
            if (!ok) cerr << procId << "Failed to extract image data" << endl;
        }
        // intermediate clean up
        if (data) {
            cvReleaseMat(&data);
            data = NULL;
        }
        // read permutations
        if (ok && permMode == PERMMODE_READ) {
            cout << procId << "Reading permutations" << endl;
            cout.flush();
            ok = ((opt.perms = read_matrix(perms_file.getValue().c_str())) != NULL);
            if (!ok) cerr << procId << "Failed to read permutations from file " << perms_file.getValue() << endl;
        }
    }

    // send data to slave processes
    #if ODVBA_USE_MPI
        if (rank == 0) {
            cout << procId << "Sending data to other process(es)" << endl;
            cout.flush();
            for (int i = 1; ok && i < nProc; i++) {
                // n1
                ok = ok && MPI_Send(&db.n1, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                // index
                ok = ok && MPI_Send(&db.index->rows, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                ok = ok && MPI_Send(&db.index->cols, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                ok = ok && MPI_Send(db.index->data.fl, db.index->rows * db.index->cols, MPI_FLOAT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                // neighborhoods
                ok = ok && MPI_Send(&db.NI->rows, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                ok = ok && MPI_Send(&db.NI->cols, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                ok = ok && MPI_Send(db.NI->data.fl, db.NI->rows * db.NI->cols, MPI_FLOAT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                // X
                ok = ok && MPI_Send(&db.X->rows, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                ok = ok && MPI_Send(&db.X->cols, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                ok = ok && MPI_Send(db.X->data.fl, db.X->rows * db.X->cols, MPI_FLOAT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                // permutations
                if (permMode == PERMMODE_READ) {
                    ok = ok && MPI_Send(&opt.perms->rows, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                    ok = ok && MPI_Send(&opt.perms->cols, 1, MPI_INT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                    ok = ok && MPI_Send(opt.perms->data.fl, opt.perms->rows * opt.perms->cols, MPI_FLOAT, i, 0, MPI_COMM_WORLD) == MPI_SUCCESS;
                }
                // success?
                if (ok) {
                    cout << procId << "Sent data to process " << (i + 1) << endl;
                    cout.flush();
                } else {
                    cerr << procId << "Failed to send data to process " << (i + 1) << endl;
                }
            }
        } else {
            cout << procId << "Waiting for data from process 1" << endl;
            cout.flush();
            MPI_Status status;
            int rows = 0;
            int cols = 0;
            // n1
            ok = ok && MPI_Recv(&db.n1, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
            // index
            ok = ok && MPI_Recv(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
            ok = ok && MPI_Recv(&cols, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
            if (ok) {
                db.index = cvCreateMat(rows, cols, CV_32FC1);
                if (db.index) {
                    ok = ok && MPI_Recv(db.index->data.fl, rows * cols, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
                } else {
                    cerr << procId << "Failed to allocate memory!" << endl;
                }
            }
            // neighborhoods
            ok = ok && MPI_Recv(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
            ok = ok && MPI_Recv(&cols, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
            if (ok) {
                db.NI = cvCreateMat(rows, cols, CV_32FC1);
                if (db.NI) {
                    ok = ok && MPI_Recv(db.NI->data.fl, rows * cols, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
                } else {
                    cerr << procId << "Failed to allocate memory!" << endl;
                }
            }
            // X
            ok = ok && MPI_Recv(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
            ok = ok && MPI_Recv(&cols, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
            if (ok) {
                db.X = cvCreateMat(rows, cols, CV_32FC1);
                if (db.X) {
                    ok = ok && MPI_Recv(db.X->data.fl, rows * cols, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
                } else {
                    cerr << procId << "Failed to allocate memory!" << endl;
                }
            }
            // permutations
            if (permMode == PERMMODE_READ) {
                ok = ok && MPI_Recv(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
                ok = ok && MPI_Recv(&cols, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
                if (ok) {
                    opt.perms = cvCreateMat(rows, cols, CV_32FC1);
                    if (opt.perms) {
                        ok = ok && MPI_Recv(opt.perms->data.fl, rows * cols, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS;
                    } else {
                        cerr << procId << "Failed to allocate memory!" << endl;
                    }
                }
            }
            // success ?
            if (ok) {
                cout << procId << "Received data from process 1" << endl;
                cout.flush();
            } else {
                cerr << procId << "Failed to retrieve data from process 1" << endl;
            }
        }
    #endif // ODVBA_USE_MPI

    // everything ok?
    if (!ok) {
        db.release();
        #if ODVBA_USE_MPI
            MPI_Finalize();
        #endif
        exit(EXIT_FAILURE);
    }

    // data related constants
    const int n = db.X->cols;            // number of subjects
    const int p = ((opt.nPerm > 0)       // number of permutations (total)
                    ? opt.nPerm
                    : (opt.perms
                       ? opt.perms->rows
                       : 0));

    // -----------------------------------------------------------------------
    // prepare processing
    int* nPerms = new int[nProc]; // number of permutations per process

    // distribute work among processes
    for (int i = 0; i < nProc; i++) nPerms[i] = 0;

    if (nProc == 1) {
        nPerms[0] = p;
    } else {
        int i = 1;
        for (int np = 0; np < p; np++, i++) {
            if (i == nProc) i = 1;
            nPerms[i]++;
        }
    }

    // adjust permutations for this process
    if (permMode == PERMMODE_READ) {
        opt.nPerm = 0;

        if (nProc > 1) {
            CvMat* perms = NULL;

            if (nPerms[rank] > 0) {
                ok = (perms = cvCreateMat(nPerms[rank], n, CV_32FC1)) != NULL;
            }

            if (perms) {
                int nSkip = 0;
                for (int i = 0; i < rank; i++) nSkip += nPerms[i];
                memcpy(perms->data.fl, opt.perms->data.fl + nSkip * n, nPerms[rank] * n * sizeof(float));
            }

            cvReleaseMat(&opt.perms);
            opt.perms = perms;
        }
    } else if (permMode == PERMMODE_WRITE) {
        opt.nPerm = nPerms[rank];
        opt.perms = cvCreateMat(nPerms[rank], n, CV_32FC1);

        if (!opt.perms) {
            cerr << procId << "Failed to allocate memory" << endl;
            ok = false;
        }
    } else {
        opt.nPerm = nPerms[rank];
    }

    // everything ok?
    if (!ok) {
        delete [] nPerms;
        nPerms = NULL;
        db.release();
        #if ODVBA_USE_MPI
            MPI_Finalize();
        #endif
        exit(EXIT_FAILURE);
    }

    // -----------------------------------------------------------------------
    // perform group analysis
    time_t startTimeAnalysis = clock(); // time when processing started
    CvMat* maps              = NULL;    // group analysis results

    if (rank == 0) {
        if (nPerms[rank] > 0) {
            cout << procId << "Performing initial analysis and "
                 << nPerms[rank] << " permutation test(s)" << endl;
        } else {
            cout << procId << "Performing initial analysis" << endl;
        }
    } else {
        if (nPerms[rank] > 0) {
            cout << procId << "Performing " << nPerms[rank] << " permutation test(s)" << endl;
        } else {
            cout << procId << "Nothing to do" << endl;
        }
    }
    cout.flush();

    if (rank == 0 || opt.nPerm > 0 || (opt.perms && opt.perms->rows > 0)) {
        ok = ((maps = perform_analysis(db, opt, ((rank == 0) ? true : false), verbose.getValue(), procId)) != NULL);
        if (ok) {
            double t = static_cast<float>(clock() - startTimeAnalysis) / CLOCKS_PER_SEC;
            cout << procId << "Performed group analysis in " << fixed << setprecision(2) << t << " sec" << endl;
            cout.flush();
        } else {
            cerr << procId << "Failed to perform group analysis" << endl;
        }
    }

    // everything ok?
    if (!ok) {
        if (maps) {
            cvReleaseMat(&maps);
            maps = NULL;
        }
        delete [] nPerms;
        nPerms = NULL;
        db.release();
        #if ODVBA_USE_MPI
            MPI_Finalize();
        #endif
        exit(EXIT_FAILURE);
    }

    // -----------------------------------------------------------------------
    // assemble results
    #if ODVBA_USE_MPI
        // if subprocess...
        if (rank > 0) {
            // send results to master process
            if (nPerms[rank] > 0) {
                cout << procId << "Waiting for process 1 to collect results" << endl;
                cout.flush();
                if (MPI_Send(maps->data.fl, maps->rows * maps->cols, MPI_FLOAT, 0, 0, MPI_COMM_WORLD) == MPI_SUCCESS) {
                    cout << procId << "Sent results to process 1" << endl;
                    cout.flush();
                } else {
                    cerr << "Failed to send results to process 1" << endl;
                    ok = false;
                }
            }
        // if master (and not only) process...
        } else if (nProc > 1) {
            // allocate memory for assembled results, copy results of master
            // process and replace maps by matrix big enough for all results
            cout << procId << "Allocating memory for collective results" << endl;
            cout.flush();
            CvMat* maps2 = NULL;
            ok = ((maps2 = cvCreateMat(1 + p, maps->cols, CV_32FC1)) != NULL);
            if (ok) {
                cout << procId << "Filling in own results" << endl;
                cout.flush();
                memcpy(maps2->data.fl, maps->data.fl, (1 + nPerms[0]) * maps->cols * sizeof(float));
                cvReleaseMat(&maps);
                maps = maps2;
            } else {
                cerr << procId << "Failed to allocate memory" << endl;
            }
            // collect results of other processes
            if (ok) {
                cout << procId << "Collecting results from the other process(es)" << endl;
                cout.flush();
                float* ptr = maps->data.fl + (1 + nPerms[0]) * maps->cols;
                for (int i = 1; i < nProc; i++) {
                    if (nPerms[i] > 0) {
                        MPI_Status status;
                        if (MPI_Recv(ptr, nPerms[i] * maps->cols, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS) {
                            cout << procId << "Received results from process " << (i + 1) << endl;
                            cout.flush();
                        } else {
                            cerr << procId << "Failed to collect results from process " << (i + 1) << endl;
                            ok = false;
                        }
                        ptr += nPerms[i] * maps->cols;
                    } else {
                        cout << procId << "Process " << (i + 1) << " had nothing to do" << endl;
                        cout.flush();
                    }
                }
            }
        }
    #else
        // otherwise, there is nothing to assemble...
    #endif // ODVBA_USE_MPI

    // intermediate clean up
    delete [] nPerms;

    // everything ok?
    if (!ok) {
        if (maps) {
            cvReleaseMat(&maps);
            maps = NULL;
        }
        db.release();
        #if ODVBA_USE_MPI
            MPI_Finalize();
        #endif
        exit(EXIT_FAILURE);
    }

    // -----------------------------------------------------------------------
    // assemble permutations
    if (permMode == PERMMODE_WRITE) {
        #if ODVBA_USE_MPI
            // if subprocess...
            if (rank > 0) {
                // send permutations to master process
                cout << procId << "Waiting for process 1 to collect permutations" << endl;
                cout.flush();
                if (MPI_Send(opt.perms->data.fl, nPerms[rank] * opt.perms->cols, MPI_FLOAT, 0, 0, MPI_COMM_WORLD) == MPI_SUCCESS) {
                    cout << procId << "Sent permutations to process 1" << endl;
                    cout.flush();
                } else {
                    cerr << procId << "Failed to send permutations to process 1" << endl;
                    ok = false;
                }
            // if master (and not the only) process...
            } else if (nProc > 1) {
                // allocate memory for assembled permutations, copy permutations of
                // master process and replace perms by matrix big enough for all
                // permutations
                cout << procId << "Allocating memory for collective permutations" << endl;
                cout.flush();
                CvMat* perms2 = NULL;
                ok = ((perms2 = cvCreateMat(1 + p, n, CV_32FC1)) != NULL);
                if (ok && nPerms[0] > 0) {
                    cout << procId << "Filling in own permutations" << endl;
                    cout.flush();
                    memcpy(perms2->data.fl, opt.perms->data.fl, nPerms[0] * n * sizeof(float));
                    cvReleaseMat(&opt.perms);
                    opt.perms = perms2;
                } else {
                    cerr << procId << "Failed to allocate memory" << endl;
                }
                // collect results of other processes
                if (ok) {
                    cout << procId << "Collecting permutations from the other process(es)" << endl;
                    cout.flush();
                    float* ptr = opt.perms->data.fl + nPerms[0] * n;
                    for (int i = 1; i < nProc; i++) {
                        if (nPerms[i] > 0) {
                            MPI_Status status;
                            if (MPI_Recv(ptr, nPerms[i] * n, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &status) == MPI_SUCCESS) {
                                cout << procId << "Received permutations from process " << (i + 1) << endl;
                                cout.flush();
                            } else {
                                cerr << procId << "Failed to collect permutations from process " << (i + 1) << endl;
                                ok = false;
                            }
                            ptr += nPerms[i] * n;
                        } else {
                            cout << procId << "Process " << (i + 1) << " had nothing to do" << endl;
                            cout.flush();
                        }
                    }
                }
            }
        #else
            // otherwise, there is nothing to assemble...
        #endif // ODVBA_USE_MPI
    }

    // -----------------------------------------------------------------------
    // output index (master process only, optional)
    if (rank == 0 && index_out_file.isSet()) {
        cout << procId << "Writing used index to file " << index_out_file.getValue() << endl;
        cout.flush();

        if (!write_matrix(index_out_file.getValue().c_str(), db.index, "%f")) {
            cerr << procId << "Failed to write index" << endl;
            ok = false;
        }
    }

    // -----------------------------------------------------------------------
    // output neighorhoods (master process only, optional)
    if (rank == 0 && generate_ni && ni_file.isSet()) {
        cout << procId << "Writing used neighborhoods to file " << ni_file.getValue() << endl;
        cout.flush();
        if (!write_matrix(ni_file.getValue().c_str(), db.NI, "%f")) {
            cerr << procId << "Failed to write neighborhoods" << endl;
            ok = false;
        }
    }

    // -----------------------------------------------------------------------
    // output results (master process only, optional)
    if (rank == 0 && maps_file.isSet()) {
        cout << procId << "Writing group analysis results to file " << maps_file.getValue() << endl;
        cout.flush();
        if (!write_matrix(maps_file.getValue().c_str(), maps, "%f")) {
            cerr << procId << "Failed to write group analysis results" << endl;
            ok = false;
        }
    }

    // -----------------------------------------------------------------------
    // output permutations (master process only, optional)
    if (rank == 0 && permMode == PERMMODE_WRITE) {
        cout << procId << "Writing permutations to file " << perms_file.getValue() << endl;
        cout.flush();
        if (!write_matrix(perms_file.getValue().c_str(), opt.perms, "%.0f")) {
            cerr << procId << "Failed to write permutations" << endl;
            ok = false;
        }
    }

    // -----------------------------------------------------------------------
    // output p-image (master process only)
    if (rank == 0) {
        cout << procId << "Writing p-image to file " << p_image_file.getValue() << endl;
        cout.flush();
        CvMat* p_image = get_p_image(maps, db.index, hdr.dim[1] * hdr.dim[2] * hdr.dim[3]);
        if (p_image) {
            hdr.intent_code = NIFTI_INTENT_PVAL;
            strncpy(hdr.descrip, "Output p-values of performed ODVBA group analysis", 80);
            strncpy(hdr.intent_name, nifti_intent_string(hdr.intent_code), 16);
            if (!write_nifti_image(p_image_file.getValue().c_str(), hdr, p_image)) {
                cerr << procId << "Failed to write p-image" << endl;
                ok = false;
            }
            cvReleaseMat(&p_image);
        } else {
            cerr << procId << "Failed to generate p-image" << endl;
            ok = false;
        }
    }

    // -----------------------------------------------------------------------
    // finalize
    if (maps) {
        cvReleaseMat(&maps);
        maps = NULL;
    }
    db.release();

    double t = static_cast<double>(clock() - startTime) / CLOCKS_PER_SEC;
    cout << procId << "Finished in " << fixed << setprecision(2) << t << " sec" << endl;

    #if ODVBA_USE_MPI
        MPI_Finalize();
    #endif
    exit(ok ? EXIT_SUCCESS : EXIT_FAILURE);
}
