Pardiso

From Medusa: Coordinate Free Mehless Method implementation
Jump to: navigation, search

Pardiso library

Pardiso library can be downloaded from [1]. There are also examples of use, the manual, etc. Note that the source of the library is not available. There are only pre-compiled objects available on download after one registers on the site.

  • The package PARDISO is a thread-safe, high-performance, robust, memory efficient and easy to use software for solving large sparse symmetric and unsymmetric linear systems of equations on shared-memory and distributed-memory multiprocessors. The solver uses a combination of left- and right-looking Level-3 BLAS supernode techniques".
  • Pardiso is proprietary software but is available for academic use; by registering a yearly license can be obtained.
  • Parallelization is implemented using both OpenMP and MPI.
  • MPI-based numerical factorization and parallel forward/backward substitution on distributed-memory architectures is implemented for symmetric indefinite matrices.

Example of use

This minimal example makes use of the OpenMP parallelization to run Pardiso solver on a sample matrix.

test.cpp:

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <vector>
#include <cassert>
#include <math.h>

// PARDISO prototype.
// There is no pardiso.h, therefore function prototypes have to be defined within the user's code
extern "C" void pardisoinit (void   *, int    *,   int *, int *, double *, int *);
extern "C" void pardiso     (void   *, int    *,   int *, int *,    int *, int *, 
                  double *, int    *,    int *, int *,   int *, int *,
                     int *, double *, double *, int *, double *);
extern "C" void pardiso_chkmatrix  (int *, int *, double *, int *, int *, int *);
extern "C" void pardiso_chkvec     (int *, int *, double *, int *);
extern "C" void pardiso_printstats (int *, int *, double *, int *, int *, int *, double *, int *);

struct TestCase {
    // the dimension of the problem, Matrix A will have n*n elements, while vectors b and x will have n elements
    int n;
    // indices of individual rows in compact matrix definition: row[0] contains non zero elements at column indices ia[0] .. (ia[1]-1)
    std::vector<int> ia;
    // column indices of non-zero elements
    std::vector<int> ja;
    // values of non-zero elements
    std::vector<double> A;
    // right hand side
    std::vector<double> b;
    // solution vector
    std::vector<double> x;

    // Pardiso workspace variable (using void* as the type ensures enough space is allocated on 32 and 64-bit systems)
    void*    pt[64];
    // Pardiso solver parameter variables
    int      iparm[64];
    double   dparm[64];
    
    void createTestMatrix(int n) {
        this->n = n;
        
        // -2 on the diagonal (except topmost and bottommost, which are 1), 1 on the +1 and -1 diagonals (except topmost and bottommost, which are 0)
        ia.reserve(n*2);
        ja.reserve(n*3);
        A.reserve(n*3);  // 3 per row
        
        // 1st row (only 1 element)
        ia.push_back(0);
        A.push_back(1);
        ja.push_back(0);

        for (int i = 1; i < n-1; ++i) {
            // new row is stating
            ia.push_back(A.size());
            
            // the 3 diagonal elements
            A.push_back(1);
            ja.push_back(i-1);
            
            A.push_back(-2);
            ja.push_back(i);
            
            A.push_back(1);
            ja.push_back(i+1);
        }
        
        // last row (only 1 element)
        ia.push_back(A.size());
        A.push_back(1);
        ja.push_back(n-1);
        
        // the last element points at the end of the matrix (n+1 row)
        ia.push_back(A.size());
        
        // Convert matrix from 0-based to 1-based notation (Fortran...)
        for (auto i = 0; i < ia.size(); i++) 
            ia[i] += 1;
        for (auto i = 0; i < ja.size(); i++) 
            ja[i] += 1;
            
        // set b to 1 (topmost and bottommost elements) and 1/n² (the other elements)
        b.resize(n, 1);
        for (auto i = 0; i < b.size(); i++) 
            b[i] /= (n*n);
    }
    
    void solve(int numThreads) {
        // the solution placeholder
        x.resize(n, 0);
        
        int error = 0;
        int solver = 0;     // use sparse direct solver
        int maxfct, mnum, phase, msglvl;
        int mtype = 11;     // Real unsymmetric matrix
        double ddum;        // double dummy variable
        int idum;           // integer dummy variable
        int nrhs = 1;       // number of right hand sides
        maxfct = 1;         // Maximum number of numerical factorizations.
        mnum   = 1;         // Which factorization to use.
        msglvl = 1;         // Print statistical information
        error  = 0;         // Initialize error flag to no-error
        iparm[2] = numThreads;  // initialize the number of threads 
        
        pardisoinit (pt,  &mtype, &solver, iparm, dparm, &error);
        std::cerr << "error = " << error << "\n";
        pardiso_chkmatrix  (&mtype, &n, A.data(), ia.data(), ja.data(), &error);
        std::cerr << "error = " << error << "\n";
        pardiso_chkvec (&n, &nrhs, b.data(), &error);
        std::cerr << "error = " << error << "\n";
        pardiso_printstats (&mtype, &n, A.data(), ia.data(), ja.data(), &nrhs, b.data(), &error);
        std::cerr << "error = " << error << "\n";
   
        phase = 11;     // reordering & symbolic factorization
        pardiso (pt, &maxfct, &mnum, &mtype, &phase, &n, A.data(), ia.data(), ja.data(), &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &error, dparm); 
        std::cerr << "error = " << error << "\n";
        phase = 22;     // numerical factorization
        pardiso (pt, &maxfct, &mnum, &mtype, &phase, &n, A.data(), ia.data(), ja.data(), &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &error, dparm);
        std::cerr << "error = " << error << "\n";
        phase = 33;     // back substitution and refinement
        iparm[7] = 10;  // number of refinement steps
        pardiso (pt, &maxfct, &mnum, &mtype, &phase, &n, A.data(), ia.data(), ja.data(), &idum, &nrhs, iparm, &msglvl, b.data(), x.data(), &error, dparm);
        std::cerr << "error = " << error << "\n";
        phase = -1;     // finalization and memory release
        pardiso (pt, &maxfct, &mnum, &mtype, &phase, &n, A.data(), ia.data(), ja.data(), &idum, &nrhs, iparm, &msglvl, b.data(), x.data(), &error, dparm);
        std::cerr << "error = " << error << "\n";
    }
};


int main(int argc, char** argv) {
    assert(argc == 2 && "Second argument is size of the system.");
    int n, nThreads;
    std::stringstream ss(argv[1]);
    ss >> n;
    auto omp_num_threads = getenv("OMP_NUM_THREADS");
    if (omp_num_threads != nullptr) {
        std::cerr << "OMP_NUM_THREADS = " << omp_num_threads << "\n";
        nThreads = atoi(omp_num_threads);      // number of threads
    }
    std::cout << "n = " << n << ", num threads = " << nThreads << std::endl;
    
    TestCase testcase;
    testcase.createTestMatrix(n);
    testcase.solve(nThreads);
    return 0;
}

The code is compiled with GCC using the following command line: g++ test.cpp -o test -O3 -L./ -lpardiso500-GNU481-X86-64 -lgfortran -fopenmp -lpthread -lm -llapack -lblas

Then it is executed: OMP_NUM_THREADS=8 ./test 1000000

This takes quite some time on the experimental machine (Intel i7 870 @ 2.93 GHz), which is dilligently reported by Pardiso itself:

Summary PARDISO 5.0.0: ( reorder to reorder )
    Time fulladj: 0.054887 s
    Time reorder: 2.192858 s
    Time symbfct: 0.189367 s
    Time parlist: 0.059304 s
    Time malloc : 0.122711 s
    Time total  : 2.871472 s total - sum: 0.252344 s
Summary PARDISO 5.0.0: ( factorize to factorize )
    Time A to LU: 0.000000 s
    Time numfct : 0.196008 s
    Time malloc : -0.000004 s
    Time total  : 0.196070 s total - sum: 0.000066 s
Summary PARDISO 5.0.0: ( solve to solve )
    Time solve  : 0.033254 s
    Time total  : 0.200364 s total - sum: 0.167110 s