Pardiso
From Medusa: Coordinate Free Mehless Method implementation
Pardiso library
Pardiso library can be downloaded from [1]. There are also examples of use, the manual, etc. Note that the source of the library is not available. There are only pre-compiled objects available on download after one registers on the site.
- The package PARDISO is a thread-safe, high-performance, robust, memory efficient and easy to use software for solving large sparse symmetric and unsymmetric linear systems of equations on shared-memory and distributed-memory multiprocessors. The solver uses a combination of left- and right-looking Level-3 BLAS supernode techniques".
- Pardiso is proprietary software but is available for academic use; by registering a yearly license can be obtained.
- Parallelization is implemented using both OpenMP and MPI.
- MPI-based numerical factorization and parallel forward/backward substitution on distributed-memory architectures is implemented for symmetric indefinite matrices.
Example of use
This minimal example makes use of the OpenMP parallelization to run Pardiso solver on a sample matrix.
test.cpp:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <vector>
#include <cassert>
#include <math.h>
// PARDISO prototype.
// There is no pardiso.h, therefore function prototypes have to be defined within the user's code
extern "C" void pardisoinit (void *, int *, int *, int *, double *, int *);
extern "C" void pardiso (void *, int *, int *, int *, int *, int *,
double *, int *, int *, int *, int *, int *,
int *, double *, double *, int *, double *);
extern "C" void pardiso_chkmatrix (int *, int *, double *, int *, int *, int *);
extern "C" void pardiso_chkvec (int *, int *, double *, int *);
extern "C" void pardiso_printstats (int *, int *, double *, int *, int *, int *, double *, int *);
struct TestCase {
// the dimension of the problem, Matrix A will have n*n elements, while vectors b and x will have n elements
int n;
// indices of individual rows in compact matrix definition: row[0] contains non zero elements at column indices ia[0] .. (ia[1]-1)
std::vector<int> ia;
// column indices of non-zero elements
std::vector<int> ja;
// values of non-zero elements
std::vector<double> A;
// right hand side
std::vector<double> b;
// solution vector
std::vector<double> x;
// Pardiso workspace variable (using void* as the type ensures enough space is allocated on 32 and 64-bit systems)
void* pt[64];
// Pardiso solver parameter variables
int iparm[64];
double dparm[64];
void createTestMatrix(int n) {
this->n = n;
// -2 on the diagonal (except topmost and bottommost, which are 1), 1 on the +1 and -1 diagonals (except topmost and bottommost, which are 0)
ia.reserve(n*2);
ja.reserve(n*3);
A.reserve(n*3); // 3 per row
// 1st row (only 1 element)
ia.push_back(0);
A.push_back(1);
ja.push_back(0);
for (int i = 1; i < n-1; ++i) {
// new row is stating
ia.push_back(A.size());
// the 3 diagonal elements
A.push_back(1);
ja.push_back(i-1);
A.push_back(-2);
ja.push_back(i);
A.push_back(1);
ja.push_back(i+1);
}
// last row (only 1 element)
ia.push_back(A.size());
A.push_back(1);
ja.push_back(n-1);
// the last element points at the end of the matrix (n+1 row)
ia.push_back(A.size());
// Convert matrix from 0-based to 1-based notation (Fortran...)
for (auto i = 0; i < ia.size(); i++)
ia[i] += 1;
for (auto i = 0; i < ja.size(); i++)
ja[i] += 1;
// set b to 1 (topmost and bottommost elements) and 1/n² (the other elements)
b.resize(n, 1);
for (auto i = 0; i < b.size(); i++)
b[i] /= (n*n);
}
void solve(int numThreads) {
// the solution placeholder
x.resize(n, 0);
int error = 0;
int solver = 0; // use sparse direct solver
int maxfct, mnum, phase, msglvl;
int mtype = 11; // Real unsymmetric matrix
double ddum; // double dummy variable
int idum; // integer dummy variable
int nrhs = 1; // number of right hand sides
maxfct = 1; // Maximum number of numerical factorizations.
mnum = 1; // Which factorization to use.
msglvl = 1; // Print statistical information
error = 0; // Initialize error flag to no-error
iparm[2] = numThreads; // initialize the number of threads
pardisoinit (pt, &mtype, &solver, iparm, dparm, &error);
std::cerr << "error = " << error << "\n";
pardiso_chkmatrix (&mtype, &n, A.data(), ia.data(), ja.data(), &error);
std::cerr << "error = " << error << "\n";
pardiso_chkvec (&n, &nrhs, b.data(), &error);
std::cerr << "error = " << error << "\n";
pardiso_printstats (&mtype, &n, A.data(), ia.data(), ja.data(), &nrhs, b.data(), &error);
std::cerr << "error = " << error << "\n";
phase = 11; // reordering & symbolic factorization
pardiso (pt, &maxfct, &mnum, &mtype, &phase, &n, A.data(), ia.data(), ja.data(), &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &error, dparm);
std::cerr << "error = " << error << "\n";
phase = 22; // numerical factorization
pardiso (pt, &maxfct, &mnum, &mtype, &phase, &n, A.data(), ia.data(), ja.data(), &idum, &nrhs, iparm, &msglvl, &ddum, &ddum, &error, dparm);
std::cerr << "error = " << error << "\n";
phase = 33; // back substitution and refinement
iparm[7] = 10; // number of refinement steps
pardiso (pt, &maxfct, &mnum, &mtype, &phase, &n, A.data(), ia.data(), ja.data(), &idum, &nrhs, iparm, &msglvl, b.data(), x.data(), &error, dparm);
std::cerr << "error = " << error << "\n";
phase = -1; // finalization and memory release
pardiso (pt, &maxfct, &mnum, &mtype, &phase, &n, A.data(), ia.data(), ja.data(), &idum, &nrhs, iparm, &msglvl, b.data(), x.data(), &error, dparm);
std::cerr << "error = " << error << "\n";
}
};
int main(int argc, char** argv) {
assert(argc == 2 && "Second argument is size of the system.");
int n, nThreads;
std::stringstream ss(argv[1]);
ss >> n;
auto omp_num_threads = getenv("OMP_NUM_THREADS");
if (omp_num_threads != nullptr) {
std::cerr << "OMP_NUM_THREADS = " << omp_num_threads << "\n";
nThreads = atoi(omp_num_threads); // number of threads
}
std::cout << "n = " << n << ", num threads = " << nThreads << std::endl;
TestCase testcase;
testcase.createTestMatrix(n);
testcase.solve(nThreads);
return 0;
}The code is compiled with GCC using the following command line:
g++ test.cpp -o test -O3 -L./ -lpardiso500-GNU481-X86-64 -lgfortran -fopenmp -lpthread -lm -llapack -lblas
Then it is executed:
OMP_NUM_THREADS=8 ./test 1000000
This takes quite some time on the experimental machine (Intel i7 870 @ 2.93 GHz), which is dilligently reported by Pardiso itself:
Summary PARDISO 5.0.0: ( reorder to reorder )
Time fulladj: 0.054887 s
Time reorder: 2.192858 s
Time symbfct: 0.189367 s
Time parlist: 0.059304 s
Time malloc : 0.122711 s
Time total : 2.871472 s total - sum: 0.252344 s
Summary PARDISO 5.0.0: ( factorize to factorize )
Time A to LU: 0.000000 s
Time numfct : 0.196008 s
Time malloc : -0.000004 s
Time total : 0.196070 s total - sum: 0.000066 s
Summary PARDISO 5.0.0: ( solve to solve )
Time solve : 0.033254 s
Time total : 0.200364 s total - sum: 0.167110 s