Renan Cabrera L. PhD

Sunday, May 2, 2010

Eigenvalues: C/Lapack

Here is another example on the use of Lapack. This time the objective is to calculate the eigenvectors and eigenvalues of a complex matrix

#include<stdio.h>
#include<math.h>

#include<complex.h>
#include <stdlib.h>

//.......................................................................................................
void zgeTranspose( double complex *Transposed, double complex *M ,int n)
{

int i,j;
for(i=0;i<n;i++)

  for(j=0;j<n;j++) Transposed[i+n*j] = M[i*n+j];
}

//......................................................................................................
//  MatrixComplexEigensystem: computes the eigenvectors and eigenValues of input matrix A
//  The eigenvectors are stored in columns
//.....................................................................................................
void MatrixComplexEigensystem( double complex *eigenvectorsVR, double complex *eigenvaluesW, double complex *A, int N)
{

 int i;
  
  double complex *AT = (double complex*) malloc( N*N*sizeof(double complex) );

  zgeTranspose( AT, A , N);
  
  char JOBVL ='N';   // Compute Right eigenvectors

  char JOBVR ='V';   // Do not compute Left eigenvectors

  double complex VL[1];

  int LDVL = 1; 
  int LDVR = N;

 int LWORK = 4*N; 

 double complex *WORK =  (double complex*)malloc( LWORK*sizeof(double complex));

 double complex *RWORK = (double complex*)malloc( 2*N*sizeof(double complex));

 
 int INFO;

 zgeev_( &JOBVL, &JOBVR, &N, AT ,  &N , eigenvaluesW ,

   VL, &LDVL,
   eigenvectorsVR, &LDVR, 
   WORK, 
   &LWORK, RWORK, &INFO );

 zgeTranspose( AT, eigenvectorsVR , N);

 for(i=0;i<N*N;i++) eigenvectorsVR[i]=AT[i];

 
  free(WORK);
  free(RWORK);
  free(AT);
}


int main()
{
  int i,j;
  const int N = 3;

  
  double complex A[] = { 1.+I , 2. ,  3 , 4. , 5.+I , 6. , 7., 8., 9. + I};

  double complex eigenVectors[N*N];
  double complex eigenValues[N];

  
  MatrixComplexEigensystem( eigenVectors, eigenValues, A, N);
  
  printf("\nEigenvectors\n");

  for(i=0;i<N;i++){
    for(j=0;j<N;j++) printf(" (%f,%f) \t", eigenVectors[i*N + j]);

    printf("\n");
  }
  
  printf("\nEigenvalues \n");
  for(i=0;i<N;i++) printf("\n (%f, %f) \t",  eigenValues[i] );

  
printf("\n------------------------------------------------------------\n"); 
return 0;

}

Complex matrix inverse: C++/Lapack

So, here is an example on how to call lapack from c++

#include<iostream>
#include<math.h>

#include<complex>
#include <stdlib.h>

using namespace std;

extern "C" void zgetrf_( int*, int* , complex<double>* , int*, int* , int* );

extern "C" void zgetri_( int*, complex<double>* , int*, int* , complex<double>*, int* , int* );

//........................................................................................
void zgeTranspose( complex<double> *Transposed, complex<double> *M ,int n)
{

int i,j;
for(i=0;i<n;i++)

 for(j=0;j<n;j++) Transposed[i+n*j] = M[i*n+j];
}

//.........................................................................................
void MatrixComplexInverse(complex<double> *invA, complex<double> *A, int n)
{

 int LWORK=10*n;

 int *permutations;

 complex<double> *WORK, *tempA;

 tempA = new complex<double>[n*n];

 permutations =  new int[2*n];
 WORK = new complex<double>[n*n];


 int INFO;

 zgeTranspose(tempA,A,n);


 zgetrf_( &n, &n, tempA , &n, permutations , &INFO );

 if (INFO != 0) {
   cout<<"ComplexMatrixInverse: Error at zgetrf  \n"; exit(0);
   }



 zgetri_( &n, tempA , &n, permutations , WORK, &LWORK, &INFO );

 if (INFO != 0) {
   cout<<"ComplexMatrixInverse: Error at zgetri  \n"; exit(0);
   }

 zgeTranspose(invA,tempA,n);

 delete [] WORK;

 delete [] tempA;
 delete [] permutations;

}

/////////////////////////////////////////////////////////////////////////////////////////

int main()
{
 int i,j;
 const int N = 3;

 complex<double> I(0.,1.);

 complex<double> A[] = { 1. + I , 2. ,  3 , 4. , 5.+I , 6. , 7., 8., 9. + I};

 complex<double> invA[N*N];

 MatrixComplexInverse(invA,A,N);


 for(i=0;i<N;i++){
   for(j=0;j<N;j++) cout << invA[i*N + j]<<"\t";

   cout<<"\n";
 }

cout<<"---------------------------\n";

return 0;

}

////////////////////=====================////////////////////

and the make file is

#
CC = c++

#edit LAPACK_PATH if necessary
LAPACK_PATH = /usr/lib64/atlas

a,out: main.cpp
 $(CC)  main.cpp  -L$(LAPACK_PATH) -llapack -lblas  -lgfortran -lm

Wednesday, April 28, 2010

Here I am listing some general rules of programming that I learned to follow.
Some of them are enforced by certain languages or programming environments but some are very permissive and require to be more careful. These rules are already well known but here they are again

Define very well the input from the output arguments of a function. The input arguments should remain unmodified in the routine.
Never write a routine that cannot fit in one page of computer screen in the worst case.
Develop and validate a program in reasonable small steps. It may take longer to write the code but it will take less time to debug.
Use a maximum of two nested loops in a certain routine.
Use meaningful names for the variables even if they become long.
Do not repeat the same task in different places.

Monday, April 26, 2010

Complex matrix inverse: C/Lapack

Here a simple example on how to use Lapack within C to invert matrices using the LU decomposition.

The C++ code is different and I will post that another day

[The html code for this post was generated by http://www.bedaux.net/cpp2html/]

#include<stdio.h>
#include<math.h>

#include<complex.h>
#include <stdlib.h>

//........................................................................................
void zgeTranspose( double complex *Transposed, double complex *M ,int n)
{

int i,j;
for(i=0;i<n;i++)

for(j=0;j<n;j++) Transposed[i+n*j] = M[i*n+j];
}

//.........................................................................................
void MatrixComplexInverse(double complex *invA, double complex *A, int n)
{

int LWORK=10*n;

int *permutations;

double complex *WORK, *tempA;

tempA = (double complex*) malloc( n*n*sizeof(double complex) );

permutations = (int*) malloc( 2*n*sizeof(int) );

WORK = (double complex *)malloc(LWORK*sizeof(double complex));


int INFO;

zgeTranspose(tempA,A,n);


zgetrf_( &n, &n, tempA , &n, permutations , &INFO );

if (INFO != 0) {
 printf("ComplexMatrixInverse: Error at zgetrf  \n"); exit(0);
 }



zgetri_( &n, tempA , &n, permutations , WORK, &LWORK, &INFO );

if (INFO != 0) {
 printf("ComplexMatrixInverse: Error at zgetri  \n"); exit(0);
 }

zgeTranspose(invA,tempA,n);

free(WORK);

free(tempA);
free(permutations);

}


/////////////////////////////////////////////////////////////////////////////////////////


int main()
{
int i,j;
const int N = 3;


double complex A[] = { 1.+I , 2. ,  3 , 4. , 5.+I , 6. , 7., 8., 9. + I};

double complex invA[N*N];

MatrixComplexInverse(invA,A,N);


for(i=0;i<N;i++){
 for(j=0;j<N;j++) printf(" (%f,%f) \t", invA[i*N + j]);

 printf("\n");
}

printf("------------------------------------------------------------\n");
return 0;

}


The make file is

#
CC = gcc

#edit LAPACK_PATH if necessary
LAPACK_PATH = /usr/lib64/atlas

a,out: main.c
$(CC)  main.c  -L$(LAPACK_PATH) -llapack -lblas  -lgfortran -lm

Sunday, April 25, 2010

Complex matrix inverse III: CUDA

Here is cuinverse_kernel.cu

__global__ void cgeMatrixInverse_kernel(cuFloatComplex *invA , cuFloatComplex *A , int N , cuFloatComplex *Work)
{

int i;
int idx = blockDim.x * blockIdx.x + threadIdx.x;

cuFloatComplex * ThreadWorkSpace;

ThreadWorkSpace = Work + idx*cgeMatrixInverse_WorkSpace()*N*N;

for(i=0;i<N*N;i++) A[ i + idx*N*N ] =  A[i];

A[ idx*N*N ] = make_cuFloatComplex( (float) idx , 1./sqrtf(2));

cgeMatrixInverse(invA + idx*N*N , A + idx*N*N , N , ThreadWorkSpace);


}

_______________________

and my makefile is

#  edit the path of your working directory
BASE_PATH = /home/rcabrera/Documents/source/c/inverseCUDA

#

CUDA_PATH = /usr/local/cuda

CC = gcc
NVCC    := $(CUDA_PATH)/bin/nvcc

CUDA_INCLUDE_PATH = /usr/local/cuda/include
CUDA_LIB_PATH := $(CUDA_PATH)/lib64
CUDA_LIBS = -lcuda -lcudart

a.out: main.cu cuinverse_kernel.cu cumatrixtools.h
$(NVCC)  -I$(CUDA_INCLUDE_PATH) -L$(CUDA_LIB_PATH) $(CUDA_LIBS) -lm  main.cu -o a.out

# $(NVCC) -deviceemu  -I$(CUDA_INCLUDE_PATH) -L$(CUDA_LIB_PATH) $(CUDA_LIBS) -lm  main.cu -o a.out

clean :
rm -f a.out

Complex matrix inverse II: CUDA

So, here is cumatrixtools.h

#include<cuComplex.h>
#ifndef _CUMATRIXTOOLS_H_
#define _CUMATRIXTOOLS_H_


////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void
cgeSquareMatrixProduct(cuFloatComplex *out, cuFloatComplex *A, cuFloatComplex *B, int N  )
{

int i,j,k;
cuFloatComplex sum;

for(i=0;i<N;i++)

for(j=0;j<N;j++){
  //sum = 0. + 0.*I;
    sum  = make_cuFloatComplex(0.,0.);

  for(k=0;k<N;k++){
    sum =  cuCaddf(sum  , cuCmulf(A[i*N+k],B[k*N+j]) );
  }

  out[i*N+j] = sum;
}


}
//////////////////////////////////////////////////////////////////////

__device__ __host__ static __inline__ void cgeTranspose(cuFloatComplex  *At, cuFloatComplex  *A, int N)
{

int i,j;
for( i = 0; i<N; i++)

for( j = 0; j<N; j++)
  At[i+j*N] = A[i*N+j] ;
}

//////////////////////////////////////////////////////////////////////////////////////
//                LU decomposition of complex matrices
/////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeDoolittle_LU_Decomposition(cuFloatComplex *LU, cuFloatComplex *A, int n)
{

 int i, j, k, p;
 cuFloatComplex *p_k, *p_row, *p_col;

for(k=0; k<n*n; k++) LU[k]=A[k];


for (k = 0, p_k = LU; k < n; p_k += n, k++) {

    for (j = k; j < n; j++) {

       for (p = 0, p_col = LU; p < k; p_col += n,  p++)

//           *(p_k + j) -= *(p_k + p) * *(p_col + j);
       *(p_k + j) = cuCsubf( *(p_k + j) , cuCmulf( *(p_k + p) , *(p_col + j) ));
    }

    if ( cuCabsf(*(p_k + k)) != 0.0 )  //return -1;


    for (i = k+1, p_row = p_k + n; i < n; p_row += n, i++) {

       for (p = 0, p_col = LU; p < k; p_col += n, p++)

         // *(p_row + k) -= *(p_row + p) * *(p_col + k);
    *(p_row + k) = cuCsubf( *(p_row + k) , cuCmulf(  *(p_row + p) , *(p_col + k) ));
         //*(p_row + k) /= *(p_k + k);


  *(p_row + k) = cuCdivf( *(p_row + k) , *(p_k + k)  );
    }
 }

}

//////////////////////////////////////////////////////////////////////////////////////////////////
// Back substitution for lower triangular matrices assuming that the diagonal is filled with 1's
// Given  T x = b ,
// where T is a lower triangula matrix NxN with 1's in the diagonal
// b is a known vector
// x is an unknown vector
// the routine otputs x = xout
//////////////////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeBackSubstitutionLowerDiagOne(cuFloatComplex *xout, cuFloatComplex *T , cuFloatComplex *b ,int N  )
{

cuFloatComplex bTemp;
int i,j;

for(i=0;i<N;i++){

  bTemp = b[i];
  for(j=0;j<i;j++) bTemp = cuCsubf( bTemp , cuCmulf(T[ i*N + j], xout[j] ) );

  xout[i] = bTemp ;    
}
}

/////////////////////////////////////////////////////////////////////////////////////
// Back substitution for upper triangular matrice
////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeBackSubstitutionUpper(cuFloatComplex *xout, cuFloatComplex *T , cuFloatComplex *b ,int N  )
{

cuFloatComplex bTemp;
int i,j;

for(i=N-1;i>=0;i--){

  bTemp = b[i];
  for(j=i+1;j<N;j++) bTemp = cuCsubf( bTemp , cuCmulf(T[ i*N + j], xout[j] ) );

  xout[i] = cuCdivf( bTemp , T[ i*N + i] );    
}
}

///////////////////////
__device__ __host__ static __inline__ void cgeIdentity(cuFloatComplex *Id, int N)
{

int i,j;
for(i=0;i<N;i++)

for(j=0;j<N;j++){
Id[i*N+j] = make_cuFloatComplex(0.f,0.f);
}


for(i=0;i<N;i++){
  Id[i*N+i] = make_cuFloatComplex(1.f,0.f);
}
}

//////////////////////////////////////////////////////////////////////////////////////
//  Inverse of a matrix using the triangularization of matrices
//  Warning:
//          It does not destroys the original matrix A
//          A work space for 3 complex matrices must be supplied in W
//////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ int cgeMatrixInverse_WorkSpace(){ return 3;}

__device__ __host__ static __inline__ void cgeMatrixInverse(cuFloatComplex *inv , cuFloatComplex *A , int N , cuFloatComplex *W)
{

int i;
cuFloatComplex *Id;
cuFloatComplex *invL, *invU;


Id = W;      //Double purpose work space
  invL = W + N*N;

invU = W + 2*N*N;


cgeIdentity( Id, N);

 
 cgeDoolittle_LU_Decomposition(inv,A,N);


for(i=0;i<N;i++)   cgeBackSubstitutionLowerDiagOne( invL + i*N , inv , Id + i*N ,  N  );


for(i=0;i<N;i++) cgeBackSubstitutionUpper( invU + i*N , inv , Id + i*N ,  N  );


cgeSquareMatrixProduct(Id , invL , invU , N  );


cgeTranspose(inv , Id , N);

}


#endif

Saturday, April 17, 2010

Complex matrix Inverse I: CUDA

Here is a short program in CUDA to invert complex matrices through the LU decomposition (without permutations) with back substitution in __device__ mode. This means that the whole inversion is designed to run in each thread.

This code was adapted from a C code in real double precision found here

The code is standalone in the sense that it only requires the Standard tools provided by CUDA (without CuBlas) and tested in the NVIDIA GeForce 240M
on a Linux Fedora 11.

The program can be organized in three files

cumatrixtools.h: which contains the method itself.
cuinverse_kernel.cu: which contains the kernel function, which maps the method on the threreads.
main.cu: which administers the memory and calls kernel function.

The purpose of this program is to invert several variations of a matrix A. The matrix A is passed to the kernel and the inverse is evaluated after modifying the first element of A. Each branch executes a different matrix. The kernel is called with <<<2,32>>>, which means that the threads are organized in two blocks, each bloch with 32 threads. The more physical processors one has, the more threads per block one could assign in order to gain performance

The whole program is distributed in three posts (I,II and III)

main.cu looks like

#include <stdio.h>
#include <complex.h>

#include<cuComplex.h>

#include"cumatrixtools.h"
#include"cuinverse_kernel.cu"

void cuPrintMatrix( cuFloatComplex  *C ,int N, int M )
{

int i,j;
for(i=0;i<N;i++){

for(j=0;j<M;j++) printf(" (%f,%f)\t ", cuCrealf(C[i*N + j]) , cuCimagf(C[i*N + j]) );

printf(" \n ");
}
}

///////////////////////////////////////////////////////////////////////////////
const int N=3;     //Matrix dimension

const int NBranches = 64;
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){

int i;
float  complex A[9] =  //  Base matrix
    {

0.f+I/sqrtf(2.f), 0.0f+ I/sqrt(2.0f),  0.+ 0.*I,

0. -I/2.     , 0.+I/2       ,   0.+ I/sqrt(2.),
-0.5+0.*I ,  0.5+0.*I  ,  -1./sqrt(2.)+.0*I

};

cuFloatComplex  *h_A, *h_invA;
cuFloatComplex  *d_A, *d_invA,   *d_WorkSpace;


printf("...allocating CPU memory.\n");
h_A =          (cuFloatComplex *)   malloc(                              N*N*sizeof(cuFloatComplex ));
h_invA =       (cuFloatComplex *)   malloc(                    NBranches*N*N*sizeof(cuFloatComplex ));

printf("...allocating GPU memory.\n");

cudaMalloc((void **)&d_A,                                      NBranches*N*N*sizeof(cuFloatComplex ));

cudaMalloc((void **)&d_invA,                                   NBranches*N*N*sizeof(cuFloatComplex ));

cudaMalloc((void **)&d_WorkSpace, NBranches*cgeMatrixInverse_WorkSpace()*N*N*sizeof(cuFloatComplex ));


printf("...Copying memory.\n ");
for(i=0;i<N*N;i++ ) h_A[i] = make_cuFloatComplex( crealf(A[i]) , cimagf(A[i])  );

cudaMemcpy(d_A, h_A, N*N*sizeof(cuFloatComplex) , cudaMemcpyHostToDevice);


printf("...The base matrix is:\n");
cuPrintMatrix( h_A , N, N );


printf("\n...Calling the kernel.\n");

cudaThreadSynchronize();
cgeMatrixInverse_kernel<<<2,32>>>(d_invA, d_A , N ,d_WorkSpace);   // Divinding the 64 branches in 2 blocks of 32 threads

    cudaThreadSynchronize();

cudaMemcpy(h_invA, d_invA, NBranches*N*N*sizeof(float), cudaMemcpyDeviceToHost);


printf("\n The inverse of the first branch is \n");
cuPrintMatrix( h_invA , N, N );


printf("\n The inverse of the second branch is \n");
cuPrintMatrix( h_invA + N*N , N, N );


printf("\n and so on ..\n");

free(h_A);
free(h_invA);


cudaFree(d_A);
cudaFree(d_invA);

cudaThreadExit();
printf("\n-------------------------------------------------------\n");
}