Renan Cabrera L. PhD: April 2010

Wednesday, April 28, 2010

My basic rules of programming

Here I am listing some general rules of programming that I learned to follow.
Some of them are enforced by certain languages or programming environments but some are very permissive and require to be more careful. These rules are already well known but here they are again

Define very well the input from the output arguments of a function. The input arguments should remain unmodified in the routine.
Never write a routine that cannot fit in one page of computer screen in the worst case.
Develop and validate a program in reasonable small steps. It may take longer to write the code but it will take less time to debug.
Use a maximum of two nested loops in a certain routine.
Use meaningful names for the variables even if they become long.
Do not repeat the same task in different places.

Monday, April 26, 2010

Complex matrix inverse: C/Lapack

Here a simple example on how to use Lapack within C to invert matrices using the LU decomposition.

The C++ code is different and I will post that another day

[The html code for this post was generated by http://www.bedaux.net/cpp2html/]

#include<stdio.h>
#include<math.h>

#include<complex.h>
#include <stdlib.h>

//........................................................................................
void zgeTranspose( double complex *Transposed, double complex *M ,int n)
{

int i,j;
for(i=0;i<n;i++)

for(j=0;j<n;j++) Transposed[i+n*j] = M[i*n+j];
}

//.........................................................................................
void MatrixComplexInverse(double complex *invA, double complex *A, int n)
{

int LWORK=10*n;

int *permutations;

double complex *WORK, *tempA;

tempA = (double complex*) malloc( n*n*sizeof(double complex) );

permutations = (int*) malloc( 2*n*sizeof(int) );

WORK = (double complex *)malloc(LWORK*sizeof(double complex));


int INFO;

zgeTranspose(tempA,A,n);


zgetrf_( &n, &n, tempA , &n, permutations , &INFO );

if (INFO != 0) {
 printf("ComplexMatrixInverse: Error at zgetrf  \n"); exit(0);
 }



zgetri_( &n, tempA , &n, permutations , WORK, &LWORK, &INFO );

if (INFO != 0) {
 printf("ComplexMatrixInverse: Error at zgetri  \n"); exit(0);
 }

zgeTranspose(invA,tempA,n);

free(WORK);

free(tempA);
free(permutations);

}


/////////////////////////////////////////////////////////////////////////////////////////


int main()
{
int i,j;
const int N = 3;


double complex A[] = { 1.+I , 2. ,  3 , 4. , 5.+I , 6. , 7., 8., 9. + I};

double complex invA[N*N];

MatrixComplexInverse(invA,A,N);


for(i=0;i<N;i++){
 for(j=0;j<N;j++) printf(" (%f,%f) \t", invA[i*N + j]);

 printf("\n");
}

printf("------------------------------------------------------------\n");
return 0;

}


The make file is

#
CC = gcc

#edit LAPACK_PATH if necessary
LAPACK_PATH = /usr/lib64/atlas

a,out: main.c
$(CC)  main.c  -L$(LAPACK_PATH) -llapack -lblas  -lgfortran -lm

Sunday, April 25, 2010

Complex matrix inverse III: CUDA

Here is cuinverse_kernel.cu

__global__ void cgeMatrixInverse_kernel(cuFloatComplex *invA , cuFloatComplex *A , int N , cuFloatComplex *Work)
{

int i;
int idx = blockDim.x * blockIdx.x + threadIdx.x;

cuFloatComplex * ThreadWorkSpace;

ThreadWorkSpace = Work + idx*cgeMatrixInverse_WorkSpace()*N*N;

for(i=0;i<N*N;i++) A[ i + idx*N*N ] =  A[i];

A[ idx*N*N ] = make_cuFloatComplex( (float) idx , 1./sqrtf(2));

cgeMatrixInverse(invA + idx*N*N , A + idx*N*N , N , ThreadWorkSpace);


}

_______________________

and my makefile is

#  edit the path of your working directory
BASE_PATH = /home/rcabrera/Documents/source/c/inverseCUDA

#

CUDA_PATH = /usr/local/cuda

CC = gcc
NVCC    := $(CUDA_PATH)/bin/nvcc

CUDA_INCLUDE_PATH = /usr/local/cuda/include
CUDA_LIB_PATH := $(CUDA_PATH)/lib64
CUDA_LIBS = -lcuda -lcudart

a.out: main.cu cuinverse_kernel.cu cumatrixtools.h
$(NVCC)  -I$(CUDA_INCLUDE_PATH) -L$(CUDA_LIB_PATH) $(CUDA_LIBS) -lm  main.cu -o a.out

# $(NVCC) -deviceemu  -I$(CUDA_INCLUDE_PATH) -L$(CUDA_LIB_PATH) $(CUDA_LIBS) -lm  main.cu -o a.out

clean :
rm -f a.out

Complex matrix inverse II: CUDA

So, here is cumatrixtools.h

#include<cuComplex.h>
#ifndef _CUMATRIXTOOLS_H_
#define _CUMATRIXTOOLS_H_


////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void
cgeSquareMatrixProduct(cuFloatComplex *out, cuFloatComplex *A, cuFloatComplex *B, int N  )
{

int i,j,k;
cuFloatComplex sum;

for(i=0;i<N;i++)

for(j=0;j<N;j++){
  //sum = 0. + 0.*I;
    sum  = make_cuFloatComplex(0.,0.);

  for(k=0;k<N;k++){
    sum =  cuCaddf(sum  , cuCmulf(A[i*N+k],B[k*N+j]) );
  }

  out[i*N+j] = sum;
}


}
//////////////////////////////////////////////////////////////////////

__device__ __host__ static __inline__ void cgeTranspose(cuFloatComplex  *At, cuFloatComplex  *A, int N)
{

int i,j;
for( i = 0; i<N; i++)

for( j = 0; j<N; j++)
  At[i+j*N] = A[i*N+j] ;
}

//////////////////////////////////////////////////////////////////////////////////////
//                LU decomposition of complex matrices
/////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeDoolittle_LU_Decomposition(cuFloatComplex *LU, cuFloatComplex *A, int n)
{

 int i, j, k, p;
 cuFloatComplex *p_k, *p_row, *p_col;

for(k=0; k<n*n; k++) LU[k]=A[k];


for (k = 0, p_k = LU; k < n; p_k += n, k++) {

    for (j = k; j < n; j++) {

       for (p = 0, p_col = LU; p < k; p_col += n,  p++)

//           *(p_k + j) -= *(p_k + p) * *(p_col + j);
       *(p_k + j) = cuCsubf( *(p_k + j) , cuCmulf( *(p_k + p) , *(p_col + j) ));
    }

    if ( cuCabsf(*(p_k + k)) != 0.0 )  //return -1;


    for (i = k+1, p_row = p_k + n; i < n; p_row += n, i++) {

       for (p = 0, p_col = LU; p < k; p_col += n, p++)

         // *(p_row + k) -= *(p_row + p) * *(p_col + k);
    *(p_row + k) = cuCsubf( *(p_row + k) , cuCmulf(  *(p_row + p) , *(p_col + k) ));
         //*(p_row + k) /= *(p_k + k);


  *(p_row + k) = cuCdivf( *(p_row + k) , *(p_k + k)  );
    }
 }

}

//////////////////////////////////////////////////////////////////////////////////////////////////
// Back substitution for lower triangular matrices assuming that the diagonal is filled with 1's
// Given  T x = b ,
// where T is a lower triangula matrix NxN with 1's in the diagonal
// b is a known vector
// x is an unknown vector
// the routine otputs x = xout
//////////////////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeBackSubstitutionLowerDiagOne(cuFloatComplex *xout, cuFloatComplex *T , cuFloatComplex *b ,int N  )
{

cuFloatComplex bTemp;
int i,j;

for(i=0;i<N;i++){

  bTemp = b[i];
  for(j=0;j<i;j++) bTemp = cuCsubf( bTemp , cuCmulf(T[ i*N + j], xout[j] ) );

  xout[i] = bTemp ;    
}
}

/////////////////////////////////////////////////////////////////////////////////////
// Back substitution for upper triangular matrice
////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeBackSubstitutionUpper(cuFloatComplex *xout, cuFloatComplex *T , cuFloatComplex *b ,int N  )
{

cuFloatComplex bTemp;
int i,j;

for(i=N-1;i>=0;i--){

  bTemp = b[i];
  for(j=i+1;j<N;j++) bTemp = cuCsubf( bTemp , cuCmulf(T[ i*N + j], xout[j] ) );

  xout[i] = cuCdivf( bTemp , T[ i*N + i] );    
}
}

///////////////////////
__device__ __host__ static __inline__ void cgeIdentity(cuFloatComplex *Id, int N)
{

int i,j;
for(i=0;i<N;i++)

for(j=0;j<N;j++){
Id[i*N+j] = make_cuFloatComplex(0.f,0.f);
}


for(i=0;i<N;i++){
  Id[i*N+i] = make_cuFloatComplex(1.f,0.f);
}
}

//////////////////////////////////////////////////////////////////////////////////////
//  Inverse of a matrix using the triangularization of matrices
//  Warning:
//          It does not destroys the original matrix A
//          A work space for 3 complex matrices must be supplied in W
//////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ int cgeMatrixInverse_WorkSpace(){ return 3;}

__device__ __host__ static __inline__ void cgeMatrixInverse(cuFloatComplex *inv , cuFloatComplex *A , int N , cuFloatComplex *W)
{

int i;
cuFloatComplex *Id;
cuFloatComplex *invL, *invU;


Id = W;      //Double purpose work space
  invL = W + N*N;

invU = W + 2*N*N;


cgeIdentity( Id, N);

 
 cgeDoolittle_LU_Decomposition(inv,A,N);


for(i=0;i<N;i++)   cgeBackSubstitutionLowerDiagOne( invL + i*N , inv , Id + i*N ,  N  );


for(i=0;i<N;i++) cgeBackSubstitutionUpper( invU + i*N , inv , Id + i*N ,  N  );


cgeSquareMatrixProduct(Id , invL , invU , N  );


cgeTranspose(inv , Id , N);

}


#endif

Saturday, April 17, 2010

Complex matrix Inverse I: CUDA

Here is a short program in CUDA to invert complex matrices through the LU decomposition (without permutations) with back substitution in __device__ mode. This means that the whole inversion is designed to run in each thread.

This code was adapted from a C code in real double precision found here

The code is standalone in the sense that it only requires the Standard tools provided by CUDA (without CuBlas) and tested in the NVIDIA GeForce 240M
on a Linux Fedora 11.

The program can be organized in three files

cumatrixtools.h: which contains the method itself.
cuinverse_kernel.cu: which contains the kernel function, which maps the method on the threreads.
main.cu: which administers the memory and calls kernel function.

The purpose of this program is to invert several variations of a matrix A. The matrix A is passed to the kernel and the inverse is evaluated after modifying the first element of A. Each branch executes a different matrix. The kernel is called with <<<2,32>>>, which means that the threads are organized in two blocks, each bloch with 32 threads. The more physical processors one has, the more threads per block one could assign in order to gain performance

The whole program is distributed in three posts (I,II and III)

main.cu looks like

#include <stdio.h>
#include <complex.h>

#include<cuComplex.h>

#include"cumatrixtools.h"
#include"cuinverse_kernel.cu"

void cuPrintMatrix( cuFloatComplex  *C ,int N, int M )
{

int i,j;
for(i=0;i<N;i++){

for(j=0;j<M;j++) printf(" (%f,%f)\t ", cuCrealf(C[i*N + j]) , cuCimagf(C[i*N + j]) );

printf(" \n ");
}
}

///////////////////////////////////////////////////////////////////////////////
const int N=3;     //Matrix dimension

const int NBranches = 64;
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){

int i;
float  complex A[9] =  //  Base matrix
    {

0.f+I/sqrtf(2.f), 0.0f+ I/sqrt(2.0f),  0.+ 0.*I,

0. -I/2.     , 0.+I/2       ,   0.+ I/sqrt(2.),
-0.5+0.*I ,  0.5+0.*I  ,  -1./sqrt(2.)+.0*I

};

cuFloatComplex  *h_A, *h_invA;
cuFloatComplex  *d_A, *d_invA,   *d_WorkSpace;


printf("...allocating CPU memory.\n");
h_A =          (cuFloatComplex *)   malloc(                              N*N*sizeof(cuFloatComplex ));
h_invA =       (cuFloatComplex *)   malloc(                    NBranches*N*N*sizeof(cuFloatComplex ));

printf("...allocating GPU memory.\n");

cudaMalloc((void **)&d_A,                                      NBranches*N*N*sizeof(cuFloatComplex ));

cudaMalloc((void **)&d_invA,                                   NBranches*N*N*sizeof(cuFloatComplex ));

cudaMalloc((void **)&d_WorkSpace, NBranches*cgeMatrixInverse_WorkSpace()*N*N*sizeof(cuFloatComplex ));


printf("...Copying memory.\n ");
for(i=0;i<N*N;i++ ) h_A[i] = make_cuFloatComplex( crealf(A[i]) , cimagf(A[i])  );

cudaMemcpy(d_A, h_A, N*N*sizeof(cuFloatComplex) , cudaMemcpyHostToDevice);


printf("...The base matrix is:\n");
cuPrintMatrix( h_A , N, N );


printf("\n...Calling the kernel.\n");

cudaThreadSynchronize();
cgeMatrixInverse_kernel<<<2,32>>>(d_invA, d_A , N ,d_WorkSpace);   // Divinding the 64 branches in 2 blocks of 32 threads

    cudaThreadSynchronize();

cudaMemcpy(h_invA, d_invA, NBranches*N*N*sizeof(float), cudaMemcpyDeviceToHost);


printf("\n The inverse of the first branch is \n");
cuPrintMatrix( h_invA , N, N );


printf("\n The inverse of the second branch is \n");
cuPrintMatrix( h_invA + N*N , N, N );


printf("\n and so on ..\n");

free(h_A);
free(h_invA);


cudaFree(d_A);
cudaFree(d_invA);

cudaThreadExit();
printf("\n-------------------------------------------------------\n");
}