#include<stdio.h>
#include<math.h>
#include<complex.h>
#include <stdlib.h>
//.......................................................................................................
void zgeTranspose( double complex *Transposed, double complex *M ,int n)
{
int i,j;
for(i=0;i<n;i++)
for(j=0;j<n;j++) Transposed[i+n*j] = M[i*n+j];
}
//......................................................................................................
// MatrixComplexEigensystem: computes the eigenvectors and eigenValues of input matrix A
// The eigenvectors are stored in columns
//.....................................................................................................
void MatrixComplexEigensystem( double complex *eigenvectorsVR, double complex *eigenvaluesW, double complex *A, int N)
{
int i;
double complex *AT = (double complex*) malloc( N*N*sizeof(double complex) );
zgeTranspose( AT, A , N);
char JOBVL ='N'; // Compute Right eigenvectors
char JOBVR ='V'; // Do not compute Left eigenvectors
double complex VL[1];
int LDVL = 1;
int LDVR = N;
int LWORK = 4*N;
double complex *WORK = (double complex*)malloc( LWORK*sizeof(double complex));
double complex *RWORK = (double complex*)malloc( 2*N*sizeof(double complex));
int INFO;
zgeev_( &JOBVL, &JOBVR, &N, AT , &N , eigenvaluesW ,
VL, &LDVL,
eigenvectorsVR, &LDVR,
WORK,
&LWORK, RWORK, &INFO );
zgeTranspose( AT, eigenvectorsVR , N);
for(i=0;i<N*N;i++) eigenvectorsVR[i]=AT[i];
free(WORK);
free(RWORK);
free(AT);
}
int main()
{
int i,j;
const int N = 3;
double complex A[] = { 1.+I , 2. , 3 , 4. , 5.+I , 6. , 7., 8., 9. + I};
double complex eigenVectors[N*N];
double complex eigenValues[N];
MatrixComplexEigensystem( eigenVectors, eigenValues, A, N);
printf("\nEigenvectors\n");
for(i=0;i<N;i++){
for(j=0;j<N;j++) printf(" (%f,%f) \t", eigenVectors[i*N + j]);
printf("\n");
}
printf("\nEigenvalues \n");
for(i=0;i<N;i++) printf("\n (%f, %f) \t", eigenValues[i] );
printf("\n------------------------------------------------------------\n");
return 0;
}
Sunday, May 2, 2010
Eigenvalues: C/Lapack
Here is another example on the use of Lapack. This time the objective is to calculate the eigenvectors and eigenvalues of a complex matrix
Complex matrix inverse: C++/Lapack
So, here is an example on how to call lapack from c++
////////////////////=====================////////////////////
and the make file is
#include<iostream>
#include<math.h>
#include<complex>
#include <stdlib.h>
using namespace std;
extern "C" void zgetrf_( int*, int* , complex<double>* , int*, int* , int* );
extern "C" void zgetri_( int*, complex<double>* , int*, int* , complex<double>*, int* , int* );
//........................................................................................
void zgeTranspose( complex<double> *Transposed, complex<double> *M ,int n)
{
int i,j;
for(i=0;i<n;i++)
for(j=0;j<n;j++) Transposed[i+n*j] = M[i*n+j];
}
//.........................................................................................
void MatrixComplexInverse(complex<double> *invA, complex<double> *A, int n)
{
int LWORK=10*n;
int *permutations;
complex<double> *WORK, *tempA;
tempA = new complex<double>[n*n];
permutations = new int[2*n];
WORK = new complex<double>[n*n];
int INFO;
zgeTranspose(tempA,A,n);
zgetrf_( &n, &n, tempA , &n, permutations , &INFO );
if (INFO != 0) {
cout<<"ComplexMatrixInverse: Error at zgetrf \n"; exit(0);
}
zgetri_( &n, tempA , &n, permutations , WORK, &LWORK, &INFO );
if (INFO != 0) {
cout<<"ComplexMatrixInverse: Error at zgetri \n"; exit(0);
}
zgeTranspose(invA,tempA,n);
delete [] WORK;
delete [] tempA;
delete [] permutations;
}
/////////////////////////////////////////////////////////////////////////////////////////
int main()
{
int i,j;
const int N = 3;
complex<double> I(0.,1.);
complex<double> A[] = { 1. + I , 2. , 3 , 4. , 5.+I , 6. , 7., 8., 9. + I};
complex<double> invA[N*N];
MatrixComplexInverse(invA,A,N);
for(i=0;i<N;i++){
for(j=0;j<N;j++) cout << invA[i*N + j]<<"\t";
cout<<"\n";
}
cout<<"---------------------------\n";
return 0;
}
////////////////////=====================////////////////////
and the make file is
#
CC = c++
#edit LAPACK_PATH if necessary
LAPACK_PATH = /usr/lib64/atlas
a,out: main.cpp
$(CC) main.cpp -L$(LAPACK_PATH) -llapack -lblas -lgfortran -lm
Wednesday, April 28, 2010
My basic rules of programming
Here I am listing some general rules of programming that I learned to follow.
Some of them are enforced by certain languages or programming environments but some are very permissive and require to be more careful. These rules are already well known but here they are again
Some of them are enforced by certain languages or programming environments but some are very permissive and require to be more careful. These rules are already well known but here they are again
- Define very well the input from the output arguments of a function. The input arguments should remain unmodified in the routine.
- Never write a routine that cannot fit in one page of computer screen in the worst case.
- Develop and validate a program in reasonable small steps. It may take longer to write the code but it will take less time to debug.
- Use a maximum of two nested loops in a certain routine.
- Use meaningful names for the variables even if they become long.
- Do not repeat the same task in different places.
Monday, April 26, 2010
Complex matrix inverse: C/Lapack
Here a simple example on how to use Lapack within C to invert matrices using the LU decomposition.
The C++ code is different and I will post that another day
[The html code for this post was generated by http://www.bedaux.net/cpp2html/]
The C++ code is different and I will post that another day
[The html code for this post was generated by http://www.bedaux.net/cpp2html/]
#include<stdio.h>
#include<math.h>
#include<complex.h>
#include <stdlib.h>
//........................................................................................
void zgeTranspose( double complex *Transposed, double complex *M ,int n)
{
int i,j;
for(i=0;i<n;i++)
for(j=0;j<n;j++) Transposed[i+n*j] = M[i*n+j];
}
//.........................................................................................
void MatrixComplexInverse(double complex *invA, double complex *A, int n)
{
int LWORK=10*n;
int *permutations;
double complex *WORK, *tempA;
tempA = (double complex*) malloc( n*n*sizeof(double complex) );
permutations = (int*) malloc( 2*n*sizeof(int) );
WORK = (double complex *)malloc(LWORK*sizeof(double complex));
int INFO;
zgeTranspose(tempA,A,n);
zgetrf_( &n, &n, tempA , &n, permutations , &INFO );
if (INFO != 0) {
printf("ComplexMatrixInverse: Error at zgetrf \n"); exit(0);
}
zgetri_( &n, tempA , &n, permutations , WORK, &LWORK, &INFO );
if (INFO != 0) {
printf("ComplexMatrixInverse: Error at zgetri \n"); exit(0);
}
zgeTranspose(invA,tempA,n);
free(WORK);
free(tempA);
free(permutations);
}
/////////////////////////////////////////////////////////////////////////////////////////
int main()
{
int i,j;
const int N = 3;
double complex A[] = { 1.+I , 2. , 3 , 4. , 5.+I , 6. , 7., 8., 9. + I};
double complex invA[N*N];
MatrixComplexInverse(invA,A,N);
for(i=0;i<N;i++){
for(j=0;j<N;j++) printf(" (%f,%f) \t", invA[i*N + j]);
printf("\n");
}
printf("------------------------------------------------------------\n");
return 0;
}
The make file is#
CC = gcc
#edit LAPACK_PATH if necessary
LAPACK_PATH = /usr/lib64/atlas
a,out: main.c
$(CC) main.c -L$(LAPACK_PATH) -llapack -lblas -lgfortran -lm
Sunday, April 25, 2010
Complex matrix inverse III: CUDA
Here is cuinverse_kernel.cu
_______________________
and my makefile is
__global__ void cgeMatrixInverse_kernel(cuFloatComplex *invA , cuFloatComplex *A , int N , cuFloatComplex *Work)
{
int i;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
cuFloatComplex * ThreadWorkSpace;
ThreadWorkSpace = Work + idx*cgeMatrixInverse_WorkSpace()*N*N;
for(i=0;i<N*N;i++) A[ i + idx*N*N ] = A[i];
A[ idx*N*N ] = make_cuFloatComplex( (float) idx , 1./sqrtf(2));
cgeMatrixInverse(invA + idx*N*N , A + idx*N*N , N , ThreadWorkSpace);
}
_______________________
and my makefile is
# edit the path of your working directory
BASE_PATH = /home/rcabrera/Documents/source/c/inverseCUDA
#
CUDA_PATH = /usr/local/cuda
CC = gcc
NVCC := $(CUDA_PATH)/bin/nvcc
CUDA_INCLUDE_PATH = /usr/local/cuda/include
CUDA_LIB_PATH := $(CUDA_PATH)/lib64
CUDA_LIBS = -lcuda -lcudart
a.out: main.cu cuinverse_kernel.cu cumatrixtools.h
$(NVCC) -I$(CUDA_INCLUDE_PATH) -L$(CUDA_LIB_PATH) $(CUDA_LIBS) -lm main.cu -o a.out
# $(NVCC) -deviceemu -I$(CUDA_INCLUDE_PATH) -L$(CUDA_LIB_PATH) $(CUDA_LIBS) -lm main.cu -o a.out
clean :
rm -f a.out
Complex matrix inverse II: CUDA
So, here is cumatrixtools.h
#include<cuComplex.h>
#ifndef _CUMATRIXTOOLS_H_
#define _CUMATRIXTOOLS_H_
////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void
cgeSquareMatrixProduct(cuFloatComplex *out, cuFloatComplex *A, cuFloatComplex *B, int N )
{
int i,j,k;
cuFloatComplex sum;
for(i=0;i<N;i++)
for(j=0;j<N;j++){
//sum = 0. + 0.*I;
sum = make_cuFloatComplex(0.,0.);
for(k=0;k<N;k++){
sum = cuCaddf(sum , cuCmulf(A[i*N+k],B[k*N+j]) );
}
out[i*N+j] = sum;
}
}
//////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeTranspose(cuFloatComplex *At, cuFloatComplex *A, int N)
{
int i,j;
for( i = 0; i<N; i++)
for( j = 0; j<N; j++)
At[i+j*N] = A[i*N+j] ;
}
//////////////////////////////////////////////////////////////////////////////////////
// LU decomposition of complex matrices
/////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeDoolittle_LU_Decomposition(cuFloatComplex *LU, cuFloatComplex *A, int n)
{
int i, j, k, p;
cuFloatComplex *p_k, *p_row, *p_col;
for(k=0; k<n*n; k++) LU[k]=A[k];
for (k = 0, p_k = LU; k < n; p_k += n, k++) {
for (j = k; j < n; j++) {
for (p = 0, p_col = LU; p < k; p_col += n, p++)
// *(p_k + j) -= *(p_k + p) * *(p_col + j);
*(p_k + j) = cuCsubf( *(p_k + j) , cuCmulf( *(p_k + p) , *(p_col + j) ));
}
if ( cuCabsf(*(p_k + k)) != 0.0 ) //return -1;
for (i = k+1, p_row = p_k + n; i < n; p_row += n, i++) {
for (p = 0, p_col = LU; p < k; p_col += n, p++)
// *(p_row + k) -= *(p_row + p) * *(p_col + k);
*(p_row + k) = cuCsubf( *(p_row + k) , cuCmulf( *(p_row + p) , *(p_col + k) ));
//*(p_row + k) /= *(p_k + k);
*(p_row + k) = cuCdivf( *(p_row + k) , *(p_k + k) );
}
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Back substitution for lower triangular matrices assuming that the diagonal is filled with 1's
// Given T x = b ,
// where T is a lower triangula matrix NxN with 1's in the diagonal
// b is a known vector
// x is an unknown vector
// the routine otputs x = xout
//////////////////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeBackSubstitutionLowerDiagOne(cuFloatComplex *xout, cuFloatComplex *T , cuFloatComplex *b ,int N )
{
cuFloatComplex bTemp;
int i,j;
for(i=0;i<N;i++){
bTemp = b[i];
for(j=0;j<i;j++) bTemp = cuCsubf( bTemp , cuCmulf(T[ i*N + j], xout[j] ) );
xout[i] = bTemp ;
}
}
/////////////////////////////////////////////////////////////////////////////////////
// Back substitution for upper triangular matrice
////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ void cgeBackSubstitutionUpper(cuFloatComplex *xout, cuFloatComplex *T , cuFloatComplex *b ,int N )
{
cuFloatComplex bTemp;
int i,j;
for(i=N-1;i>=0;i--){
bTemp = b[i];
for(j=i+1;j<N;j++) bTemp = cuCsubf( bTemp , cuCmulf(T[ i*N + j], xout[j] ) );
xout[i] = cuCdivf( bTemp , T[ i*N + i] );
}
}
///////////////////////
__device__ __host__ static __inline__ void cgeIdentity(cuFloatComplex *Id, int N)
{
int i,j;
for(i=0;i<N;i++)
for(j=0;j<N;j++){
Id[i*N+j] = make_cuFloatComplex(0.f,0.f);
}
for(i=0;i<N;i++){
Id[i*N+i] = make_cuFloatComplex(1.f,0.f);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Inverse of a matrix using the triangularization of matrices
// Warning:
// It does not destroys the original matrix A
// A work space for 3 complex matrices must be supplied in W
//////////////////////////////////////////////////////////////////////////////////////
__device__ __host__ static __inline__ int cgeMatrixInverse_WorkSpace(){ return 3;}
__device__ __host__ static __inline__ void cgeMatrixInverse(cuFloatComplex *inv , cuFloatComplex *A , int N , cuFloatComplex *W)
{
int i;
cuFloatComplex *Id;
cuFloatComplex *invL, *invU;
Id = W; //Double purpose work space
invL = W + N*N;
invU = W + 2*N*N;
cgeIdentity( Id, N);
cgeDoolittle_LU_Decomposition(inv,A,N);
for(i=0;i<N;i++) cgeBackSubstitutionLowerDiagOne( invL + i*N , inv , Id + i*N , N );
for(i=0;i<N;i++) cgeBackSubstitutionUpper( invU + i*N , inv , Id + i*N , N );
cgeSquareMatrixProduct(Id , invL , invU , N );
cgeTranspose(inv , Id , N);
}
#endif
Saturday, April 17, 2010
Complex matrix Inverse I: CUDA
Here is a short program in CUDA to invert complex matrices through the LU decomposition (without permutations) with back substitution in __device__ mode. This means that the whole inversion is designed to run in each thread.
This code was adapted from a C code in real double precision found here
The code is standalone in the sense that it only requires the Standard tools provided by CUDA (without CuBlas) and tested in the NVIDIA GeForce 240M
on a Linux Fedora 11.
The program can be organized in three files
The whole program is distributed in three posts (I,II and III)
main.cu looks like
This code was adapted from a C code in real double precision found here
The code is standalone in the sense that it only requires the Standard tools provided by CUDA (without CuBlas) and tested in the NVIDIA GeForce 240M
on a Linux Fedora 11.
The program can be organized in three files
- cumatrixtools.h: which contains the method itself.
- cuinverse_kernel.cu: which contains the kernel function, which maps the method on the threreads.
- main.cu: which administers the memory and calls kernel function.
The whole program is distributed in three posts (I,II and III)
main.cu looks like
#include <stdio.h> #include <complex.h> #include<cuComplex.h> #include"cumatrixtools.h" #include"cuinverse_kernel.cu" void cuPrintMatrix( cuFloatComplex *C ,int N, int M ) { int i,j; for(i=0;i<N;i++){ for(j=0;j<M;j++) printf(" (%f,%f)\t ", cuCrealf(C[i*N + j]) , cuCimagf(C[i*N + j]) ); printf(" \n "); } } /////////////////////////////////////////////////////////////////////////////// const int N=3; //Matrix dimension const int NBranches = 64; /////////////////////////////////////////////////////////////////////////////// // Main program /////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ int i; float complex A[9] = // Base matrix { 0.f+I/sqrtf(2.f), 0.0f+ I/sqrt(2.0f), 0.+ 0.*I, 0. -I/2. , 0.+I/2 , 0.+ I/sqrt(2.), -0.5+0.*I , 0.5+0.*I , -1./sqrt(2.)+.0*I }; cuFloatComplex *h_A, *h_invA; cuFloatComplex *d_A, *d_invA, *d_WorkSpace; printf("...allocating CPU memory.\n"); h_A = (cuFloatComplex *) malloc( N*N*sizeof(cuFloatComplex )); h_invA = (cuFloatComplex *) malloc( NBranches*N*N*sizeof(cuFloatComplex )); printf("...allocating GPU memory.\n"); cudaMalloc((void **)&d_A, NBranches*N*N*sizeof(cuFloatComplex )); cudaMalloc((void **)&d_invA, NBranches*N*N*sizeof(cuFloatComplex )); cudaMalloc((void **)&d_WorkSpace, NBranches*cgeMatrixInverse_WorkSpace()*N*N*sizeof(cuFloatComplex )); printf("...Copying memory.\n "); for(i=0;i<N*N;i++ ) h_A[i] = make_cuFloatComplex( crealf(A[i]) , cimagf(A[i]) ); cudaMemcpy(d_A, h_A, N*N*sizeof(cuFloatComplex) , cudaMemcpyHostToDevice); printf("...The base matrix is:\n"); cuPrintMatrix( h_A , N, N ); printf("\n...Calling the kernel.\n"); cudaThreadSynchronize(); cgeMatrixInverse_kernel<<<2,32>>>(d_invA, d_A , N ,d_WorkSpace); // Divinding the 64 branches in 2 blocks of 32 threads cudaThreadSynchronize(); cudaMemcpy(h_invA, d_invA, NBranches*N*N*sizeof(float), cudaMemcpyDeviceToHost); printf("\n The inverse of the first branch is \n"); cuPrintMatrix( h_invA , N, N ); printf("\n The inverse of the second branch is \n"); cuPrintMatrix( h_invA + N*N , N, N ); printf("\n and so on ..\n"); free(h_A); free(h_invA); cudaFree(d_A); cudaFree(d_invA); cudaThreadExit(); printf("\n-------------------------------------------------------\n"); }
Subscribe to:
Posts (Atom)