! Matrix-Matrix Multiply, AB = C using PESSL (ScaLAPACK)
! filename: ex5.f
! to use IBM PESSL library
! compile: mpxlf90 -o ex5 -lblacs -lpessl ex5.f
! run: poe+ ./ex5 -nodes 1 -procs 4
! to use ScaLAPACK library
! compile: module load scalapack
! mpxlf -o ex5 -qfree $PBLAS $BLACS $SCALAPACK -lessl ex5.f
! to use ScaLAPACK and VAMPIR (which does not work with PESSL)
! compile: module load scalapack
! module load vampir vampirtrace
! mpxlf -o ex5 -qfree $PBLAS $BLACS $SCALAPACK -lessl $VTINC $VTLIB ex5.f
! prow number of rows in proc grid set to 2
! pcol number of columns in proc grid set to 2
! n number of rows/columns in matrix A set to 4000
! nb matrix distribution block size set to 16
! oputput: stdout
implicit none
integer :: n=4000, nb=16 ! problem size and block size
! integer :: myunit ! local output unit number for debug output
integer :: myArows, myAcols ! size of local subset of global array
integer :: i,j, igrid,jgrid, iproc,jproc, myi,myj, p
real*8, dimension(:,:), allocatable :: myA,myB,myC
integer :: numroc ! blacs routine
integer :: me, procs, icontxt, prow=2, pcol=2, myrow, mycol ! blacs data
integer :: info ! pessl return value
integer, dimension(9) :: ides_a, ides_b, ides_c ! pessl array desc
! Initialize blacs processor grid
call blacs_pinfo (me,procs)
call blacs_get (0, 0, icontxt)
call blacs_gridinit(icontxt, 'R', prow, pcol)
call blacs_gridinfo(icontxt, prow, pcol, myrow, mycol)
! myunit = 10+me
! write(myunit,*)"--------"
! write(myunit,*)"Output for processor ",me," to unit ",myunit
! write(myunit,*)"Proc ",me,": myrow, mycol in p-array is ", &
! myrow, mycol
! Construct local arrays
! Global structure: matrix A of n rows and n columns
! matrix B of n rows and n column
! matrix C of n rows and n column
myArows = numroc(n, nb, myrow, 0, prow)
myAcols = numroc(n, nb, mycol, 0, pcol)
! write(myunit,*)"Size of global array is ",n," x ",n
! write(myunit,*)"Size of block is ",nb," x ",nb
! write(myunit,*)"Size of local array is ",myArows," x ",myAcols
! Initialize local arrays
allocate(myA(myArows,myAcols))
allocate(myB(myArows,myAcols))
allocate(myC(myArows,myAcols))
do i=1,n
call g2l(i,n,prow,nb,iproc,myi)
if (myrow==iproc) then
do j=1,n
call g2l(j,n,pcol,nb,jproc,myj)
if (mycol==jproc) then
myA(myi,myj) = real(i+j)
myB(myi,myj) = real(i-j)
myC(myi,myj) = 0.d0
! write(*,*)"A(",i,",",j,")", &
! " --> myA(",myi,",",myj,")=",myA(myi,myj), &
! "on proc(",iproc,",",jproc,")"
endif
enddo
endif
enddo
! Prepare array descriptors for PESSL (ScaLAPACK style)
ides_a(1) = 1 ! descriptor type
ides_a(2) = icontxt ! blacs context
ides_a(3) = n ! global number of rows
ides_a(4) = n ! global number of columns
ides_a(5) = nb ! row block size
ides_a(6) = nb ! column block size
ides_a(7) = 0 ! initial process row
ides_a(8) = 0 ! initial process column
ides_a(9) = myArows ! leading dimension of local array
do i=1,9
ides_b(i) = ides_a(i)
ides_c(i) = ides_a(i)
enddo
! Call PESSL library routine
call pdgemm('T','T',n,n,n,1.0d0, myA,1,1,ides_a, &
myB,1,1,ides_b,0.d0, &
myC,1,1,ides_c )
! Print results
call g2l(n,n,prow,nb,iproc,myi)
call g2l(n,n,pcol,nb,jproc,myj)
if ((myrow==iproc) .and. (mycol==jproc)) &
write(*,*) 'c(',n,n,')=',myC(myi,myj)
! Deallocate the local arrays
deallocate(myA, myB, myC)
! End blacs for processors that are used
call blacs_gridexit(icontxt)
call blacs_exit(0)
stop
end
! convert global index to local index in block-cyclic distribution
subroutine g2l(i,n,np,nb,p,il)
implicit none
integer :: i ! global array index, input
integer :: n ! global array dimension, input
integer :: np ! processor array dimension, input
integer :: nb ! block size, input
integer :: p ! processor array index, output
integer :: il ! local array index, output
integer :: im1
im1 = i-1
p = mod((im1/nb),np)
il = (im1/(np*nb))*nb + mod(im1,nb) + 1
return
end
! convert local index to global index in block-cyclic distribution
subroutine l2g(il,p,n,np,nb,i)
implicit none
integer :: il ! local array index, input
integer :: p ! processor array index, input
integer :: n ! global array dimension, input
integer :: np ! processor array dimension, input
integer :: nb ! block size, input
integer :: i ! global array index, output
integer :: ilm1
ilm1 = il-1
i = (((ilm1/nb) * np) + p)*nb + mod(ilm1,nb) + 1
return
end
|