首页 文章

直接在Scipy稀疏矩阵上使用Intel mkl库来计算A点A.T,内存较少

提问于
浏览
0

我想从python中调用mkl.mkl_scsrmultcsr . 目标是以compressed sparse row格式计算稀疏矩阵C.稀疏矩阵C是A与A的转置之间的矩阵乘积,其中A也是csr格式的稀疏矩阵 . 当用scipy计算C = A点(AT)时,scipy似乎(?)分配新的内存用于保持A(AT)的转置,并且肯定为新的C矩阵分配内存(这意味着我不能使用现有的C矩阵) . 所以,我想尝试直接使用mkl c函数来减少内存使用量 .

Here是一个适用于另一个mkl函数的答案 . 在那个答案中,mkl函数速度提高了4倍 .

以下版本在处理4天后终于可以使用了 . 我想知道我是否浪费了任何记忆 . ctype会制作numpy数组的任何副本吗?是否必须在测试功能中进行csr-> csc转换? intel c函数可以计算(A.T)点A或A点A,但不能计算A点(A.T) .

再次感谢 .

from ctypes import *
import scipy.sparse as spsp
import numpy as np
import multiprocessing as mp
# June 2nd 2016 version.

# Load the share library
mkl = cdll.LoadLibrary("libmkl_rt.so")

def get_csr_handle(A,clear=False):
    if clear == True:
        A.indptr[:] = 0
        A.indices[:] = 0
        A.data[:] = 0

    a_pointer   = A.data.ctypes.data_as(POINTER(c_float))
     # Array containing non-zero elements of the matrix A.
     # This corresponds to data array of csr_matrix
     # Its length is equal to #non zero elements in A
     # (Can this be longer than actual #non-zero elements?)
    assert A.data.ctypes.data % 16 == 0 # Check alignment

    ja_pointer  = A.indices.ctypes.data_as(POINTER(c_int))
     # Array of column indices of all non-zero elements of A.
     # This corresponds to the indices array of csr_matrix
    assert A.indices.ctypes.data % 16 == 0 # Check alignment

    ia_pointer  = A.indptr.ctypes.data_as(POINTER(c_int))
     # Array of length m+1.
     # a[ia[i]:ia[i+1]] is the value of nonzero entries of
     # the ith row of A.
     # ja[ia[i]:ia[i+1]] is the column indices of nonzero
     # entries of the ith row of A
     # This corresponds to the indptr array of csr_matrix
    assert A.indptr.ctypes.data % 16 == 0 # Check alignment

    A_data_size    = A.data.size
    A_indices_size = A.indices.size
    A_indptr_size  = A.indptr.size

    return (a_pointer, ja_pointer, ia_pointer, A)

def csr_dot_csr_t(A_handle, C_handle, nz=None):
    # Calculate (A.T).dot(A) and put result into C
    #
    # This uses one-based indexing
    #
    # Both C.data and A.data must be in np.float32 type.
    #
    # Number of nonzero elements in C must be greater than
    #     or equal to the size of C.data
    #
    # size of C.indptr must be greater than or equal to
    #     1 + (num rows of A).
    #
    # C_data    = np.zeros((nz), dtype=np.single)
    # C_indices = np.zeros((nz), dtype=np.int32)
    # C_indptr  = np.zeros((m+1),dtype=np.int32)

    #assert len(c_pointer._obj) >= 1 + A_shape[0]


    (a_pointer, ja_pointer, ia_pointer, A) = A_handle
    (c_pointer, jc_pointer, ic_pointer, C) = C_handle
    #print "CCC",C
    #assert type(C.data[0]) == np.float32
    #assert type(A.data[0]) == np.float32
    #assert C.indptr.size >= A.shape[0] + 1

    #CC = A.dot(A.T)
    #assert C.data.size >= nz
    #assert C.indices.size >= nz

    trans_pointer   = byref(c_char('T'))
    sort_pointer    = byref(c_int(0))

    (m, n)          = A.shape
    sort_pointer        = byref(c_int(0))
    m_pointer           = byref(c_int(m))     # Number of rows of matrix A
    n_pointer           = byref(c_int(n))     # Number of columns of matrix A
    k_pointer           = byref(c_int(n))     # Number of columns of matrix B
                                              # should be n when trans='T'
                          # Otherwise, I guess should be m
    ###
    b_pointer   = a_pointer
    jb_pointer  = ja_pointer
    ib_pointer  = ia_pointer
    ###
    if nz == None:
        nz = n*n #*n # m*m # Number of nonzero elements expected
             # probably can use lower value for sparse
             # matrices.
    nzmax_pointer   = byref(c_int(nz))
     # length of arrays c and jc. (which are data and
     # indices of csr_matrix). So this is the number of
     # nonzero elements of matrix C
     #
     # This parameter is used only if request=0.
     # The routine stops calculation if the number of
     # elements in the result matrix C exceeds the
     # specified value of nzmax.

    info = c_int(-3)
    info_pointer = byref(info)
    request_pointer_list = [byref(c_int(0)), byref(c_int(1)), byref(c_int(2))]
    return_list = []
    for ii in [0]:
        request_pointer = request_pointer_list[ii]
        ret = mkl.mkl_scsrmultcsr(trans_pointer, request_pointer, sort_pointer,
                    m_pointer, n_pointer, k_pointer,
                    a_pointer, ja_pointer, ia_pointer,
                    b_pointer, jb_pointer, ib_pointer,
                    c_pointer, jc_pointer, ic_pointer,
                    nzmax_pointer, info_pointer)
        info_val = info.value
        return_list += [ (ret,info_val) ]
    return return_list


def show_csr_internal(A, indent=4):
    # Print data, indptr, and indices
    # of a scipy csr_matrix A
    name = ['data', 'indptr', 'indices']
    mat  = [A.data, A.indptr, A.indices]
    for i in range(3):
        str_print = ' '*indent+name[i]+':\n%s'%mat[i]
        str_print = str_print.replace('\n', '\n'+' '*indent*2)
        print(str_print)

def fix_for_scipy(C,A):
    n = A.shape[1]
    print "fix n", n
    nz =  C.indptr[n] - 1 # -1 as this is still one based indexing.
    print "fix nz", nz
    data = C.data[:nz]

    C.indptr[:n+1] -= 1
    indptr = C.indptr[:n+1]
    C.indices[:nz] -= 1
    indices = C.indices[:nz]
    return spsp.csr_matrix( (data, indices, indptr), shape=(n,n))

def test():
    AA= [[1,0,0,1],
         [1,0,1,0],
         [0,0,1,0]]
    AA = np.random.choice([0,1], size=(3,750000), replace=True, p=[0.99,0.01])
    A_original = spsp.csr_matrix(AA)
    #A = spsp.csr_matrix(A_original, dtype=np.float32)
    A = A_original.astype(np.float32).tocsc()
    #A_original = A.todense()
    A = spsp.csr_matrix( (A.data, A.indices, A.indptr) )
    print  "A:"
    show_csr_internal(A)
    print A.todense()
    A.indptr  += 1 # convert to 1-based indexing
    A.indices += 1 # convert to 1-based indexing
    A_ptrs = get_csr_handle(A)

    C = spsp.csr_matrix( np.ones((3,3)), dtype=np.float32)
    #C.data = C.data[:16].view()
    #C.indptr = C.indptr
    C_ptrs = get_csr_handle(C, clear=True)

    print "C:"
    show_csr_internal(C)    
    print "=call mkl function=" 
    return_list= csr_dot_csr_t(A_ptrs, C_ptrs)
    print "(ret, info):", return_list
    print "C after calling mkl:"
    show_csr_internal(C)    

    C_fix = fix_for_scipy(C,A)
    print "C_fix for scipy:"
    show_csr_internal(C_fix)
    print C_fix.todense()

    print "Original C after fixing:"
    show_csr_internal(C)

    print "scipy's (A).dot(A.T)"
    scipy_ans = (A_original).dot(A_original.T)
    #scipy_ans = spsp.csr_matrix(scipy_ans)
    show_csr_internal(scipy_ans)
    print scipy_ans.todense()

if __name__ == "__main__":
    test()

结果:

A:
    data:
        [ 1.  1.  1. ...,  1.  1.  1.]
    indptr:
        [    0     0     0 ..., 22673 22673 22673]
    indices:
        [1 0 2 ..., 2 1 2]
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 ..., 
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
C:
    data:
        [ 0.  0.  0.  0.  0.  0.  0.  0.  0.]
    indptr:
        [0 0 0 0]
    indices:
        [0 0 0 0 0 0 0 0 0]
=call mkl function=
(ret, info): [(2, 0)]
C after calling mkl:
    data:
        [ 7576.    77.    83.    77.  7607.   104.    83.   104.  7490.]
    indptr:
        [ 1  4  7 10]
    indices:
        [1 2 3 1 2 3 1 2 3]
fix n 3
fix nz 9
C_fix for scipy:
    data:
        [ 7576.    77.    83.    77.  7607.   104.    83.   104.  7490.]
    indptr:
        [0 3 6 9]
    indices:
        [0 1 2 0 1 2 0 1 2]
[[ 7576.    77.    83.]
 [   77.  7607.   104.]
 [   83.   104.  7490.]]
Original C after fixing:
    data:
        [ 7576.    77.    83.    77.  7607.   104.    83.   104.  7490.]
    indptr:
        [0 3 6 9]
    indices:
        [0 1 2 0 1 2 0 1 2]
scipy's (A.T).dot(A)
    data:
        [  83   77 7576  104   77 7607   83  104 7490]
    indptr:
        [0 3 6 9]
    indices:
        [2 1 0 2 0 1 0 1 2]
[[7576   77   83]
 [  77 7607  104]
 [  83  104 7490]]

学到的东西:

对于矩阵A,B和C,

  • index从1开始 .

  • 我在原始代码中混淆了c_indptr和c_indices . 应该是scipy csr_matrix的ia = indptr . ja = scipy csr_matrix的索引 .

  • 来自代码here . 一切都作为指针传递给mkl_?csrmultcsr . mkl_scsrmultcsr(&ta, &r[1], &sort, &m, &m, &m, a, ja, ia, a, ja, ia, c, jc, ic, &nzmax, &info);

  • 我想要一个可以使用从零开始的索引的mkl函数 . mkl.mkl_scsrmultcsr函数只能用于基于一的索引 . (或者我可以对所有内容进行基于索引的索引 . 这意味着对于大多数线性代数步骤,使用intel c函数而不是scipy / numpy . )

1 回答

  • 1

    查看scipy稀疏产品的Python代码 . 请注意,它在2次传递中调用已编译的代码 .

    看起来mkl代码做同样的事情

    https://software.intel.com/en-us/node/468640

    如果request = 1,则例程仅计算长度为m 1的数组ic的值,必须事先分配该数组的内存 . 退出时,值ic(m 1) - 1是数组c和jc中元素的实际数量 . 如果request = 2,则先前使用参数request = 1调用例程,输出数组jc和c在调用程序中分配,并且它们的长度至少为ic(m 1) - 1 .

    您首先根据 C 的行数分配了 ic (您知道来自输入),并使用 request=1 调用mkl代码 .

    对于 request=2 ,您必须根据 ic(m+1) - 1 中的大小分配 cjc 数组 . 这与输入数组中的 nnz 的数量不同 .

    您正在使用 request1 = c_int(0) ,这要求 c 数组的大小正确 - 如果不实际执行 dot (或 request 1 ),您将无法知道 .

    ==================

    File:        /usr/lib/python3/dist-packages/scipy/sparse/compressed.py
    Definition:  A._mul_sparse_matrix(self, other)
    

    pass 1分配 indptr (注意大小),并传递指针(数据在此传递时无关紧要)

    indptr = np.empty(major_axis + 1, dtype=idx_dtype)
    
        fn = getattr(_sparsetools, self.format + '_matmat_pass1')
        fn(M, N,
           np.asarray(self.indptr, dtype=idx_dtype),
           np.asarray(self.indices, dtype=idx_dtype),
           np.asarray(other.indptr, dtype=idx_dtype),
           np.asarray(other.indices, dtype=idx_dtype),
           indptr)
    
        nnz = indptr[-1]
    

    pass 2分配 indptr (不同大小),并基于 nnz indicesdata .

    indptr = np.asarray(indptr, dtype=idx_dtype)
        indices = np.empty(nnz, dtype=idx_dtype)
        data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
    
        fn = getattr(_sparsetools, self.format + '_matmat_pass2')
        fn(M, N, np.asarray(self.indptr, dtype=idx_dtype),
           np.asarray(self.indices, dtype=idx_dtype),
           self.data,
           np.asarray(other.indptr, dtype=idx_dtype),
           np.asarray(other.indices, dtype=idx_dtype),
           other.data,
           indptr, indices, data)
    

    最后使用这些数组创建一个新数组 .

    return self.__class__((data,indices,indptr),shape=(M,N))
    

    应以相同的方式使用 mkl 库 .

    ===================

    https://github.com/scipy/scipy/blob/master/scipy/sparse/sparsetools/csr.h

    csr_matmat_pass1csr_matmat_pass2 的c代码

    ====================

    如果它有帮助,这里是这些传递的纯Python实现 . 没有任何尝试利用任何数组操作的字面翻译 .

    def pass1(A, B):
        nrow,ncol=A.shape
        Aptr=A.indptr
        Bptr=B.indptr
        Cp=np.zeros(nrow+1,int)
        mask=np.zeros(ncol,int)-1
        nnz=0
        for i in range(nrow):
            row_nnz=0
            for jj in range(Aptr[i],Aptr[i+1]):
                j=A.indices[jj]
                for kk in range(Bptr[j],Bptr[j+1]):
                    k=B.indices[kk]
                    if mask[k]!=i:
                        mask[k]=i
                        row_nnz += 1
            nnz += row_nnz
            Cp[i+1]=nnz
        return Cp
    
    def pass2(A,B,Cnnz):
        nrow,ncol=A.shape
        Ap,Aj,Ax=A.indptr,A.indices,A.data
        Bp,Bj,Bx=B.indptr,B.indices,B.data
    
        next = np.zeros(ncol,int)-1
        sums = np.zeros(ncol,A.dtype)
    
        Cp=np.zeros(nrow+1,int)
        Cj=np.zeros(Cnnz,int)
        Cx=np.zeros(Cnnz,A.dtype)
        nnz = 0
        for i in range(nrow):
            head = -2
            length = 0
            for jj in range(Ap[i], Ap[i+1]):
                j, v = Aj[jj], Ax[jj]
                for kk in range(Bp[j], Bp[j+1]):
                    k = Bj[kk]
                    sums[k] += v*Bx[kk]
                    if (next[k]==-1):
                        next[k], head=head, k
                        length += 1
            print(i,sums, next)
            for _ in range(length):
                if sums[head] !=0:
                    Cj[nnz], Cx[nnz] = head, sums[head]
                    nnz += 1
                temp = head
                head = next[head]
                next[temp], sums[temp] = -1, 0
            Cp[i+1] = nnz            
        return Cp, Cj, Cx                  
    
    def pass0(A,B):
        Cp = pass1(A,B)
        nnz = Cp[-1]
        Cp,Cj,Cx=pass2(A,B,nnz)
        N,M=A.shape[0], B.shape[1]
        C=sparse.csr_matrix((Cx, Cj, Cp), shape=(N,M))
        return C
    

    AB 都必须是 csr . 它使用转置它需要转换,例如 B = A.T.tocsr() .

相关问题