diffMap/diffuse.py at master · wjawaid/diffMap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import numpy as np
from scipy.sparse.linalg import eigsh


def applyGaussianKernel(d2, rsigmas, csigmas=None):
    """Applies Gaussian Kernel given sigmas

    Given sigmas this function will return a Gaussian kernel.

    Keyword arguments:
    rsigmas -- refers to row sigmas and is symmetric when calculating
    the diffusion map.
    csigmas -- is optional and only required when projecting on a
    diffusion map.
    """
    if len(rsigmas) != d2.shape[0]:
      raise ValueError('Length of rsigmas does not match d2 dimensions')
    if csigmas is None:
      sigmaMat = np.matrix(rsigmas)
      sigmaMat = sigmaMat.transpose() * sigmaMat
      sigmaSqd = np.power(rsigmas, 2)
      sigmaSum = np.matrix(sigmaSqd)
      sigmaSum = sigmaSum + sigmaSum.transpose()
    else:
      if len(csigmas) != d2.shape[1]:
        raise ValueError('Length of csigmas does not match d2 dimensions')
      sigmaMat = np.matrix(rsigmas).transpose() * np.matrix(csigmas)
      rsigmaSqd = np.power(rsigmas, 2)
      csigmaSqd = np.power(csigmas, 2)
      sigmaSum = np.matrix(rsigmaSqd).transpose() + np.matrix(csigmaSqd)
    W = np.multiply(np.sqrt(2 * sigmaMat / sigmaSum), np.exp(-d2 / sigmaSum))
    return(W)


def diffuse(data, ndims=20, nsig=5, sigmas=0):
    """Diffusion map function

    Calculates diffusion map dimensionality reduction as described by
    Laleh Haghverdi et al.

    data --  matrix of values with cell in columns and genes in rows
    ndims -- number of dimensions to return (default 20)
    nsig --  number of neighbours when calculating sigmas (default 5)
    sigmas -- Give your own sigma(s). If 0 they will be automatically
              calculated (default 0)

    Returns a list of: eigenvalues, eigenvectors, sigmas, gaussian and
                       redundant first vector
    """
    # d2 = sp.spatial.distance.pdist(data.transpose(), 'cosine')
    # d2 = sp.spatial.distance.squareform(d2)
    d2 = np.power(1 - np.corrcoef(data.T), 2)
    np.fill_diagonal(d2, 0)
    if sigmas == 0:
      sigmas = [np.sqrt(np.sort(d2[i, :])[nsig-1])/2 for i in xrange(0, d2.shape[0])]
      W = applyGaussianKernel(d2, sigmas)
    elif (isinstance(sigmas, float) or isinstance(sigmas, int)):
      W = np.exp(-d2 / (2 * sigmas**2))
    np.fill_diagonal(W, 0)
    D = np.matrix(sum(W))
    markov = W / D
    q = D.transpose() * D
    H = W / q
    dH = sum(H)
    rdH = np.matrix(1. / np.sqrt(dH))
    Hp = np.multiply(H, (rdH.transpose() * rdH))
    n = d2.shape[0]
    evals, evecs = eigsh(Hp, ndims + 1, which='LM')
    # evals, evecs = eigh(Hp)
    srtOrd = np.argsort(evals)[::-1]
    evals = evals[srtOrd]
    evecs = evecs[:, srtOrd]
    evecs = np.array(rdH) * np.array(evecs).transpose()
    evals = evals[1:]
    evec0 = evecs[0,:]
    evecs = evecs.transpose()[:, 1:]
    return([evals, evecs, sigmas, W, evec0])


def c2c(A,B):
    """Calculate pearson correlation coefficient

    Calculates pearson correlation between rows of two arrays.
    Keyword values:
    A -- M x T shaped array
    B -- N x T shaped array

    returns an M x N shaped array

    """
    # Rowwise mean of input arrays & subtract from input arrays themeselves
    A_mA = A - A.mean(1)[:,None]
    B_mB = B - B.mean(1)[:,None]

    # Sum of squares across rows
    ssA = (A_mA**2).sum(1);
    ssB = (B_mB**2).sum(1);

    # Finally get corr coeff
    return np.dot(A_mA,B_mB.T)/np.sqrt(np.dot(ssA[:,None],ssB[None]))


def diffProj(x, data, evals, evecs, sigmas, gaussian, vec0, nsig):
    """Project on diffusion map

    Projects new data on a previously produced diffusion map.
    Keyword values:
    x    -- new data in the form of a G x N array
    data -- original data in the form of a G x M array
    evals -- eigenvalues from base diffusion map
    evecs -- eigenvectors from base diffusion map
    sigmas -- sigmas used for base diffusion map
    gaussian - gaussian generated while calulating base diffusion map
    vec0 -- redundant first eigenvector, usually removed
    nsig -- number of nearest neighbours for calulating new sigmas use
            same as for original diffusion map

    """
    d2 = np.power(1 - c2c(x.T, data.T), 2)
    nsigmas = [np.sqrt(np.sort(d2[i, :])[nsig-1])/2
               for i in xrange(0, d2.shape[0])]
    W = applyGaussianKernel(d2, nsigmas, sigmas)
    W[abs(W-1) < 1e-15] = 0
    H = W / np.sum(W, 1)
    H = H / np.sum(gaussian, 1).T
    Hp = np.matrix(H / np.sum(H, 1))
    predMat = Hp * \
              np.concatenate((np.matrix(vec0).T, np.matrix(evecs)), axis=1) * \
              np.diag(np.append([1],1/evals))
    predMat = np.array(predMat[:,1:])
    return(predMat)