#! /usr/bin/env python3
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (4, 4) # That's 4x4 inches, making it a 400x400px PNG figure.
infile = sys.argv[1]
outfile = sys.argv[2]
# Reading the data table in, with Pandas.
#
df = pd.read_table(infile)
ctype = list(df['type']) # Pull the celltype column out as an array
df = df.drop('type', axis=1) # ... then drop the column out of the main data table
(ncells, ngenes) = df.shape # ... leaving just counts, cells x genes
X = df.values.astype(float) # ... extract that data matrix as an ndarray, X.
# The data are in mapped read counts. As we've seen, clustering of
# RNA-seq counts often works best in log(counts).
#
# Counts can be zero, especially in sparse single cell RNA-seq data,
# and you can't take log(0), so people do log(counts + epsilon) for
# some arbitrary small epsilon.
#
# The choice of epsilon can matter when you go on to calculate
# Euclidean distances between cells, as you do in cluster analyses; if
# epsilon is very small, the distance between zero and one count --
# log(0 + epsilon) and log(1 + epsilon) -- can be large relative to
# actual gene expression differences. epsilon=1.0 seems a little
# better behaved in t-SNE analyses.
#
for i in range(ncells):
for g in range(ngenes):
X[i,g] = np.log( X[i,g] + 1.0)
# Here's PCA, implemented with SVD.
# Easy, now that we know how to do it!
#
Xs = X - np.mean(X, axis=0) # subtract column mean from each column
U, S, Wt = np.linalg.svd(Xs) # SVD
W = np.transpose(Wt)
X2 = Xs @ W[:,:2] # X2 is now our data, projected onto the first two PCs
# I carefully chose a set of 16 colors, and their order, to
# give good visual discrimination of the clusters in Moriarty's
# plots. I started with a list from here:
# https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/
#
Color = [ 'xkcd:red', 'xkcd:green', 'xkcd:magenta', 'xkcd:blue',
'xkcd:purple', 'xkcd:orange', 'xkcd:cyan', 'xkcd:lime',
'xkcd:pink', 'xkcd:yellow', 'xkcd:teal', 'xkcd:lavender',
'xkcd:brown', 'xkcd:maroon', 'xkcd:olive', 'xkcd:navy' ]
# Plot each cell as a point in PC1 vs PC2.
#
fig, ax = plt.subplots(1,1)
for i in range(ncells):
ax.plot(X2[i,0], X2[i,1], 'o', markersize=4, mfc='w', mec=Color[ctype[i]%16])
ax.set_title(infile)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.tight_layout()
plt.savefig(outfile)