# User Manual at http://www.wwjohnston.net/famhist/bill-howard-ydna.htm
from scipy.stats import pearsonr
from scipy.cluster.hierarchy import dendrogram, linkage
#from pprint import pprint
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
from matplotlib import pyplot as plt
import math

# ---------1 - SET CONTROL VARIABLES------------------------------

I = 'yoemans-111Corrected'
#Specify the file name of the CSV file. 
# This will also be the phylogenetic tree’s title and file name.

Rscale=1.27
Byear = 1950
RCC1added = 13
RCC1markers = 111
# Rscale is the years/RCC ratio, calibrated to this family & number of markers.
# Byear is the Birth Year used to compute the split years.
# Set Byear about the birth year of the test takers.

RCC1added = 13
RCC1markers = 111
# RCC1 is the value to which fabricated vector z1 markers are set.
# RCC1markers is the number of Y-STR markers of each input kit.

# ---------2 - READ INPUT INTO MATRICES------------------------------

# read the Input file
C = pd.read_csv("{input}.csv".format(input=I)).values

# extract the kits as rows of observations and columns of attributes (markers)
K = C[:,2:845]
# Note that the numbering starts with 0 and not 1 for both rows and columns.

# extract the kit numbers and names to produce a labels matrix
L = []
for ix,row in enumerate(C):
    L.append("#{ix} {kit} {name}".format(ix=ix+1,kit=row[0],name=row[1]))
    
# reverse the list and kits
#L = L[::-1]
#K = K[::-1]

# ---------3 - CALCULATE RCC1 MATRIX FOR ALL PAIRS OF KITS-------------
    
# calculate a condensed distance matrix consisting of RCC values as per Bill Howards paper
# the condensed distance matrix is a single dimension matrix containing 
# the top triangle of a two dimensional distance matrix
# e.g. [1,2,3,4]
#      [2,1,2,3]
#      [3,2,1,2]
#      [4,3,2,1]
# becomes a condensed distance matrix Y of [2,3,4,2,3,2]
Y = []
X = []
for i1,v1 in enumerate(K):
    X1 = []
    z1 = np.zeros((RCC1markers,), dtype=int)
# --------3A-Fabricate vectors z1 and z2------------------
    for z1x in range(RCC1markers):
        if z1x == 1:
            z1[z1x] = 1
        else:
            z1[z1x] = RCC1added
    for i2,v2 in enumerate(K):
        z2 = np.zeros((RCC1markers,), dtype=int)
        for x in range(RCC1markers):
            if v2[x] == v1[x]:
                z2[x] = z1[x]
            else:
                z2[x] = z1[x]-1
# --------3B-Calculate RCC1 in Matrix------------------------
        if i2 > i1:
            if np.ndarray.all(z1 == z2):
                Y.append(0)
            else:
                Y.append((1/pearsonr(z1,z2)[0]-1)*10000.0)
        if np.ndarray.all(v1 == v2):
            X1.append(0)
        else:
            X1.append((1/pearsonr(z1,z2)[0]-1)*10000.0)
    X.append(X1)

# ---------4 - GENERATE THE PHYLOGENETIC TREE SPLITS-------------

# perform the agglomerative clustering using the average method as per Bill Howards paper
Z = linkage(Y,method='weighted',optimal_ordering=False)

# perform linkage reordering such that the shorter branch is first, the longer branch second
# the lower index is first, the higher index is second
for link in Z:
    leftDepth = link[0]
    if leftDepth >= len(L):
        leftDepth = Z[int(leftDepth)-len(L),3]
    else:
        leftDepth = 1
    rightDepth = link[1]
    if rightDepth >= len(L):
        rightDepth = Z[int(rightDepth)-len(L),3]
    else:
        rightDepth = 1
    if leftDepth < rightDepth:
        t = link[0]
        link[0] = link[1]
        link[1] = t
    elif link[0] < link[1] and link[0] < len(L) and link[1] < len(L):
            t = link[0]
            link[0] = link[1]
            link[1] = t

# ---------5 - PLOT THE PHYLOGENETIC TREE-------------            
            
# plot the cluster hierarchy produced by linkage as a dendrogram
F = plt.figure(figsize=(16,20),dpi=72) # A1 paper
plt.title(I)
plt.xlabel("RCC1")
plt.grid(True,which='major',axis='x',color='g',linestyle='dashed')
plt.minorticks_on()
plt.tick_params(axis='x',which='minor')
plt.tick_params(axis='y',which='minor',length=0)

#plt.xscale('symlog',basex=2)

plt.xticks(np.arange(0,800,12))
#plt.xticks((0,1,2,4,8,16,32,64,128,256),(0,1,2,4,8,16,32,64,128,256))
# - use this on recent kits within the last 50 RCC: plt.xticks(np.arange(50))
# - use this on kits that split way back BCE: plt.xticks(np.arange(0,800,25))

D = dendrogram(Z,labels=L,color_threshold=3.5,leaf_font_size=12,leaf_rotation=0,orientation='left')
#plt.gca().invert_yaxis()
for i, d, c in zip(D['icoord'], D['dcoord'], D['color_list']):
        y = 0.5 * sum(i[1:3])
        x = d[1]
        if x > 0:
            plt.plot(x, y, 'o', c=c)
            yr = math.floor((Byear - int(x* Rscale))/10)*10
            yr = int(yr)
            if yr >= 0:
                yr_txt = "{yr}".format(yr=yr)
            else:
                yr_txt = "{yr} BCE".format(yr=-yr)
            rcc_txt = "{:.2f}".format(x)
            plt.annotate("%s" % yr_txt, (x, y), xytext=(-6, -12),
                             textcoords='offset points', color='r',
                             va='center', ha='center', rotation=90)
            plt.annotate("%s" % rcc_txt, (x, y), xytext=(+7, 0),
                             textcoords='offset points', color='r',
                             va='center', ha='center', rotation=90)
plt.annotate("RCC1 = {rscale} years".format(rscale=Rscale),(0,0),xytext=(0,-5))        
F.subplots_adjust(left=0.05, right=0.85, top=0.97, bottom=0.05)
plt.savefig("{input}.jupyter.png".format(input=I))

# cite: http://www.jogg.info/pages/72/files/Howard.htm
# Dating Y-DNA Haplotypes on a Phylogenetic Tree: Tying the Genealogy of Pedigrees and Surname Clusters into Genetic Time Scales
# William E. Howard III and Frederic R. Schwab

# This code originally written by David Johnston in Australia.
# Modified by George El Zakhem and Wesley Johnston in the USA.