# User Manual at http://www.wwjohnston.net/famhist/bill-howard-ydna.htm from scipy.stats import pearsonr from scipy.cluster.hierarchy import dendrogram, linkage import numpy as np import pandas as pd from scipy.spatial.distance import pdist from matplotlib import pyplot as plt import math # ---------1 - SET CONTROL VARIABLES------------------------------ I = 'yoemans-111Corrected' #Specify the file name of the CSV file. # This will also be the phylogenetic tree’s title and file name. Rscale = 34.65 Byear = 1950 # Rscale is the years/RCC ratio, calibrated to this family & number of markers. # Byear is the Birth Year used to compute the split years. # Set Byear about the birth year of the test takers. # ---------2 - READ INPUT INTO MATRICES------------------------------ # read the Input file C = pd.read_csv("{input}.csv".format(input=I)).values # extract the kits as rows of observations and columns of attributes (markers) K = C[:,2:845] # Note that the numbering starts with 0 and not 1 for both rows and columns. # extract the kit numbers and names to produce a labels matrix L = [] for ix,row in enumerate(C): L.append("#{ix} {kit} {name}".format(ix=ix+1,kit=row[0],name=row[1])) # reverse the list and kits #L = L[::-1] #K = K[::-1] # ---------3 - CALCULATE RCC MATRIX FOR ALL PAIRS OF KITS------------- # calculate a condensed distance matrix consisting of RCC values as per Bill Howards paper # the condensed distance matrix is a single dimension matrix containing # the top triangle of a two dimensional distance matrix # e.g. [1,2,3,4] # [2,1,2,3] # [3,2,1,2] # [4,3,2,1] # becomes a condensed distance matrix Y of [2,3,4,2,3,2] Y = [] X = [] for i1,v1 in enumerate(K): X1 = [] for i2,v2 in enumerate(K): if i2 > i1: Y.append((1/pearsonr(v1,v2)[0]-1)*10000.0) X1.append((1/pearsonr(v1,v2)[0]-1)*10000.0) X.append(X1) # ---------4 - GENERATE THE PHYLOGENETIC TREE SPLITS------------- # perform the agglomerative clustering using the average method as per Bill Howards paper Z = linkage(Y,method='weighted',optimal_ordering=False) # perform linkage reordering such that the shorter branch is first, the longer branch second # the lower index is first, the higher index is second for link in Z: leftDepth = link[0] if leftDepth >= len(L): leftDepth = Z[int(leftDepth)-len(L),3] else: leftDepth = 1 rightDepth = link[1] if rightDepth >= len(L): rightDepth = Z[int(rightDepth)-len(L),3] else: rightDepth = 1 if leftDepth < rightDepth: t = link[0] link[0] = link[1] link[1] = t elif link[0] < link[1] and link[0] < len(L) and link[1] < len(L): t = link[0] link[0] = link[1] link[1] = t # ---------5 - PLOT THE PHYLOGENETIC TREE------------- # plot the cluster hierarchy produced by linkage as a dendrogram F = plt.figure(figsize=(16,20),dpi=72) # A1 paper plt.title(I) plt.xlabel("RCC") plt.grid(True,which='major',axis='x',color='g',linestyle='dashed') plt.minorticks_on() plt.tick_params(axis='x',which='minor') plt.tick_params(axis='y',which='minor',length=0) #plt.xscale('symlog',basex=2) plt.xticks(np.arange(0,800,12)) #plt.xticks((0,1,2,4,8,16,32,64,128,256),(0,1,2,4,8,16,32,64,128,256)) # - use this on recent kits within the last 50 RCC: plt.xticks(np.arange(50)) # - use this on kits that split way back BCE: plt.xticks(np.arange(0,800,25)) D = dendrogram(Z,labels=L,color_threshold=3.5,leaf_font_size=12,leaf_rotation=0,orientation='left') #plt.gca().invert_yaxis() for i, d, c in zip(D['icoord'], D['dcoord'], D['color_list']): y = 0.5 * sum(i[1:3]) x = d[1] if x > 0: plt.plot(x, y, 'o', c=c) yr = math.floor((Byear - int(x* Rscale))/10)*10 yr = int(yr) if yr >= 0: yr_txt = "{yr}".format(yr=yr) else: yr_txt = "{yr} BCE".format(yr=-yr) #rcc_txt = int(x*10)/10 rcc_txt = "{:.2f}".format(x) plt.annotate("%s" % yr_txt, (x, y), xytext=(-6, -12), textcoords='offset points', color='r', va='center', ha='center', rotation=90) plt.annotate("%s" % rcc_txt, (x, y), xytext=(+7, 0), textcoords='offset points', color='r', va='center', ha='center', rotation=90) plt.annotate("RCC = {rscale} years".format(rscale=Rscale),(0,0),xytext=(0,-5)) F.subplots_adjust(left=0.05, right=0.85, top=0.97, bottom=0.05) plt.savefig("{input}.jupyter.png".format(input=I)) # cite: http://www.jogg.info/pages/72/files/Howard.htm # Dating Y-DNA Haplotypes on a Phylogenetic Tree: Tying the Genealogy of Pedigrees and Surname Clusters into Genetic Time Scales # William E. Howard III and Frederic R. Schwab # http://www.wwjohnston.net/famhist/bill-howard-ydna.htm # This code originally written by David Johnston in Australia. # Modified by George El Zakhem and Wesley Johnston in the USA.