# User Manual at http://www.wwjohnston.net/famhist/bill-howard-ydna.htm from scipy.stats import pearsonr from scipy.cluster.hierarchy import dendrogram, linkage #from pprint import pprint import numpy as np import pandas as pd from scipy.spatial.distance import pdist from matplotlib import pyplot as plt import math # ---------1 - SET CONTROL VARIABLES------------------------------ I = 'yoemans-111Corrected' #Specify the file name of the CSV file. # This will also be the phylogenetic tree’s title and file name. Rscale=1.27 Byear = 1950 RCC1added = 13 RCC1markers = 111 # Rscale is the years/RCC ratio, calibrated to this family & number of markers. # Byear is the Birth Year used to compute the split years. # Set Byear about the birth year of the test takers. RCC1added = 13 RCC1markers = 111 # RCC1 is the value to which fabricated vector z1 markers are set. # RCC1markers is the number of Y-STR markers of each input kit. # ---------2 - READ INPUT INTO MATRICES------------------------------ # read the Input file C = pd.read_csv("{input}.csv".format(input=I)).values # extract the kits as rows of observations and columns of attributes (markers) K = C[:,2:845] # Note that the numbering starts with 0 and not 1 for both rows and columns. # extract the kit numbers and names to produce a labels matrix L = [] for ix,row in enumerate(C): L.append("#{ix} {kit} {name}".format(ix=ix+1,kit=row[0],name=row[1])) # reverse the list and kits #L = L[::-1] #K = K[::-1] # ---------3 - CALCULATE RCC1 MATRIX FOR ALL PAIRS OF KITS------------- # calculate a condensed distance matrix consisting of RCC values as per Bill Howards paper # the condensed distance matrix is a single dimension matrix containing # the top triangle of a two dimensional distance matrix # e.g. [1,2,3,4] # [2,1,2,3] # [3,2,1,2] # [4,3,2,1] # becomes a condensed distance matrix Y of [2,3,4,2,3,2] Y = [] X = [] for i1,v1 in enumerate(K): X1 = [] z1 = np.zeros((RCC1markers,), dtype=int) # --------3A-Fabricate vectors z1 and z2------------------ for z1x in range(RCC1markers): if z1x == 1: z1[z1x] = 1 else: z1[z1x] = RCC1added for i2,v2 in enumerate(K): z2 = np.zeros((RCC1markers,), dtype=int) for x in range(RCC1markers): if v2[x] == v1[x]: z2[x] = z1[x] else: z2[x] = z1[x]-1 # --------3B-Calculate RCC1 in Matrix------------------------ if i2 > i1: if np.ndarray.all(z1 == z2): Y.append(0) else: Y.append((1/pearsonr(z1,z2)[0]-1)*10000.0) if np.ndarray.all(v1 == v2): X1.append(0) else: X1.append((1/pearsonr(z1,z2)[0]-1)*10000.0) X.append(X1) # ---------4 - GENERATE THE PHYLOGENETIC TREE SPLITS------------- # perform the agglomerative clustering using the average method as per Bill Howards paper Z = linkage(Y,method='weighted',optimal_ordering=False) # perform linkage reordering such that the shorter branch is first, the longer branch second # the lower index is first, the higher index is second for link in Z: leftDepth = link[0] if leftDepth >= len(L): leftDepth = Z[int(leftDepth)-len(L),3] else: leftDepth = 1 rightDepth = link[1] if rightDepth >= len(L): rightDepth = Z[int(rightDepth)-len(L),3] else: rightDepth = 1 if leftDepth < rightDepth: t = link[0] link[0] = link[1] link[1] = t elif link[0] < link[1] and link[0] < len(L) and link[1] < len(L): t = link[0] link[0] = link[1] link[1] = t # ---------5 - PLOT THE PHYLOGENETIC TREE------------- # plot the cluster hierarchy produced by linkage as a dendrogram F = plt.figure(figsize=(16,20),dpi=72) # A1 paper plt.title(I) plt.xlabel("RCC1") plt.grid(True,which='major',axis='x',color='g',linestyle='dashed') plt.minorticks_on() plt.tick_params(axis='x',which='minor') plt.tick_params(axis='y',which='minor',length=0) #plt.xscale('symlog',basex=2) plt.xticks(np.arange(0,800,12)) #plt.xticks((0,1,2,4,8,16,32,64,128,256),(0,1,2,4,8,16,32,64,128,256)) # - use this on recent kits within the last 50 RCC: plt.xticks(np.arange(50)) # - use this on kits that split way back BCE: plt.xticks(np.arange(0,800,25)) D = dendrogram(Z,labels=L,color_threshold=3.5,leaf_font_size=12,leaf_rotation=0,orientation='left') #plt.gca().invert_yaxis() for i, d, c in zip(D['icoord'], D['dcoord'], D['color_list']): y = 0.5 * sum(i[1:3]) x = d[1] if x > 0: plt.plot(x, y, 'o', c=c) yr = math.floor((Byear - int(x* Rscale))/10)*10 yr = int(yr) if yr >= 0: yr_txt = "{yr}".format(yr=yr) else: yr_txt = "{yr} BCE".format(yr=-yr) rcc_txt = "{:.2f}".format(x) plt.annotate("%s" % yr_txt, (x, y), xytext=(-6, -12), textcoords='offset points', color='r', va='center', ha='center', rotation=90) plt.annotate("%s" % rcc_txt, (x, y), xytext=(+7, 0), textcoords='offset points', color='r', va='center', ha='center', rotation=90) plt.annotate("RCC1 = {rscale} years".format(rscale=Rscale),(0,0),xytext=(0,-5)) F.subplots_adjust(left=0.05, right=0.85, top=0.97, bottom=0.05) plt.savefig("{input}.jupyter.png".format(input=I)) # cite: http://www.jogg.info/pages/72/files/Howard.htm # Dating Y-DNA Haplotypes on a Phylogenetic Tree: Tying the Genealogy of Pedigrees and Surname Clusters into Genetic Time Scales # William E. Howard III and Frederic R. Schwab # This code originally written by David Johnston in Australia. # Modified by George El Zakhem and Wesley Johnston in the USA.