Introduction

Prior to merging the data sets, I need to fix the IDs in the WGS data as they are not UUIDs and are therefore recognising all individuals as unique despite having an overlap.

Methods and Results

1. Load WGS fam file

wgs_fam_data <- read.table('C:/Users/Martha/Documents/Honours/Project/honours.project/Data/plink_output/wgs_filtered.fam', header = F)
head(wgs_fam_data)

##   V1     V2 V3 V4 V5 V6
## 1  1 110390  0  0  2 -9
## 2  1 110990  0  0  1 -9
## 3  1 111280  0  0  2 -9
## 4  1 111800  0  0  2 -9
## 5  1 111830  0  0  2 -9
## 6  1 124520  0  0  1 -9

2. Load spreadsheet with WGS IDs and UUIDs

WGS_ID <- read.csv('C:/Users/Martha/Documents/Honours/Project/honours.project/Data/WGS_ID_mapping.csv', header =F)
head(WGS_ID)

##           V1     V2     V3     V4    V5
## 1     KCCGID  QUTID   UUID  patID matID
## 2 FR07885645 ES_804 400804 316910  2802
## 3 FR07885644 ES_801 400801 312590  1890
## 4 FR07885643 ES_698 400698 311660    76
## 5 FR07885642 ES_696 400696 311660    76
## 6 FR07885324 ES_684 400684      0     0

3. Merge the two tables by the ‘FR’ IDs

WGS_ID_merge <- merge(wgs_fam_data, WGS_ID, by = 'V1')
head(WGS_ID_merge)

##  [1] V1   V2.x V3.x V4.x V5.x V6   V2.y V3.y V4.y V5.y
## <0 rows> (or 0-length row.names)

4. Subset the relevant columns to retain (i.e. Fam ID, UUID, paternal ID, maternal ID, sex, phenotype value)

fam_id_subset <- c("V1", "V3.y", "V3.x", "V4.x", "V5.x", "V6")
wgs_fam_data2 <- WGS_ID_merge[fam_id_subset]
wgs_fam_data2 <-`colnames<-`(wgs_fam_data2, c("V1", "V2", "V3", "V4", "V5", "V6"))
head(wgs_fam_data2)

## [1] V1 V2 V3 V4 V5 V6
## <0 rows> (or 0-length row.names)

5. Export this new .fam file

write.table(wgs_fam_data2,'C:/Users/Martha/Documents/Honours/Project/honours.project/Data/plink_output/wgs_filtered.fam', row.names = F, col.names = F, quote = F)

Note: I replaced wgs_filtered.fam in taurus so that it contains this new data set with UUIDs in place of the FR IDs.

6. Try merging data sets using new fam file

plink --bfile plink_output/wgs_filtered --bmerge plink_output/array_filtered.bed plink_output/array_filtered.bim plink_output/array_filtered.fam --merge-mode 6 --out plink_output/merge_mode6_filtered

Note: This step does not technically merge the two data sets. Merge mode 6 will report all mismatching calls - assuming that there will be a difference in genotypes reported between the two data sets for the same individuals.

I checked the number of overlapping individuals in R - there are 94. However, the merge still does not recognise them as overlapping. I suspect that the merge considers all other variables in the .fam file. Therefore, despite having the same UUIDs, the individuals that do overlap have different family, paternal, and maternal IDs. This will have to be resolved first.

7. Merge WGS and array fam files by UUIDs to obtain overlap

array_fam_data <- read.table('C:/Users/Martha/Documents/Honours/Project/honours.project/Data/plink_output/array_filtered.fam', header = F)
head(array_fam_data) #load in array fam file (WGS fam file already loaded)

##   V1     V2     V3     V4 V5 V6
## 1  1 110630    790    800  2 -9
## 2  1 110850 111740 333130  2 -9
## 3  1 111280    933     59  2 -9
## 4  1 111830 169270 400523  2 -9
## 5  1 310881      0      0  2 -9
## 6  1 343481   1716 319731  2 -9

array_V_wgs_uuid <- merge(wgs_fam_data2, array_fam_data, by = 'V2')
head(array_V_wgs_uuid)

##  [1] V2   V1.x V3.x V4.x V5.x V6.x V1.y V3.y V4.y V5.y V6.y
## <0 rows> (or 0-length row.names)

8. Retain info from array file

fam_id_subset2 <- c("V1.y","V2", "V3.y", "V4.y", "V5.x", "V6.x") #select desired columns
wgs_array_same <- array_V_wgs_uuid[fam_id_subset2] #subset data based on selected columns
wgs_array_same <-`colnames<-`(wgs_array_same, c("V1", "V2", "V3", "V4", "V5", "V6")) #rename columns to match
head(wgs_array_same)

## [1] V1 V2 V3 V4 V5 V6
## <0 rows> (or 0-length row.names)

9. Replace info in wgs_fam_data2 (current fam file)

UUID_same <- wgs_array_same$V2 #subset overlapping UUIDs
head(UUID_same)

## factor(0)
## 109 Levels: 110390 110990 111100 111280 111800 111830 124520 ... UUID

wgs_fam_omit <- wgs_fam_data2[!wgs_fam_data2$V2 %in% UUID_same,] #remove data for individuals with the same UUID (ie overlapping indivs are omitted)

wgs_fam_data3 <- rbind(wgs_array_same, wgs_fam_omit) #put together the overlapping and non-overlapping indivs
head(wgs_fam_data3)

## [1] V1 V2 V3 V4 V5 V6
## <0 rows> (or 0-length row.names)

tail(wgs_fam_data3)

## [1] V1 V2 V3 V4 V5 V6
## <0 rows> (or 0-length row.names)

wgs_fam_data3 now ensures that all overlapping individuals have the same family, paternal, and maternal IDs as the array .fam file. However, I didn’t think to just make them all 1s and 0s. So in the interest of moving forward, I exported wgs_fam_data3 to Excel and made all family IDs = 1 and all paternal and maternal IDs = 0.

10. Export wgs_fam_data3 to Excel, make fam, pat, and mat IDs uniform, then re-load wgs_fam_data3

write.csv(wgs_fam_data3,'C:/Users/Martha/Documents/Honours/Project/honours.project/Data/plink_output/wgs_filtered.csv', row.names = F, quote = F, col.names = F)

wgs_fam_data4 <- read.csv('C:/Users/Martha/Documents/Honours/Project/honours.project/Data/plink_output/wgs_filtered.csv') #this now contains same fam, pat, and mat IDs as array .fam file
head(wgs_fam_data4)

11. Export wgs_fam_data4 to .fam file for plink

write.table(wgs_fam_data4,'C:/Users/Martha/Documents/Honours/Project/honours.project/Data/plink_output/wgs_filtered.fam', row.names = F, quote = F, col.names = F)

Note: I replaced wgs_filtered.fam in taurus with this new .fam file (still names wgs_filtered.fam)

12. Re-try merge-mode 6 in plink

plink --bfile plink_output/aray_filtered --bmerge plink_output/wgs_filtered.bed plink_output/wgs_filtered.bim plink_output/wgs_filtered.fam --merge-mode 6 --out plink_output/merge_mode6_filtered

Output: - 506 people from array; 108 from WGS - from WGS: 14 people are new, 94 people overlap with array (as expected) - 9,155,053 variants were loaded from both - 860,574,982 overlapping calls - 835,321,129 nonmissing in both file sets - 579,288,999 concordant - concordance rate = 69.31%

This concordance rate is much lower than expected - it should be close to 100% but discrepancies between different sequencing technology cannot be avoided, therefore a concordance rate for the same sample will never be 100%.

13. Merge both file sets using default settings (consensus calls)

plink --bfile plink_output/aray_filtered --bmerge plink_output/wgs_filtered.bed plink_output/wgs_filtered.bim plink_output/wgs_filtered.fam --make-bed --out plink_output/filtered_merge

Output: - 520 people (255 males, 265 females) and 9,155,053 variants pass filters and QC - 59 founders and 461 non-founders - total genotyping rate is 92.32%

Fixing WGS ID and merging data