library(tidyverse)
library(qvalue)
library(reshape)
library(ggpubr)
library(vcfR)
library(microbenchmark)
library(matrixStats)
files <- c("High_Ne_HLG_FST_real_5000simulations.txt",
"High_Ne_ROS_FST_real_5000simulations.txt",
"High_Ne_QUI_FST_real_5000simulations.txt")
FST <- lapply(files, function(x) read.table(x, header = T, stringsAsFactors = F))
obj.vcfR <- vcfR::read.vcfR("VCF_2067_SNPs_at_T1_3pops.vcf", verbose = FALSE)
df_TempDiff <- read.table("Temporal_outliers_TempoDiff_qval0.10.txt", header = T,
stringsAsFactors = F)
df_OutFLANK <- read.table("Temporal_outliers_outflank_qval0.10.txt", header = T, stringsAsFactors = F)
Proportion of simulations with FST values equal or larger to the
observed FST after being corrected for multiple testing
POP <- c("HLG","ROS","QUI")
n = 5000
alpha = 0.10
df_POP_FST_count <- list()
N_observations <-list()
qval <- list()
outliers_HighFST <- list()
header <- c("CHROM","POS","real_FST",paste(rep("simu",n),seq(1,n,1),sep = ""))
for (i in 1: length(POP)){
POP_FST <- as.data.frame(FST[[i]])
colnames(POP_FST) <- header
POP_FST_count <- POP_FST %>%
group_by(CHROM,POS) %>%
dplyr::count(POP_FST[,4:(n+3)] >= real_FST)
df_POP_FST_count[[i]] <- data.frame(as.matrix(POP_FST_count,ncol=(n+3)))
df_POP_FST_count[[i]] <- df_POP_FST_count[[i]][,3:(n+2)]
df_POP_FST_count[[i]]$count <- apply(df_POP_FST_count[[i]] , 1, function(x) sum(x==1,na.rm = TRUE))
colnames(df_POP_FST_count[[i]]) <- c(paste(rep("simu",n),seq(1,n,1),sep = ""),"count_FSTsim>FSTreal")
Na_count <- apply(df_POP_FST_count[[i]] , 1, function(x) sum(is.na(x)))
N_observations[[i]] <- (n - Na_count)
df_POP_FST_count[[i]]$Pvalue_highFST <- df_POP_FST_count[[i]]$`count_FSTsim>FSTreal`/N_observations[[i]]
qval[[i]] <- qvalue(df_POP_FST_count[[i]]$Pvalue_highFST)$qvalues
outliers_HighFST[[i]] <- which(qval[[i]] < alpha)
print(paste(POP[[i]],"number of outliers (pvalue corrected):",length(outliers_HighFST[[i]])))
}
## [1] "HLG number of outliers (pvalue corrected): 6"
## [1] "ROS number of outliers (pvalue corrected): 74"
## [1] "QUI number of outliers (pvalue corrected): 53"
CHROM_POS<-data.frame(seq(1,length(getCHROM(obj.vcfR))), getCHROM(obj.vcfR),getPOS(obj.vcfR),getID(obj.vcfR) ,qval[[1]],qval[[2]],qval[[3]])
colnames(CHROM_POS) <- c("n","CHROM","POS","ID", "qval_HLG", "qval_ROS", "qval_QUI")
outliers_SNP_pop <- list()
for (i in 1: length(POP)){
outlier_SNP <-filter(CHROM_POS, n %in% outliers_HighFST[[i]])
outliers_SNP_pop[[i]] <- cbind(outlier_SNP, rep(POP[[i]],length(outliers_HighFST[[i]])))
}
outliers_df_highFST <- rbind(outliers_SNP_pop[[1]],outliers_SNP_pop[[2]],outliers_SNP_pop[[3]])
colnames(outliers_df_highFST) <- c("n","CHROM","POS","ID", "qval_HLG", "qval_ROS", "qval_QUI","pop_detected")
#report the total number of outlier over populations
outliers_df_highFST_unique <-outliers_df_highFST[!duplicated(outliers_df_highFST$n), ]
print(paste("Total number of outlier over populations",length(unique(outliers_df_highFST$ID))))
## [1] "Total number of outlier over populations 131"
#report the number of SNPs detected twice in different populatons
occ_SNPs <- table(unlist(outliers_df_highFST$ID))
occurences <- melt(occ_SNPs)
multiple_occ <- occurences %>%
filter(value >= 2)
multiple_occ_pop <-filter(outliers_df_highFST, ID %in% multiple_occ[,1])
multiple_occ_pop <-multiple_occ_pop[
order(multiple_occ_pop[,1]),]
99% centile of simulated FST values for each SNP
pop_TempFST <- list()
for (i in 1:length(POP)){
quantile0.01 <-rowQuantiles(as.matrix(FST[[i]][1:n+3]), probs=0.01, na.rm = TRUE)
quantile99 <-rowQuantiles(as.matrix(FST[[i]][1:n+3]), probs=0.99, na.rm = TRUE)
df_pop <-data.frame(CHROM_POS,FST[[i]][,3],quantile0.01,quantile99)
names(df_pop)[8] <- "TempFST"
df_TempDiff_pop<-df_TempDiff%>%filter(pop == POP[[i]])
df_OutFLANK_pop <- df_OutFLANK%>%filter(pop == POP[[i]])
df_pop <- df_pop%>%
mutate(SLiM_simulations= ID %in% outliers_SNP_pop[[i]]$ID,
TempoDiff= ID %in% df_TempDiff_pop$ID,
OutFLANK = ID %in% df_OutFLANK_pop$ID)
df_pop[df_pop == "FALSE"] <- 0
df_pop[df_pop == "TRUE"] <- 1
df_pop <- df_pop[,c(1:4,8:13)]
df_subset <- df_pop%>%
mutate(twotest= rowSums(.[8:10],na.rm = T))
df_neutre <- df_subset%>% filter(twotest ==0)%>%
filter(quantile0.01 != "NA")
df_two_tests <- df_subset%>%filter(twotest ==2)
df_three_tests <- df_subset%>%filter(twotest ==3)
df_SLiM_simulations_uniq <- df_subset%>%
filter(SLiM_simulations ==1 & TempoDiff ==0)
pop_TempFST[[i]] <-ggplot(df_neutre, aes(x = n, y = quantile99))+
geom_line(color = "darkgreen",size = 0.01, alpha = 0.4)+
geom_line(aes(x = n, y = quantile0.01), color = "darkgreen",size = 0.01, alpha = 0.4)+
geom_point(data = df_neutre,aes(x = n, y = TempFST),colour="grey84",size = 1,shape = 4)+
geom_point(data = df_SLiM_simulations_uniq,aes(x = n, y = TempFST),fill = "grey",size = 2,shape = 25)+
geom_point(data = df_two_tests,aes(x = n, y = TempFST),fill = "purple",size = 2.5,shape = 25)+
geom_point(data = df_three_tests,aes(x = n, y = TempFST),fill = "red",size = 2.5,shape = 25)+
scale_y_continuous(limits = c(-0.05,0.25),breaks = c(0,0.05,0.10,0.15,0.20,0.25),name = expression(F[ST]))+
scale_x_continuous(breaks = c(1,500,1000,1500,2067),name = 'SNPs')+
theme(axis.text.x=element_text(colour="black", size = 12,angle = 45,hjust=0.95,vjust=0.9),
axis.title = element_text(colour="black", size=12),
axis.text.y=element_text(colour="black", size = 12),
panel.background = element_rect(fill="white"),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour="black", fill=NA, size=0.5),
strip.text = element_text(colour="black", size=10),
legend.position = "bottom",
legend.text = element_text())
}
#HLG
pop_TempFST[[1]] <- pop_TempFST[[1]] + scale_x_continuous(name = '',expand = c(0.01,0.01)) + theme(axis.text.x=element_blank())
#ROS
pop_TempFST[[2]] <- pop_TempFST[[2]] + scale_x_continuous(name = '',expand = c(0.01,0.01))+ theme(axis.text.x=element_blank())
#QUI
pop_TempFST[[3]] <- pop_TempFST[[3]] + scale_x_continuous(name = 'SNPs',expand = c(0.01,0.01))
plot(ggarrange(pop_TempFST[[1]], ggparagraph(text=" ", face = "italic", size = 10, color = "black"),
pop_TempFST[[2]], ggparagraph(text=" ", face = "italic", size = 10, color = "black"),
pop_TempFST[[3]], ncol = 1, nrow = 5, labels=c("A", "", "B", "", "C"), heights=c(1.5,0.01,1.5,0.01,1.80), widths=c(1, 1, 1, 1, 1), common.legend = TRUE, legend="bottom" ))
Figure 3. Temporal FST per SNP are compared to the upper limit of the neutral expectation based on the high Ne scenario. The green line represents the 99% quantile of FST values expected under genetic drift alone, determined from 5 000 simulations for (A) HLG, (B) ROS, and (C) QUI. Outlier SNPs that are significant after correction for multiple comparisons and detected only by the simulation framework are represented by grey triangles. Those detected by two or three methods are represented by purple and red triangles, respectively. Grey crosses indicate putatively neutral SNPs that were not detected by outlier tests
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=French_France.1252 LC_CTYPE=French_France.1252
## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C
## [5] LC_TIME=French_France.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] matrixStats_0.61.0 microbenchmark_1.4.9 vcfR_1.12.0
## [4] ggpubr_0.4.0 reshape_0.8.8 qvalue_2.18.0
## [7] forcats_0.5.1 stringr_1.4.1 dplyr_1.0.7
## [10] purrr_0.3.4 readr_2.1.3 tidyr_1.2.0
## [13] tibble_3.1.6 ggplot2_3.3.6 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-155 fs_1.5.2.9000 lubridate_1.8.0 httr_1.4.2
## [5] tools_3.6.0 backports_1.4.1 bslib_0.3.1 utf8_1.2.2
## [9] R6_2.5.1 vegan_2.5-7 DBI_1.1.2 mgcv_1.8-38
## [13] colorspace_2.0-3 permute_0.9-7 withr_2.5.0 tidyselect_1.1.1
## [17] compiler_3.6.0 cli_3.4.1 rvest_1.0.2 xml2_1.3.3
## [21] labeling_0.4.2 sass_0.4.0 scales_1.1.1 digest_0.6.29
## [25] rmarkdown_2.13 pkgconfig_2.0.3 htmltools_0.5.2 dbplyr_2.1.1
## [29] fastmap_1.1.0 highr_0.9 rlang_0.4.11 readxl_1.3.1
## [33] rstudioapi_0.13 farver_2.1.1 jquerylib_0.1.4 generics_0.1.2
## [37] jsonlite_1.8.0 car_3.0-12 magrittr_2.0.3 Matrix_1.5-1
## [41] Rcpp_1.0.10 munsell_0.5.0 fansi_1.0.3 ape_5.7-1
## [45] abind_1.4-5 lifecycle_1.0.1 stringi_1.7.8 yaml_2.3.5
## [49] carData_3.0-5 MASS_7.3-55 plyr_1.8.6 pinfsc50_1.2.0
## [53] grid_3.6.0 parallel_3.6.0 crayon_1.5.2 lattice_0.20-45
## [57] cowplot_1.1.1 haven_2.4.3 splines_3.6.0 hms_1.1.2
## [61] knitr_1.37 pillar_1.7.0 ggsignif_0.6.3 reshape2_1.4.4
## [65] reprex_2.0.1 glue_1.6.2 evaluate_0.15 memuse_4.2-1
## [69] modelr_0.1.8 vctrs_0.3.8 tzdb_0.3.0 cellranger_1.1.0
## [73] gtable_0.3.1 assertthat_0.2.1 xfun_0.30 broom_0.7.12
## [77] rstatix_0.7.0 viridisLite_0.4.1 cluster_2.1.2 ellipsis_0.3.2