library(tidyverse)
library(qvalue)
library(reshape)
library(ggpubr)
library(vcfR)
library(microbenchmark)
library(matrixStats)

1. Loading datasets

files <- c("High_Ne_HLG_FST_real_5000simulations.txt",
           "High_Ne_ROS_FST_real_5000simulations.txt",
           "High_Ne_QUI_FST_real_5000simulations.txt")
FST <- lapply(files, function(x) read.table(x, header = T, stringsAsFactors = F)) 

obj.vcfR <- vcfR::read.vcfR("VCF_2067_SNPs_at_T1_3pops.vcf", verbose = FALSE)

df_TempDiff <- read.table("Temporal_outliers_TempoDiff_qval0.10.txt", header = T,
stringsAsFactors = F)
df_OutFLANK <- read.table("Temporal_outliers_outflank_qval0.10.txt", header = T, stringsAsFactors = F)

2. Calculting P-value for each of the three populations

Proportion of simulations with FST values equal or larger to the
observed FST after being corrected for multiple testing

POP <- c("HLG","ROS","QUI")
n = 5000
alpha = 0.10

df_POP_FST_count <- list()
N_observations <-list()
qval <- list()
outliers_HighFST <- list()
header <- c("CHROM","POS","real_FST",paste(rep("simu",n),seq(1,n,1),sep = ""))

for (i in 1: length(POP)){
  
  POP_FST <- as.data.frame(FST[[i]])
  colnames(POP_FST) <- header

  POP_FST_count <- POP_FST %>%
  group_by(CHROM,POS) %>%
  dplyr::count(POP_FST[,4:(n+3)] >= real_FST)
  df_POP_FST_count[[i]] <- data.frame(as.matrix(POP_FST_count,ncol=(n+3)))
  df_POP_FST_count[[i]] <- df_POP_FST_count[[i]][,3:(n+2)]

  df_POP_FST_count[[i]]$count <- apply(df_POP_FST_count[[i]] , 1, function(x) sum(x==1,na.rm = TRUE))
  colnames(df_POP_FST_count[[i]]) <- c(paste(rep("simu",n),seq(1,n,1),sep = ""),"count_FSTsim>FSTreal")
  
  Na_count <- apply(df_POP_FST_count[[i]] , 1, function(x) sum(is.na(x)))
  N_observations[[i]] <- (n - Na_count)
  
  df_POP_FST_count[[i]]$Pvalue_highFST  <- df_POP_FST_count[[i]]$`count_FSTsim>FSTreal`/N_observations[[i]]
  
  qval[[i]] <- qvalue(df_POP_FST_count[[i]]$Pvalue_highFST)$qvalues
  outliers_HighFST[[i]] <- which(qval[[i]] < alpha)
  print(paste(POP[[i]],"number of outliers (pvalue corrected):",length(outliers_HighFST[[i]])))
  
  }
## [1] "HLG number of outliers (pvalue corrected): 6"
## [1] "ROS number of outliers (pvalue corrected): 74"
## [1] "QUI number of outliers (pvalue corrected): 53"

3. Reporting and filtering outlier SNPs previously identified using a qvalue threshold

CHROM_POS<-data.frame(seq(1,length(getCHROM(obj.vcfR))), getCHROM(obj.vcfR),getPOS(obj.vcfR),getID(obj.vcfR) ,qval[[1]],qval[[2]],qval[[3]])
colnames(CHROM_POS) <- c("n","CHROM","POS","ID", "qval_HLG", "qval_ROS", "qval_QUI")

outliers_SNP_pop <- list()

for (i in 1: length(POP)){ 
  outlier_SNP <-filter(CHROM_POS, n %in% outliers_HighFST[[i]])
  outliers_SNP_pop[[i]] <- cbind(outlier_SNP, rep(POP[[i]],length(outliers_HighFST[[i]])))
}

outliers_df_highFST <- rbind(outliers_SNP_pop[[1]],outliers_SNP_pop[[2]],outliers_SNP_pop[[3]])
colnames(outliers_df_highFST) <- c("n","CHROM","POS","ID", "qval_HLG", "qval_ROS", "qval_QUI","pop_detected")

  #report the total number of outlier over populations
  outliers_df_highFST_unique <-outliers_df_highFST[!duplicated(outliers_df_highFST$n), ]
  print(paste("Total number of outlier over populations",length(unique(outliers_df_highFST$ID))))
## [1] "Total number of outlier over populations 131"
  #report the number of SNPs detected twice in different populatons 
  occ_SNPs <- table(unlist(outliers_df_highFST$ID))
  occurences <- melt(occ_SNPs)
  
  multiple_occ <- occurences %>%
    filter(value >= 2)
  
  multiple_occ_pop <-filter(outliers_df_highFST, ID %in% multiple_occ[,1])  
  multiple_occ_pop <-multiple_occ_pop[
    order(multiple_occ_pop[,1]),]

4. Reporting the upper limit of neutral expectation

99% centile of simulated FST values for each SNP

pop_TempFST <- list()
for (i in 1:length(POP)){
  
quantile0.01 <-rowQuantiles(as.matrix(FST[[i]][1:n+3]), probs=0.01, na.rm = TRUE)
quantile99 <-rowQuantiles(as.matrix(FST[[i]][1:n+3]), probs=0.99, na.rm = TRUE)

df_pop <-data.frame(CHROM_POS,FST[[i]][,3],quantile0.01,quantile99)
names(df_pop)[8] <- "TempFST"

    df_TempDiff_pop<-df_TempDiff%>%filter(pop == POP[[i]])
    
    df_OutFLANK_pop <- df_OutFLANK%>%filter(pop == POP[[i]])
    
    df_pop <- df_pop%>%
              mutate(SLiM_simulations= ID %in% outliers_SNP_pop[[i]]$ID,
                     TempoDiff= ID %in% df_TempDiff_pop$ID,
                     OutFLANK = ID %in% df_OutFLANK_pop$ID)
    
      df_pop[df_pop == "FALSE"] <- 0
      df_pop[df_pop == "TRUE"] <- 1   
      df_pop <- df_pop[,c(1:4,8:13)]

    df_subset <- df_pop%>%
      mutate(twotest= rowSums(.[8:10],na.rm = T))
    
    df_neutre <- df_subset%>% filter(twotest ==0)%>%
      filter(quantile0.01 != "NA")
    
    df_two_tests <- df_subset%>%filter(twotest ==2)
    df_three_tests <- df_subset%>%filter(twotest ==3)
    
    df_SLiM_simulations_uniq <- df_subset%>%
      filter(SLiM_simulations ==1 & TempoDiff ==0)
    
pop_TempFST[[i]] <-ggplot(df_neutre, aes(x = n, y = quantile99))+
  geom_line(color = "darkgreen",size = 0.01, alpha = 0.4)+
  geom_line(aes(x = n, y = quantile0.01), color = "darkgreen",size = 0.01, alpha = 0.4)+
  geom_point(data = df_neutre,aes(x = n, y = TempFST),colour="grey84",size = 1,shape = 4)+
    geom_point(data = df_SLiM_simulations_uniq,aes(x = n, y = TempFST),fill = "grey",size = 2,shape = 25)+
    geom_point(data = df_two_tests,aes(x = n, y = TempFST),fill = "purple",size = 2.5,shape = 25)+
    geom_point(data = df_three_tests,aes(x = n, y = TempFST),fill = "red",size = 2.5,shape = 25)+
  scale_y_continuous(limits = c(-0.05,0.25),breaks = c(0,0.05,0.10,0.15,0.20,0.25),name = expression(F[ST]))+
  scale_x_continuous(breaks = c(1,500,1000,1500,2067),name = 'SNPs')+
  theme(axis.text.x=element_text(colour="black", size = 12,angle = 45,hjust=0.95,vjust=0.9),
        axis.title = element_text(colour="black", size=12),
        axis.text.y=element_text(colour="black", size = 12),
        panel.background = element_rect(fill="white"),
        panel.grid.minor = element_blank(),
        panel.border = element_rect(colour="black", fill=NA, size=0.5),
        strip.text = element_text(colour="black", size=10),
        legend.position = "bottom",
        legend.text = element_text())
}

#HLG 
pop_TempFST[[1]] <- pop_TempFST[[1]] + scale_x_continuous(name = '',expand = c(0.01,0.01)) + theme(axis.text.x=element_blank())
#ROS
pop_TempFST[[2]] <- pop_TempFST[[2]] +  scale_x_continuous(name = '',expand = c(0.01,0.01))+ theme(axis.text.x=element_blank())
#QUI
pop_TempFST[[3]] <- pop_TempFST[[3]] + scale_x_continuous(name = 'SNPs',expand = c(0.01,0.01))
plot(ggarrange(pop_TempFST[[1]], ggparagraph(text=" ", face = "italic", size = 10, color = "black"), 
          pop_TempFST[[2]], ggparagraph(text=" ", face = "italic", size = 10, color = "black"), 
          pop_TempFST[[3]], ncol = 1, nrow = 5, labels=c("A", "", "B", "", "C"), heights=c(1.5,0.01,1.5,0.01,1.80), widths=c(1, 1, 1, 1, 1), common.legend = TRUE, legend="bottom" ))

Figure 3. Temporal FST per SNP are compared to the upper limit of the neutral expectation based on the high Ne scenario. The green line represents the 99% quantile of FST values expected under genetic drift alone, determined from 5 000 simulations for (A) HLG, (B) ROS, and (C) QUI. Outlier SNPs that are significant after correction for multiple comparisons and detected only by the simulation framework are represented by grey triangles. Those detected by two or three methods are represented by purple and red triangles, respectively. Grey crosses indicate putatively neutral SNPs that were not detected by outlier tests

## R version 3.6.0 (2019-04-26)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=French_France.1252  LC_CTYPE=French_France.1252   
## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C                  
## [5] LC_TIME=French_France.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] matrixStats_0.61.0   microbenchmark_1.4.9 vcfR_1.12.0         
##  [4] ggpubr_0.4.0         reshape_0.8.8        qvalue_2.18.0       
##  [7] forcats_0.5.1        stringr_1.4.1        dplyr_1.0.7         
## [10] purrr_0.3.4          readr_2.1.3          tidyr_1.2.0         
## [13] tibble_3.1.6         ggplot2_3.3.6        tidyverse_1.3.1     
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-155      fs_1.5.2.9000     lubridate_1.8.0   httr_1.4.2       
##  [5] tools_3.6.0       backports_1.4.1   bslib_0.3.1       utf8_1.2.2       
##  [9] R6_2.5.1          vegan_2.5-7       DBI_1.1.2         mgcv_1.8-38      
## [13] colorspace_2.0-3  permute_0.9-7     withr_2.5.0       tidyselect_1.1.1 
## [17] compiler_3.6.0    cli_3.4.1         rvest_1.0.2       xml2_1.3.3       
## [21] labeling_0.4.2    sass_0.4.0        scales_1.1.1      digest_0.6.29    
## [25] rmarkdown_2.13    pkgconfig_2.0.3   htmltools_0.5.2   dbplyr_2.1.1     
## [29] fastmap_1.1.0     highr_0.9         rlang_0.4.11      readxl_1.3.1     
## [33] rstudioapi_0.13   farver_2.1.1      jquerylib_0.1.4   generics_0.1.2   
## [37] jsonlite_1.8.0    car_3.0-12        magrittr_2.0.3    Matrix_1.5-1     
## [41] Rcpp_1.0.10       munsell_0.5.0     fansi_1.0.3       ape_5.7-1        
## [45] abind_1.4-5       lifecycle_1.0.1   stringi_1.7.8     yaml_2.3.5       
## [49] carData_3.0-5     MASS_7.3-55       plyr_1.8.6        pinfsc50_1.2.0   
## [53] grid_3.6.0        parallel_3.6.0    crayon_1.5.2      lattice_0.20-45  
## [57] cowplot_1.1.1     haven_2.4.3       splines_3.6.0     hms_1.1.2        
## [61] knitr_1.37        pillar_1.7.0      ggsignif_0.6.3    reshape2_1.4.4   
## [65] reprex_2.0.1      glue_1.6.2        evaluate_0.15     memuse_4.2-1     
## [69] modelr_0.1.8      vctrs_0.3.8       tzdb_0.3.0        cellranger_1.1.0 
## [73] gtable_0.3.1      assertthat_0.2.1  xfun_0.30         broom_0.7.12     
## [77] rstatix_0.7.0     viridisLite_0.4.1 cluster_2.1.2     ellipsis_0.3.2