library(VennDiagram)
library(tidyr)
library(dplyr)
library(plyr)
library(ff)
library(data.table)

Peptides

#Import peptide data
df <- read.csv("V2-V4-NCBI-UniProt comparison 091619.csv", stringsAsFactors = F)
#Make ID column
df$ID <- 1:nrow(df)
#Identify peptide accessions by grep
V4_IDs <- df$ID[(grep("Zm00", df$Accessions))]
V2_IDs <- df$ID[(grep("GRMZM", df$Accessions))]
Uniprot_IDs <- df$ID[(grep("A0", df$Accessions))]
NCBI_IDs <- df$ID[(grep("P_", df$Accessions))]

#Identify Peptides that were missed in grep
missings <- df$ID[!df$ID%in%(unique(c(V4_IDs,V2_IDs,Uniprot_IDs,NCBI_IDs)))]
df_missings <- df[missings,c(6,8)]

df_missings_uniprot <- df_missings[c(1:11,26:43,46:57,59:60),]
df_missings_ncbi <- df_missings[c(12:25,73),]
df_missings_v2 <- df_missings[c(44:45,58,61:72,74:75),]

#Add back in the missing peptides
V2_IDs <- c(V2_IDs, df_missings_v2$ID)
Uniprot_IDs <- c(Uniprot_IDs, df_missings_uniprot$ID)
NCBI_IDs <- c(NCBI_IDs, df_missings_ncbi$ID)

#Check that every peptide is now accounted for
length(unique(c(V4_IDs,V2_IDs,Uniprot_IDs,NCBI_IDs)))==nrow(df)
## [1] TRUE
#Venn Diagram of Peptide distrubution across databases
venn.diagram(x=list(V2=V2_IDs, V4=V4_IDs, NCBI=NCBI_IDs, Uniprot=Uniprot_IDs),
             fill=c("#BDB7D6", "#96CAC2", "#CAC69C", "#D6B4A7"),
             filename = "PeptideVen.png")
## [1] 1

Create master comparison table - accessions X protein

#Function to separate "Accessions" column into individual rows with one accession each
SepFun <- function(df){
  separate_rows(df, Accessions, sep = "\\|")
}
#First split df by Group
df_split <- split(df, df$Group)
#Split again by Accession
df_split_sep <- lapply(df_split, SepFun)
#Function to collect unique accessions from each DF (of df_split_sep) and make it into a single row per group name
SplitIntoAccessionDFsfun <- function(x){
test2 <- data.frame(unique(x$Accessions))
test3 <- as.data.frame(t(test2))
rownames(test3)<- paste0("Group",(x[1,3]))
test3
}

df_split_sep_AccDFs <- lapply(df_split_sep, SplitIntoAccessionDFsfun)
df_split_sep_AccDFs_rbind <- t(rbind.fill(lapply(df_split_sep_AccDFs, as.data.table, keep.rownames=T)))

#Save data
saveRDS(df_split_sep_AccDFs_rbind, "df_split_sep_AccDFs_rbind")

Proteins

#Import protein data
ProtDF <- read.csv("V2-V4-NCBI-UniProt_Proteins.csv", stringsAsFactors = F)
#Trim to just Group, Score, and Accession
ProtDF <- ProtDF[,c(2,3,6)]
#Split by Group
ProtDF_split <- split(ProtDF, ProtDF$Group)
#Identify proteins accessions by grep
ProtDF_V4 <- ProtDF[(grep("Zm00", ProtDF$Accession)),]
ProtDF_V2 <- ProtDF[(grep("GRMZM", ProtDF$Accession)),]
ProtDF_NCBI <- ProtDF[(grep("P_", ProtDF$Accession)),]
ProtDF_NCBI <- rbind(ProtDF_NCBI, ProtDF[(grep("gi_", ProtDF$Accession)),])
ProtDF_Uniprot <- ProtDF[(grep("A0", ProtDF$Accession)),]
#Identify proteins that were missed in grep
ProtDF_uniquegroups <- unique(c(ProtDF_V4$Group, ProtDF_V2$Group, ProtDF_NCBI$Group, ProtDF_Uniprot$Group))
ProtDF_missings <- ProtDF[!(ProtDF$Group%in%ProtDF_uniquegroups),]
ProtDF_Uniprot <- rbind(ProtDF_Uniprot, ProtDF_missings[c(1:19,22,23),])
ProtDF_V2 <- rbind(ProtDF_V2, ProtDF_missings[c(20,21,24:40),])
#Check that every protein is now accounted for
length(unique(c(ProtDF_V4$Group, ProtDF_V2$Group, ProtDF_NCBI$Group, ProtDF_Uniprot$Group)))==length(unique(ProtDF$Group))
## [1] TRUE
#Define Proteins (i.e. Groups)
V4_groups <- unique(ProtDF_V4$Group)
V2_groups <- unique(ProtDF_V2$Group)
NCBI_groups <- unique(ProtDF_NCBI$Group)
Uniprot_groups <- unique(ProtDF_Uniprot$Group)
#Venn Diagram of Protein distrubution across databases
venn.diagram(x=list(V2=V2_groups, V4=V4_groups, NCBI=NCBI_groups, Uniprot=Uniprot_groups),
             fill=c("#BDB7D6", "#96CAC2", "#CAC69C", "#D6B4A7"),
             filename = "ProteinVen.png")
## [1] 1

Make new genome annotation “V4pro” by finding non-redundant gene set and re-naming

#define venn diagram overlaping areas
V2_NCBI_Uniprot_groups <-unique(c(V2_groups, NCBI_groups, Uniprot_groups))
GroupsNotInV4 <- V2_NCBI_Uniprot_groups[!(V2_NCBI_Uniprot_groups%in%V4_groups)]
V4_only_groups <- V4_groups[!(V4_groups%in%V2_NCBI_Uniprot_groups)]

#function to pick name
V4proFun <- function(x){
  DF <- data.frame("Group"=c(1:length(x)), "V4"=NA, "Preferred"=NA)
  for(i in 1:length(x)){
    if(i%in%V4_groups){
      df <- x[[i]]
      df2 <- df[(grep("Zm00", df$Accession)),]
      df3 <- df2[order(-df2$Score, df2$Accession),]
      DF[i,"V4"]<- df3[1,3]}
      else{DF[i,"V4"]=NA}
  }
  for(i in 1:length(x)){
    df <- x[[i]]
    df2 <- df[(df$Score==max(df$Score)),]
    if(sum(grepl("Zm00",df2$Accession))>0){DF[i,"Preferred"]=NA}
    else if(sum(grepl("NP_",df2$Accession))>0){
      df3<-df2[(grep("NP_",df2$Accession)),]
      DF[i,"Preferred"]<- df3[1,3]
    }
    else if(sum(grepl("XP_",df2$Accession))>0){
      df3<-df2[(grep("XP_",df2$Accession)),]
      DF[i,"Preferred"]<- df3[1,3]
    }
    else if(sum(grepl("gi_",df2$Accession))>0){
      df3<-df2[(grep("gi_",df2$Accession)),]
      DF[i,"Preferred"]<- df3[1,3]
    }
    else if(sum(grepl("GRMZM",df2$Accession))>0){
      df3<-df2[(grep("GRMZM",df2$Accession)),]
      DF[i,"Preferred"]<- df3[1,3]
    }
    else{DF[i,"Preferred"]<- df2[1,3]}
  }
  DF
}

#run
V4proDF <- V4proFun(ProtDF_split)
#Make v4pro accession call
V4proDF$V4pro <- V4proDF$Preferred
#Set to v4 accession if there is no preferred accession
V4proDF$V4pro[is.na(V4proDF$V4pro)] <- V4proDF$V4[is.na(V4proDF$V4pro)]