setwd("~/Desktop")

library(jsonlite)
library(tibble)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Read in all genome data from each site's respective API and convert it to data frames
## Galaxy API: https://usegalaxy.org/api/genomes
## Galaxy EU API: https://usegalaxy.eu/api/genomes
## UCSC API: https://api.genome.ucsc.edu/list/ucscGenomes
sample_data <- fromJSON("genomes.json")
galaxy_us_genomes <- as.data.frame(sample_data)

sample_data <- fromJSON("genomes_eu.json")
galaxy_eu_genomes <- as.data.frame(sample_data)

sample_data <- fromJSON("ucscGenomes.json")
ucsc_genomes <- as.data.frame(sample_data[["ucscGenomes"]])

# Merge the two Galaxy API datasets, one from the US and one from the EU
galaxy_genomes <- merge(galaxy_us_genomes, galaxy_eu_genomes, all = TRUE)
head(galaxy_genomes)
##                                                                         V1
## 1 A. ceylanicum Mar. 2014 (WS243/Acey_2013.11.30.genDNA/ancCey1) (ancCey1)
## 2                      A. gambiae Feb. 2003 (IAGEC MOZ2/anoGam1) (anoGam1)
## 3                          A. gambiae Oct. 2006 (AgamP3/anoGam3) (anoGam3)
## 4                    A. mellifera 04 Nov 2010 (Amel_4.5/apiMel4) (apiMel4)
## 5                      A. mellifera genome (apiMel3, Baylor HGSC Amel_3.0)
## 6                         A. mellifera genome (apiMel3.1 NCBI Amel_HAv3.1)
##          V2
## 1   ancCey1
## 2   anoGam1
## 3   anoGam3
## 4   apiMel4
## 5   apiMel3
## 6 apiMel3_1
# Clean up the UCSC dataset
# First, keep only those columns with the name identifier
temp <- ucsc_genomes[,grepl(".scientificName", names(ucsc_genomes))]
# Transpose the dataset and turn those name identifiers into a new column called "V2"
ucsc_genomes <- rownames_to_column(as.data.frame(t(temp)), var = "V2")
# Reorder the dataset so that the columns are in a logical order
ucsc_genomes <- ucsc_genomes[, c("V1", "V2")]
# Remove the ".scientificName" string from those name identifiers
ucsc_genomes$V2 <- gsub(".scientificName","",as.character(ucsc_genomes$V2))
head(ucsc_genomes)
##                           V1      V2
## 1     Ailuropoda melanoleuca ailMel1
## 2 Alligator mississippiensis allMis1
## 3        Anolis carolinensis anoCar1
## 4        Anolis carolinensis anoCar2
## 5          Anopheles gambiae anoGam1
## 6          Anopheles gambiae anoGam3
# Now, merge the Galaxy and UCSC datasets
unique_genomes <- merge(galaxy_genomes, ucsc_genomes, by = "V2", all=TRUE)
head(unique_genomes)
##      V2                                                    V1.x V1.y
## 1     ?                                         unspecified (?) <NA>
## 2   100                      Mycoplasma pulmonis UAB CTIP (100) <NA>
## 3   106                        Halobacterium salinarum R1 (106) <NA>
## 4 10616 Mycoplasma mycoides subsp. mycoides SC str. PG1 (10616) <NA>
## 5 10638      Yersinia pestis biovar Microtus str. 91001 (10638) <NA>
## 6 10639                   Mycoplasma hyopneumoniae 7448 (10639) <NA>
# Read in IGB genomes from synonyms.txt
igb_genomes <- read.table("synonyms.txt", header = FALSE, sep = "\t", 
                          col.names = c("V1", "V2", "V0"), fill = TRUE)
head(igb_genomes)
##                                 V1      V2
## 1            A_albopictus_Jan_2024  AalbF5
## 2             A_australis_Jun_2015 aptMan1
## 3           A_californica_Sep_2008 aplCal1
## 4          A_carolinensis_Feb_2007 anoCar1
## 5          A_carolinensis_May_2010 anoCar2
## 6 A_chrysaetos_canadensis_Oct_2014 aquChr2
##                                        V0
## 1                                        
## 2                                        
## 3                                        
## 4 Anolis_carolinensis.AnoCar1.0.reference
## 5                               AnoCar2.0
## 6
# Merge these IGB genomes with the unique_genomes df
unique_genomes_2 <- merge(unique_genomes, igb_genomes, by = "V2", all.x=TRUE)
colnames(unique_genomes_2) <- c("V2", "Galaxy genome", "UCSC genome", "IGB genome", "V0")

# Drop the last column, remove any duplicate rows, then turn the genome keys into rownames
unique_genomes_2 <- unique_genomes_2 %>%
  select(c("V2", "Galaxy genome", "UCSC genome", "IGB genome")) %>%
  distinct(V2, .keep_all = TRUE) %>%
  column_to_rownames(var = "V2")

head(unique_genomes_2)
##                                                 Galaxy genome UCSC genome
## ?                                             unspecified (?)        <NA>
## 100                        Mycoplasma pulmonis UAB CTIP (100)        <NA>
## 106                          Halobacterium salinarum R1 (106)        <NA>
## 10616 Mycoplasma mycoides subsp. mycoides SC str. PG1 (10616)        <NA>
## 10638      Yersinia pestis biovar Microtus str. 91001 (10638)        <NA>
## 10639                   Mycoplasma hyopneumoniae 7448 (10639)        <NA>
##       IGB genome
## ?           <NA>
## 100         <NA>
## 106         <NA>
## 10616       <NA>
## 10638       <NA>
## 10639       <NA>
# Output this data as a csv
write.csv(unique_genomes_2, "galaxy-genomes-to-add-to-synonyms.csv")