setwd("~/Desktop")
library(jsonlite)
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read in all genome data from each site's respective API and convert it to data frames
## Galaxy API: https://usegalaxy.org/api/genomes
## Galaxy EU API: https://usegalaxy.eu/api/genomes
## UCSC API: https://api.genome.ucsc.edu/list/ucscGenomes
sample_data <- fromJSON("genomes.json")
galaxy_us_genomes <- as.data.frame(sample_data)
sample_data <- fromJSON("genomes_eu.json")
galaxy_eu_genomes <- as.data.frame(sample_data)
sample_data <- fromJSON("ucscGenomes.json")
ucsc_genomes <- as.data.frame(sample_data[["ucscGenomes"]])
# Merge the two Galaxy API datasets, one from the US and one from the EU
galaxy_genomes <- merge(galaxy_us_genomes, galaxy_eu_genomes, all = TRUE)
head(galaxy_genomes)
## V1
## 1 A. ceylanicum Mar. 2014 (WS243/Acey_2013.11.30.genDNA/ancCey1) (ancCey1)
## 2 A. gambiae Feb. 2003 (IAGEC MOZ2/anoGam1) (anoGam1)
## 3 A. gambiae Oct. 2006 (AgamP3/anoGam3) (anoGam3)
## 4 A. mellifera 04 Nov 2010 (Amel_4.5/apiMel4) (apiMel4)
## 5 A. mellifera genome (apiMel3, Baylor HGSC Amel_3.0)
## 6 A. mellifera genome (apiMel3.1 NCBI Amel_HAv3.1)
## V2
## 1 ancCey1
## 2 anoGam1
## 3 anoGam3
## 4 apiMel4
## 5 apiMel3
## 6 apiMel3_1
# Clean up the UCSC dataset
# First, keep only those columns with the name identifier
temp <- ucsc_genomes[,grepl(".scientificName", names(ucsc_genomes))]
# Transpose the dataset and turn those name identifiers into a new column called "V2"
ucsc_genomes <- rownames_to_column(as.data.frame(t(temp)), var = "V2")
# Reorder the dataset so that the columns are in a logical order
ucsc_genomes <- ucsc_genomes[, c("V1", "V2")]
# Remove the ".scientificName" string from those name identifiers
ucsc_genomes$V2 <- gsub(".scientificName","",as.character(ucsc_genomes$V2))
head(ucsc_genomes)
## V1 V2
## 1 Ailuropoda melanoleuca ailMel1
## 2 Alligator mississippiensis allMis1
## 3 Anolis carolinensis anoCar1
## 4 Anolis carolinensis anoCar2
## 5 Anopheles gambiae anoGam1
## 6 Anopheles gambiae anoGam3
# Now, merge the Galaxy and UCSC datasets
unique_genomes <- merge(galaxy_genomes, ucsc_genomes, by = "V2", all=TRUE)
head(unique_genomes)
## V2 V1.x V1.y
## 1 ? unspecified (?) <NA>
## 2 100 Mycoplasma pulmonis UAB CTIP (100) <NA>
## 3 106 Halobacterium salinarum R1 (106) <NA>
## 4 10616 Mycoplasma mycoides subsp. mycoides SC str. PG1 (10616) <NA>
## 5 10638 Yersinia pestis biovar Microtus str. 91001 (10638) <NA>
## 6 10639 Mycoplasma hyopneumoniae 7448 (10639) <NA>
# Read in IGB genomes from synonyms.txt
igb_genomes <- read.table("synonyms.txt", header = FALSE, sep = "\t",
col.names = c("V1", "V2", "V0"), fill = TRUE)
head(igb_genomes)
## V1 V2
## 1 A_albopictus_Jan_2024 AalbF5
## 2 A_australis_Jun_2015 aptMan1
## 3 A_californica_Sep_2008 aplCal1
## 4 A_carolinensis_Feb_2007 anoCar1
## 5 A_carolinensis_May_2010 anoCar2
## 6 A_chrysaetos_canadensis_Oct_2014 aquChr2
## V0
## 1
## 2
## 3
## 4 Anolis_carolinensis.AnoCar1.0.reference
## 5 AnoCar2.0
## 6
# Merge these IGB genomes with the unique_genomes df
unique_genomes_2 <- merge(unique_genomes, igb_genomes, by = "V2", all.x=TRUE)
colnames(unique_genomes_2) <- c("V2", "Galaxy genome", "UCSC genome", "IGB genome", "V0")
# Drop the last column, remove any duplicate rows, then turn the genome keys into rownames
unique_genomes_2 <- unique_genomes_2 %>%
select(c("V2", "Galaxy genome", "UCSC genome", "IGB genome")) %>%
distinct(V2, .keep_all = TRUE) %>%
column_to_rownames(var = "V2")
head(unique_genomes_2)
## Galaxy genome UCSC genome
## ? unspecified (?) <NA>
## 100 Mycoplasma pulmonis UAB CTIP (100) <NA>
## 106 Halobacterium salinarum R1 (106) <NA>
## 10616 Mycoplasma mycoides subsp. mycoides SC str. PG1 (10616) <NA>
## 10638 Yersinia pestis biovar Microtus str. 91001 (10638) <NA>
## 10639 Mycoplasma hyopneumoniae 7448 (10639) <NA>
## IGB genome
## ? <NA>
## 100 <NA>
## 106 <NA>
## 10616 <NA>
## 10638 <NA>
## 10639 <NA>
# Output this data as a csv
write.csv(unique_genomes_2, "galaxy-genomes-to-add-to-synonyms.csv")