SRA toolkit tutorial
Rémy VILLETTE
04/05/2021
|
NCBI made a very useful tool to download quickly and accurately large sequencing data stored on SRA, ENA or DDBJ servers. You will need to install, configure and get accession list for your samples. The toolkit will download SRA formatted files and then transform them in fastq.
Install the SRA toolkit
You first need to download the SRA toolkit from NCBI : https://github.com/ncbi/sra-tools/wiki/01.-Downloading-SRA-Toolkit
#for LINUX UBUNTU
wget --output-document sratoolkit.tar.gz http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-ubuntu64.tar.gz
#FOR MAC OS
curl --output sratoolkit.tar.gz http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-mac64.tar.gz
Configure your toolkit
You need to configure it as explained here : https://github.com/ncbi/sra-tools/wiki/03.-Quick-Toolkit-Configuration
Rapidly you can just run this in your terminal and chose the directory you want the SRA toolkit to download into. If you don’t perform this step the toolkit will download in the directory were you put it, usually in your root. So change this to a directory with a large storage capacity.
/Users/mala76/sratoolkit/sratoolkit.2.10.9-mac64/bin/vdb-config -i
Download the SRA formatted files.
Once the directory is set you can start to get your data. Go to SRA entrez in NCBI : https://www.ncbi.nlm.nih.gov/sra/docs/sradownload/
First get the filenames in .txt format from the run selector or the sra entrez search. You need to go to the SRA entrez on NCBI and use the accession number to find your samples. Paste the accession number in the searching zone, press enter. Then on the top right of the panel you’ll find “Send to” button. Select “Run selector”. Select .SRA entrez usually covers for SRA, ENA, DDBJ and more.
setwd("/media/remy/Stockage/Database_fastq/SOP_Early_LIFE")
path <- "/home/remy/sratoolkit.current-centos_linux64/sratoolkit.2.10.7-centos_linux64/bin" #make the path to the toolkit directory
func_fetch<- paste0(path, "/prefetch --option-file") #prepare the function
Here you can chose the directory where you want your SRA formatted files to be stored. If you already did it in the vdb -configure -i you don’t need to redo it. You can also use this code if you want to change temporarily the output-directory.
# This section of code is to make sure that files do not contain WGS data
library(tidyverse)
# give an output file to where the files will be downloaded
f <- list.files("/home/remy/Documents/SOP_early_life_R/Datas", pattern = "SraRunTable.txt")
accession <- list()
metadata <- list()
not_shotgun <- list()
shotgun <- list()
sink("metadata cleaning.txt")
for(i in f){
tmp= str_split(i, pattern="_", simplify = T)[1]
tmp= str_split(i, pattern =" ", simplify = T )[1]
regex_match <- "shotgun|WGS|whole genome|all_genome|WXS|WholeGenomeShotgun|Whole genome shotgun|Metatranscriptomic|WXS|NextSeq"
metadata[[tmp]] <- read.delim(paste0("/home/remy/Documents/SOP_early_life_R/Datas/", tmp, " SraRunTable.txt"), sep=",")
bool= sum( grepl(regex_match, metadata[[tmp]]))
if( bool > 0)
{
#find out which accession_list contains shotgun data
nb = matches(vars= metadata[[tmp]], c("shotgun", "WGS", "whole genome", "all_genome", "WXS", "METATRANSCRIPTOMIC", "NextSeq", "WholeGenomeShotgun"))
nb=
cat(tmp, "\n", "column",paste0(colnames(metadata[[tmp]])[nb], collapse = " | "), "\n", "contains a pattern related to SHOTGUN sequencing, there is shotgun innit \n")
shotgun[tmp] = tmp
}
# For those that are only 16S
if (bool == 0)
{
not_nb = matches(vars= metadata[[tmp]], c("pcr", "Amplicon", "OTHER", "16S"))
cat(tmp, "\n", "column", paste0(colnames(metadata[[tmp]])[not_nb], collapse = " | "), "\n", "seems to be only 16S rRNA gene \n")
# tmp2 = str_split(i, pattern="_", simplify = T)[1]
not_shotgun[[tmp]]<- tmp
}
}
sink(file = NULL)
#remove them from the accession_list and give a new accession_list with only the 16S samples
`%notin%` = Negate(`%in%`)
dir.create("/clean_Datas")
for(j in shotgun){
meta= metadata[[j]]
tmp = meta %>% filter(if_any(everything(),~str_detect(., regex_match)))
meta= meta[meta$Run%notin%tmp$Run,]
vec= meta %>% select(Run)
if(dim(meta)[1]>0){
write.table(x= vec, file = paste0("./clean_datas/", j , "_SRR_Acc_List.txt"), sep = "\t", row.names = F, col.names = F, quote = F)
write.table(x= meta, file = paste0("./clean_datas/", j , "_SraRunTable.txt"), sep="\t", row.names = F, col.names = T, quote = F)
}
}
# export the modified data
for(i in not_shotgun) {
write.table(x= metadata[[i]], file = paste0("./clean_datas/",i, "_SraRunTable.txt"), row.names = F, sep = "\t", col.names = T, quote = F)
write.table(x= metadata[[i]][,'Run'], file = paste0("./clean_datas/", i, "_SRR_Acc_List.txt"), sep = "\t", row.names = F, col.names = F, quote = F)
}
new_meta=list()
for(i in list.files("./clean_datas", pattern = "_SraRunTable.txt")){
tmp= str_split(i, pattern="_", simplify = T)[1]
tmp= str_split(tmp, " ")[[1]][1]
new_meta[[tmp]] = read.delim(file = paste0("/home/remy/Documents/SOP_early_life_R/clean_datas/", tmp, "_SraRunTable.txt"), sep="\t", header = T)
}
“PRJEB26419” project contains hidden WGS and transcriptomic datas. For this specific paper we need to subset the data manually
You can now proceed with the function. It will download SRA formatted files (not compressed however) in the directory you chose. Give the accession list you downloaded from NCBI.
For this tutorial we will use the accession number : PRJEB2079.
lstfil <- list.files("/home/remy/Documents/SOP_early_life_R/clean_datas", pattern = "_SRR_Acc_List")
#store the accession name for each project
project_number = str_split(lstfil, "_", simplify = T)[,1]
nb=31
outdir <- paste0("/media/remy/Stockage/Database_fastq/SOP_Early_LIFE/sra/", project_number[nb])
# accession<- file.choose() # the accession list to download
accession<- paste0( "/home/remy/Documents/SOP_early_life_R/clean_datas/", project_number[nb], "_SRR_Acc_List.txt" )
cmd1<- paste(func_fetch, accession , "--output-directory", outdir, "--progress") # here goes the function
system(cmd1) # launch it, it will take some time depending on the number of sample that you are downloading
#Troubleshooting In case you need to redownload some files or you’re code stopped. If the utils function find an existing fastq it will stop the loop. For that we remove these fastqs from the list. For the files that stopped downloading
library(tidyverse)
dir2<- outdir #Get the directory were files are stored
non_files = list.files(dir2, pattern=".prf|lock|.tmp")
for(i in non_files){
x= str_remove(i, ".sra.prf|.sra.tmp|.sra.lock")
print(x)
cmd1<- paste( paste0(path, "/prefetch "), x , "--output-directory", outdir, "--progress") # here goes the function
system(cmd1)
}
Transform SRA formatted data into fastq.
There is a subtility there : be aware that some files will be uploaded as single-end sequence files, meaning there is no Forward and Reverse files. In that case you just need to adjust the functions gzip.
Presently fasterq-dump doesn’t allow compression so we need to use external compression function.
# You may encounter a problem and the code stopped. If a file has allready been done the utils function will block the loop
# In that manner there is a character vector to detect the files allready done
files<- list.files(dir2) #make a lit of the samples
non_files = list.files(dir2, pattern=".prf|lock|.tmp")
redo= str_remove(non_files, pattern = ".prf|.lock|.tmp")
outdir2<- paste0("/media/remy/Stockage/Database_fastq/SOP_Early_LIFE/fastq/", project_number[nb]) # chose an directory to store the fastq
done= list.files(outdir2)
done = str_remove(done, pattern = ".fastq.gz|_1.fastq.gz|_2.fastq.gz")
for(f in files[!(files%in%done | files%in%redo | files%in%non_files)]) {
dif<- paste0(dir2,"/", f)
func<- paste0(path, "/fasterq-dump --split-files --progress")
cmd2 = paste(func, dif, "--outdir", outdir2)
cat(f,"\n")#print the current command
system(cmd2) # invoke command
if(file.exists(paste0(outdir2,"/", f, "_1.fastq"))){
# now compress the files
R.utils::gzip(paste0(outdir2,"/", f, "_1.fastq"), remove=T)
R.utils::gzip(paste0(outdir2,"/", f, "_2.fastq"), remove=T)
} else{
R.utils::gzip(paste0(outdir2,"/", f, ".fastq"), remove=T)
}
}
Finally, we want to remove SRA formatted files as they are big and useless now.
outdir2 <- list.files(outdir)
outdir2 <- paste0(outdir,"/",outdir2)
for (dl in outdir2){
file.remove(dl)
}
Work metadata from multiple SRA projects.
The challenge is now to homogenize the metadata in order to merge them later. Metadata are given by authors and can vary from 6 columns to 50, with different column names for the same information (ex: “Host_age” and “Age”). For now we just want to import the metadata files in R to analyze the names of the columns and their frequencies before considering any modifications.
# Get frequencies of data types - used for metadata harmonization
colname <- NULL
data_name= NULL
dim_metadata= NULL
for (i in names(new_meta)){
colname <- c(colname, colnames(new_meta[[i]]))
data_name[[i]]= colnames(new_meta[[i]])
dim_metadata= rbind(dim_metadata,data_frame(project= i, factors= dim(new_meta[[i]])[2], samples= dim(new_meta[[i]])[1]))
}
tmp = as.data.frame(table(colname))
tmp = tmp[order(-tmp$Freq),]
sum(tmp$Freq==49)
filter(tmp, Freq==49)%>%View()
write.csv(. ,"shared_col.csv")
filter(tmp, Freq==1)%>% View()
sum(tmp$Freq==1)
sum(dim_metadata$samples)
Change the column names work on progress.
metadat= metadata
for(i in names(new_meta)){
nb = str_detect(colnames(new_meta[[i]]), "Age|_age_|AGE")
print(colnames(new_meta[[i]][1:5,])[ nb])
}