After fertilization but prior to the onset of zygotic transcription, the C. elegans zygote cleaves asymmetrically to create the anterior AB and posterior P1 blastomeres, each of which goes on to generate distinct cell lineages. To understand how patterns of RNA inheritance and abundance arise after this first asymmetric cell division, we pooled hand-dissected AB and P1 blastomeres and performed RNA-seq (Study GSE59943).
The information which connects the sample information from GEO with the SRA run id is downloaded from SRA using the Send to: File button.
We used the info in the downloaded SraAccList.txt file to download the SRA files. We used the system function to invoke aan OS command. xargs can converts each line of a file into an argument.
The fasterq-dump function will then download and convert each corresponding sra file from the SRA archive to a fastq file.
The fasterq-dump is a tool from the sra-tools suite that can be used to download sra files and to convert them into the fastq format.
system("xargs fasterq-dump -p < SraAccList.txt")
For illustration purposes we will download subsampled fastq files from the course github site.
<- "https://github.com/statOmics/SGA/archive/refs/heads/elegansFastq.zip"
url <- "elegansFastq.zip"
destFile
download.file(url, destFile)
unzip(destFile, exdir = "./", overwrite = TRUE)
if (file.exists(destFile)) {
#Delete file if it exists
file.remove(destFile)
}
## [1] TRUE
Assess the fastq files with fastQC.
We will make the input file for the hisat aligner that will be called through the QUASAR package.
<- list.files(path = "SGA-elegansFastq",full.names = TRUE, pattern = "fastq")
fastqFiles
<- fastqFiles |>
samples substr(start=18,stop=27)
<- tibble(FileName=fastqFiles,SampleName=samples)
fileInfo fileInfo
## # A tibble: 6 × 2
## FileName SampleName
## <chr> <chr>
## 1 SGA-elegansFastq/SRR1532959sub.fastq.gz SRR1532959
## 2 SGA-elegansFastq/SRR1532960sub.fastq.gz SRR1532960
## 3 SGA-elegansFastq/SRR1532961sub.fastq.gz SRR1532961
## 4 SGA-elegansFastq/SRR1532962sub.fastq.gz SRR1532962
## 5 SGA-elegansFastq/SRR1532963sub.fastq.gz SRR1532963
## 6 SGA-elegansFastq/SRR1532964sub.fastq.gz SRR1532964
We write the fileInfo to disk
write_tsv(fileInfo,file="fastQfiles.txt")
<- "https://ftp.ensembl.org/pub/release-108/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz"
urlGenome <- "Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz"
genomeDestFile <- "https://ftp.ensembl.org/pub/release-108/gff3/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.108.gff3.gz"
urlGff <- "Caenorhabditis_elegans.WBcel235.108.gff3.gz"
gffDestFile
download.file(url = urlGenome ,
destfile = genomeDestFile)
download.file(url = urlGff,
destfile = gffDestFile)
gunzip(genomeDestFile, overwrite = TRUE, remove=TRUE)
gunzip(gffDestFile, overwrite = TRUE, remove=TRUE)
list.files()
## [1] "AhlmannEltze2021.pdf"
## [2] "backgroundProteomicsDataAnalysis.pdf"
## [3] "Caenorhabditis_elegans.WBcel235.108.gff3"
## [4] "Caenorhabditis_elegans.WBcel235.dna.toplevel.fa"
## [5] "elegansAlignmentCountTable.html"
## [6] "elegansAlignmentCountTable.Rmd"
## [7] "fastQfiles.txt"
## [8] "Genome Res.-2008-Marioni-1509-17.pdf"
## [9] "illumina_sequencing_introduction.pdf"
## [10] "intro.pdf"
## [11] "martens_proteomics_bioinformatics.pdf"
## [12] "proteomics_data_analysis.pdf"
## [13] "SGA-elegansFastq"
## [14] "stagewiseTesting.pdf"
We are aligning RNA-seq reads and have to use a gap aware
alignment modus. We therefore use the argument
splicedAlignment = TRUE
The project is a single-end sequencing project! So we use the
argument paired = "no"
.
suppressPackageStartupMessages({
library(QuasR)
library(Rhisat2)
})<- "fastQfiles.txt"
sampleFile <- "Caenorhabditis_elegans.WBcel235.dna.toplevel.fa"
genomeFile <- qAlign(
projElegans sampleFile = sampleFile,
genome = genomeFile,
splicedAlignment = TRUE,
aligner = "Rhisat2",
paired="no")
## Creating .fai file for: /Users/lclement/Dropbox/statOmics/SGA/docs/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa
## alignment files missing - need to:
## create alignment index for the genome
## create 6 genomic alignment(s)
## Creating an Rhisat2 index for /Users/lclement/Dropbox/statOmics/SGA/docs/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa
## Finished creating index
## Testing the compute nodes...OK
## Loading QuasR on the compute nodes...preparing to run on 1 nodes...done
## Available cores:
## lievens-MacBook-Pro.local: 1
## Performing genomic alignments for 6 samples. See progress in the log file:
## /Users/lclement/Dropbox/statOmics/SGA/docs/QuasR_log_de5935c6c78.txt
## Genomic alignments have been created successfully
The gff3 file contains information on the position of features, e.g. exons and genes, along the chromosome.
suppressPackageStartupMessages({
library(GenomicFeatures)
library(Rsamtools)
})<- "Caenorhabditis_elegans.WBcel235.108.gff3"
annotFile
#chrLen <- scanFaIndex(genomeFile)
#chrominfo <- data.frame(chrom = as.character(seqnames(chrLen)),
# length = width(chrLen),
# is_circular = rep(FALSE, length(chrLen)))
<- makeTxDbFromGFF(file = annotFile,
txdb dataSource = "Ensembl",
organism = "Caenorhabditis elegans")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .extract_exons_from_GRanges(exon_IDX, gr, mcols0, tx_IDX, feature = "exon", : 15639 exons couldn't be linked to a transcript so were dropped (showing
## only the first 6):
## seqid start end strand ID Name Parent Parent_type
## 1 I 31523 31543 + <NA> Y74C9A.7.e1 transcript:Y74C9A.7 <NA>
## 2 I 32415 32435 + <NA> Y74C9A.8.e1 transcript:Y74C9A.8 <NA>
## 3 I 115656 115676 - <NA> F53G12.14.e1 transcript:F53G12.14 <NA>
## 4 I 272603 272623 - <NA> C53D5.8.e1 transcript:C53D5.8 <NA>
## 5 I 272607 272627 - <NA> C53D5.7.e1 transcript:C53D5.7 <NA>
## 6 I 463304 463324 + <NA> W04C9.7.e1 transcript:W04C9.7 <NA>
## OK
With the qCount function we can count the overlap between the aligned reads and the genomic features of interest.
<- qCount(projElegans, txdb, reportLevel = "gene") geneCounts
## extracting gene regions from TxDb...done
## counting alignments...done
## collapsing counts by query name...done
head(geneCounts)
## width SRR1532959 SRR1532960 SRR1532961 SRR1532962 SRR1532963
## WBGene00000001 1806 109 57 135 67 202
## WBGene00000002 1739 0 0 0 0 1
## WBGene00000003 1738 1 3 4 4 3
## WBGene00000004 2027 3 2 2 1 3
## WBGene00000005 1821 0 0 0 0 0
## WBGene00000006 1937 0 0 0 0 0
## SRR1532964
## WBGene00000001 147
## WBGene00000002 1
## WBGene00000003 6
## WBGene00000004 5
## WBGene00000005 0
## WBGene00000006 0
Save count table for future use.
write.csv(as.data.frame(geneCounts),file = "elegansCountTable.csv")
With respect to reproducibility, it is highly recommended to include a session info in your script so that readers of your output can see your particular setup of R.
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Big Sur 11.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] Rsamtools_2.12.0 Biostrings_2.64.0 XVector_0.36.0
## [4] GenomicFeatures_1.48.3 AnnotationDbi_1.58.0 Biobase_2.56.0
## [7] Rhisat2_1.13.1 QuasR_1.36.0 Rbowtie_1.36.0
## [10] GenomicRanges_1.48.0 GenomeInfoDb_1.32.2 IRanges_2.30.0
## [13] S4Vectors_0.34.0 BiocGenerics_0.42.0 R.utils_2.11.0
## [16] R.oo_1.24.0 R.methodsS3_1.8.1 forcats_0.5.1
## [19] stringr_1.4.1 dplyr_1.0.9 purrr_0.3.4
## [22] readr_2.1.2 tidyr_1.2.0 tibble_3.1.7
## [25] ggplot2_3.3.6 tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] googledrive_2.0.0 colorspace_2.0-3
## [3] deldir_1.0-6 hwriter_1.3.2.1
## [5] rjson_0.2.21 ellipsis_0.3.2
## [7] fs_1.5.2 rstudioapi_0.13
## [9] bit64_4.0.5 fansi_1.0.3
## [11] lubridate_1.8.0 xml2_1.3.3
## [13] cachem_1.0.6 knitr_1.40.1
## [15] jsonlite_1.8.0 broom_0.8.0
## [17] dbplyr_2.1.1 png_0.1-7
## [19] SGSeq_1.30.0 compiler_4.2.0
## [21] httr_1.4.3 backports_1.4.1
## [23] assertthat_0.2.1 Matrix_1.4-1
## [25] fastmap_1.1.0 gargle_1.2.0
## [27] cli_3.3.0 prettyunits_1.1.1
## [29] htmltools_0.5.2 tools_4.2.0
## [31] igraph_1.3.2 gtable_0.3.0
## [33] glue_1.6.2 GenomeInfoDbData_1.2.8
## [35] rappdirs_0.3.3 ShortRead_1.54.0
## [37] Rcpp_1.0.8.3 cellranger_1.1.0
## [39] jquerylib_0.1.4 vctrs_0.4.1
## [41] rtracklayer_1.56.0 xfun_0.33
## [43] rvest_1.0.2 lifecycle_1.0.1
## [45] restfulr_0.0.13 XML_3.99-0.9
## [47] googlesheets4_1.0.0 zlibbioc_1.42.0
## [49] scales_1.2.0 vroom_1.5.7
## [51] BSgenome_1.64.0 VariantAnnotation_1.42.1
## [53] hms_1.1.1 MatrixGenerics_1.8.0
## [55] SummarizedExperiment_1.26.1 RColorBrewer_1.1-3
## [57] curl_4.3.2 yaml_2.3.5
## [59] memoise_2.0.1 sass_0.4.1
## [61] biomaRt_2.52.0 latticeExtra_0.6-30
## [63] stringi_1.7.8 RSQLite_2.2.14
## [65] highr_0.9 BiocIO_1.6.0
## [67] filelock_1.0.2 BiocParallel_1.30.2
## [69] GenomicFiles_1.32.1 rlang_1.0.2
## [71] pkgconfig_2.0.3 bitops_1.0-7
## [73] matrixStats_0.62.0 evaluate_0.16
## [75] lattice_0.20-45 GenomicAlignments_1.32.0
## [77] bit_4.0.4 tidyselect_1.1.2
## [79] magrittr_2.0.3 R6_2.5.1
## [81] generics_0.1.2 RUnit_0.4.32
## [83] DelayedArray_0.22.0 DBI_1.1.2
## [85] pillar_1.7.0 haven_2.5.0
## [87] withr_2.5.0 KEGGREST_1.36.0
## [89] RCurl_1.98-1.6 modelr_0.1.8
## [91] crayon_1.5.1 interp_1.1-3
## [93] utf8_1.2.2 BiocFileCache_2.4.0
## [95] tzdb_0.3.0 rmarkdown_2.14
## [97] jpeg_0.1-9 progress_1.2.2
## [99] grid_4.2.0 readxl_1.4.0
## [101] blob_1.2.3 reprex_2.0.1
## [103] digest_0.6.29 munsell_0.5.0
## [105] bslib_0.3.1