LongBench is a comprehensive benchmarking resource for evaluating long-read RNA-seq analysis tools and methods. It provides a collection of datasets, reference files, and analysis workflows to facilitate the assessment of various aspects of long-read RNA-seq data analysis. In this tutorial, we will focus on differential gene expression (DGE) and differential transcript expression (DTE) analysis using edgeR and compare the results across different sequencing platforms, including Illumina, PacBio, and Oxford Nanopore Technologies (ONT) (cDNA and dRNA sequencing).
Bulk RNA-seq was performed to analyse gene and transcript expression across various cancer cell lines using both short- and long-read sequencing platforms.
Samples:
Total: 8 cancer cell lines (H146, H69, H526, H211, SHP77, H1975, H2228, HCC827) for each platform.
Sequencing Platforms:
Short-Read Sequencing:
Long-Read Sequencing:
PacBio
ONT PCR cDNA
ONT direct RNA (dRNA)
Groups by Cancer Cell Type:
Small Cell Lung Cancer (SCLC):
Subtype A (SCLC_A): H146, H69, SHP77.
Subtype P (SCLC_P): H526, H211.
Lung Adenocarcinoma (LUAD): H1975, H2228, HCC827.
Note: In this tutorial, we will focus on comparing SCLC_A vs LUAD. However, the same analysis can be applied to other comparisons, such as SCLC_P vs LUAD and SCLC_A vs SCLC_P.
# Define a consistent color palette for the sequencing platforms
color_palette <- c(
PacBio = "#df1995",
"ONT dRNA" = "#00789b",
"ONT cDNA" = "#04476c",
Illumina = "#e88b20"
)
The code below will read the RDS file directly from AWS S3 and load the following DGE objects into the global environment:
# read RDS file directly (TBD)
obj <- aws.s3::s3readRDS(object = "path/to/file.Rds", bucket = "bucket-name")
# load to global environment
list2env(obj, .GlobalEnv)
# remove the object from memory
rm(obj)
ill_bulk.gene.dge: Illuminaont_bulk.gene.dge: ONT cDNApb_bulk.gene.dge: PacBiodrna_bulk.gene.dge: ONT dRNAill_bulk.tx.dge: Illuminaont_bulk.tx.dge: ONT cDNApb_bulk.tx.dge: PacBiodrna_bulk.tx.dge: ONT dRNAAfter loading the DGE objects, we will filter for human genes only based on Ensembl IDs from Spike-ins.
# Function to identify human genes based on Ensembl IDs from Spike-ins
is.human <- function(dge) {grepl("^ENS", rownames(dge))}
ill_bulk.gene.dge <- ill_bulk.gene.dge[is.human(ill_bulk.gene.dge), ]
ont_bulk.gene.dge <- ont_bulk.gene.dge[is.human(ont_bulk.gene.dge), ]
pb_bulk.gene.dge <- pb_bulk.gene.dge[is.human(pb_bulk.gene.dge), ]
drna_bulk.gene.dge <- drna_bulk.gene.dge[is.human(drna_bulk.gene.dge), ]
ill_bulk.tx.dge <- ill_bulk.tx.dge[is.human(ill_bulk.tx.dge), ]
ont_bulk.tx.dge <- ont_bulk.tx.dge[is.human(ont_bulk.tx.dge), ]
pb_bulk.tx.dge <- pb_bulk.tx.dge[is.human(pb_bulk.tx.dge), ]
drna_bulk.tx.dge <- drna_bulk.tx.dge[is.human(drna_bulk.tx.dge), ]
Let’s first examine the library sizes across different sequencing platforms.
# Build combined data frame of library sizes
lib_sizes <- data.frame(
Sample = colnames(ill_bulk.tx.dge),
LibSize = colSums(ill_bulk.tx.dge$counts),
Dataset = "Illumina"
) %>%
rbind(data.frame(
Sample = colnames(drna_bulk.tx.dge),
LibSize = colSums(drna_bulk.tx.dge$counts),
Dataset = "ONT dRNA"
)) %>%
rbind(data.frame(
Sample = colnames(pb_bulk.tx.dge),
LibSize = colSums(pb_bulk.tx.dge$counts),
Dataset = "PacBio"
)) %>%
rbind(data.frame(
Sample = colnames(ont_bulk.tx.dge),
LibSize = colSums(ont_bulk.tx.dge$counts),
Dataset = "ONT cDNA"
))
# Plot with custom colors
ggplot(lib_sizes, aes(x = Dataset, y = LibSize, fill = Dataset)) +
geom_boxplot(outlier.shape = NA, alpha = 0.9) +
geom_jitter(width = 0.2, alpha = 0.6, size = 1) +
scale_fill_manual(values = color_palette) +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Library Size per Sample",
y = "Total Counts (Library Size)",
x = "Sequencing protocols"
) +
ylim(0, 65000000) +
theme_minimal() +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
For benchmarking purposes, only genes and transcripts common to all
datasets will be retained for downstream differential expression
analysis. Genes and transcripts are filtered using
edgeR::filterByExpr: the gene-level count matrix uses the
default minimum count threshold of 5, while the transcript-level count
matrix uses a reduced minimum count threshold of 5.
#GENE
## 1. get common genes
common_genes <- Reduce(intersect, lapply(list(ill_bulk.gene.dge, ont_bulk.gene.dge, pb_bulk.gene.dge, drna_bulk.gene.dge), rownames))
ill_bulk.gene.dge <- ill_bulk.gene.dge[common_genes, ]
ont_bulk.gene.dge <- ont_bulk.gene.dge[common_genes, ]
pb_bulk.gene.dge <- pb_bulk.gene.dge[common_genes, ]
drna_bulk.gene.dge <- drna_bulk.gene.dge[common_genes, ]
## 2. filter by expression
keep <- filterByExpr(ill_bulk.gene.dge, group = ill_bulk.gene.dge$samples$group) &
filterByExpr(ont_bulk.gene.dge, group = ont_bulk.gene.dge$samples$group) &
filterByExpr(pb_bulk.gene.dge, group = pb_bulk.gene.dge$samples$group) &
filterByExpr(drna_bulk.gene.dge, group = drna_bulk.gene.dge$samples$group)
#table(keep)
ill_bulk.gene.dge <- ill_bulk.gene.dge[keep, ]
ont_bulk.gene.dge <- ont_bulk.gene.dge[keep, ]
pb_bulk.gene.dge <- pb_bulk.gene.dge[keep, ]
drna_bulk.gene.dge <- drna_bulk.gene.dge[keep, ]
# Transcript
## 1. get common transcripts
common_tx <- Reduce(intersect, lapply(list(ill_bulk.tx.dge, ont_bulk.tx.dge, pb_bulk.tx.dge, drna_bulk.tx.dge), rownames))
ill_bulk.tx.dge <- ill_bulk.tx.dge[common_tx, ]
ont_bulk.tx.dge <- ont_bulk.tx.dge[common_tx, ]
pb_bulk.tx.dge <- pb_bulk.tx.dge[common_tx, ]
drna_bulk.tx.dge <- drna_bulk.tx.dge[common_tx, ]
## 2. filter by expression
keep <- filterByExpr(ill_bulk.tx.dge, group = ill_bulk.tx.dge$samples$group, min.count=5) &
filterByExpr(ont_bulk.tx.dge, group = ont_bulk.tx.dge$samples$group, min.count=5) &
filterByExpr(pb_bulk.tx.dge, group = pb_bulk.tx.dge$samples$group, min.count=5) &
filterByExpr(drna_bulk.tx.dge, group = drna_bulk.tx.dge$samples$group, min.count=5)
#table(keep)
ill_bulk.tx.dge <- ill_bulk.tx.dge[keep, ]
ont_bulk.tx.dge <- ont_bulk.tx.dge[keep, ]
pb_bulk.tx.dge <- pb_bulk.tx.dge[keep, ]
drna_bulk.tx.dge <- drna_bulk.tx.dge[keep, ]
Next, the gene and transcript counts using
calcNormFactors with “TMM” methods for each dataset.
ill_bulk.gene.dge <- calcNormFactors(ill_bulk.gene.dge, method = "TMM")
ont_bulk.gene.dge <- calcNormFactors(ont_bulk.gene.dge, method = "TMM")
pb_bulk.gene.dge <- calcNormFactors(pb_bulk.gene.dge, method = "TMM")
drna_bulk.gene.dge <- calcNormFactors(drna_bulk.gene.dge, method = "TMM")
ill_bulk.tx.dge <- calcNormFactors(ill_bulk.tx.dge, method = "TMM")
ont_bulk.tx.dge <- calcNormFactors(ont_bulk.tx.dge, method = "TMM")
pb_bulk.tx.dge <- calcNormFactors(pb_bulk.tx.dge, method = "TMM")
drna_bulk.tx.dge <- calcNormFactors(drna_bulk.tx.dge, method = "TMM")
Up until now, we have prepared the DGElist objects for each dataset (gene and transcript level) and normalised the counts using TMM. Let’s now explore the data using MDS plots and then proceed to differential expression analysis.
To explore the data across all datasets, we will merge the DGElist objects from each dataset (gene and transcript level) into a single DGElist object for gene-level and transcript-level counts, respectively. Please NOTE: The merged DGElist will only be used for the MDS plot. The DE analysis will be done separately for each dataset.
Now let’s create MDS plots to visualize the relationships between samples based on their gene and transcript expression profiles.
We first set up a function and myMDS to create the MDS
plot using ggplot2, so that we can easily reuse it for both gene-level
and transcript-level data and for different platforms
From the MDS plot, we can clearly see that samples cluster by cancer type (LUAD, SCLC type a, SCLC type p). However, at the transcript level, the samples also cluster by sequencing platform (Illumina, ONT cDNA, ONT dRNA, PacBio), suggesting that technical differences between platforms also contribute to the observed variation.
Now, let’s proceed to differential gene expression (DGE) analysis using the edgeR Quasi-Likelihood pipeline. We will perform DGE analysis separately for each dataset (Illumina, ONT cDNA, ONT dRNA, PacBio) using the filtered and normalised DGElist objects created earlier.SCLC type A vs. LUAD with the model:
Model:
model.matrix(~0 + group, data = dge$samples)
Differential expression gene and transcript lists are defined using
DecideTests with a log2 fold change cutoff of 0 and
FDR<0.05.
Let’s first set up the workflow as a function so we can repeat it across different platforms:
DGE.edgeR_human <- function(dge, norm.method = "TMM") {
design <- model.matrix(~0 + group, data = dge$samples)
dge <- estimateDisp(dge, design)
qlfit <- glmQLFit(dge, design)
contrasts <- makeContrasts(
AvsL = groupsclc_a - groupluad,
levels = design
)
AvsL.QLF.test <- glmQLFTest(qlfit, contrast = contrasts[, "AvsL"])
list(
AvsL.DE = AvsL.QLF.test
)
}
Now, let’s apply the function to the DGElist
objects.
# DEG analsysis on human genes only
ill_bulk.dge_human.list <- DGE.edgeR_human(ill_bulk.gene.dge)
ont_bulk.dge_human.list <- DGE.edgeR_human(ont_bulk.gene.dge)
drna_bulk.deg_human.list <- DGE.edgeR_human(drna_bulk.gene.dge)
pb_bulk.dge_human.list <- DGE.edgeR_human(pb_bulk.gene.dge)
After we get the DE genes from each sequencing platform, we can
overlap the DE genes and visualise them using an upset plot with
ComplexUpset
Again, let’s first set up the function for the plot:
Now we can plot the upset plot.
The results show that differentially expressed genes are largely consistent across platforms, with PacBio detecting the most DE genes, followed by ONT dRNA.
The same analysis can be done at the transcript level. Since the read-to-transcript assignment is sometimes uncertain, we applied the pipeline suggested in Baldoni et al. (2024).
The transcript level counts are scaled by the dispersion values recovered from Bootstrap sampling (50 Bootstrap sampling runs for each sample)
Normalisation: Scaled transcript counts are normalised using the
calcNormFactors function in edgeR with the TMM
method for each subset.
Differential Expression: Differential expression is assessed
using a GLM-based model with the glmQLFit and
glmQLFTest functions
DTE.edgeR_human <- function(dge, norm.method="TMM") {
# Scale
dge.scaled <- dge
dge.scaled$counts <- dge.scaled$counts / dge.scaled$genes$Overdispersion
# Normalisation
if (!is.null(norm.method)) {
dge <- calcNormFactors(dge, method = norm.method)
}
design <- model.matrix(~0+group, data = dge.scaled$samples)
dge.scaled <- estimateDisp(dge.scaled, design)
qlfit <- glmQLFit(dge.scaled, design)
contrasts <- makeContrasts(
AvsL = groupsclc_a - groupluad,
levels = design
)
AvsL.QLF.test <- glmQLFTest(qlfit, contrast = contrasts[, "AvsL"])
list(
AvsL.DE = AvsL.QLF.test
)
}
# get human list
ill_bulk.scl_dte_human.list <- DTE.edgeR_human(ill_bulk.tx.dge)
ont_bulk.scl_dte_human.list <- DTE.edgeR_human(ont_bulk.tx.dge)
pb_bulk.scl_dte_human.list <- DTE.edgeR_human(pb_bulk.tx.dge)
drna_bulk.scl_dte_human.list <- DTE.edgeR_human(drna_bulk.tx.dge)
# Upset plot
## SCLC_A vs LUCA (AvsL)
lst <- list(
"Illumina" = ill_bulk.scl_dte_human.list$AvsL.DE,
"ONT cDNA" = ont_bulk.scl_dte_human.list$AvsL.DE,
"PacBio" = pb_bulk.scl_dte_human.list$AvsL.DE,
"ONT dRNA" = drna_bulk.scl_dte_human.list$AvsL.DE
)
p <- complex_upset_plot(lst)$plot + ggtitle("SCLC_A vs LUCA") + theme(plot.title = element_text(size = 20))
mat.AvsL.DTE <- complex_upset_plot(lst)$matrix
p + patchwork::plot_annotation(
title = "DTE Human",
theme = theme(plot.title = element_text(size =25))
)
Here, the differential transcript expression (DTE) results visualised in the upset plots show less consistency when comparing the gene-level DE results.