Skip to contents

Chi-square based differential transcription usage analysis. This variant is meant for single cell data. Takes the SingleCellExperiment object from sc_long_pipeline as input. Alternatively, the path to the output folder could be provided instead of the SCE object.

Usage

sc_DTU_analysis(sce, min_count = 15)

Arguments

sce

The SingleCellExperiment object from sc_long_pipeline, with the following metadata: file is required under the output folder of the SCE object.

min_count

The minimum UMI count threshold for filtering isoforms.

Value

a data.frame containing the following columns:

gene_id

- differentially transcribed genes

X_value

- the X value for the DTU gene

df

- degrees of freedom of the approximate chi-squared distribution of the test statistic

DTU_tr

- the transcript_id with the highest squared residuals

DTU_group

- the cell group with the highest squared residuals

p_value

- the p-value for the test

adj_p

- the adjusted p-value (by Benjamini–Hochberg correction)

The table is sorted by decreasing P-values. It will also be saved as sc_DTU_analysis.csv under the output folder.

Details

This function will search for genes that have at least two isoforms, each with more than min_count UMI counts. For each gene, the per cell transcript counts were merged by group to generate pseudo bulk samples. Grouping is specified by the colLabels of the SCE object. The top 2 highly expressed transcripts for each group were selected and a UMI count matrix where the rows are selected transcripts and columns are groups was used as input to a chi-square test of independence (chisq.test). Adjusted P-values were calculated by Benjamini–Hochberg correction.

Examples

outdir <- tempfile()
dir.create(outdir)
bc_allow <- file.path(outdir, "bc_allow.tsv")
genome_fa <- file.path(outdir, "rps24.fa")
R.utils::gunzip(
  filename = system.file("extdata", "bc_allow.tsv.gz", package = "FLAMES"),
  destname = bc_allow, remove = FALSE
)
R.utils::gunzip(
  filename = system.file("extdata", "rps24.fa.gz", package = "FLAMES"),
  destname = genome_fa, remove = FALSE
)

sce <- FLAMES::sc_long_pipeline(
  genome_fa = genome_fa,
  fastq = system.file("extdata", "fastq", "musc_rps24.fastq.gz", package = "FLAMES"),
  annotation = system.file("extdata", "rps24.gtf.gz", package = "FLAMES"),
  outdir = outdir,
  barcodes_file = bc_allow,
  config_file = create_config(outdir, oarfish_quantification = FALSE)
)
#> Writing configuration parameters to:  /tmp/RtmpoS7Kzz/file1f6d5ee62b7f/config_file_8045.json 
#> 02:14:06 AM Fri Oct 25 2024 Start running
#> 02:14:06 AM Fri Oct 25 2024 Demultiplexing using flexiplex...
#> Matching cell barcodes...
#> FLEXIPLEX 0.96.2
#> Setting max barcode edit distance to 2
#> Setting max flanking sequence edit distance to 8
#> Setting read IDs to be  replaced
#> Setting number of threads to 8
#> Search pattern: 
#> primer: CTACACGACGCTCTTCCGATCT
#> BC: NNNNNNNNNNNNNNNN
#> UMI: NNNNNNNNNNNN
#> polyT: TTTTTTTTT
#> Setting known barcodes from /tmp/RtmpoS7Kzz/file1f6d5ee62b7f/bc_allow.tsv
#> Number of known barcodes: 143
#> Processing file: /__w/_temp/Library/FLAMES/extdata/fastq/musc_rps24.fastq.gz
#> Searching for barcodes...
#> Number of reads processed: 393
#> Number of reads where at least one barcode was found: 368
#> Number of reads with exactly one barcode match: 364
#> Number of chimera reads: 1
#> All done!
#> 02:14:06 AM Fri Oct 25 2024 Demultiplex done
#> Running FLAMES pipeline...
#> #### Input parameters:
#> {
#>   "pipeline_parameters": {
#>     "seed": [2022],
#>     "threads": [8],
#>     "do_barcode_demultiplex": [true],
#>     "do_gene_quantification": [true],
#>     "do_genome_alignment": [true],
#>     "do_isoform_identification": [true],
#>     "bambu_isoform_identification": [false],
#>     "multithread_isoform_identification": [false],
#>     "do_read_realignment": [true],
#>     "do_transcript_quantification": [true],
#>     "oarfish_quantification": [false]
#>   },
#>   "barcode_parameters": {
#>     "max_bc_editdistance": [2],
#>     "max_flank_editdistance": [8],
#>     "pattern": {
#>       "primer": ["CTACACGACGCTCTTCCGATCT"],
#>       "BC": ["NNNNNNNNNNNNNNNN"],
#>       "UMI": ["NNNNNNNNNNNN"],
#>       "polyT": ["TTTTTTTTT"]
#>     },
#>     "strand": ["-"],
#>     "TSO_seq": ["AAGCAGTGGTATCAACGCAGAGTACATGGG"],
#>     "TSO_prime": [3],
#>     "full_length_only": [false]
#>   },
#>   "isoform_parameters": {
#>     "generate_raw_isoform": [false],
#>     "max_dist": [10],
#>     "max_ts_dist": [100],
#>     "max_splice_match_dist": [10],
#>     "min_fl_exon_len": [40],
#>     "max_site_per_splice": [3],
#>     "min_sup_cnt": [5],
#>     "min_cnt_pct": [0.001],
#>     "min_sup_pct": [0.2],
#>     "bambu_trust_reference": [true],
#>     "strand_specific": [0],
#>     "remove_incomp_reads": [4],
#>     "downsample_ratio": [1]
#>   },
#>   "alignment_parameters": {
#>     "use_junctions": [true],
#>     "no_flank": [false]
#>   },
#>   "realign_parameters": {
#>     "use_annotation": [true]
#>   },
#>   "transcript_counting": {
#>     "min_tr_coverage": [0.4],
#>     "min_read_coverage": [0.4]
#>   }
#> } 
#> gene annotation: /__w/_temp/Library/FLAMES/extdata/rps24.gtf.gz 
#> genome fasta: /tmp/RtmpoS7Kzz/file1f6d5ee62b7f/rps24.fa 
#> input fastq: /tmp/RtmpoS7Kzz/file1f6d5ee62b7f/matched_reads.fastq 
#> output directory: /tmp/RtmpoS7Kzz/file1f6d5ee62b7f 
#> minimap2 path: 
#> k8 path: 
#> #### Aligning reads to genome using minimap2
#> 02:14:06 AM Fri Oct 25 2024 minimap2_align
#> 02:14:06 AM Fri Oct 25 2024 Start gene quantification and UMI deduplication
#> 02:14:06 AM Fri Oct 25 2024 quantify genes 
#> Found genome alignment file(s): 	align2genome.bam
#> 02:14:07 AM Fri Oct 25 2024 Gene quantification and UMI deduplication done!
#> 02:14:07 AM Fri Oct 25 2024 Start isoform identificaiton
#> 02:14:07 AM Fri Oct 25 2024 find_isoform
#> 02:14:07 AM Fri Oct 25 2024 Isoform identificaiton done
#> #### Realigning deduplicated reads to transcript using minimap2
#> 02:14:07 AM Fri Oct 25 2024 minimap2_realign
#> Sorting by position
#> #### Generating transcript count matrix
#> 02:14:07 AM Fri Oct 25 2024 quantify transcripts 
#> Found realignment file(s): 	realign2transcript.bam
group_anno <- data.frame(barcode_seq = colnames(sce), groups = SingleCellExperiment::counts(sce)["ENSMUST00000169826.2", ] > 1)
SingleCellExperiment::colLabels(sce) <- group_anno$groups
sc_DTU_analysis(sce, min_count = 1)
#> Loading isoform_FSM_annotation.csv ...
#> Selecting transcript_ids with full splice match ...
#> Summing transcripts with same FSM ... 
#> Creating FSM_count.csv.gz ...
#> /tmp/RtmpoS7Kzz/file1f6d5ee62b7f/FSM_count.csv.gz saved.
#> 	 3 FSM_match(s) found.
#> Filtering for genes with at least 2 detected isforms ...      3 FSM_match(s) left.
#> Keeping only the top 4 expressed FSM_matches for each gene ...      3 FSM_match(s) left.
#> Aggregating counts by cluster labels ...
#> Filtering isoforms ... 3  transcript_id(s) remaining.
#> Performing Chi-square tests ...
#> Warning: Chi-squared approximation(s) may be incorrect
#> Results saved to  /tmp/RtmpoS7Kzz/file1f6d5ee62b7f/sc_DTU_analysis.csv 
#>                         gene_id  X_value df                          DTU_tr
#> X-squared ENSMUSG00000025290.17 13.17771  2 ENSMUSG00000025290.17_19_5159_1
#>           DTU_group     p_value       adj_p
#> X-squared      TRUE 0.001375616 0.001375616