demultiplex reads with flexiplex
Usage
find_barcode(
fastq,
barcodes_file,
max_bc_editdistance = 2,
max_flank_editdistance = 8,
reads_out,
stats_out,
threads = 1,
pattern = c(primer = "CTACACGACGCTCTTCCGATCT", BC = paste0(rep("N", 16), collapse =
""), UMI = paste0(rep("N", 12), collapse = ""), polyT = paste0(rep("T", 9), collapse
= "")),
TSO_seq = "",
TSO_prime = 3,
strand = "+",
cutadapt_minimum_length = 1,
full_length_only = FALSE
)
Arguments
- fastq
character vector of paths to FASTQ files or folders, if named, the names will be used as sample names, otherwise the file names will be used
- barcodes_file
path to file containing barcode allow-list, with one barcode in each line
- max_bc_editdistance
max edit distances for the barcode sequence
- max_flank_editdistance
max edit distances for the flanking sequences (primer and polyT)
- reads_out
path to output FASTQ file; if multiple samples are processed, the sample name will be appended to this argument, e.g. provide
path/out.fq
for single sample, andpath/prefix
for multiple samples.- stats_out
path of output stats file; similar to
reads_out
, e.g. providepath/stats.tsv
for single sample, andpath/prefix
for multiple samples.- threads
number of threads to be used
- pattern
named character vector defining the barcode pattern
- TSO_seq
TSO sequence to be trimmed
- TSO_prime
either 3 (when
TSO_seq
is on 3' the end) or 5 (on 5' end)- strand
strand of the barcode pattern, either '+' or '-' (read will be reverse complemented after barcode matching if '-')
- cutadapt_minimum_length
minimum read length after TSO trimming (cutadapt's –minimum-length)
- full_length_only
boolean, when TSO sequence is provided, whether reads without TSO are to be discarded
Value
a list containing: reads_tb
(tibble of read demultiplexed information) and
input
, output
, read1_with_adapter
from cutadapt report
(if TSO trimming is performed)
Details
This function demultiplexes reads by searching for flanking sequences (adaptors) around the barcode sequence, and then matching against allowed barcodes. For single sample, either provide a single FASTQ file or a folder containing FASTQ files. For multiple samples, provide a vector of paths (either to FASTQ files or folders containing FASTQ files). Gzipped file input are supported but the output will be uncompressed.
Examples
outdir <- tempfile()
dir.create(outdir)
bc_allow <- file.path(outdir, "bc_allow.tsv")
R.utils::gunzip(
filename = system.file("extdata", "bc_allow.tsv.gz", package = "FLAMES"),
destname = bc_allow, remove = FALSE
)
# single sample
find_barcode(
fastq = system.file("extdata", "fastq", "musc_rps24.fastq.gz", package = "FLAMES"),
stats_out = file.path(outdir, "bc_stat"),
reads_out = file.path(outdir, "demultiplexed.fq"),
barcodes_file = bc_allow,
TSO_seq = "AAGCAGTGGTATCAACGCAGAGTACATGGG", TSO_prime = 5,
strand = '-', cutadapt_minimum_length = 10, full_length_only = TRUE
)
#> FLEXIPLEX 0.96.2
#> Setting max barcode edit distance to 2
#> Setting max flanking sequence edit distance to 8
#> Setting read IDs to be replaced
#> Setting number of threads to 1
#> Search pattern:
#> primer: CTACACGACGCTCTTCCGATCT
#> BC: NNNNNNNNNNNNNNNN
#> UMI: NNNNNNNNNNNN
#> polyT: TTTTTTTTT
#> Setting known barcodes from /tmp/RtmpqmWKMU/filecd677dec44dc/bc_allow.tsv
#> Number of known barcodes: 143
#> Processing file: /__w/_temp/Library/FLAMES/extdata/fastq/musc_rps24.fastq.gz
#> Searching for barcodes...
#> Number of reads processed: 393
#> Number of reads where at least one barcode was found: 368
#> Number of reads with exactly one barcode match: 364
#> Number of chimera reads: 1
#> All done!
#> $musc_rps24
#> $musc_rps24$cutadapt
#> read1_with_adapter input output
#> 248 372 248
#>
#> $musc_rps24$reads_tb
#> # A tibble: 372 × 8
#> Read CellBarcode FlankEditDist BarcodeEditDist UMI TooShort Outfile Sample
#> <chr> <chr> <int> <int> <chr> <lgl> <chr> <chr>
#> 1 5040… CGGGTCAGTA… 1 0 CCGG… FALSE /tmp/R… musc_…
#> 2 d54e… CGGACTGAGT… 1 0 GGGA… FALSE /tmp/R… musc_…
#> 3 94bf… CTGTGCTTCC… 1 2 TTCT… FALSE /tmp/R… musc_…
#> 4 d644… GTCGTAATCC… 7 0 GGTA… FALSE /tmp/R… musc_…
#> 5 055d… TTTATGCAGA… 1 0 ATTC… FALSE /tmp/R… musc_…
#> 6 66cb… GTAACTGAGA… 2 0 TGAG… FALSE /tmp/R… musc_…
#> 7 e427… CGAACATGTC… 2 0 ATAC… FALSE /tmp/R… musc_…
#> 8 5f74… ACCGTAACAA… 3 1 CCTC… FALSE /tmp/R… musc_…
#> 9 f164… GACTAACTCC… 3 1 GTTC… FALSE /tmp/R… musc_…
#> 10 c7ae… CATCGAACAG… 2 1 TTAA… FALSE /tmp/R… musc_…
#> # ℹ 362 more rows
#>
#> $musc_rps24$read_counts
#> total reads
#> 393
#> demultiplexed reads
#> 368
#> single match reads
#> 364
#> single strand single barcode multi-matching reads
#> 0
#> single strand multiple barcode reads
#> 3
#> both strands single barcode reads
#> 0
#> both strands multiple barcode reads
#> 1
#>
#>
# multi-sample
fastq_dir <- tempfile()
dir.create(fastq_dir)
file.copy(system.file("extdata", "fastq", "musc_rps24.fastq.gz", package = "FLAMES"),
file.path(fastq_dir, "musc_rps24.fastq.gz"))
#> [1] TRUE
sampled_lines <- readLines(file.path(fastq_dir, "musc_rps24.fastq.gz"), n = 400)
writeLines(sampled_lines, file.path(fastq_dir, "copy.fastq"))
result <- find_barcode(
# you can mix folders and files. each path will be considered as a sample
fastq = c(fastq_dir, system.file("extdata", "fastq", "musc_rps24.fastq.gz", package = "FLAMES")),
stats_out = file.path(outdir, "bc_stat"),
reads_out = file.path(outdir, "demultiplexed.fq"),
barcodes_file = bc_allow, TSO_seq = "CCCATGTACTCTGCGTTGATACCACTGCTT"
)
#> FLEXIPLEX 0.96.2
#> Setting max barcode edit distance to 2
#> Setting max flanking sequence edit distance to 8
#> Setting read IDs to be replaced
#> Setting number of threads to 1
#> Search pattern:
#> primer: CTACACGACGCTCTTCCGATCT
#> BC: NNNNNNNNNNNNNNNN
#> UMI: NNNNNNNNNNNN
#> polyT: TTTTTTTTT
#> Setting known barcodes from /tmp/RtmpqmWKMU/filecd677dec44dc/bc_allow.tsv
#> Number of known barcodes: 143
#> Processing file: /tmp/RtmpqmWKMU/filecd6778889a60/copy.fastq
#> Searching for barcodes...
#> Processing file: /tmp/RtmpqmWKMU/filecd6778889a60/musc_rps24.fastq.gz
#> Searching for barcodes...
#> Number of reads processed: 493
#> Number of reads where at least one barcode was found: 460
#> Number of reads with exactly one barcode match: 455
#> Number of chimera reads: 2
#> All done!
#> FLEXIPLEX 0.96.2
#> Setting max barcode edit distance to 2
#> Setting max flanking sequence edit distance to 8
#> Setting read IDs to be replaced
#> Setting number of threads to 1
#> Search pattern:
#> primer: CTACACGACGCTCTTCCGATCT
#> BC: NNNNNNNNNNNNNNNN
#> UMI: NNNNNNNNNNNN
#> polyT: TTTTTTTTT
#> Setting known barcodes from /tmp/RtmpqmWKMU/filecd677dec44dc/bc_allow.tsv
#> Number of known barcodes: 143
#> Processing file: /__w/_temp/Library/FLAMES/extdata/fastq/musc_rps24.fastq.gz
#> Searching for barcodes...
#> Number of reads processed: 393
#> Number of reads where at least one barcode was found: 368
#> Number of reads with exactly one barcode match: 364
#> Number of chimera reads: 1
#> All done!