Skip to contents

demultiplex reads with flexiplex

Usage

find_barcode(
  fastq,
  barcodes_file,
  max_bc_editdistance = 2,
  max_flank_editdistance = 8,
  reads_out,
  stats_out,
  threads = 1,
  pattern = c(primer = "CTACACGACGCTCTTCCGATCT", BC = paste0(rep("N", 16), collapse =
    ""), UMI = paste0(rep("N", 12), collapse = ""), polyT = paste0(rep("T", 9), collapse
    = "")),
  TSO_seq = "",
  TSO_prime = 3,
  strand = "+",
  cutadapt_minimum_length = 1,
  full_length_only = FALSE
)

Arguments

fastq

character vector of paths to FASTQ files or folders, if named, the names will be used as sample names, otherwise the file names will be used

barcodes_file

path to file containing barcode allow-list, with one barcode in each line

max_bc_editdistance

max edit distances for the barcode sequence

max_flank_editdistance

max edit distances for the flanking sequences (primer and polyT)

reads_out

path to output FASTQ file; if multiple samples are processed, the sample name will be appended to this argument, e.g. provide path/out.fq for single sample, and path/prefix for multiple samples.

stats_out

path of output stats file; similar to reads_out, e.g. provide path/stats.tsv for single sample, and path/prefix for multiple samples.

threads

number of threads to be used

pattern

named character vector defining the barcode pattern

TSO_seq

TSO sequence to be trimmed

TSO_prime

either 3 (when TSO_seq is on 3' the end) or 5 (on 5' end)

strand

strand of the barcode pattern, either '+' or '-' (read will be reverse complemented after barcode matching if '-')

cutadapt_minimum_length

minimum read length after TSO trimming (cutadapt's –minimum-length)

full_length_only

boolean, when TSO sequence is provided, whether reads without TSO are to be discarded

Value

a list containing: reads_tb (tibble of read demultiplexed information) and input, output, read1_with_adapter from cutadapt report (if TSO trimming is performed)

Details

This function demultiplexes reads by searching for flanking sequences (adaptors) around the barcode sequence, and then matching against allowed barcodes. For single sample, either provide a single FASTQ file or a folder containing FASTQ files. For multiple samples, provide a vector of paths (either to FASTQ files or folders containing FASTQ files). Gzipped file input are supported but the output will be uncompressed.

Examples

outdir <- tempfile()
dir.create(outdir)
bc_allow <- file.path(outdir, "bc_allow.tsv")
R.utils::gunzip(
  filename = system.file("extdata", "bc_allow.tsv.gz", package = "FLAMES"),
  destname = bc_allow, remove = FALSE
)
# single sample
find_barcode(
  fastq = system.file("extdata", "fastq", "musc_rps24.fastq.gz", package = "FLAMES"),
  stats_out = file.path(outdir, "bc_stat"),
  reads_out = file.path(outdir, "demultiplexed.fq"),
  barcodes_file = bc_allow, 
  TSO_seq = "AAGCAGTGGTATCAACGCAGAGTACATGGG", TSO_prime = 5,
  strand = '-', cutadapt_minimum_length = 10, full_length_only = TRUE
)
#> FLEXIPLEX 0.96.2
#> Setting max barcode edit distance to 2
#> Setting max flanking sequence edit distance to 8
#> Setting read IDs to be  replaced
#> Setting number of threads to 1
#> Search pattern: 
#> primer: CTACACGACGCTCTTCCGATCT
#> BC: NNNNNNNNNNNNNNNN
#> UMI: NNNNNNNNNNNN
#> polyT: TTTTTTTTT
#> Setting known barcodes from /tmp/RtmpqmWKMU/filecd677dec44dc/bc_allow.tsv
#> Number of known barcodes: 143
#> Processing file: /__w/_temp/Library/FLAMES/extdata/fastq/musc_rps24.fastq.gz
#> Searching for barcodes...
#> Number of reads processed: 393
#> Number of reads where at least one barcode was found: 368
#> Number of reads with exactly one barcode match: 364
#> Number of chimera reads: 1
#> All done!
#> $musc_rps24
#> $musc_rps24$cutadapt
#> read1_with_adapter              input             output 
#>                248                372                248 
#> 
#> $musc_rps24$reads_tb
#> # A tibble: 372 × 8
#>    Read  CellBarcode FlankEditDist BarcodeEditDist UMI   TooShort Outfile Sample
#>    <chr> <chr>               <int>           <int> <chr> <lgl>    <chr>   <chr> 
#>  1 5040… CGGGTCAGTA…             1               0 CCGG… FALSE    /tmp/R… musc_…
#>  2 d54e… CGGACTGAGT…             1               0 GGGA… FALSE    /tmp/R… musc_…
#>  3 94bf… CTGTGCTTCC…             1               2 TTCT… FALSE    /tmp/R… musc_…
#>  4 d644… GTCGTAATCC…             7               0 GGTA… FALSE    /tmp/R… musc_…
#>  5 055d… TTTATGCAGA…             1               0 ATTC… FALSE    /tmp/R… musc_…
#>  6 66cb… GTAACTGAGA…             2               0 TGAG… FALSE    /tmp/R… musc_…
#>  7 e427… CGAACATGTC…             2               0 ATAC… FALSE    /tmp/R… musc_…
#>  8 5f74… ACCGTAACAA…             3               1 CCTC… FALSE    /tmp/R… musc_…
#>  9 f164… GACTAACTCC…             3               1 GTTC… FALSE    /tmp/R… musc_…
#> 10 c7ae… CATCGAACAG…             2               1 TTAA… FALSE    /tmp/R… musc_…
#> # ℹ 362 more rows
#> 
#> $musc_rps24$read_counts
#>                                       total reads 
#>                                               393 
#>                               demultiplexed reads 
#>                                               368 
#>                                single match reads 
#>                                               364 
#> single strand single barcode multi-matching reads 
#>                                                 0 
#>              single strand multiple barcode reads 
#>                                                 3 
#>                 both strands single barcode reads 
#>                                                 0 
#>               both strands multiple barcode reads 
#>                                                 1 
#> 
#> 
# multi-sample
fastq_dir <- tempfile()
dir.create(fastq_dir)
file.copy(system.file("extdata", "fastq", "musc_rps24.fastq.gz", package = "FLAMES"),
  file.path(fastq_dir, "musc_rps24.fastq.gz"))
#> [1] TRUE
sampled_lines <- readLines(file.path(fastq_dir, "musc_rps24.fastq.gz"), n = 400)
writeLines(sampled_lines, file.path(fastq_dir, "copy.fastq"))
result <- find_barcode(
  # you can mix folders and files. each path will be considered as a sample
  fastq = c(fastq_dir, system.file("extdata", "fastq", "musc_rps24.fastq.gz", package = "FLAMES")),
  stats_out = file.path(outdir, "bc_stat"),
  reads_out = file.path(outdir, "demultiplexed.fq"),
  barcodes_file = bc_allow, TSO_seq = "CCCATGTACTCTGCGTTGATACCACTGCTT"
)
#> FLEXIPLEX 0.96.2
#> Setting max barcode edit distance to 2
#> Setting max flanking sequence edit distance to 8
#> Setting read IDs to be  replaced
#> Setting number of threads to 1
#> Search pattern: 
#> primer: CTACACGACGCTCTTCCGATCT
#> BC: NNNNNNNNNNNNNNNN
#> UMI: NNNNNNNNNNNN
#> polyT: TTTTTTTTT
#> Setting known barcodes from /tmp/RtmpqmWKMU/filecd677dec44dc/bc_allow.tsv
#> Number of known barcodes: 143
#> Processing file: /tmp/RtmpqmWKMU/filecd6778889a60/copy.fastq
#> Searching for barcodes...
#> Processing file: /tmp/RtmpqmWKMU/filecd6778889a60/musc_rps24.fastq.gz
#> Searching for barcodes...
#> Number of reads processed: 493
#> Number of reads where at least one barcode was found: 460
#> Number of reads with exactly one barcode match: 455
#> Number of chimera reads: 2
#> All done!
#> FLEXIPLEX 0.96.2
#> Setting max barcode edit distance to 2
#> Setting max flanking sequence edit distance to 8
#> Setting read IDs to be  replaced
#> Setting number of threads to 1
#> Search pattern: 
#> primer: CTACACGACGCTCTTCCGATCT
#> BC: NNNNNNNNNNNNNNNN
#> UMI: NNNNNNNNNNNN
#> polyT: TTTTTTTTT
#> Setting known barcodes from /tmp/RtmpqmWKMU/filecd677dec44dc/bc_allow.tsv
#> Number of known barcodes: 143
#> Processing file: /__w/_temp/Library/FLAMES/extdata/fastq/musc_rps24.fastq.gz
#> Searching for barcodes...
#> Number of reads processed: 393
#> Number of reads where at least one barcode was found: 368
#> Number of reads with exactly one barcode match: 364
#> Number of chimera reads: 1
#> All done!