Skip to contents

demultiplex reads with flexiplex

Usage

find_barcode(
  fastq,
  barcodes_file,
  max_bc_editdistance = 2,
  max_flank_editdistance = 8,
  reads_out,
  stats_out,
  threads = 1,
  pattern = c(primer = "CTACACGACGCTCTTCCGATCT", BC = paste0(rep("N", 16), collapse =
    ""), UMI = paste0(rep("N", 12), collapse = ""), polyT = paste0(rep("T", 9), collapse
    = "")),
  TSO_seq = "",
  TSO_prime = 3,
  strand = "+",
  cutadapt_minimum_length = 1,
  full_length_only = FALSE
)

Arguments

fastq

A path to a FASTQ file or a directory containing FASTQ files.

barcodes_file

path to file containing barcode allow-list, with one barcode in each line

max_bc_editdistance

max edit distances for the barcode sequence

max_flank_editdistance

max edit distances for the flanking sequences (primer and polyT)

reads_out

path to output FASTQ file

stats_out

path of output stats file

threads

number of threads to be used

pattern

named character vector defining the barcode pattern

TSO_seq

TSO sequence to be trimmed

TSO_prime

either 3 (when TSO_seq is on 3' the end) or 5 (on 5' end)

strand

strand of the barcode pattern, either '+' or '-' (read will be reverse complemented after barcode matching if '-')

cutadapt_minimum_length

minimum read length after TSO trimming (cutadapt's –minimum-length)

full_length_only

boolean, when TSO sequence is provided, whether reads without TSO are to be discarded

Value

a list containing: reads_tb (tibble of read demultiplexed information) and input, output, read1_with_adapter from cutadapt report (if TSO trimming is performed)

Details

This function demultiplexes reads by searching for flanking sequences (adaptors) around the barcode sequence, and then matching against allowed barcodes.

Examples

outdir <- tempfile()
dir.create(outdir)
bc_allow <- file.path(outdir, "bc_allow.tsv")
R.utils::gunzip(
  filename = system.file("extdata", "bc_allow.tsv.gz", package = "FLAMES"),
  destname = bc_allow, remove = FALSE
)
find_barcode(
  fastq = system.file("extdata", "fastq", "musc_rps24.fastq.gz", package = "FLAMES"),
  stats_out = file.path(outdir, "bc_stat.tsv.gz"),
  reads_out = file.path(outdir, "demultiplexed.fastq.gz"),
  barcodes_file = bc_allow, 
  TSO_seq = "AAGCAGTGGTATCAACGCAGAGTACATGGG", TSO_prime = 5,
  strand = '-', cutadapt_minimum_length = 10, full_length_only = TRUE
)
#> FLEXIPLEX 0.96.2
#> Setting max barcode edit distance to 2
#> Setting max flanking sequence edit distance to 8
#> Setting read IDs to be  replaced
#> Setting number of threads to 1
#> Search pattern: 
#> primer: CTACACGACGCTCTTCCGATCT
#> BC: NNNNNNNNNNNNNNNN
#> UMI: NNNNNNNNNNNN
#> polyT: TTTTTTTTT
#> Setting known barcodes from /tmp/RtmpA5lNG6/file329064c67719/bc_allow.tsv
#> Number of known barcodes: 143
#> Processing file: /__w/_temp/Library/FLAMES/extdata/fastq/musc_rps24.fastq.gz
#> Searching for barcodes...
#> Number of reads processed: 393
#> Number of reads where at least one barcode was found: 368
#> Number of reads with exactly one barcode match: 364
#> Number of chimera reads: 1
#> All done!
#> $read_counts
#>                                       total reads 
#>                                               393 
#>                               demultiplexed reads 
#>                                               368 
#>                                single match reads 
#>                                               364 
#> single strand single barcode multi-matching reads 
#>                                                 0 
#>              single strand multiple barcode reads 
#>                                                 3 
#>                 both strands single barcode reads 
#>                                                 0 
#>               both strands multiple barcode reads 
#>                                                 1 
#> 
#> $stats_out
#> [1] "/tmp/RtmpA5lNG6/file329064c67719/bc_stat.tsv.gz"
#> 
#> $cutadapt
#> $cutadapt$tag
#> [1] "Cutadapt report"
#> 
#> $cutadapt$schema_version
#> [1] 0 3
#> 
#> $cutadapt$cutadapt_version
#> [1] "4.9"
#> 
#> $cutadapt$python_version
#> [1] "3.11.9"
#> 
#> $cutadapt$command_line_arguments
#>  [1] "-g"                                                               
#>  [2] "AAGCAGTGGTATCAACGCAGAGTACATGGG"                                   
#>  [3] "-o"                                                               
#>  [4] "/tmp/RtmpA5lNG6/file329064c67719/demultiplexed.fastq.gz"          
#>  [5] "/tmp/RtmpA5lNG6/file329064c67719/untrimmed_demultiplexed.fastq.gz"
#>  [6] "--json"                                                           
#>  [7] "/tmp/RtmpA5lNG6/file329064c67719/file329018b09ffd.json"           
#>  [8] "--untrimmed-output"                                               
#>  [9] "/tmp/RtmpA5lNG6/file329064c67719/noTSO_demultiplexed.fastq.gz"    
#> [10] "--minimum-length"                                                 
#> [11] "10"                                                               
#> 
#> $cutadapt$cores
#> [1] 1
#> 
#> $cutadapt$input
#> $cutadapt$input$path1
#> [1] "/tmp/RtmpA5lNG6/file329064c67719/untrimmed_demultiplexed.fastq.gz"
#> 
#> $cutadapt$input$path2
#> NULL
#> 
#> $cutadapt$input$paired
#> [1] FALSE
#> 
#> 
#> $cutadapt$read_counts
#> $cutadapt$read_counts$input
#> [1] 372
#> 
#> $cutadapt$read_counts$filtered
#> $cutadapt$read_counts$filtered$too_short
#> [1] 0
#> 
#> $cutadapt$read_counts$filtered$too_long
#> NULL
#> 
#> $cutadapt$read_counts$filtered$too_many_n
#> NULL
#> 
#> $cutadapt$read_counts$filtered$too_many_expected_errors
#> NULL
#> 
#> $cutadapt$read_counts$filtered$casava_filtered
#> NULL
#> 
#> $cutadapt$read_counts$filtered$discard_trimmed
#> NULL
#> 
#> $cutadapt$read_counts$filtered$discard_untrimmed
#> NULL
#> 
#> 
#> $cutadapt$read_counts$output
#> [1] 248
#> 
#> $cutadapt$read_counts$reverse_complemented
#> NULL
#> 
#> $cutadapt$read_counts$read1_with_adapter
#> [1] 248
#> 
#> $cutadapt$read_counts$read2_with_adapter
#> NULL
#> 
#> 
#> $cutadapt$basepair_counts
#> $cutadapt$basepair_counts$input
#> [1] 248786
#> 
#> $cutadapt$basepair_counts$input_read1
#> [1] 248786
#> 
#> $cutadapt$basepair_counts$input_read2
#> NULL
#> 
#> $cutadapt$basepair_counts$quality_trimmed
#> NULL
#> 
#> $cutadapt$basepair_counts$quality_trimmed_read1
#> NULL
#> 
#> $cutadapt$basepair_counts$quality_trimmed_read2
#> NULL
#> 
#> $cutadapt$basepair_counts$poly_a_trimmed
#> NULL
#> 
#> $cutadapt$basepair_counts$poly_a_trimmed_read1
#> NULL
#> 
#> $cutadapt$basepair_counts$poly_a_trimmed_read2
#> NULL
#> 
#> $cutadapt$basepair_counts$output
#> [1] 154283
#> 
#> $cutadapt$basepair_counts$output_read1
#> [1] 154283
#> 
#> $cutadapt$basepair_counts$output_read2
#> NULL
#> 
#> 
#> $cutadapt$adapters_read1
#>   name total_matches on_reverse_complement linked five_prime_end.type
#> 1    1           248                    NA  FALSE  regular_five_prime
#>          five_prime_end.sequence five_prime_end.error_rate
#> 1 AAGCAGTGGTATCAACGCAGAGTACATGGG                       0.1
#>   five_prime_end.indels five_prime_end.error_lengths five_prime_end.matches
#> 1                  TRUE                 9, 19, 2....                    248
#>   five_prime_end.adjacent_bases five_prime_end.dominant_adjacent_base
#> 1                            NA                                    NA
#>   five_prime_end.trimmed_lengths three_prime_end
#> 1                   c(3, 16,....              NA
#> 
#> $cutadapt$adapters_read2
#> NULL
#> 
#> $cutadapt$poly_a_trimmed_read1
#> NULL
#> 
#> $cutadapt$poly_a_trimmed_read2
#> NULL
#> 
#>